dotnet · harishsk · May 21, 2020 · Apr 28, 2020 · Apr 28, 2020 · Apr 29, 2020
diff --git a/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs b/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs
@@ -55,6 +55,7 @@ public static ColumnInferenceResults InferColumns(MLContext context, string path
                 Separators = new[] { splitInference.Separator.Value },
                 AllowSparse = splitInference.AllowSparse,
                 AllowQuoting = splitInference.AllowQuote,
+                ReadMultilines = splitInference.ReadMultilines,
                 HasHeader = hasHeader,
                 TrimWhitespace = trimWhitespace
             };
@@ -91,6 +92,7 @@ public static ColumnInferenceResults InferColumns(MLContext context, string path
                 AllowQuoting = splitInference.AllowQuote,
                 AllowSparse = splitInference.AllowSparse,
                 Separators = new char[] { splitInference.Separator.Value },
+                ReadMultilines = splitInference.ReadMultilines,
                 HasHeader = hasHeader,
                 TrimWhitespace = trimWhitespace
             };
@@ -139,6 +141,7 @@ private static ColumnTypeInference.InferenceResult InferColumnTypes(MLContext co
                     Separator = splitInference.Separator.Value,
                     AllowSparse = splitInference.AllowSparse,
                     AllowQuote = splitInference.AllowQuote,
+                    ReadMultilines = splitInference.ReadMultilines,
                     HasHeader = hasHeader,
                     LabelColumnIndex = labelColumnIndex,
                     Label = label

diff --git a/src/Microsoft.ML.AutoML/ColumnInference/ColumnTypeInference.cs b/src/Microsoft.ML.AutoML/ColumnInference/ColumnTypeInference.cs
@@ -32,6 +32,7 @@ internal sealed class Arguments
             public int MaxRowsToRead;
             public uint? LabelColumnIndex;
             public string Label;
+            public bool ReadMultilines;
 
             public Arguments()
             {
@@ -262,6 +263,7 @@ private static InferenceResult InferTextFileColumnTypesCore(MLContext context, I
                 Separators = new[] { args.Separator },
                 AllowSparse = args.AllowSparse,
                 AllowQuoting = args.AllowQuote,
+                ReadMultilines = args.ReadMultilines,
             };
             var textLoader = context.Data.CreateTextLoader(textLoaderOptions);
             var idv = textLoader.Load(fileSource);

diff --git a/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs b/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs
@@ -23,14 +23,16 @@ public class ColumnSplitResult
 
             public bool AllowQuote { get; set; }
             public bool AllowSparse { get; set; }
+            public bool  ReadMultilines { get; set; }
 
-            public ColumnSplitResult(bool isSuccess, char? separator, bool allowQuote, bool allowSparse, int columnCount)
+            public ColumnSplitResult(bool isSuccess, char? separator, bool allowQuote, bool readMultilines, bool allowSparse, int columnCount)
-            public ColumnSplitResult(bool isSuccess, char? separator, bool allowQuote, bool readMultilines, bool allowSparse, int columnCount)
+            public ColumnSplitResult(bool isSuccess, char? separator, bool allowQuote, bool allowMultiline, bool allowSparse, int columnCount)
-            public ColumnSplitResult(bool isSuccess, char? separator, bool allowQuote, bool readMultilines, bool allowSparse, int columnCount)
+            public ColumnSplitResult(bool isSuccess, char? separator, bool allowQuote, bool allowMultiline, bool allowSparse, int columnCount)
             {
                 IsSuccess = isSuccess;
                 Separator = separator;
                 AllowQuote = allowQuote;
                 AllowSparse = allowSparse;
                 ColumnCount = columnCount;
+                ReadMultilines = readMultilines;
             }
         }
 
@@ -50,12 +52,14 @@ public static ColumnSplitResult TrySplitColumns(MLContext context, IMultiStreamS
         {
             var sparse = new[] { false, true };
             var quote = new[] { true, false };
+            var tryMultiline = new[] { false, true };
             var foundAny = false;
             var result = default(ColumnSplitResult);
             foreach (var perm in (from _allowSparse in sparse
                                     from _allowQuote in quote
                                     from _sep in separatorCandidates
-                                    select new { _allowSparse, _allowQuote, _sep }))
+                                    from _tryMultiline in tryMultiline
-                                    from _tryMultiline in tryMultiline
+                                    from _allowMultiline in multiline
-                                    from _tryMultiline in tryMultiline
+                                    from _allowMultiline in multiline
+                                    select new { _allowSparse, _allowQuote, _sep, _tryMultiline }))
             {
                 var options = new TextLoader.Options
                 {
@@ -66,7 +70,8 @@ from _sep in separatorCandidates
                     } },
                     Separators = new[] { perm._sep },
                     AllowQuoting = perm._allowQuote,
-                    AllowSparse = perm._allowSparse
+                    AllowSparse = perm._allowSparse,
+                    ReadMultilines = perm._tryMultiline,
                 };
 
                 if (TryParseFile(context, options, source, out result))
@@ -75,7 +80,7 @@ from _sep in separatorCandidates
                     break;
                 }
             }
-            return foundAny ? result : new ColumnSplitResult(false, null, true, true, 0);
+            return foundAny ? result : new ColumnSplitResult(false, null, true, true, true, 0);
         }
 
         private static bool TryParseFile(MLContext context, TextLoader.Options options, IMultiStreamSource source,
@@ -111,7 +116,7 @@ private static bool TryParseFile(MLContext context, TextLoader.Options options,
                 // disallow single-column case
                 if (mostCommon.Key <= 1) { return false; }
 
-                result = new ColumnSplitResult(true, options.Separators.First(), options.AllowQuoting, options.AllowSparse, mostCommon.Key);
+                result = new ColumnSplitResult(true, options.Separators.First(), options.AllowQuoting, options.ReadMultilines, options.AllowSparse, mostCommon.Key);
                 return true;
             }
             // fail gracefully if unable to instantiate data view with swept arguments

diff --git a/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs b/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs
@@ -2,6 +2,7 @@
 using System.Collections.Generic;
 using System.IO;
 using System.Linq;
+using FluentAssertions;
 using Microsoft.ML.Data;
 using Microsoft.ML.TestFramework;
 using Xunit;
@@ -186,5 +187,39 @@ public void InferColumnsColumnInfoParam()
             Assert.Equal(DefaultColumnNames.Features, result.ColumnInformation.NumericColumnNames.First());
             Assert.Null(result.ColumnInformation.ExampleWeightColumnName);
         }
+
+        [Fact]
+        public void TrySplitColumns_should_split_on_dataset_with_newline_between_double_quotes()
+        {
+            var context = new MLContext();
+            var dataset = Path.Combine("TestData", "DatasetWithNewlineBetweenQuotes.txt");
+            var sample = TextFileSample.CreateFromFullFile(dataset);
+            var result = TextFileContents.TrySplitColumns(context, sample, TextFileContents.DefaultSeparators);
+
+            result.ColumnCount.Should().Be(4);
+            result.Separator.Should().Be(',');
+            result.IsSuccess.Should().BeTrue();
+        }
+
+        [Fact]
+        public void InferColumnsFromMultilineInputFile()
+        {
+            // Check if we can infer the column information
+            // from and input file which has escaped newlines inside quotes
+            var dataPath = GetDataPath("multiline.csv");
+            MLContext mlContext = new MLContext();
+            var inputColumnInformation = new ColumnInformation();
+            inputColumnInformation.LabelColumnName = @"id";
+            var result = mlContext.Auto().InferColumns(dataPath, inputColumnInformation);
+
+            // File has 3 columns: "id", "description" and "animal"
+            Assert.NotNull(result.ColumnInformation.LabelColumnName);
+            Assert.Equal(1, result.ColumnInformation.TextColumnNames.Count);
+            Assert.Equal(1, result.ColumnInformation.CategoricalColumnNames.Count);
+
+            Assert.Equal("id", result.ColumnInformation.LabelColumnName);
+            Assert.Equal("description", result.ColumnInformation.TextColumnNames.First());
+            Assert.Equal("animal", result.ColumnInformation.CategoricalColumnNames.First());
+        }
     }
-}
+}
diff --git a/test/Microsoft.ML.AutoML.Tests/Microsoft.ML.AutoML.Tests.csproj b/test/Microsoft.ML.AutoML.Tests/Microsoft.ML.AutoML.Tests.csproj
@@ -7,10 +7,14 @@
   </ItemGroup>
 
   <ItemGroup>
+    <PackageReference Include="FluentAssertions" Version="5.10.3" />
     <PackageReference Include="SciSharp.TensorFlow.Redist" Version="$(TensorFlowVersion)" />
   </ItemGroup>
 
   <ItemGroup>
+    <None Update="TestData\DatasetWithNewlineBetweenQuotes.txt">
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </None>
     <None Update="TestData\DatasetWithDefaultColumnNames.txt">
       <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
     </None>

diff --git a/test/Microsoft.ML.AutoML.Tests/TestData/DatasetWithNewlineBetweenQuotes.txt b/test/Microsoft.ML.AutoML.Tests/TestData/DatasetWithNewlineBetweenQuotes.txt
@@ -0,0 +1,36 @@
+id,Column1,Column2,Column3
+1,this is a description, 1,2
+2,"this is a quote description",1,2
+3,"this is a quote description with double quote("")",1,2
+4,"this is a quote description with ""a pair of double quote""",1,2
+5,"this is a quote description with new line
+quote",1,2
+6,"this is a quote description with
+new line1 and
+new line2 and empty line
+
+and double quote""",1,2
+7, this is a description with single quote("),1,2
+// empty line between quotes
+8,"",1,2
+// single quote between quotes
+9,"""",1,2
+// simply newline between quotes
+10,"
+
+
+
+",1,2
+// simply signle quote and newline between quotes 
+11,"
+
+""""
+
+""
+
+""
+
+",1,2
+
+
+
 public static ColumnInferenceResults InferColumns(MLContext context, string path, string labelColumn, 
     char? separatorChar, bool? allowQuotedStrings, bool? supportSparse, bool trimWhitespace, bool groupColumns) 
 public static ColumnInferenceResults InferColumns(MLContext context, string path, string labelColumn, 
     char? separatorChar, bool? allowQuotedStrings, bool? supportSparse, bool trimWhitespace, bool groupColumns)