Skip to content

Added decimal marker option in TextLoader #5145

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
May 22, 2020
Merged
Prev Previous commit
Next Next commit
Added DecimalMarker in DoubleParser
  • Loading branch information
mstfbl committed May 21, 2020
commit ece551852a2804a9a70e5ddd1a2bc4b6b6139577
16 changes: 11 additions & 5 deletions src/Microsoft.ML.Core/Utilities/DoubleParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,13 @@ internal static class DoubleParser
private const ulong TopThreeBits = 0xE000000000000000UL;
private const char InfinitySymbol = '\u221E';

// The decimal marker that separates the integer part from the fractional part of a number
// written in decimal from can vary across different cultures as either '.' or ','. The
// default decimal marker in ML .NET is '.', however through this static char variable,
// we allow users to specify the decimal marker used in their datasets as ',' as well.
[BestFriend]
internal static char DecimalMarker = '.';

// REVIEW: casting ulong to Double doesn't always do the right thing, for example
// with 0x84595161401484A0UL. Hence the gymnastics several places in this code. Note that
// long to Double does work. The work around is:
Expand Down Expand Up @@ -527,8 +534,6 @@ private static bool TryParseCore(ReadOnlySpan<char> span, ref int ich, ref bool
Contracts.Assert(num == 0);
Contracts.Assert(exp == 0);

const char decimalMarker = '.';

if (ich >= span.Length)
return false;

Expand Down Expand Up @@ -556,7 +561,8 @@ private static bool TryParseCore(ReadOnlySpan<char> span, ref int ich, ref bool
return false;
break;

case decimalMarker:
case '.':
case ',':
goto LPoint;

// The common cases.
Expand Down Expand Up @@ -595,12 +601,12 @@ private static bool TryParseCore(ReadOnlySpan<char> span, ref int ich, ref bool
}
Contracts.Assert(i < span.Length);

if (span[i] != decimalMarker)
if (span[i] != DecimalMarker)
goto LAfterDigits;

LPoint:
Contracts.Assert(i < span.Length);
Contracts.Assert(span[i] == decimalMarker);
Contracts.Assert(span[i] == DecimalMarker);

// Get the digits after the decimal marker, which may be '.' or ','
for (; ; )
Expand Down
10 changes: 6 additions & 4 deletions src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -709,11 +709,11 @@ public Bindings(TextLoader parent, Column[] cols, IMultiStreamSource headerFile,
ch.Assert(0 <= inputSize & inputSize < SrcLim);
List<ReadOnlyMemory<char>> lines = null;
if (headerFile != null)
Cursor.GetSomeLines(headerFile, 1, parent.ReadMultilines, parent._separators, ref lines, parent._decimalMarker);
Cursor.GetSomeLines(headerFile, 1, parent.ReadMultilines, parent._separators, ref lines);
if (needInputSize && inputSize == 0)
Cursor.GetSomeLines(dataSample, 100, parent.ReadMultilines, parent._separators, ref lines, parent._decimalMarker);
Cursor.GetSomeLines(dataSample, 100, parent.ReadMultilines, parent._separators, ref lines);
else if (headerFile == null && parent.HasHeader)
Cursor.GetSomeLines(dataSample, 1, parent.ReadMultilines, parent._separators, ref lines, parent._decimalMarker);
Cursor.GetSomeLines(dataSample, 1, parent.ReadMultilines, parent._separators, ref lines);

if (needInputSize && inputSize == 0)
{
Expand Down Expand Up @@ -1219,7 +1219,7 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo
}
}

if (options.DecimalMarker == ',' && _separators.Contains<char>(','))
if (_separators.Contains(options.DecimalMarker))
throw _host.ExceptUserArg(nameof(Options.DecimalMarker), "Decimal marker and separator cannot be the same '{0}' character.", options.DecimalMarker);
_decimalMarker = options.DecimalMarker;
_bindings = new Bindings(this, cols, headerFile, dataSample);
Expand Down Expand Up @@ -1607,13 +1607,15 @@ public BoundLoader(TextLoader loader, IMultiStreamSource files)
public DataViewRowCursor GetRowCursor(IEnumerable<DataViewSchema.Column> columnsNeeded, Random rand = null)
{
_host.CheckValueOrNull(rand);
DoubleParser.DecimalMarker = _loader._decimalMarker;
var active = Utils.BuildArray(_loader._bindings.OutputSchema.Count, columnsNeeded);
return Cursor.Create(_loader, _files, active);
}

public DataViewRowCursor[] GetRowCursorSet(IEnumerable<DataViewSchema.Column> columnsNeeded, int n, Random rand = null)
{
_host.CheckValueOrNull(rand);
DoubleParser.DecimalMarker = _loader._decimalMarker;
var active = Utils.BuildArray(_loader._bindings.OutputSchema.Count, columnsNeeded);
return Cursor.CreateSet(_loader, _files, active, n);
}
Expand Down
12 changes: 5 additions & 7 deletions src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ public static DataViewRowCursor Create(TextLoader parent, IMultiStreamSource fil
SetupCursor(parent, active, 0, out srcNeeded, out cthd);
Contracts.Assert(cthd > 0);

var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, 1, parent._decimalMarker);
var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, 1);
var stats = new ParseStats(parent._host, 1);
return new Cursor(parent, stats, active, reader, srcNeeded, cthd);
}
Expand All @@ -163,7 +163,7 @@ public static DataViewRowCursor[] CreateSet(TextLoader parent, IMultiStreamSourc
SetupCursor(parent, active, n, out srcNeeded, out cthd);
Contracts.Assert(cthd > 0);

var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, cthd, parent._decimalMarker);
var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, cthd);
var stats = new ParseStats(parent._host, cthd);
if (cthd <= 1)
return new DataViewRowCursor[1] { new Cursor(parent, stats, active, reader, srcNeeded, 1) };
Expand Down Expand Up @@ -205,7 +205,7 @@ public override ValueGetter<DataViewRowId> GetIdGetter()
};
}

public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, char[] separators, ref List<ReadOnlyMemory<char>> lines, char decimalMarker)
public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, char[] separators, ref List<ReadOnlyMemory<char>> lines)
{
Contracts.AssertValue(source);
Contracts.Assert(count > 0);
Expand All @@ -215,7 +215,7 @@ public static void GetSomeLines(IMultiStreamSource source, int count, bool readM
count = 2;

LineBatch batch;
var reader = new LineReader(source, count, 1, false, readMultilines, separators, count, 1, decimalMarker);
var reader = new LineReader(source, count, 1, false, readMultilines, separators, count, 1);
try
{
batch = reader.GetBatch();
Expand Down Expand Up @@ -404,7 +404,6 @@ private sealed class LineReader
private readonly bool _hasHeader;
private readonly bool _readMultilines;
private readonly char[] _separators;
private readonly char _decimalMarker;
private readonly int _batchSize;
private readonly IMultiStreamSource _files;

Expand All @@ -414,7 +413,7 @@ private sealed class LineReader
private Task _thdRead;
private volatile bool _abort;

public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, bool readMultilines, char[] separators, long limit, int cref, char decimalMarker)
public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, bool readMultilines, char[] separators, long limit, int cref)
{
// Note that files is allowed to be empty.
Contracts.AssertValue(files);
Expand All @@ -431,7 +430,6 @@ public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool has
_separators = separators;
_files = files;
_cref = cref;
_decimalMarker = decimalMarker;

_queue = new BlockingQueue<LineBatch>(bufSize);
_thdRead = Utils.RunOnBackgroundThreadAsync(ThreadProc);
Expand Down
2 changes: 0 additions & 2 deletions src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -633,7 +633,6 @@ public void Clear()
}

private readonly char[] _separators;
private readonly char _decimalMarker;
private readonly OptionFlags _flags;
private readonly int _inputSize;
private readonly ColInfo[] _infos;
Expand Down Expand Up @@ -684,7 +683,6 @@ public Parser(TextLoader parent)
}

_separators = parent._separators;
_decimalMarker = parent._decimalMarker;
_flags = parent._flags;
_inputSize = parent._inputSize;
Contracts.Assert(_inputSize >= 0);
Expand Down