Skip to content

Allow TextLoader to load empty float/double fields as NaN instead of 0 #5198

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Jun 9, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion docs/code/IDataViewTypeSystem.md
Original file line number Diff line number Diff line change
Expand Up @@ -540,7 +540,10 @@ is first processed entirely as `TX` values, then parsed, or processed directly
into numeric values, that is, parsing as the row is processed. In the latter
case, it is simple to map implicit items (suppressed due to sparsity) to zero.
In the former case, these items are first mapped to the empty text value. To
get the same result, we need empty text to map to zero.
get the same result, we need empty text to map to zero. An exception to this
rule has been permitted in the TextLoader, where there's an option to load
empty `TX` fields as `NaN` for `R4` and `R8` fields, instead of using the default
conversion of empty `TX` to the numeric default `0`.

### To Text

Expand Down
25 changes: 19 additions & 6 deletions src/Microsoft.ML.Core/Utilities/DoubleParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ internal enum OptionFlags : uint
// a number and its decimal part). If this isn't set, then
// default behavior is to use "." as decimal marker.
UseCommaAsDecimalMarker = 0x01,

// If this flag is set, then empty spans (or those with only white-space)
// will be parsed as NaN. If it isn't set, then default behavior
// is to return them as 0.
EmptyAsNaN = 0x02,
}

private const ulong TopBit = 0x8000000000000000UL;
Expand Down Expand Up @@ -81,22 +86,22 @@ public enum Result
}

/// <summary>
/// This produces zero for an empty string.
/// This produces zero for an empty string, or NaN depending on the <see cref="DoubleParser.OptionFlags.EmptyAsNaN"/> used.
/// </summary>
public static bool TryParse(ReadOnlySpan<char> span, out Single value, OptionFlags flags = OptionFlags.Default)
{
var res = Parse(span, out value, flags);
Contracts.Assert(res != Result.Empty || value == 0);
Contracts.Assert(res != Result.Empty || ((flags & OptionFlags.EmptyAsNaN) == 0 && value == 0) || Single.IsNaN(value));
return res <= Result.Empty;
}

/// <summary>
/// This produces zero for an empty string.
/// This produces zero for an empty string, or NaN depending on the <see cref="DoubleParser.OptionFlags.EmptyAsNaN"/> used.
/// </summary>
public static bool TryParse(ReadOnlySpan<char> span, out Double value, OptionFlags flags = OptionFlags.Default)
{
var res = Parse(span, out value, flags);
Contracts.Assert(res != Result.Empty || value == 0);
Contracts.Assert(res != Result.Empty || ((flags & OptionFlags.EmptyAsNaN) == 0 && value == 0) || Double.IsNaN(value));
return res <= Result.Empty;
}

Expand All @@ -107,7 +112,11 @@ public static Result Parse(ReadOnlySpan<char> span, out Single value, OptionFlag
{
if (ich >= span.Length)
{
value = 0;
if ((flags & OptionFlags.EmptyAsNaN) == 0)
value = 0;
else
value = Single.NaN;

return Result.Empty;
}
if (!char.IsWhiteSpace(span[ich]))
Expand Down Expand Up @@ -155,7 +164,11 @@ public static Result Parse(ReadOnlySpan<char> span, out Double value, OptionFlag
{
if (ich >= span.Length)
{
value = 0;
if ((flags & OptionFlags.EmptyAsNaN) == 0)
value = 0;
else
value = Double.NaN;

return Result.Empty;
}
if (!char.IsWhiteSpace(span[ich]))
Expand Down
15 changes: 13 additions & 2 deletions src/Microsoft.ML.Data/Data/Conversion.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1369,7 +1369,8 @@ private void TryParseSigned(long max, in TX text, out long? result)
}

/// <summary>
/// This produces zero for empty. It returns false if the text is not parsable.
/// This produces zero for empty, or NaN depending on the <see cref="DoubleParser.OptionFlags.EmptyAsNaN"/> used.
/// It returns false if the text is not parsable.
/// On failure, it sets dst to the NA value.
/// </summary>
public bool TryParse(in TX src, out R4 dst)
Expand All @@ -1382,7 +1383,8 @@ public bool TryParse(in TX src, out R4 dst)
}

/// <summary>
/// This produces zero for empty. It returns false if the text is not parsable.
/// This produces zero for empty, or NaN depending on the <see cref="DoubleParser.OptionFlags.EmptyAsNaN"/> used.
/// It returns false if the text is not parsable.
/// On failure, it sets dst to the NA value.
/// </summary>
public bool TryParse(in TX src, out R8 dst)
Expand All @@ -1394,6 +1396,9 @@ public bool TryParse(in TX src, out R8 dst)
return IsStdMissing(ref span);
}

/// <summary>
/// This produces default for empty.
/// </summary>
public bool TryParse(in TX src, out TS dst)
{
if (src.IsEmpty)
Expand All @@ -1408,6 +1413,9 @@ public bool TryParse(in TX src, out TS dst)
return false;
}

/// <summary>
/// This produces default for empty.
/// </summary>
public bool TryParse(in TX src, out DT dst)
{
if (src.IsEmpty)
Expand All @@ -1422,6 +1430,9 @@ public bool TryParse(in TX src, out DT dst)
return false;
}

/// <summary>
/// This produces default for empty.
/// </summary>
public bool TryParse(in TX src, out DZ dst)
{
if (src.IsEmpty)
Expand Down
45 changes: 38 additions & 7 deletions src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -433,10 +433,9 @@ public class Options
/// </summary>
[Argument(ArgumentType.AtMostOnce,
HelpText =
"Whether the input may include quoted values, which can contain separator characters, colons," +
" and distinguish empty values from missing values. When true, consecutive separators denote a" +
" missing value and an empty value is denoted by \"\". When false, consecutive separators" +
" denote an empty value.",
"Whether the input may include double-quoted values. This parameter is used to distinguish separator characters in an input value " +
"from actual separators. When true, separators within double quotes are treated as part of the input value. When false, all " +
"separators, even those within quotes, are treated as delimiting a new column.",
ShortName = "quote")]
public bool AllowQuoting = Defaults.AllowQuoting;

Expand Down Expand Up @@ -533,6 +532,15 @@ public class Options
[Argument(ArgumentType.AtMostOnce, HelpText = "Character to use to escape quotes inside quoted fields. It can't be a character used as separator.", ShortName = "escapechar")]
public char EscapeChar = Defaults.EscapeChar;

/// <summary>
/// If true, missing real fields (i.e. double or single fields) will be loaded as NaN.
/// If false, they'll be loaded as 0. Default is false.
/// A field is considered "missing" if it's empty, if it only has whitespace, or if there are missing columns
/// at the end of a given row.
/// </summary>
[Argument(ArgumentType.AtMostOnce, HelpText = "If true, empty float fields will be loaded as NaN. If false, they'll be loaded as 0. Default is false.", ShortName = "missingrealnan")]
public bool MissingRealsAsNaNs = Defaults.MissingRealsAsNaNs;

/// <summary>
/// Checks that all column specifications are valid (that is, ranges are disjoint and have min&lt;=max).
/// </summary>
Expand All @@ -552,6 +560,7 @@ internal static class Defaults
internal const bool TrimWhitespace = false;
internal const bool ReadMultilines = false;
internal const char EscapeChar = '"';
internal const bool MissingRealsAsNaNs = false;
}

/// <summary>
Expand Down Expand Up @@ -1078,7 +1087,8 @@ private static VersionInfo GetVersionInfo()
//verWrittenCur: 0x0001000A, // Added ForceVector in Range
//verWrittenCur: 0x0001000B, // Header now retained if used and present
//verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType, and added ReadMultilines flag to OptionFlags
verWrittenCur: 0x0001000D, // Added escapeChar option and decimal marker option to allow for ',' to be a decimal marker
//verWrittenCur: 0x0001000D, // Added escapeChar and decimalMarker chars
verWrittenCur: 0x0001000E, // Added MissingRealsAsNaNs flag
verReadableCur: 0x0001000A,
verWeCanReadBack: 0x00010009,
loaderSignature: LoaderSignature,
Expand All @@ -1097,7 +1107,8 @@ private enum OptionFlags : uint
AllowQuoting = 0x04,
AllowSparse = 0x08,
ReadMultilines = 0x10,
All = TrimWhitespace | HasHeader | AllowQuoting | AllowSparse | ReadMultilines
MissingRealsAsNaNs = 0x20,
All = TrimWhitespace | HasHeader | AllowQuoting | AllowSparse | ReadMultilines | MissingRealsAsNaNs
}

// This is reserved to mean the range extends to the end (the segment is variable).
Expand Down Expand Up @@ -1179,6 +1190,8 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo
_flags |= OptionFlags.AllowSparse;
if (options.AllowQuoting && options.ReadMultilines)
_flags |= OptionFlags.ReadMultilines;
if (options.MissingRealsAsNaNs)
_flags |= OptionFlags.MissingRealsAsNaNs;

// REVIEW: This should be persisted (if it should be maintained).
_maxRows = options.MaxRows ?? long.MaxValue;
Expand Down Expand Up @@ -1407,7 +1420,25 @@ private TextLoader(IHost host, ModelLoadContext ctx)
_maxRows = ctx.Reader.ReadInt64();
host.CheckDecode(_maxRows > 0);
_flags = (OptionFlags)ctx.Reader.ReadUInt32();
host.CheckDecode((_flags & ~OptionFlags.All) == 0);

// Flags introduced with the first ML.NET commit:
var acceptableFlags = OptionFlags.TrimWhitespace;
acceptableFlags |= OptionFlags.HasHeader;
acceptableFlags |= OptionFlags.AllowQuoting;
acceptableFlags |= OptionFlags.AllowSparse;

// Flags added on later versions of TextLoader:
if(ctx.Header.ModelVerWritten >= 0x0001000C)
{
acceptableFlags |= OptionFlags.ReadMultilines;
}
if(ctx.Header.ModelVerWritten >= 0x0001000E)
{
acceptableFlags |= OptionFlags.MissingRealsAsNaNs;
}

host.CheckDecode((_flags & ~acceptableFlags) == 0);

_inputSize = ctx.Reader.ReadInt32();
host.CheckDecode(0 <= _inputSize && _inputSize < SrcLim);

Expand Down
52 changes: 51 additions & 1 deletion src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,8 @@ private abstract class ColumnPipe

public abstract bool HasNA { get; }

public abstract bool IsReal { get; } // If the type of the ColumnPipe is either Single or Double

protected ColumnPipe(RowSet rows)
{
Contracts.AssertValue(rows);
Expand All @@ -251,6 +253,8 @@ private sealed class PrimitivePipe<TResult> : ColumnPipe

public override bool HasNA { get; }

public override bool IsReal { get; }

public PrimitivePipe(RowSet rows, PrimitiveDataViewType type, TryParseMapper<TResult> conv)
: base(rows)
{
Expand All @@ -259,6 +263,7 @@ public PrimitivePipe(RowSet rows, PrimitiveDataViewType type, TryParseMapper<TRe
_conv = conv;
_values = new TResult[Rows.Count];
HasNA = Conversions.DefaultInstance.TryGetIsNAPredicate(type, out var del);
IsReal = typeof(TResult) == typeof(Single) || typeof(TResult) == typeof(Double);
}

public override void Reset(int irow, int size)
Expand Down Expand Up @@ -295,6 +300,8 @@ private sealed class VectorPipe<TItem> : ColumnPipe

public override bool HasNA { get; }

public override bool IsReal { get; }

private class VectorValue
{
private readonly VectorPipe<TItem> _pipe;
Expand Down Expand Up @@ -441,6 +448,7 @@ public VectorPipe(RowSet rows, PrimitiveDataViewType type, TryParseMapper<TItem>
for (int i = 0; i < _values.Length; i++)
_values[i] = new VectorValue(this);
HasNA = Conversions.DefaultInstance.TryGetIsNAPredicate(type, out var del);
IsReal = typeof(TItem) == typeof(Single) || typeof(TItem) == typeof(Double);
}

public override void Reset(int irow, int size)
Expand Down Expand Up @@ -649,6 +657,7 @@ public void Clear()

private readonly char[] _separators;
private readonly OptionFlags _flags;
private readonly bool _missingRealsAsNaNs;
private readonly char _escapeChar;
private readonly int _inputSize;
private readonly ColInfo[] _infos;
Expand All @@ -659,6 +668,8 @@ public void Clear()
private volatile int _csrc;
private volatile int _mismatchCount;

private ReadOnlyMemory<char> _blank;

public Parser(TextLoader parent)
{
Contracts.AssertValue(parent);
Expand All @@ -671,6 +682,8 @@ public Parser(TextLoader parent)
var doubleParserOptionFlags = DoubleParser.OptionFlags.Default;
if (parent._decimalMarker == ',')
doubleParserOptionFlags |= DoubleParser.OptionFlags.UseCommaAsDecimalMarker;
if ((parent._flags & OptionFlags.MissingRealsAsNaNs) != 0)
doubleParserOptionFlags |= DoubleParser.OptionFlags.EmptyAsNaN;

if (doubleParserOptionFlags == DoubleParser.OptionFlags.Default)
cache = ValueCreatorCache.DefaultInstance;
Expand Down Expand Up @@ -713,6 +726,8 @@ public Parser(TextLoader parent)
_flags = parent._flags;
_escapeChar = parent._escapeChar;
_inputSize = parent._inputSize;
_missingRealsAsNaNs = (parent._flags & OptionFlags.MissingRealsAsNaNs) != 0;
_blank = ReadOnlyMemory<char>.Empty;
Contracts.Assert(_inputSize >= 0);
}

Expand Down Expand Up @@ -900,6 +915,7 @@ private sealed class HelperImpl : Helper
private readonly int _srcNeeded;
private readonly bool _quoting;
private readonly bool _sparse;
private readonly bool _keepEmpty;
// This is a working buffer.
private readonly StringBuilder _sb;

Expand Down Expand Up @@ -930,6 +946,11 @@ public HelperImpl(ParseStats stats, OptionFlags flags, char[] seps, char escapeC
_sb = new StringBuilder();
_blank = ReadOnlyMemory<char>.Empty;
Fields = new FieldSet();

// If we want to impute empty real fields as NaNs, then we must keep
// all empty field spans, as there's no way for the Parser.HelperImpl
// to know beforehand which fields belong to a float field
_keepEmpty = (flags & OptionFlags.MissingRealsAsNaNs) != 0;
}

/// <summary>
Expand Down Expand Up @@ -978,6 +999,13 @@ public int GatherFields(ReadOnlyMemory<char> lineSpan, ReadOnlySpan<char> span,
Fields.Spans[Fields.Count] = scan.Span;
Fields.Indices[Fields.Count++] = src;
}
else if(_keepEmpty)
{
Fields.EnsureSpace();
Fields.Spans[Fields.Count] = _blank;
Fields.Indices[Fields.Count++] = src;
}

if (++src > _srcNeeded || !more)
break;
}
Expand Down Expand Up @@ -1390,10 +1418,10 @@ private void ProcessVec(int srcLim, FieldSet fields, ColInfo info, ColumnPipe v,
int sizeSeg = lim - min;
Contracts.Assert(ivDst <= size - sizeSeg);

int indexBase = ivDst - min;
int isrc = fields.Indices.FindIndexSorted(0, fields.Count, min);
if (isrc < fields.Count && fields.Indices[isrc] < lim)
{
int indexBase = ivDst - min;
int isrcLim = fields.Indices.FindIndexSorted(isrc, fields.Count, lim);
Contracts.Assert(isrc < isrcLim);
for (; isrc < isrcLim; isrc++)
Expand All @@ -1408,6 +1436,19 @@ private void ProcessVec(int srcLim, FieldSet fields, ColInfo info, ColumnPipe v,
}
}
}

if(_missingRealsAsNaNs && isrc >= fields.Count && v.IsReal)
{
// If the user has set the MissingRealsAsNaNs option to true,
// And there are missing columns on a given row,
// then we should load them as if they were empty (i.e. _blank) fields
// So that they can be loaded as NaNs if they're single/double columns
// Or as default if they aren't.
for (int srcCur = Math.Max(min, fields.Count); srcCur < lim; srcCur++)
{
v.Consume(irow, indexBase + srcCur, ref _blank);
}
}
ivDst += sizeSeg;
}
Contracts.Assert(ivDst == size);
Expand All @@ -1430,6 +1471,15 @@ private void ProcessOne(FieldSet vs, ColInfo info, ColumnPipe v, int irow, long
v.Rows.Stats.LogBadValue(line, info.Name);
}
}
else if(_missingRealsAsNaNs && v.IsReal)
{
// If the user has set the MissingRealsAsNaNs option to true,
// And there are missing columns on a given row,
// then we should load them as if they were empty (i.e. _blank) fields
// So that they can be loaded as NaNs if they're single/double columns
// Or as default if they aren't.
v.Consume(irow, 0, ref _blank);
}
else
v.Reset(irow, 0);
}
Expand Down
Loading