Skip to content

IDataView to DataFrame #5712

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Mar 22, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Remove unused param
Docs
maxRows
More unit tests
Fixed ArrowStringDataFrameColumn construction in the unit test
  • Loading branch information
Prashanth Govindarajan committed Mar 19, 2021
commit 120349523c70cd666ed72c60d333018fb93861ad
29 changes: 8 additions & 21 deletions src/Microsoft.Data.Analysis/DataFrame.IDataView.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ public partial class DataFrame : IDataView
bool IDataView.CanShuffle => false;

private DataViewSchema _schema;
internal DataViewSchema DataViewSchema
private DataViewSchema DataViewSchema
{
get
{
Expand Down Expand Up @@ -70,29 +70,22 @@ private sealed class RowCursor : DataViewRowCursor
private bool _disposed;
private long _position;
private readonly DataFrame _dataFrame;
private readonly List<Delegate> _getters;
private Dictionary<int, int> _columnIndexToGetterIndex;
private readonly Delegate[] _getters;

public RowCursor(DataFrame dataFrame, bool[] activeColumns)
{
Debug.Assert(dataFrame != null);
Debug.Assert(activeColumns != null);

_columnIndexToGetterIndex = new Dictionary<int, int>();
_position = -1;
_dataFrame = dataFrame;
_getters = new List<Delegate>();
for (int i = 0; i < Schema.Count; i++)
_getters = new Delegate[Schema.Count];
for (int i = 0; i < _getters.Length; i++)
{
if (!activeColumns[i])
{
continue;
}

Delegate getter = CreateGetterDelegate(i);
_getters.Add(getter);
Debug.Assert(getter != null);
_columnIndexToGetterIndex[i] = _getters.Count - 1;
_getters[i] = CreateGetterDelegate(i);
Debug.Assert(_getters[i] != null);
}
}

Expand All @@ -103,15 +96,11 @@ public RowCursor(DataFrame dataFrame, bool[] activeColumns)
protected override void Dispose(bool disposing)
{
if (_disposed)
{
return;
}

if (disposing)
{
_position = -1;
}

_disposed = true;
base.Dispose(disposing);
}
Expand All @@ -127,7 +116,7 @@ public override ValueGetter<TValue> GetGetter<TValue>(DataViewSchema.Column colu
if (!IsColumnActive(column))
throw new ArgumentOutOfRangeException(nameof(column));

return (ValueGetter<TValue>)_getters[_columnIndexToGetterIndex[column.Index]];
return (ValueGetter<TValue>)_getters[column.Index];
}

public override ValueGetter<DataViewRowId> GetIdGetter()
Expand All @@ -137,15 +126,13 @@ public override ValueGetter<DataViewRowId> GetIdGetter()

public override bool IsColumnActive(DataViewSchema.Column column)
{
return _getters[_columnIndexToGetterIndex[column.Index]] != null;
return _getters[column.Index] != null;
}

public override bool MoveNext()
{
if (_disposed)
{
return false;
}
_position++;
return _position < _dataFrame.Rows.Count;
}
Expand Down
5 changes: 2 additions & 3 deletions src/Microsoft.Data.Analysis/DataFrameColumn.cs
Original file line number Diff line number Diff line change
Expand Up @@ -251,15 +251,14 @@ public virtual DataFrameColumn Sort(bool ascending = true)
/// Appends a value to this <see cref="DataFrameColumn"/> using <paramref name="cursor"/>
/// </summary>
/// <param name="cursor">The row cursor which has the current position</param>
/// <param name="schemaColumn">The <see cref="DataViewSchema.Column"/> in <see cref="DataViewSchema"/></param>
/// <param name="ValueGetter">The cached ValueGetter for this column.</param>
protected internal virtual void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn, Delegate ValueGetter) => throw new NotImplementedException();
protected internal virtual void AddValueUsingCursor(DataViewRowCursor cursor, Delegate ValueGetter) => throw new NotImplementedException();

/// <summary>
/// Returns the ValueGetter for each active column in <paramref name="cursor"/> as a delegate to be cached.
/// </summary>
/// <param name="cursor">The row cursor which has the current position</param>
/// <param name="schemaColumn">The <see cref="DataViewSchema.Column"/> in <see cref="DataViewSchema"/></param>
/// <param name="schemaColumn">The <see cref="DataViewSchema.Column"/> to return the ValueGetter for.</param>
protected internal virtual Delegate GetValueGetterUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn) => throw new NotImplementedException();

/// <summary>
Expand Down
72 changes: 45 additions & 27 deletions src/Microsoft.Data.Analysis/IDataView.Extension.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,113 +13,131 @@ public static class IDataViewExtensions
{
private const int defaultMaxRows = 100;

/// <summary>
/// Returns a <see cref="Microsoft.Data.Analysis.DataFrame"/> from this <paramref name="dataView"/>.
/// </summary>
/// <param name="dataView">The current <see cref="IDataView"/>.</param>
/// <param name="maxRows">The max number or rows in the <see cref="Microsoft.Data.Analysis.DataFrame"/>. Defaults to 100. Use -1 to construct a DataFrame using all the rows in <paramref name="dataView"/>.</param>
/// <returns>A <see cref="Microsoft.Data.Analysis.DataFrame"/> with <paramref name="maxRows"/>.</returns>
public static DataFrame ToDataFrame(this IDataView dataView, long maxRows = defaultMaxRows)
{
return ToDataFrame(dataView, maxRows, null);
}

/// <summary>
/// Returns a <see cref="Microsoft.Data.Analysis.DataFrame"/> with the first 100 rows of this <paramref name="dataView"/>.
/// </summary>
/// <param name="dataView">The current <see cref="IDataView"/>.</param>
/// <param name="selectColumns">The columns selected for the resultant DataFrame</param>
/// <returns>A <see cref="Microsoft.Data.Analysis.DataFrame"/> with the selected columns and 100 rows.</returns>
public static DataFrame ToDataFrame(this IDataView dataView, params string[] selectColumns)
{
return ToDataFrame(dataView, defaultMaxRows, selectColumns);
}

/// <summary>
/// Returns a <see cref="Microsoft.Data.Analysis.DataFrame"/> with the first <paramref name="maxRows"/> of this <paramref name="dataView"/>.
/// </summary>
/// <param name="dataView">The current <see cref="IDataView"/>.</param>
/// <param name="maxRows">The max number or rows in the <see cref="Microsoft.Data.Analysis.DataFrame"/>. Use -1 to construct a DataFrame using all the rows in <paramref name="dataView"/>.</param>
/// <param name="selectColumns">The columns selected for the resultant DataFrame</param>
/// <returns>A <see cref="Microsoft.Data.Analysis.DataFrame"/> with the selected columns and <paramref name="maxRows"/> rows.</returns>
public static DataFrame ToDataFrame(this IDataView dataView, long maxRows, params string[] selectColumns)
{
DataViewSchema schema = dataView.Schema;
List<DataFrameColumn> columns = new List<DataFrameColumn>(schema.Count);
List<DataFrameColumn> dataFrameColumns = new List<DataFrameColumn>(schema.Count);
maxRows = maxRows == -1 ? long.MaxValue : maxRows;

HashSet<string> selectColumnsSet = null;
if (selectColumns != null && selectColumns.Length > 0)
{
selectColumnsSet = new HashSet<string>(selectColumns);
}

List<DataViewSchema.Column> activeColumns = new List<DataViewSchema.Column>();
foreach (DataViewSchema.Column column in schema)
List<DataViewSchema.Column> activeDataViewColumns = new List<DataViewSchema.Column>();
foreach (DataViewSchema.Column dataViewColumn in schema)
{
if (column.IsHidden || (selectColumnsSet != null && !selectColumnsSet.Contains(column.Name)))
if (dataViewColumn.IsHidden || (selectColumnsSet != null && !selectColumnsSet.Contains(dataViewColumn.Name)))
{
continue;
}

activeColumns.Add(column);
DataViewType type = column.Type;
activeDataViewColumns.Add(dataViewColumn);
DataViewType type = dataViewColumn.Type;
if (type == BooleanDataViewType.Instance)
{
columns.Add(new BooleanDataFrameColumn(column.Name));
dataFrameColumns.Add(new BooleanDataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.Byte)
{
columns.Add(new ByteDataFrameColumn(column.Name));
dataFrameColumns.Add(new ByteDataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.Double)
{
columns.Add(new DoubleDataFrameColumn(column.Name));
dataFrameColumns.Add(new DoubleDataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.Single)
{
columns.Add(new SingleDataFrameColumn(column.Name));
dataFrameColumns.Add(new SingleDataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.Int32)
{
columns.Add(new Int32DataFrameColumn(column.Name));
dataFrameColumns.Add(new Int32DataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.Int64)
{
columns.Add(new Int64DataFrameColumn(column.Name));
dataFrameColumns.Add(new Int64DataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.SByte)
{
columns.Add(new SByteDataFrameColumn(column.Name));
dataFrameColumns.Add(new SByteDataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.Int16)
{
columns.Add(new Int16DataFrameColumn(column.Name));
dataFrameColumns.Add(new Int16DataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.UInt32)
{
columns.Add(new UInt32DataFrameColumn(column.Name));
dataFrameColumns.Add(new UInt32DataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.UInt64)
{
columns.Add(new UInt64DataFrameColumn(column.Name));
dataFrameColumns.Add(new UInt64DataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.UInt16)
{
columns.Add(new UInt16DataFrameColumn(column.Name));
dataFrameColumns.Add(new UInt16DataFrameColumn(dataViewColumn.Name));
}
else if (type == TextDataViewType.Instance)
{
columns.Add(new StringDataFrameColumn(column.Name));
dataFrameColumns.Add(new StringDataFrameColumn(dataViewColumn.Name));
}
else
{
throw new NotSupportedException(String.Format(Microsoft.Data.Strings.NotSupportedColumnType, type.RawType.Name));
Copy link
Author

@pgovind pgovind Mar 16, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will cause a problem for vector types in IDataView I think. We'd need to add support for vector columns in DataFrame to fix this. I'll open a bug

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

}
}

using (DataViewRowCursor cursor = dataView.GetRowCursor(activeColumns))
using (DataViewRowCursor cursor = dataView.GetRowCursor(activeDataViewColumns))
{
Delegate[] activeColumnDelegates = new Delegate[activeColumns.Count];
Delegate[] activeColumnDelegates = new Delegate[activeDataViewColumns.Count];
int columnIndex = 0;
foreach (DataViewSchema.Column column in activeColumns)
foreach (DataViewSchema.Column activeDataViewColumn in activeDataViewColumns)
{
Delegate valueGetter = columns[columnIndex].GetValueGetterUsingCursor(cursor, column);
Delegate valueGetter = dataFrameColumns[columnIndex].GetValueGetterUsingCursor(cursor, activeDataViewColumn);
activeColumnDelegates[columnIndex] = valueGetter;
columnIndex++;
}
while (cursor.MoveNext() && cursor.Position < maxRows)
{
columnIndex = 0;
foreach (DataViewSchema.Column column in activeColumns)
for (int i = 0; i < activeColumnDelegates.Length; i++)
{
columns[columnIndex].AddValueUsingCursor(cursor, column, activeColumnDelegates[columnIndex]);
columnIndex++;
dataFrameColumns[i].AddValueUsingCursor(cursor, activeColumnDelegates[i]);
}
}
}

return new DataFrame(columns);
return new DataFrame(dataFrameColumns);
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs
Original file line number Diff line number Diff line change
Expand Up @@ -776,7 +776,7 @@ private static ValueGetter<ushort> CreateCharValueGetterDelegate(DataViewRowCurs
private static ValueGetter<double> CreateDecimalValueGetterDelegate(DataViewRowCursor cursor, PrimitiveDataFrameColumn<decimal> column) =>
(ref double value) => value = (double?)column[cursor.Position] ?? double.NaN;

protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column column, Delegate getter)
protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, Delegate getter)
{
long row = cursor.Position;
T value = default;
Expand Down
3 changes: 2 additions & 1 deletion src/Microsoft.Data.Analysis/StringDataFrameColumn.cs
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,7 @@ protected internal override Delegate GetDataViewGetter(DataViewRowCursor cursor)
private ValueGetter<ReadOnlyMemory<char>> CreateValueGetterDelegate(DataViewRowCursor cursor) =>
(ref ReadOnlyMemory<char> value) => value = this[cursor.Position].AsMemory();

protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn, Delegate getter)
protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, Delegate getter)
{
long row = cursor.Position;
ReadOnlyMemory<char> value = default;
Expand All @@ -489,6 +489,7 @@ protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, D
throw new IndexOutOfRangeException(nameof(row));
}
}

protected internal override Delegate GetValueGetterUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn)
{
return cursor.GetGetter<ReadOnlyMemory<char>>(schemaColumn);
Expand Down
30 changes: 23 additions & 7 deletions test/Microsoft.Data.Analysis.Tests/DataFrameIDataViewTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -252,25 +252,41 @@ public void TestDataFrameFromIDataView_SelectColumns()
Assert.True(df.Columns["Double"].ElementwiseEquals(newDf.Columns["Double"]).All());
}

[Fact]
public void TestDataFrameFromIDataView_SelectRows()
[Theory]
[InlineData(10, 5)]
[InlineData(110, 100)]
[InlineData(110, -1)]
public void TestDataFrameFromIDataView_SelectRows(int dataFrameSize, int rowSize)
{
DataFrame df = DataFrameTests.MakeDataFrameWithAllColumnTypes(10, withNulls: false);
DataFrame df = DataFrameTests.MakeDataFrameWithAllColumnTypes(dataFrameSize, withNulls: false);
df.Columns.Remove("Char"); // Because chars are returned as uint16 by DataViewSchema, so end up comparing CharDataFrameColumn to UInt16DataFrameColumn and fail asserts
df.Columns.Remove("Decimal"); // Because decimal is returned as double by DataViewSchema, so end up comparing DecimalDataFrameColumn to DoubleDataFrameColumn and fail asserts
IDataView dfAsIDataView = df;
DataFrame newDf = dfAsIDataView.ToDataFrame(5);
Assert.Equal(5, newDf.Rows.Count);
DataFrame newDf;
if (rowSize == 100)
{
// Test default
newDf = dfAsIDataView.ToDataFrame();
}
else
{
newDf = dfAsIDataView.ToDataFrame(rowSize);
}
if (rowSize == -1)
{
rowSize = dataFrameSize;
}
Assert.Equal(rowSize, newDf.Rows.Count);
Assert.Equal(df.Columns.Count, newDf.Columns.Count);
for (int i = 0; i < newDf.Columns.Count; i++)
{
Assert.Equal(5, newDf.Columns[i].Length);
Assert.Equal(rowSize, newDf.Columns[i].Length);
Assert.Equal(df.Columns[i].Name, newDf.Columns[i].Name);
}
Assert.Equal(dfAsIDataView.Schema.Count, newDf.Columns.Count);
for (int c = 0; c < df.Columns.Count; c++)
{
for (int r = 0; r < 5; r++)
for (int r = 0; r < rowSize; r++)
{
Assert.Equal(df.Columns[c][r], newDf.Columns[c][r]);
}
Expand Down
10 changes: 6 additions & 4 deletions test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,12 @@ public static ArrowStringDataFrameColumn CreateArrowStringColumn(int length, boo

// write the current length to (index + 1)
int offsetIndex = (i + 1) * 4;
offsetMemory[offsetIndex++] = (byte)(3 * validStringsIndex);
Copy link
Author

@pgovind pgovind Mar 19, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was overflowing when length was big enough (exercised in TestDataFrameFromIDataView_SelectRows). Hence the change.

offsetMemory[offsetIndex++] = 0;
offsetMemory[offsetIndex++] = 0;
offsetMemory[offsetIndex++] = 0;
int offsetValue = 3 * validStringsIndex;
byte[] offsetValueBytes = BitConverter.GetBytes(offsetValue);
offsetMemory[offsetIndex++] = offsetValueBytes[0];
offsetMemory[offsetIndex++] = offsetValueBytes[1];
offsetMemory[offsetIndex++] = offsetValueBytes[2];
offsetMemory[offsetIndex++] = offsetValueBytes[3];
}

int nullCount = withNulls ? 1 : 0;
Expand Down