Skip to content

Commit 31bafc5

Browse files
author
Prashanth Govindarajan
authored
Merge 1203495 into 82b4838
2 parents 82b4838 + 1203495 commit 31bafc5

9 files changed

+431
-10
lines changed

src/Microsoft.Data.Analysis/DataFrame.IDataView.cs

+2-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
namespace Microsoft.Data.Analysis
1212
{
1313
public partial class DataFrame : IDataView
14-
{
14+
{
1515
// TODO: support shuffling
1616
bool IDataView.CanShuffle => false;
1717

@@ -53,6 +53,7 @@ private DataViewRowCursor GetRowCursorCore(IEnumerable<DataViewSchema.Column> co
5353

5454
return new RowCursor(this, activeColumns);
5555
}
56+
5657
DataViewRowCursor IDataView.GetRowCursor(IEnumerable<DataViewSchema.Column> columnsNeeded, Random rand)
5758
{
5859
return GetRowCursorCore(columnsNeeded);

src/Microsoft.Data.Analysis/DataFrameColumn.cs

+14
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,20 @@ public virtual DataFrameColumn Sort(bool ascending = true)
247247
/// </param>
248248
protected internal virtual void AddDataViewColumn(DataViewSchema.Builder builder) => throw new NotImplementedException();
249249

250+
/// <summary>
251+
/// Appends a value to this <see cref="DataFrameColumn"/> using <paramref name="cursor"/>
252+
/// </summary>
253+
/// <param name="cursor">The row cursor which has the current position</param>
254+
/// <param name="ValueGetter">The cached ValueGetter for this column.</param>
255+
protected internal virtual void AddValueUsingCursor(DataViewRowCursor cursor, Delegate ValueGetter) => throw new NotImplementedException();
256+
257+
/// <summary>
258+
/// Returns the ValueGetter for each active column in <paramref name="cursor"/> as a delegate to be cached.
259+
/// </summary>
260+
/// <param name="cursor">The row cursor which has the current position</param>
261+
/// <param name="schemaColumn">The <see cref="DataViewSchema.Column"/> to return the ValueGetter for.</param>
262+
protected internal virtual Delegate GetValueGetterUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn) => throw new NotImplementedException();
263+
250264
/// <summary>
251265
/// Clamps values beyond the specified thresholds
252266
/// </summary>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
using System;
6+
using System.Collections.Generic;
7+
using Microsoft.Data.Analysis;
8+
using Microsoft.ML.Data;
9+
10+
namespace Microsoft.ML
11+
{
12+
public static class IDataViewExtensions
13+
{
14+
private const int defaultMaxRows = 100;
15+
16+
/// <summary>
17+
/// Returns a <see cref="Microsoft.Data.Analysis.DataFrame"/> from this <paramref name="dataView"/>.
18+
/// </summary>
19+
/// <param name="dataView">The current <see cref="IDataView"/>.</param>
20+
/// <param name="maxRows">The max number or rows in the <see cref="Microsoft.Data.Analysis.DataFrame"/>. Defaults to 100. Use -1 to construct a DataFrame using all the rows in <paramref name="dataView"/>.</param>
21+
/// <returns>A <see cref="Microsoft.Data.Analysis.DataFrame"/> with <paramref name="maxRows"/>.</returns>
22+
public static DataFrame ToDataFrame(this IDataView dataView, long maxRows = defaultMaxRows)
23+
{
24+
return ToDataFrame(dataView, maxRows, null);
25+
}
26+
27+
/// <summary>
28+
/// Returns a <see cref="Microsoft.Data.Analysis.DataFrame"/> with the first 100 rows of this <paramref name="dataView"/>.
29+
/// </summary>
30+
/// <param name="dataView">The current <see cref="IDataView"/>.</param>
31+
/// <param name="selectColumns">The columns selected for the resultant DataFrame</param>
32+
/// <returns>A <see cref="Microsoft.Data.Analysis.DataFrame"/> with the selected columns and 100 rows.</returns>
33+
public static DataFrame ToDataFrame(this IDataView dataView, params string[] selectColumns)
34+
{
35+
return ToDataFrame(dataView, defaultMaxRows, selectColumns);
36+
}
37+
38+
/// <summary>
39+
/// Returns a <see cref="Microsoft.Data.Analysis.DataFrame"/> with the first <paramref name="maxRows"/> of this <paramref name="dataView"/>.
40+
/// </summary>
41+
/// <param name="dataView">The current <see cref="IDataView"/>.</param>
42+
/// <param name="maxRows">The max number or rows in the <see cref="Microsoft.Data.Analysis.DataFrame"/>. Use -1 to construct a DataFrame using all the rows in <paramref name="dataView"/>.</param>
43+
/// <param name="selectColumns">The columns selected for the resultant DataFrame</param>
44+
/// <returns>A <see cref="Microsoft.Data.Analysis.DataFrame"/> with the selected columns and <paramref name="maxRows"/> rows.</returns>
45+
public static DataFrame ToDataFrame(this IDataView dataView, long maxRows, params string[] selectColumns)
46+
{
47+
DataViewSchema schema = dataView.Schema;
48+
List<DataFrameColumn> dataFrameColumns = new List<DataFrameColumn>(schema.Count);
49+
maxRows = maxRows == -1 ? long.MaxValue : maxRows;
50+
51+
HashSet<string> selectColumnsSet = null;
52+
if (selectColumns != null && selectColumns.Length > 0)
53+
{
54+
selectColumnsSet = new HashSet<string>(selectColumns);
55+
}
56+
57+
List<DataViewSchema.Column> activeDataViewColumns = new List<DataViewSchema.Column>();
58+
foreach (DataViewSchema.Column dataViewColumn in schema)
59+
{
60+
if (dataViewColumn.IsHidden || (selectColumnsSet != null && !selectColumnsSet.Contains(dataViewColumn.Name)))
61+
{
62+
continue;
63+
}
64+
65+
activeDataViewColumns.Add(dataViewColumn);
66+
DataViewType type = dataViewColumn.Type;
67+
if (type == BooleanDataViewType.Instance)
68+
{
69+
dataFrameColumns.Add(new BooleanDataFrameColumn(dataViewColumn.Name));
70+
}
71+
else if (type == NumberDataViewType.Byte)
72+
{
73+
dataFrameColumns.Add(new ByteDataFrameColumn(dataViewColumn.Name));
74+
}
75+
else if (type == NumberDataViewType.Double)
76+
{
77+
dataFrameColumns.Add(new DoubleDataFrameColumn(dataViewColumn.Name));
78+
}
79+
else if (type == NumberDataViewType.Single)
80+
{
81+
dataFrameColumns.Add(new SingleDataFrameColumn(dataViewColumn.Name));
82+
}
83+
else if (type == NumberDataViewType.Int32)
84+
{
85+
dataFrameColumns.Add(new Int32DataFrameColumn(dataViewColumn.Name));
86+
}
87+
else if (type == NumberDataViewType.Int64)
88+
{
89+
dataFrameColumns.Add(new Int64DataFrameColumn(dataViewColumn.Name));
90+
}
91+
else if (type == NumberDataViewType.SByte)
92+
{
93+
dataFrameColumns.Add(new SByteDataFrameColumn(dataViewColumn.Name));
94+
}
95+
else if (type == NumberDataViewType.Int16)
96+
{
97+
dataFrameColumns.Add(new Int16DataFrameColumn(dataViewColumn.Name));
98+
}
99+
else if (type == NumberDataViewType.UInt32)
100+
{
101+
dataFrameColumns.Add(new UInt32DataFrameColumn(dataViewColumn.Name));
102+
}
103+
else if (type == NumberDataViewType.UInt64)
104+
{
105+
dataFrameColumns.Add(new UInt64DataFrameColumn(dataViewColumn.Name));
106+
}
107+
else if (type == NumberDataViewType.UInt16)
108+
{
109+
dataFrameColumns.Add(new UInt16DataFrameColumn(dataViewColumn.Name));
110+
}
111+
else if (type == TextDataViewType.Instance)
112+
{
113+
dataFrameColumns.Add(new StringDataFrameColumn(dataViewColumn.Name));
114+
}
115+
else
116+
{
117+
throw new NotSupportedException(String.Format(Microsoft.Data.Strings.NotSupportedColumnType, type.RawType.Name));
118+
}
119+
}
120+
121+
using (DataViewRowCursor cursor = dataView.GetRowCursor(activeDataViewColumns))
122+
{
123+
Delegate[] activeColumnDelegates = new Delegate[activeDataViewColumns.Count];
124+
int columnIndex = 0;
125+
foreach (DataViewSchema.Column activeDataViewColumn in activeDataViewColumns)
126+
{
127+
Delegate valueGetter = dataFrameColumns[columnIndex].GetValueGetterUsingCursor(cursor, activeDataViewColumn);
128+
activeColumnDelegates[columnIndex] = valueGetter;
129+
columnIndex++;
130+
}
131+
while (cursor.MoveNext() && cursor.Position < maxRows)
132+
{
133+
for (int i = 0; i < activeColumnDelegates.Length; i++)
134+
{
135+
dataFrameColumns[i].AddValueUsingCursor(cursor, activeColumnDelegates[i]);
136+
}
137+
}
138+
}
139+
140+
return new DataFrame(dataFrameColumns);
141+
}
142+
}
143+
144+
}

src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs

+26
Original file line numberDiff line numberDiff line change
@@ -775,5 +775,31 @@ private static ValueGetter<ushort> CreateCharValueGetterDelegate(DataViewRowCurs
775775

776776
private static ValueGetter<double> CreateDecimalValueGetterDelegate(DataViewRowCursor cursor, PrimitiveDataFrameColumn<decimal> column) =>
777777
(ref double value) => value = (double?)column[cursor.Position] ?? double.NaN;
778+
779+
protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, Delegate getter)
780+
{
781+
long row = cursor.Position;
782+
T value = default;
783+
Debug.Assert(getter != null, "Excepted getter to be valid");
784+
(getter as ValueGetter<T>)(ref value);
785+
786+
if (Length > row)
787+
{
788+
this[row] = value;
789+
}
790+
else if (Length == row)
791+
{
792+
Append(value);
793+
}
794+
else
795+
{
796+
throw new IndexOutOfRangeException(nameof(row));
797+
}
798+
}
799+
800+
protected internal override Delegate GetValueGetterUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn)
801+
{
802+
return cursor.GetGetter<T>(schemaColumn);
803+
}
778804
}
779805
}

src/Microsoft.Data.Analysis/StringDataFrameColumn.cs

+27
Original file line numberDiff line numberDiff line change
@@ -467,5 +467,32 @@ protected internal override Delegate GetDataViewGetter(DataViewRowCursor cursor)
467467

468468
private ValueGetter<ReadOnlyMemory<char>> CreateValueGetterDelegate(DataViewRowCursor cursor) =>
469469
(ref ReadOnlyMemory<char> value) => value = this[cursor.Position].AsMemory();
470+
471+
protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, Delegate getter)
472+
{
473+
long row = cursor.Position;
474+
ReadOnlyMemory<char> value = default;
475+
Debug.Assert(getter != null, "Excepted getter to be valid");
476+
477+
(getter as ValueGetter<ReadOnlyMemory<char>>)(ref value);
478+
479+
if (Length > row)
480+
{
481+
this[row] = value.ToString();
482+
}
483+
else if (Length == row)
484+
{
485+
Append(value.ToString());
486+
}
487+
else
488+
{
489+
throw new IndexOutOfRangeException(nameof(row));
490+
}
491+
}
492+
493+
protected internal override Delegate GetValueGetterUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn)
494+
{
495+
return cursor.GetGetter<ReadOnlyMemory<char>>(schemaColumn);
496+
}
470497
}
471498
}

src/Microsoft.Data.Analysis/strings.Designer.cs

+9
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/Microsoft.Data.Analysis/strings.resx

+4-1
Original file line numberDiff line numberDiff line change
@@ -183,10 +183,13 @@
183183
<data name="NonSeekableStream" xml:space="preserve">
184184
<value>Expected a seekable stream</value>
185185
</data>
186+
<data name="NotSupportedColumnType" xml:space="preserve">
187+
<value>{0} is not a supported column type.</value>
188+
</data>
186189
<data name="NumericColumnType" xml:space="preserve">
187190
<value>numeric column</value>
188191
</data>
189192
<data name="SpansMultipleBuffers" xml:space="preserve">
190193
<value>Cannot span multiple buffers</value>
191194
</data>
192-
</root>
195+
</root>

0 commit comments

Comments
 (0)