Skip to content

Allow to define CultureInfo for parsing values on reading DataFrame from csv #6782

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Sep 1, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Merge remote-tracking branch 'origin/main' into 5652_support_culturei…
…nfo_dataframe_loadcsv

# Conflicts:
#	src/Microsoft.Data.Analysis/DataFrame.IO.cs
#	test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs
  • Loading branch information
asmirnov82 committed Aug 31, 2023
commit 69d2275a6d0f7063411f2ae49e43f582ce9f4be9
6 changes: 4 additions & 2 deletions src/Microsoft.Data.Analysis/DataFrame.IO.cs
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,7 @@ private static DataFrame ReadCsvLinesIntoDataFrame(WrappedStreamReaderOrStringRe
char separator = ',', bool header = true,
string[] columnNames = null, Type[] dataTypes = null,
long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false,
bool renameDuplicatedColumns = false,
CultureInfo cultureInfo = null)
{
if (cultureInfo == null)
Expand Down Expand Up @@ -551,13 +552,14 @@ public static DataFrame LoadCsvFromString(string csvString,
/// <param name="guessRows">number of rows used to guess types</param>
/// <param name="addIndexColumn">add one column with the row index</param>
/// <param name="encoding">The character encoding. Defaults to UTF8 if not specified</param>
/// <param name="renameDuplicatedColumns">If set to true, columns with repeated names are auto-renamed.</param>
/// <param name="cultureInfo">culture info for formatting values</param>
/// <returns><see cref="DataFrame"/></returns>
public static DataFrame LoadCsv(Stream csvStream,
char separator = ',', bool header = true,
string[] columnNames = null, Type[] dataTypes = null,
long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false,
Encoding encoding = null, CultureInfo cultureInfo = null)
Encoding encoding = null, bool renameDuplicatedColumns = false, CultureInfo cultureInfo = null)
{
if (!csvStream.CanSeek)
{
Expand All @@ -570,7 +572,7 @@ public static DataFrame LoadCsv(Stream csvStream,
}

WrappedStreamReaderOrStringReader wrappedStreamReaderOrStringReader = new WrappedStreamReaderOrStringReader(csvStream, encoding ?? Encoding.UTF8);
return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn, cultureInfo);
return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn, renameDuplicatedColumns, cultureInfo);
}

/// <summary>
Expand Down
40 changes: 40 additions & 0 deletions test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,46 @@ void RegularTest(DataFrame df)
RegularTest(csvDf);
}

[Fact]
public void TestReadCsvWithHeaderAndDuplicatedColumns_WithoutRenaming()
{

string data = @$"vendor_id,rate_code,passenger_count,trip_time_in_secs,trip_distance,payment_type,payment_type,fare_amount
CMT,1,1,1271,3.8,CRD,CRD,17.5
CMT,1,1,474,1.5,CRD,CRD,8
CMT,1,1,637,1.4,CRD,CRD,8.5
CMT,1,1,181,0.6,CSH,CSH,4.5";

Assert.Throws<System.ArgumentException>(() => DataFrame.LoadCsv(GetStream(data)));
}

[Fact]
public void TestReadCsvWithHeaderAndDuplicatedColumns_WithDuplicateColumnRenaming()
{

string data = @$"vendor_id,rate_code,passenger_count,trip_time_in_secs,trip_distance,payment_type,payment_type,payment_type,fare_amount
CMT,1,1,1271,3.8,CRD,CRD_1,Test,17.5
CMT,1,1,474,1.5,CRD,CRD,Test,8
CMT,1,1,637,1.4,CRD,CRD,Test,8.5
CMT,1,1,181,0.6,CSH,CSH,Test,4.5";

DataFrame df = DataFrame.LoadCsv(GetStream(data), renameDuplicatedColumns: true);

Assert.Equal(4, df.Rows.Count);
Assert.Equal(9, df.Columns.Count);
Assert.Equal("CMT", df.Columns["vendor_id"][3]);

Assert.Equal("payment_type", df.Columns[5].Name);
Assert.Equal("payment_type.1", df.Columns[6].Name);
Assert.Equal("payment_type.2", df.Columns[7].Name);

Assert.Equal("CRD", df.Columns["payment_type"][0]);
Assert.Equal("CRD_1", df.Columns["payment_type.1"][0]);
Assert.Equal("Test", df.Columns["payment_type.2"][0]);

VerifyColumnTypes(df);
}

[Fact]
public void TestReadCsvSplitAcrossMultipleLines()
{
Expand Down
You are viewing a condensed version of this merge commit. You can view the full changes here.