Skip to content

Add ability to merge using multiple columns in JOIN condition #5838

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 27 commits into from
Sep 28, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
c527c21
fix #5767 issue with DataFrame Merge method
Apr 26, 2021
a38fdf5
Merge pull request #1 from dotnet/main
Apr 29, 2021
e5f5eeb
Merge pull request #2 from dotnet/main
May 27, 2021
0580710
#5820 Extend DataFrame GroupBy operations
May 28, 2021
f7658b2
#5820 fix code review findings
May 29, 2021
dc8daf0
Avoid code duplication in Merge DataFrame method (#5657)
Jun 2, 2021
b36fbee
Merge pull request #4 from dotnet/main
Jun 4, 2021
a4735ce
Add non generic DataFrame Merge method (#5657)
Jun 4, 2021
7893bfd
Add support for multi columns join in DataFrame (#5657)
Jun 4, 2021
28fcda3
Fix failing tests for inner, left and right joins with nulls
Jun 5, 2021
5da5a37
#5657 fix DataFrame outer join failing tests
Jun 7, 2021
040f6bb
rebase to the latest main
May 29, 2021
c1b7969
Avoid code duplication in Merge DataFrame method (#5657)
Jun 2, 2021
bd22036
Add non generic DataFrame Merge method (#5657)
Jun 4, 2021
d2d5f36
Add support for multi columns join in DataFrame (#5657)
Jun 4, 2021
0494f7c
Fix failing tests for inner, left and right joins with nulls
Jun 5, 2021
976351c
#5657 fix DataFrame outer join failing tests
Jun 7, 2021
b5bff6c
#5657 fix merge issues
Jun 7, 2021
cbd02dc
Merge branch 'feature/5657_dataframe_merge_multiple_columns' of https…
Jun 7, 2021
1940f3a
Minor changes #5657
Jun 7, 2021
4232fee
Add self explanatory exception text (#5657)
Jun 7, 2021
33a99c3
Add Asserts to new unit tests (#5657)
Jun 7, 2021
12a5833
Minor changes (#5657)
Jun 7, 2021
22fc3e2
Fix right merge by 3-columns test fails
Jun 8, 2021
989711b
Merge pull request #5 from asmirnov82/feature/5657_dataframe_merge_mu…
Jun 8, 2021
2f22cd7
fixed typos (#5657)
Sep 4, 2021
3396554
Merge pull request #6 from dotnet/main
Sep 19, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Minor changes #5657
  • Loading branch information
Alexey Smirnov committed Jun 7, 2021
commit 1940f3a234722aa8a6c7a29302c00be6060a3ebc
1 change: 0 additions & 1 deletion src/Microsoft.Data.Analysis/DataFrame.Join.cs
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,6 @@ public DataFrame Merge(DataFrame other, string[] leftJoinColumns, string[] right

//Step 2
//Do RIGHT JOIN to retain all data from supplementary DataFrame too (take into account data intersection from the first step to avoid duplicates)

for (long i = 0; i < supplementaryDataFrame.Columns.RowCount; i++)
{
var columns = supplementaryJoinColumns.Select(name => supplementaryDataFrame.Columns[name]).ToArray();
Expand Down
1 change: 0 additions & 1 deletion src/Microsoft.Data.Analysis/DataFrame.cs
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,6 @@ public GroupBy<TKey> GroupBy<TKey>(string columnName)
throw new InvalidCastException(String.Format(Strings.BadColumnCastDuringGrouping, columnName, column.DataType, typeof(TKey)));
}


return group;
}

Expand Down
17 changes: 8 additions & 9 deletions test/Microsoft.Data.Analysis.Tests/DataFrameGroupByTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ public class DataFrameGroupByTests
[Fact]
public void TestGroupingWithTKeyTypeofString()
{
const int lenght = 11;
const int length = 11;

//Create test dataframe (numbers starting from 0 up to lenght)
DataFrame df = MakeTestDataFrameWithParityAndTensColumns(lenght);
DataFrame df = MakeTestDataFrameWithParityAndTensColumns(length);

var grouping = df.GroupBy<string>("Parity").Groupings;

Expand All @@ -28,11 +28,11 @@ public void TestGroupingWithTKeyTypeofString()
//Check number of elements in each group
var oddGroup = grouping.Where(gr => gr.Key == "odd").FirstOrDefault();
Assert.NotNull(oddGroup);
Assert.Equal(lenght/2, oddGroup.Count());
Assert.Equal(length/2, oddGroup.Count());

var evenGroup = grouping.Where(gr => gr.Key == "even").FirstOrDefault();
Assert.NotNull(evenGroup);
Assert.Equal(lenght / 2 + lenght % 2, evenGroup.Count());
Assert.Equal(length / 2 + length % 2, evenGroup.Count());


}
Expand All @@ -56,10 +56,10 @@ public void TestGroupingWithTKey_CornerCases()
[Fact]
public void TestGroupingWithTKeyPrimitiveType()
{
const int lenght = 55;
const int length = 55;

//Create test dataframe (numbers starting from 0 up to lenght)
DataFrame df = MakeTestDataFrameWithParityAndTensColumns(lenght);
DataFrame df = MakeTestDataFrameWithParityAndTensColumns(length);

//Group elements by int column, that contain the amount of full tens in each int
var groupings = df.GroupBy<int>("Tens").Groupings.ToDictionary(g => g.Key, g => g.ToList());
Expand All @@ -68,7 +68,7 @@ public void TestGroupingWithTKeyPrimitiveType()
int numberColumnsCount = df.Columns.Count - 2; //except "Parity" and "Tens" columns

//Check each group
for (int i = 0; i < lenght / 10; i++)
for (int i = 0; i < length / 10; i++)
{
Assert.Equal(10, groupings[i].Count());

Expand All @@ -85,13 +85,12 @@ public void TestGroupingWithTKeyPrimitiveType()
}

//Last group should contain smaller amount of items
Assert.Equal(lenght % 10, groupings[lenght / 10].Count());
Assert.Equal(length % 10, groupings[length / 10].Count());
}

[Fact]
public void TestGroupingWithTKeyOfWrongType()
{

var message = string.Empty;

//Create test dataframe (numbers starting from 0 up to lenght)
Expand Down
4 changes: 1 addition & 3 deletions test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1705,7 +1705,7 @@ public void TestMerge()
Assert.Equal(merge.Columns["Int_right"][2], right.Columns["Int"][2]);
VerifyMerge(merge, left, right, JoinAlgorithm.Inner);
}

private void MatchRowsOnMergedDataFrame(DataFrame merge, DataFrame left, DataFrame right, long mergeRow, long? leftRow, long? rightRow)
{
Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count);
Expand Down Expand Up @@ -1775,7 +1775,6 @@ public void TestMergeEdgeCases_LeftOrRight(int leftLength, int rightLength, Join
}
}


[Fact]
public void TestMergeEdgeCases_Inner()
{
Expand Down Expand Up @@ -2237,7 +2236,6 @@ public void TestClone(int dfLength, int intDfLength)
}
}
}


[Fact]
public void TestColumnCreationFromExisitingColumn()
Expand Down