zinggAI · sonalgoyal · May 29, 2025 · May 26, 2025 · May 26, 2025 · May 27, 2025
diff --git a/common/core/src/main/java/zingg/common/core/match/output/LinkOutputBuilder.java b/common/core/src/main/java/zingg/common/core/match/output/LinkOutputBuilder.java
@@ -9,7 +9,6 @@
 import zingg.common.client.FieldDefinition;
 import zingg.common.client.IArguments;
 import zingg.common.client.ZFrame;
-import zingg.common.client.ZinggClientException;
 import zingg.common.client.util.ColName;
 import zingg.common.client.util.DSUtil;
 
@@ -22,12 +21,10 @@ public LinkOutputBuilder(DSUtil<S, D, R, C> dsUtil, IArguments args) {
     }
 
     @Override
-    public ZFrame<D,R,C> getOutput(ZFrame<D, R, C> sampleOriginal, ZFrame<D, R, C> dupesActual) 
-    throws ZinggClientException, Exception{
+    public ZFrame<D,R,C> getOutput(ZFrame<D, R, C> sampleOriginal, ZFrame<D, R, C> dupesActual) {
         dupesActual = dupesActual.withColumn(ColName.CLUSTER_COLUMN, dupesActual.col(ColName.ID_COL));
 		dupesActual = getDSUtil().addUniqueCol(dupesActual, ColName.CLUSTER_COLUMN);
 		ZFrame<D,R,C>dupes2 =  alignLinked(dupesActual, args);
-		dupes2 =  postprocessLinked(dupes2, sampleOriginal);
 		LOG.debug("uncertain output schema is " + dupes2.showSchema());
         return dupes2;
 
@@ -68,24 +65,4 @@ public  ZFrame<D, R, C> alignLinked(ZFrame<D, R, C> dupesActual, IArguments args
 		return dupes1;
 	}
 
-    public ZFrame<D,R,C> getSelectedCols(ZFrame<D,R,C> actual){
-        List<C> cols = new ArrayList<C>();
-        cols.add(actual.col(ColName.CLUSTER_COLUMN));	
-    	cols.add(actual.col(ColName.ID_COL));
-    	cols.add(actual.col(ColName.SCORE_COL));
-    	cols.add(actual.col(ColName.SOURCE_COL));	
-
-    	ZFrame<D,R,C> zFieldsFromActual = actual.select(cols);
-        return zFieldsFromActual;
-
-    }
-
-    public ZFrame<D,R,C> postprocessLinked(ZFrame<D,R,C> actual, ZFrame<D,R,C> orig) {
-    	ZFrame<D,R,C> zFieldsFromActual = getSelectedCols(actual);
-    	ZFrame<D,R,C> joined = zFieldsFromActual.join(orig,ColName.ID_COL,ColName.SOURCE_COL)
-    					.drop(zFieldsFromActual.col(ColName.SOURCE_COL))
-    					.drop(ColName.ID_COL);
-
-    	return joined;
-    }
 }
diff --git a/common/core/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java b/common/core/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java
@@ -35,7 +35,6 @@ public void init(S s) throws ZinggClientException, IOException {
 
 	public abstract List<ExecutorTester<S, D, R, C, T>> getExecutors() throws ZinggClientException, IOException;
 
-	//public abstract void tearDown();	
 
 	 @Test
 	public void testExecutors() throws ZinggClientException, IOException {

diff --git a/common/core/src/test/java/zingg/common/core/executor/validate/LinkerValidator.java b/common/core/src/test/java/zingg/common/core/executor/validate/LinkerValidator.java
@@ -3,6 +3,7 @@
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 
+import org.junit.jupiter.api.Assertions;
 import zingg.common.client.ZFrame;
 import zingg.common.client.ZinggClientException;
 import zingg.common.core.executor.Matcher;
@@ -22,26 +23,31 @@ public void validateResults() throws ZinggClientException {
 
     @Override
 	protected void assessAccuracy() throws ZinggClientException {
-		ZFrame<D, R, C> df1  = getOutputData().withColumn("z_zsource", "test1");
-		df1 = df1.select("fname", "id", getClusterColName());
-		df1 = df1.withColumn("dupeRecIdFuzzyMatch",df1.substr(df1.col("id"),0,PREFIX_MATCH_LENGTH)).cache();
-
-		ZFrame<D, R, C> df2 = getOutputData().withColumn("z_zsource", "test2");
-        df2 = df2.select("fname", "id", getClusterColName());
-		df2 = df2.withColumn("dupeRecIdFuzzyMatch",df2.substr(df2.col("id"),0,PREFIX_MATCH_LENGTH)).cache();
-
-		ZFrame<D, R, C> gold = joinAndFilter("dupeRecIdFuzzyMatch", df1, df2).cache();
-		ZFrame<D, R, C> result = joinAndFilter(getClusterColName(), df1, df2).cache();
-
-        testAccuracy(gold, result);	
-	}
+		ZFrame<D, R, C> linkOutput = getOutputData();
+
+		Assertions.assertEquals(11, linkOutput.count());
+		/*
+			candidate blake will be linked to one source record -> candidate + one source -> 2 records in cluster1
+			candidate thomas will be linked to one source record -> candidate + one source -> 2 records in cluster2
+			candidate jackson will be linked to two source records -> candidate + two source -> 3 records in cluster3
+			candidate gianni 1st will be linked to one source record -> candidate + one source -> 2 records in cluster4
+			candidate gianni 2nd will be linked to one source record -> candidate + one source -> 2 records in cluster5
+			candidate takeisha has no source record
+
+			total 2 + 2 + 3 + 2 + 2 = 11 records
+		 */
+		ZFrame<D, R, C> blakeCluster = linkOutput.filter(linkOutput.equalTo("fname", "blake"));
+		ZFrame<D, R, C> thomasCluster = linkOutput.filter(linkOutput.equalTo("fname", "thomas"));
+		ZFrame<D, R, C> jacksonCluster = linkOutput.filter(linkOutput.equalTo("fname", "jackson"));
+		ZFrame<D, R, C> gianniCluster = linkOutput.filter(linkOutput.equalTo("fname", "gianni"));
+		ZFrame<D, R, C> takeishaCluster = linkOutput.filter(linkOutput.equalTo("fname", "takeisha"));
+
+		Assertions.assertEquals(2, blakeCluster.count());
+		Assertions.assertEquals(2, thomasCluster.count());
+		Assertions.assertEquals(3, jacksonCluster.count());
+		Assertions.assertEquals(4, gianniCluster.count());
+		Assertions.assertEquals(0, takeishaCluster.count());
 
-	@Override
-	protected ZFrame<D, R, C> joinAndFilter(String colName, ZFrame<D, R, C> df, ZFrame<D, R, C> df1){
-		C col1 = df.col(colName);
-		C col2 = df1.col(colName);
-		ZFrame<D, R, C> joined = df.joinOnCol(df1, df.equalTo(col1, col2));
-		return joined;
 	}
-    
+
 }
diff --git a/config/zingg.conf b/config/zingg.conf
@@ -15,6 +15,7 @@ spark.default.parallelism=8
 spark.debug.maxToStringFields=200
 spark.driver.memory=8g
 spark.executor.memory=8g
+spark.sql.adaptive.enabled=false
 #spark.jars=/home/zingg/pathto.jar
 # Additional Jars could be passed to spark through below configuration. Jars list should be comma(,) separated. 
 #spark.jars=

diff --git a/perf_test/perf_test_report/testReport_febrl120K.json b/perf_test/perf_test_report/testReport_febrl120K.json
@@ -1,9 +1,9 @@
 {
     "date": "2025-05-28",
-    "time": "00:38:05",
+    "time": "01:24:05",
     "test": "febrl_120K",
     "results": {
-        "train": 1.57,
-        "match": 5.86
+        "train": 1.48,
+        "match": 6.82
     }
 }
diff --git a/perf_test/perf_test_report/testReport_ncVoters5M.json b/perf_test/perf_test_report/testReport_ncVoters5M.json
@@ -1,9 +1,9 @@
 {
     "date": "2025-05-28",
-    "time": "03:27:29",
+    "time": "03:46:47",
     "test": "ncVoters_5M",
     "results": {
-        "train": 2.07,
-        "match": 47.12
+        "train": 2.08,
+        "match": 48.45
     }
 }
diff --git a/spark/core/src/main/java/zingg/spark/core/util/SparkGraphUtil.java b/spark/core/src/main/java/zingg/spark/core/util/SparkGraphUtil.java
@@ -37,6 +37,7 @@ public ZFrame<Dataset<Row>, Row, Column> buildGraph(ZFrame<Dataset<Row>, Row, Co
 		GraphFrame gf = new GraphFrame(v, e);
 		//gf = gf.dropIsolatedVertices();
 		//Dataset<Row> returnGraph = gf.connectedComponents().setAlgorithm("graphx").run().cache();
+
 		Dataset<Row> returnGraph = gf.connectedComponents().run().cache();
 		//reverse back o avoid graphframes id :-()
 		returnGraph = returnGraph.join(vertices, returnGraph.col("id").equalTo(vertices.col(ColName.ID_COL)));

diff --git a/spark/core/src/test/java/zingg/spark/core/session/SparkSessionProvider.java b/spark/core/src/test/java/zingg/spark/core/session/SparkSessionProvider.java
@@ -27,11 +27,17 @@ private void initializeSession() {
                 if (sparkDriverMemory == null) {
                     sparkDriverMemory = "1g";
                 }
+                String aqeFlag = System.getenv("ZINGG_AQE_ENABLED");
+                if (aqeFlag == null) {
+                    //by default disable AQE
+                    aqeFlag = "false";
+                }
                 sparkSession = SparkSession
                         .builder()
                         .master("local[*]")
                         .appName("ZinggJunit")
                         .config("spark.debug.maxToStringFields", 100)
+                        .config("spark.sql.adaptive.enabled", Boolean.parseBoolean(aqeFlag))
                         .config("spark.driver.memory", sparkDriverMemory)
                         .getOrCreate();
                 SparkContext sparkContext = sparkSession.sparkContext();

diff --git a/spark/core/src/test/resources/zingg/spark/core/executor/linkerCandidate.csv b/spark/core/src/test/resources/zingg/spark/core/executor/linkerCandidate.csv
@@ -0,0 +1,6 @@
+rec-1-candidate, blake, ryan,4, starling place, berkeley vlge, marsden,5412, nsw,19271027,2402765
+rec-2-candidate, thomas, george,1, mcmanus place, stoney creek, north turramurra,3130, sa,19630225,5460534
+rec-3-candidate, jackson, eglinton,840, fowles street, mountview, burleigh heads,2830, sa,19830807,2932837
+rec-4-candidate, gianni, matson,701, willis street, boonooloo, clifton,3101, vic,19410111,2540080
+rec-44-candidate, gianni, matson,701, willis street, boonooloo, clifton,3101, vic,19410111,2540080
+rec-5-candidate, takeisha, freeborn,6, suttor street, the groves street, wentworth falls,4615, vic,19620206,8111362
diff --git a/spark/core/src/test/resources/zingg/spark/core/executor/linkerSource.csv b/spark/core/src/test/resources/zingg/spark/core/executor/linkerSource.csv
@@ -0,0 +1,6 @@
+rec-1-source, blake, ryan,4, starling place, berkeley vlge, marsden,5412, nsw,19271027,2402765
+rec-2-source, thomas, george,1, mcmanus place, stoney creek, north turramurra,3130, sa,19630225,5460534
+rec-3-source, jackson, eglinton,840, fowles street, mountview, burleigh heads,2830, sa,19830807,2932837
+rec-33-source, jackson, eglinton,840, fowles street, mountview, burleigh heads,2830, sa,19830807,2932837
+rec-4-source, gianni, matson,701, willis street, boonooloo, clifton,3101, vic,19410111,2540080
+rec-6-source, dummy, dummy,dummy, dummy, dummy, dummy,dummy, dummy,dummy,dummy
diff --git a/spark/core/src/test/resources/zingg/spark/core/executor/single/configSparkLinkTest.json b/spark/core/src/test/resources/zingg/spark/core/executor/single/configSparkLinkTest.json
@@ -65,7 +65,7 @@
 			"name":"output", 
 			"format":"csv", 
 			"props": {
-				"location": "/tmp/junit_integration_spark/single/zinggOutput",
+				"location": "/tmp/junit_integration_spark/single/zinggLinkOutput",
 				"delimiter": ",",
 				"header":true
 			}
@@ -74,7 +74,7 @@
 			"name":"test1", 
 			"format":"csv", 
 			"props": {
-				"location": "./zingg/spark/core/executor/test1.csv",
+				"location": "./zingg/spark/core/executor/linkerCandidate.csv",
 				"delimiter": ",",
 				"header":false					
 			},
@@ -84,7 +84,7 @@
 			"name":"test2", 
 			"format":"csv", 
 			"props": {
-				"location": "./zingg/spark/core/executor/test2.csv",
+				"location": "./zingg/spark/core/executor/linkerSource.csv",
 				"delimiter": ",",
 				"header":false					
 			},
-Original file line number
+Diff line change
@@ Expand Up @@
     	public abstract List<ExecutorTester<S, D, R, C, T>> getExecutors() throws ZinggClientException, IOException;
-    	//public abstract void tearDown();
     	 @Test
     	public void testExecutors() throws ZinggClientException, IOException {
@@ Expand Down @@