[SPARK-43115][CONNECT][PS][TESTS] Split pyspark-pandas-connect from pyspark-connect module

ueshin · zhengruifeng · commit 69abf14b9668 · 2023-04-13T11:05:51.000+08:00
### What changes were proposed in this pull request? Splits `pyspark-pandas-connect` from `pyspark-connect` module. ### Why are the changes needed? Now that we have pandas API on Spark Connect, the tests for `pyspark-connect` module take long time: - before pandas API: about 40 mins - after pandas API: about 2-3 hours so we should split `pyspark-pandas-connect` from `pyspark-connect` module. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing tests. Closes apache#40764 from ueshin/issues/SPARK-43115/pyspark-pandas-connect. Authored-by: Takuya UESHIN <ueshin@databricks.com> Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -339,6 +339,8 @@ jobs:
             pyspark-pandas-slow
           - >-
             pyspark-connect, pyspark-errors
+          - >-
+            pyspark-pandas-connect
     env:
       MODULES_TO_TEST: ${{ matrix.modules }}
       HADOOP_PROFILE: ${{ inputs.hadoop }}
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
@@ -783,6 +783,21 @@ def __hash__(self):
         # ml unittests
         "pyspark.ml.tests.connect.test_connect_function",
         "pyspark.ml.tests.connect.test_parity_torch_distributor",
+    ],
+    excluded_python_implementations=[
+        "PyPy"  # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
+        # they aren't available there
+    ],
+)
+
+
+pyspark_pandas_connect = Module(
+    name="pyspark-pandas-connect",
+    dependencies=[pyspark_connect, pyspark_pandas],
+    source_file_regexes=[
+        "python/pyspark/pandas",
+    ],
+    python_test_goals=[
         # pandas-on-Spark unittests
         "pyspark.pandas.tests.connect.data_type_ops.test_parity_base",
         "pyspark.pandas.tests.connect.data_type_ops.test_parity_binary_ops",
diff --git a/dev/sparktestsupport/utils.py b/dev/sparktestsupport/utils.py
@@ -113,21 +113,24 @@ def determine_modules_to_test(changed_modules, deduplicated=True):
     ... # doctest: +NORMALIZE_WHITESPACE
     ['avro', 'connect', 'docker-integration-tests', 'examples', 'hive', 'hive-thriftserver',
      'mllib', 'protobuf', 'pyspark-connect', 'pyspark-ml', 'pyspark-mllib', 'pyspark-pandas',
-     'pyspark-pandas-slow', 'pyspark-sql', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10']
+     'pyspark-pandas-connect', 'pyspark-pandas-slow', 'pyspark-sql', 'repl', 'sparkr', 'sql',
+     'sql-kafka-0-10']
     >>> sorted([x.name for x in determine_modules_to_test(
     ...     [modules.sparkr, modules.sql], deduplicated=False)])
     ... # doctest: +NORMALIZE_WHITESPACE
     ['avro', 'connect', 'docker-integration-tests', 'examples', 'hive', 'hive-thriftserver',
      'mllib', 'protobuf', 'pyspark-connect', 'pyspark-ml', 'pyspark-mllib', 'pyspark-pandas',
-     'pyspark-pandas-slow', 'pyspark-sql', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10']
+     'pyspark-pandas-connect', 'pyspark-pandas-slow', 'pyspark-sql', 'repl', 'sparkr', 'sql',
+     'sql-kafka-0-10']
     >>> sorted([x.name for x in determine_modules_to_test(
     ...     [modules.sql, modules.core], deduplicated=False)])
     ... # doctest: +NORMALIZE_WHITESPACE
     ['avro', 'catalyst', 'connect', 'core', 'docker-integration-tests', 'examples', 'graphx',
      'hive', 'hive-thriftserver', 'mllib', 'mllib-local', 'protobuf', 'pyspark-connect',
-     'pyspark-core', 'pyspark-ml', 'pyspark-mllib', 'pyspark-pandas', 'pyspark-pandas-slow',
-     'pyspark-resource', 'pyspark-sql', 'pyspark-streaming', 'repl', 'root', 'sparkr', 'sql',
-     'sql-kafka-0-10', 'streaming', 'streaming-kafka-0-10', 'streaming-kinesis-asl']
+     'pyspark-core', 'pyspark-ml', 'pyspark-mllib', 'pyspark-pandas', 'pyspark-pandas-connect',
+     'pyspark-pandas-slow', 'pyspark-resource', 'pyspark-sql', 'pyspark-streaming', 'repl', 'root',
+     'sparkr', 'sql', 'sql-kafka-0-10', 'streaming', 'streaming-kafka-0-10',
+     'streaming-kinesis-asl']
     """
     modules_to_test = set()
     for module in changed_modules: