Skip to content

Commit 69abf14

Browse files
ueshinzhengruifeng
authored andcommitted
[SPARK-43115][CONNECT][PS][TESTS] Split pyspark-pandas-connect from pyspark-connect module
### What changes were proposed in this pull request? Splits `pyspark-pandas-connect` from `pyspark-connect` module. ### Why are the changes needed? Now that we have pandas API on Spark Connect, the tests for `pyspark-connect` module take long time: - before pandas API: about 40 mins - after pandas API: about 2-3 hours so we should split `pyspark-pandas-connect` from `pyspark-connect` module. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing tests. Closes apache#40764 from ueshin/issues/SPARK-43115/pyspark-pandas-connect. Authored-by: Takuya UESHIN <[email protected]> Signed-off-by: Ruifeng Zheng <[email protected]>
1 parent a45affe commit 69abf14

File tree

3 files changed

+25
-5
lines changed

3 files changed

+25
-5
lines changed

.github/workflows/build_and_test.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,8 @@ jobs:
339339
pyspark-pandas-slow
340340
- >-
341341
pyspark-connect, pyspark-errors
342+
- >-
343+
pyspark-pandas-connect
342344
env:
343345
MODULES_TO_TEST: ${{ matrix.modules }}
344346
HADOOP_PROFILE: ${{ inputs.hadoop }}

dev/sparktestsupport/modules.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -783,6 +783,21 @@ def __hash__(self):
783783
# ml unittests
784784
"pyspark.ml.tests.connect.test_connect_function",
785785
"pyspark.ml.tests.connect.test_parity_torch_distributor",
786+
],
787+
excluded_python_implementations=[
788+
"PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
789+
# they aren't available there
790+
],
791+
)
792+
793+
794+
pyspark_pandas_connect = Module(
795+
name="pyspark-pandas-connect",
796+
dependencies=[pyspark_connect, pyspark_pandas],
797+
source_file_regexes=[
798+
"python/pyspark/pandas",
799+
],
800+
python_test_goals=[
786801
# pandas-on-Spark unittests
787802
"pyspark.pandas.tests.connect.data_type_ops.test_parity_base",
788803
"pyspark.pandas.tests.connect.data_type_ops.test_parity_binary_ops",

dev/sparktestsupport/utils.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -113,21 +113,24 @@ def determine_modules_to_test(changed_modules, deduplicated=True):
113113
... # doctest: +NORMALIZE_WHITESPACE
114114
['avro', 'connect', 'docker-integration-tests', 'examples', 'hive', 'hive-thriftserver',
115115
'mllib', 'protobuf', 'pyspark-connect', 'pyspark-ml', 'pyspark-mllib', 'pyspark-pandas',
116-
'pyspark-pandas-slow', 'pyspark-sql', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10']
116+
'pyspark-pandas-connect', 'pyspark-pandas-slow', 'pyspark-sql', 'repl', 'sparkr', 'sql',
117+
'sql-kafka-0-10']
117118
>>> sorted([x.name for x in determine_modules_to_test(
118119
... [modules.sparkr, modules.sql], deduplicated=False)])
119120
... # doctest: +NORMALIZE_WHITESPACE
120121
['avro', 'connect', 'docker-integration-tests', 'examples', 'hive', 'hive-thriftserver',
121122
'mllib', 'protobuf', 'pyspark-connect', 'pyspark-ml', 'pyspark-mllib', 'pyspark-pandas',
122-
'pyspark-pandas-slow', 'pyspark-sql', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10']
123+
'pyspark-pandas-connect', 'pyspark-pandas-slow', 'pyspark-sql', 'repl', 'sparkr', 'sql',
124+
'sql-kafka-0-10']
123125
>>> sorted([x.name for x in determine_modules_to_test(
124126
... [modules.sql, modules.core], deduplicated=False)])
125127
... # doctest: +NORMALIZE_WHITESPACE
126128
['avro', 'catalyst', 'connect', 'core', 'docker-integration-tests', 'examples', 'graphx',
127129
'hive', 'hive-thriftserver', 'mllib', 'mllib-local', 'protobuf', 'pyspark-connect',
128-
'pyspark-core', 'pyspark-ml', 'pyspark-mllib', 'pyspark-pandas', 'pyspark-pandas-slow',
129-
'pyspark-resource', 'pyspark-sql', 'pyspark-streaming', 'repl', 'root', 'sparkr', 'sql',
130-
'sql-kafka-0-10', 'streaming', 'streaming-kafka-0-10', 'streaming-kinesis-asl']
130+
'pyspark-core', 'pyspark-ml', 'pyspark-mllib', 'pyspark-pandas', 'pyspark-pandas-connect',
131+
'pyspark-pandas-slow', 'pyspark-resource', 'pyspark-sql', 'pyspark-streaming', 'repl', 'root',
132+
'sparkr', 'sql', 'sql-kafka-0-10', 'streaming', 'streaming-kafka-0-10',
133+
'streaming-kinesis-asl']
131134
"""
132135
modules_to_test = set()
133136
for module in changed_modules:

0 commit comments

Comments
 (0)