[SPARK-41971][CONNECT][PYTHON][FOLLOWUP] Fix to_pandas to support the older Spark

ueshin · HyukjinKwon · commit 04125eb80e5c · 2023-05-31T09:18:19.000+09:00
### What changes were proposed in this pull request? This is a follow-up of apache#40988. Fix `to_pandas` to support the older Spark. For the server: ```py % ./sbin/start-connect-server.sh --packages org.apache.spark:spark-connect_2.12:3.4.0 ``` with the client with the change here: ```py >>> spark.sql("values (1, struct('x' as x)) as t(a, b)").toPandas() a b 0 1 {'x': 'x'} ``` ### Why are the changes needed? The config `spark.sql.execution.pandas.structHandlingMode` introduced in apache#40988 does not exist in the older Spark, `<3.5` ```py >>> spark.sql("values (1, struct('x' as x)) as t(a, b)").toPandas() Traceback (most recent call last): ... pyspark.errors.exceptions.connect.SparkConnectGrpcException: (java.util.NoSuchElementException) spark.sql.execution.pandas.structHandlingMode ``` ### Does this PR introduce _any_ user-facing change? The newer Spark Connect client will work with `Spark<3.5`. ### How was this patch tested? Manually. Closes apache#41390 from ueshin/issues/SPARK-41971/config_with_default. Authored-by: Takuya UESHIN <ueshin@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
diff --git a/python/pyspark/sql/connect/client/core.py b/python/pyspark/sql/connect/client/core.py
@@ -726,11 +726,14 @@ def to_pandas(self, plan: pb2.Plan) -> "pd.DataFrame":
 
         if len(pdf.columns) > 0:
             timezone: Optional[str] = None
+            if any(_has_type(f.dataType, TimestampType) for f in schema.fields):
+                (timezone,) = self.get_configs("spark.sql.session.timeZone")
+
             struct_in_pandas: Optional[str] = None
             error_on_duplicated_field_names: bool = False
-            if any(_has_type(f.dataType, (StructType, TimestampType)) for f in schema.fields):
-                timezone, struct_in_pandas = self.get_configs(
-                    "spark.sql.session.timeZone", "spark.sql.execution.pandas.structHandlingMode"
+            if any(_has_type(f.dataType, StructType) for f in schema.fields):
+                (struct_in_pandas,) = self.get_config_with_defaults(
+                    ("spark.sql.execution.pandas.structHandlingMode", "legacy"),
                 )
 
                 if struct_in_pandas == "legacy":
@@ -1108,6 +1111,17 @@ def get_configs(self, *keys: str) -> Tuple[Optional[str], ...]:
         configs = dict(self.config(op).pairs)
         return tuple(configs.get(key) for key in keys)
 
+    def get_config_with_defaults(
+        self, *pairs: Tuple[str, Optional[str]]
+    ) -> Tuple[Optional[str], ...]:
+        op = pb2.ConfigRequest.Operation(
+            get_with_default=pb2.ConfigRequest.GetWithDefault(
+                pairs=[pb2.KeyValue(key=key, value=default) for key, default in pairs]
+            )
+        )
+        configs = dict(self.config(op).pairs)
+        return tuple(configs.get(key) for key, _ in pairs)
+
     def config(self, operation: pb2.ConfigRequest.Operation) -> ConfigResult:
         """
         Call the config RPC of Spark Connect.