[SPARK-44561][PYTHON] Fix AssertionError when converting UDTF output to a complex type

ueshin · ueshin · commit f1a161cb3950 · 2023-08-07T11:48:24.000-07:00
### What changes were proposed in this pull request? Fixes AssertionError when converting UDTF output to a complex type by ignore assertions in `_create_converter_from_pandas` to make Arrow raise an error. ### Why are the changes needed? There is an assertion in `_create_converter_from_pandas`, but it should not be applied for Python UDTF case. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Added/modified the related tests. Closes apache#42310 from ueshin/issues/SPARK-44561/udtf_complex_types. Authored-by: Takuya UESHIN <ueshin@databricks.com> Signed-off-by: Takuya UESHIN <ueshin@databricks.com>
diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py
@@ -571,7 +571,10 @@ def _create_array(self, series, arrow_type, spark_type=None, arrow_cast=False):
             dt = spark_type or from_arrow_type(arrow_type, prefer_timestamp_ntz=True)
             # TODO(SPARK-43579): cache the converter for reuse
             conv = _create_converter_from_pandas(
-                dt, timezone=self._timezone, error_on_duplicated_field_names=False
+                dt,
+                timezone=self._timezone,
+                error_on_duplicated_field_names=False,
+                ignore_unexpected_complex_type_values=True,
             )
             series = conv(series)
 
diff --git a/python/pyspark/sql/pandas/types.py b/python/pyspark/sql/pandas/types.py
@@ -21,7 +21,7 @@
 """
 import datetime
 import itertools
-from typing import Any, Callable, List, Optional, Union, TYPE_CHECKING
+from typing import Any, Callable, Iterable, List, Optional, Union, TYPE_CHECKING
 
 from pyspark.sql.types import (
     cast,
@@ -750,6 +750,7 @@ def _create_converter_from_pandas(
     *,
     timezone: Optional[str],
     error_on_duplicated_field_names: bool = True,
+    ignore_unexpected_complex_type_values: bool = False,
 ) -> Callable[["pd.Series"], "pd.Series"]:
     """
     Create a converter of pandas Series to create Spark DataFrame with Arrow optimization.
@@ -763,6 +764,17 @@ def _create_converter_from_pandas(
     error_on_duplicated_field_names : bool, optional
         Whether raise an exception when there are duplicated field names.
         (default ``True``)
+    ignore_unexpected_complex_type_values : bool, optional
+        Whether ignore the case where unexpected values are given for complex types.
+        If ``False``, each complex type expects:
+
+        * array type: :class:`Iterable`
+        * map type: :class:`dict`
+        * struct type: :class:`dict` or :class:`tuple`
+
+        and raise an AssertionError when the given value is not the expected type.
+        If ``True``, just ignore and return the give value.
+        (default ``False``)
 
     Returns
     -------
@@ -781,28 +793,51 @@ def correct_timestamp(pser: pd.Series) -> pd.Series:
     def _converter(dt: DataType) -> Optional[Callable[[Any], Any]]:
 
         if isinstance(dt, ArrayType):
-            _element_conv = _converter(dt.elementType)
-            if _element_conv is None:
-                return None
+            _element_conv = _converter(dt.elementType) or (lambda x: x)
 
-            def convert_array(value: Any) -> Any:
-                if value is None:
-                    return None
-                else:
-                    return [_element_conv(v) for v in value]  # type: ignore[misc]
+            if ignore_unexpected_complex_type_values:
+
+                def convert_array(value: Any) -> Any:
+                    if value is None:
+                        return None
+                    elif isinstance(value, Iterable):
+                        return [_element_conv(v) for v in value]
+                    else:
+                        return value
+
+            else:
+
+                def convert_array(value: Any) -> Any:
+                    if value is None:
+                        return None
+                    else:
+                        assert isinstance(value, Iterable)
+                        return [_element_conv(v) for v in value]
 
             return convert_array
 
         elif isinstance(dt, MapType):
             _key_conv = _converter(dt.keyType) or (lambda x: x)
             _value_conv = _converter(dt.valueType) or (lambda x: x)
 
-            def convert_map(value: Any) -> Any:
-                if value is None:
-                    return None
-                else:
-                    assert isinstance(value, dict)
-                    return [(_key_conv(k), _value_conv(v)) for k, v in value.items()]
+            if ignore_unexpected_complex_type_values:
+
+                def convert_map(value: Any) -> Any:
+                    if value is None:
+                        return None
+                    elif isinstance(value, dict):
+                        return [(_key_conv(k), _value_conv(v)) for k, v in value.items()]
+                    else:
+                        return value
+
+            else:
+
+                def convert_map(value: Any) -> Any:
+                    if value is None:
+                        return None
+                    else:
+                        assert isinstance(value, dict)
+                        return [(_key_conv(k), _value_conv(v)) for k, v in value.items()]
 
             return convert_map
 
@@ -820,17 +855,38 @@ def convert_map(value: Any) -> Any:
 
             field_convs = [_converter(f.dataType) or (lambda x: x) for f in dt.fields]
 
-            def convert_struct(value: Any) -> Any:
-                if value is None:
-                    return None
-                elif isinstance(value, dict):
-                    return {
-                        dedup_field_names[i]: field_convs[i](value.get(key, None))
-                        for i, key in enumerate(field_names)
-                    }
-                else:
-                    assert isinstance(value, tuple)
-                    return {dedup_field_names[i]: field_convs[i](v) for i, v in enumerate(value)}
+            if ignore_unexpected_complex_type_values:
+
+                def convert_struct(value: Any) -> Any:
+                    if value is None:
+                        return None
+                    elif isinstance(value, dict):
+                        return {
+                            dedup_field_names[i]: field_convs[i](value.get(key, None))
+                            for i, key in enumerate(field_names)
+                        }
+                    elif isinstance(value, tuple):
+                        return {
+                            dedup_field_names[i]: field_convs[i](v) for i, v in enumerate(value)
+                        }
+                    else:
+                        return value
+
+            else:
+
+                def convert_struct(value: Any) -> Any:
+                    if value is None:
+                        return None
+                    elif isinstance(value, dict):
+                        return {
+                            dedup_field_names[i]: field_convs[i](value.get(key, None))
+                            for i, key in enumerate(field_names)
+                        }
+                    else:
+                        assert isinstance(value, tuple)
+                        return {
+                            dedup_field_names[i]: field_convs[i](v) for i, v in enumerate(value)
+                        }
 
             return convert_struct
 
diff --git a/python/pyspark/sql/tests/connect/test_parity_udtf.py b/python/pyspark/sql/tests/connect/test_parity_udtf.py
@@ -45,6 +45,9 @@ def tearDownClass(cls):
 
     # TODO: use PySpark error classes instead of SparkConnectGrpcException
 
+    def test_struct_output_type_casting_row(self):
+        self.check_struct_output_type_casting_row(SparkConnectGrpcException)
+
     def test_udtf_with_invalid_return_type(self):
         @udtf(returnType="int")
         class TestUDTF:
diff --git a/python/pyspark/sql/tests/test_udtf.py b/python/pyspark/sql/tests/test_udtf.py