syk-coder
diff --git a/‎python/pyspark/pandas/tests/connect/data_type_ops/test_parity_udt_ops.py‎
Lines changed: 1 addition & 47 deletions b/‎python/pyspark/pandas/tests/connect/data_type_ops/test_parity_udt_ops.py‎
Lines changed: 1 addition & 47 deletions
diff --git a/‎python/pyspark/sql/connect/client/core.py‎
Lines changed: 2 additions & 2 deletions b/‎python/pyspark/sql/connect/client/core.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/pyspark/sql/connect/conversion.py‎
Lines changed: 3 additions & 4 deletions b/‎python/pyspark/sql/connect/conversion.py‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎python/pyspark/sql/connect/dataframe.py‎
Lines changed: 1 addition & 1 deletion b/‎python/pyspark/sql/connect/dataframe.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/pyspark/sql/connect/session.py‎
Lines changed: 17 additions & 5 deletions b/‎python/pyspark/sql/connect/session.py‎
Lines changed: 17 additions & 5 deletions
diff --git a/‎python/pyspark/sql/connect/types.py‎
Lines changed: 0 additions & 146 deletions b/‎python/pyspark/sql/connect/types.py‎
Lines changed: 0 additions & 146 deletions
diff --git a/‎python/pyspark/sql/pandas/conversion.py‎
Lines changed: 8 additions & 9 deletions b/‎python/pyspark/sql/pandas/conversion.py‎
Lines changed: 8 additions & 9 deletions
@@ -25,53 +25,7 @@
 class UDTOpsParityTests(
     UDTOpsTestsMixin, PandasOnSparkTestUtils, OpsTestBase, ReusedConnectTestCase
 ):
-    @unittest.skip(
-        "TODO(SPARK-43702): Fix pyspark.sql.pandas.types.to_arrow_type to work with Spark Connect."
-    )
-    def test_eq(self):
-        super().test_eq()
-
-    @unittest.skip(
-        "TODO(SPARK-43702): Fix pyspark.sql.pandas.types.to_arrow_type to work with Spark Connect."
-    )
-    def test_from_to_pandas(self):
-        super().test_from_to_pandas()
-
-    @unittest.skip(
-        "TODO(SPARK-43702): Fix pyspark.sql.pandas.types.to_arrow_type to work with Spark Connect."
-    )
-    def test_ge(self):
-        super().test_ge()
-
-    @unittest.skip(
-        "TODO(SPARK-43702): Fix pyspark.sql.pandas.types.to_arrow_type to work with Spark Connect."
-    )
-    def test_gt(self):
-        super().test_gt()
-
-    @unittest.skip(
-        "TODO(SPARK-43702): Fix pyspark.sql.pandas.types.to_arrow_type to work with Spark Connect."
-    )
-    def test_isnull(self):
-        super().test_isnull()
-
-    @unittest.skip(
-        "TODO(SPARK-43702): Fix pyspark.sql.pandas.types.to_arrow_type to work with Spark Connect."
-    )
-    def test_le(self):
-        super().test_le()
-
-    @unittest.skip(
-        "TODO(SPARK-43702): Fix pyspark.sql.pandas.types.to_arrow_type to work with Spark Connect."
-    )
-    def test_lt(self):
-        super().test_lt()
-
-    @unittest.skip(
-        "TODO(SPARK-43702): Fix pyspark.sql.pandas.types.to_arrow_type to work with Spark Connect."
-    )
-    def test_ne(self):
-        super().test_ne()
+    pass
 
 
 if __name__ == "__main__":
 
@@ -75,7 +75,7 @@
     CommonInlineUserDefinedFunction,
     JavaUDF,
 )
-from pyspark.sql.pandas.types import _create_converter_to_pandas
+from pyspark.sql.pandas.types import _create_converter_to_pandas, from_arrow_schema
 from pyspark.sql.types import DataType, StructType, TimestampType, _has_type
 from pyspark.rdd import PythonEvalType
 from pyspark.storagelevel import StorageLevel
@@ -717,7 +717,7 @@ def to_pandas(self, plan: pb2.Plan) -> "pd.DataFrame":
         table, schema, metrics, observed_metrics, _ = self._execute_and_fetch(req)
         assert table is not None
 
-        schema = schema or types.from_arrow_schema(table.schema, prefer_timestamp_ntz=True)
+        schema = schema or from_arrow_schema(table.schema, prefer_timestamp_ntz=True)
         assert schema is not None and isinstance(schema, StructType)
 
         # Rename columns to avoid duplicated column names.
 
@@ -42,9 +42,8 @@
 )
 
 from pyspark.storagelevel import StorageLevel
-from pyspark.sql.connect.types import to_arrow_schema
 import pyspark.sql.connect.proto as pb2
-from pyspark.sql.pandas.types import _dedup_names, _deduplicate_field_names
+from pyspark.sql.pandas.types import to_arrow_schema, _dedup_names, _deduplicate_field_names
 
 from typing import (
     Any,
@@ -246,7 +245,7 @@ def convert_string(value: Any) -> Any:
         elif isinstance(dataType, UserDefinedType):
             udt: UserDefinedType = dataType
 
-            conv = LocalDataToArrowConversion._create_converter(dataType.sqlType())
+            conv = LocalDataToArrowConversion._create_converter(udt.sqlType())
 
             def convert_udt(value: Any) -> Any:
                 if value is None:
@@ -428,7 +427,7 @@ def convert_timestample_ntz(value: Any) -> Any:
         elif isinstance(dataType, UserDefinedType):
             udt: UserDefinedType = dataType
 
-            conv = ArrowTableToRowsConversion._create_converter(dataType.sqlType())
+            conv = ArrowTableToRowsConversion._create_converter(udt.sqlType())
 
             def convert_udt(value: Any) -> Any:
                 if value is None:
 
@@ -76,7 +76,7 @@
     lit,
     expr as sql_expression,
 )
-from pyspark.sql.connect.types import from_arrow_schema
+from pyspark.sql.pandas.types import from_arrow_schema
 
 
 if TYPE_CHECKING:
 
@@ -331,8 +331,12 @@ def createDataFrame(
 
             # Determine arrow types to coerce data when creating batches
             arrow_schema: Optional[pa.Schema] = None
+            spark_types: List[Optional[DataType]]
+            arrow_types: List[Optional[pa.DataType]]
             if isinstance(schema, StructType):
-                arrow_schema = to_arrow_schema(cast(StructType, _deduplicate_field_names(schema)))
+                deduped_schema = cast(StructType, _deduplicate_field_names(schema))
+                spark_types = [field.dataType for field in deduped_schema.fields]
+                arrow_schema = to_arrow_schema(deduped_schema)
                 arrow_types = [field.type for field in arrow_schema]
                 _cols = [str(x) if not isinstance(x, str) else x for x in schema.fieldNames()]
             elif isinstance(schema, DataType):
@@ -342,14 +346,15 @@ def createDataFrame(
                 )
             else:
                 # Any timestamps must be coerced to be compatible with Spark
-                arrow_types = [
-                    to_arrow_type(TimestampType())
+                spark_types = [
+                    TimestampType()
                     if is_datetime64_dtype(t) or is_datetime64tz_dtype(t)
-                    else to_arrow_type(DayTimeIntervalType())
+                    else DayTimeIntervalType()
                     if is_timedelta64_dtype(t)
                     else None
                     for t in data.dtypes
                 ]
+                arrow_types = [to_arrow_type(dt) if dt is not None else None for dt in spark_types]
 
             timezone, safecheck = self._client.get_configs(
                 "spark.sql.session.timeZone", "spark.sql.execution.pandas.convertToArrowArraySafely"
@@ -358,7 +363,14 @@ def createDataFrame(
             ser = ArrowStreamPandasSerializer(cast(str, timezone), safecheck == "true")
 
             _table = pa.Table.from_batches(
-                [ser._create_batch([(c, t) for (_, c), t in zip(data.items(), arrow_types)])]
+                [
+                    ser._create_batch(
+                        [
+                            (c, at, st)
+                            for (_, c), at, st in zip(data.items(), arrow_types, spark_types)
+                        ]
+                    )
+                ]
             )
 
             if isinstance(schema, StructType):
 
@@ -20,8 +20,6 @@
 
 import json
 
-import pyarrow as pa
-
 from typing import Any, Dict, Optional
 
 from pyspark.sql.types import (
@@ -299,147 +297,3 @@ def proto_schema_to_pyspark_data_type(schema: pb2.DataType) -> DataType:
         return UserDefinedType.fromJson(json_value)
     else:
         raise Exception(f"Unsupported data type {schema}")
-
-
-def to_arrow_type(dt: DataType) -> "pa.DataType":
-    """
-    Convert Spark data type to pyarrow type.
-
-    This function refers to 'pyspark.sql.pandas.types.to_arrow_type' but relax the restriction,
-    e.g. it supports nested StructType.
-    """
-    if type(dt) == BooleanType:
-        arrow_type = pa.bool_()
-    elif type(dt) == ByteType:
-        arrow_type = pa.int8()
-    elif type(dt) == ShortType:
-        arrow_type = pa.int16()
-    elif type(dt) == IntegerType:
-        arrow_type = pa.int32()
-    elif type(dt) == LongType:
-        arrow_type = pa.int64()
-    elif type(dt) == FloatType:
-        arrow_type = pa.float32()
-    elif type(dt) == DoubleType:
-        arrow_type = pa.float64()
-    elif type(dt) == DecimalType:
-        arrow_type = pa.decimal128(dt.precision, dt.scale)
-    elif type(dt) == StringType:
-        arrow_type = pa.string()
-    elif type(dt) == BinaryType:
-        arrow_type = pa.binary()
-    elif type(dt) == DateType:
-        arrow_type = pa.date32()
-    elif type(dt) == TimestampType:
-        # Timestamps should be in UTC, JVM Arrow timestamps require a timezone to be read
-        arrow_type = pa.timestamp("us", tz="UTC")
-    elif type(dt) == TimestampNTZType:
-        arrow_type = pa.timestamp("us", tz=None)
-    elif type(dt) == DayTimeIntervalType:
-        arrow_type = pa.duration("us")
-    elif type(dt) == ArrayType:
-        field = pa.field("element", to_arrow_type(dt.elementType), nullable=dt.containsNull)
-        arrow_type = pa.list_(field)
-    elif type(dt) == MapType:
-        key_field = pa.field("key", to_arrow_type(dt.keyType), nullable=False)
-        value_field = pa.field("value", to_arrow_type(dt.valueType), nullable=dt.valueContainsNull)
-        arrow_type = pa.map_(key_field, value_field)
-    elif type(dt) == StructType:
-        fields = [
-            pa.field(field.name, to_arrow_type(field.dataType), nullable=field.nullable)
-            for field in dt
-        ]
-        arrow_type = pa.struct(fields)
-    elif type(dt) == NullType:
-        arrow_type = pa.null()
-    elif isinstance(dt, UserDefinedType):
-        arrow_type = to_arrow_type(dt.sqlType())
-    else:
-        raise TypeError("Unsupported type in conversion to Arrow: " + str(dt))
-    return arrow_type
-
-
-def to_arrow_schema(schema: StructType) -> "pa.Schema":
-    """Convert a schema from Spark to Arrow"""
-    fields = [
-        pa.field(field.name, to_arrow_type(field.dataType), nullable=field.nullable)
-        for field in schema
-    ]
-    return pa.schema(fields)
-
-
-def from_arrow_type(at: "pa.DataType", prefer_timestamp_ntz: bool = False) -> DataType:
-    """Convert pyarrow type to Spark data type.
-
-    This function refers to 'pyspark.sql.pandas.types.from_arrow_type' but relax the restriction,
-    e.g. it supports nested StructType, Array of TimestampType. However, Arrow DictionaryType is
-    not allowed.
-    """
-    import pyarrow.types as types
-
-    spark_type: DataType
-    if types.is_boolean(at):
-        spark_type = BooleanType()
-    elif types.is_int8(at):
-        spark_type = ByteType()
-    elif types.is_int16(at):
-        spark_type = ShortType()
-    elif types.is_int32(at):
-        spark_type = IntegerType()
-    elif types.is_int64(at):
-        spark_type = LongType()
-    elif types.is_float32(at):
-        spark_type = FloatType()
-    elif types.is_float64(at):
-        spark_type = DoubleType()
-    elif types.is_decimal(at):
-        spark_type = DecimalType(precision=at.precision, scale=at.scale)
-    elif types.is_string(at):
-        spark_type = StringType()
-    elif types.is_binary(at):
-        spark_type = BinaryType()
-    elif types.is_date32(at):
-        spark_type = DateType()
-    elif types.is_timestamp(at) and prefer_timestamp_ntz and at.tz is None:
-        spark_type = TimestampNTZType()
-    elif types.is_timestamp(at):
-        spark_type = TimestampType()
-    elif types.is_duration(at):
-        spark_type = DayTimeIntervalType()
-    elif types.is_list(at):
-        spark_type = ArrayType(from_arrow_type(at.value_type, prefer_timestamp_ntz))
-    elif types.is_map(at):
-        spark_type = MapType(
-            from_arrow_type(at.key_type, prefer_timestamp_ntz),
-            from_arrow_type(at.item_type, prefer_timestamp_ntz),
-        )
-    elif types.is_struct(at):
-        return StructType(
-            [
-                StructField(
-                    field.name,
-                    from_arrow_type(field.type, prefer_timestamp_ntz),
-                    nullable=field.nullable,
-                )
-                for field in at
-            ]
-        )
-    elif types.is_null(at):
-        spark_type = NullType()
-    else:
-        raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
-    return spark_type
-
-
-def from_arrow_schema(arrow_schema: "pa.Schema", prefer_timestamp_ntz: bool = False) -> StructType:
-    """Convert schema from Arrow to Spark."""
-    return StructType(
-        [
-            StructField(
-                field.name,
-                from_arrow_type(field.type, prefer_timestamp_ntz),
-                nullable=field.nullable,
-            )
-            for field in arrow_schema
-        ]
-    )
@@ -598,30 +598,29 @@ def _create_from_pandas_with_arrow(
 
         # Determine arrow types to coerce data when creating batches
         if isinstance(schema, StructType):
-            arrow_types = [
-                to_arrow_type(_deduplicate_field_names(f.dataType)) for f in schema.fields
-            ]
+            spark_types = [_deduplicate_field_names(f.dataType) for f in schema.fields]
         elif isinstance(schema, DataType):
             raise PySparkTypeError(
                 error_class="UNSUPPORTED_DATA_TYPE_FOR_ARROW",
                 message_parameters={"data_type": str(schema)},
             )
         else:
             # Any timestamps must be coerced to be compatible with Spark
-            arrow_types = [
-                to_arrow_type(TimestampType())
-                if is_datetime64_dtype(t) or is_datetime64tz_dtype(t)
-                else None
+            spark_types = [
+                TimestampType() if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None
                 for t in pdf.dtypes
             ]
 
         # Slice the DataFrame to be batched
         step = self._jconf.arrowMaxRecordsPerBatch()
         pdf_slices = (pdf.iloc[start : start + step] for start in range(0, len(pdf), step))
 
-        # Create list of Arrow (columns, type) for serializer dump_stream
+        # Create list of Arrow (columns, arrow_type, spark_type) for serializer dump_stream
         arrow_data = [
-            [(c, t) for (_, c), t in zip(pdf_slice.items(), arrow_types)]
+            [
+                (c, to_arrow_type(t) if t is not None else None, t)
+                for (_, c), t in zip(pdf_slice.items(), spark_types)
+            ]
             for pdf_slice in pdf_slices
         ]
Original file line number	Diff line number	Diff line change
`@@ -76,7 +76,7 @@`
`76`	`76`	`lit,`
`77`	`77`	`expr as sql_expression,`
`78`	`78`	`)`
`79`		`-from pyspark.sql.connect.types import from_arrow_schema`
	`79`	`+from pyspark.sql.pandas.types import from_arrow_schema`
`80`	`80`
`81`	`81`
`82`	`82`	`if TYPE_CHECKING:`