add unit tests for extreme values

googleapis · gcf-merge-on-green · Oct 26, 2023 · Oct 23, 2023 · Oct 24, 2023 · Oct 24, 2023
commit 8bdfd79756da0319c6b861e1705d28b184d429cc
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Dict
+from typing import Dict, Union
 
 import geopandas  # type: ignore
 import pandas
@@ -23,16 +23,18 @@
 import bigframes.constants
 
 
-def arrow_to_pandas(arrow_table: pyarrow.Table, dtypes: Dict):
+def arrow_to_pandas(
+    arrow_table: Union[pyarrow.Table, pyarrow.RecordBatch], dtypes: Dict
+):
     if len(dtypes) != arrow_table.num_columns:
         raise ValueError(
             f"Number of types {len(dtypes)} doesn't match number of columns "
             f"{arrow_table.num_columns}. {bigframes.constants.FEEDBACK_LINK}"
         )
 
     serieses = {}
-    for column_name, column in zip(arrow_table.column_names, arrow_table):
-        dtype = dtypes[column_name]
+    for field, column in zip(arrow_table.schema, arrow_table):
+        dtype = dtypes[field.name]
 
         if dtype == geopandas.array.GeometryDtype():
             series = geopandas.GeoSeries.from_wkt(
@@ -52,9 +54,24 @@ def arrow_to_pandas(arrow_table: pyarrow.Table, dtypes: Dict):
                 pyarrow.compute.is_null(column).to_numpy(),
             )
             series = pandas.Series(pd_array, dtype=dtype)
+        elif dtype == pandas.Int64Dtype():
+            # Avoid out-of-bounds errors in Pandas 1.5.x, which incorrectly
+            # casts to float64 in an intermediate step.
+            pd_array = pandas.arrays.IntegerArray(
+                pyarrow.compute.fill_null(column, 0).to_numpy(),
+                pyarrow.compute.is_null(column).to_numpy(),
+            )
+            series = pandas.Series(pd_array, dtype=dtype)
+        elif isinstance(dtype, pandas.ArrowDtype):
+            # Avoid conversion logic if we are backing the pandas Series by the
+            # arrow array.
+            series = pandas.Series(
+                pandas.arrays.ArrowExtensionArray(column),  # type: ignore
+                dtype=dtype,
+            )
         else:
             series = column.to_pandas(types_mapper=lambda _: dtype)
 
-        serieses[column_name] = series
+        serieses[field.name] = series
 
     return pandas.DataFrame(serieses)
@@ -0,0 +1,216 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datetime
+from typing import Dict, Union
+
+import geopandas  # type: ignore
+import numpy
+import pandas
+import pandas.arrays
+import pandas.testing
+import pyarrow  # type: ignore
+import pytest
+
+import bigframes.session._io.pandas
+
+
+@pytest.mark.parametrize(
+    ("arrow_table", "dtypes", "expected"),
+    (
+        pytest.param(
+            pyarrow.Table.from_pydict({}),
+            {},
+            pandas.DataFrame(),
+            id="empty-df",
+        ),
+        pytest.param(
+            pyarrow.Table.from_pydict(
+                {
+                    "bool": [True, None, True, False],
+                    "bytes": [b"123", None, b"abc", b"xyz"],
+                    "date": pyarrow.array(
+                        [
+                            datetime.date(2023, 8, 29),
+                            None,
+                            datetime.date(2024, 4, 9),
+                            datetime.date(1, 1, 1),
+                        ],
+                        type=pyarrow.date32(),
+                    ),
+                    "datetime": pyarrow.array(
+                        [
+                            datetime.datetime(2023, 8, 29),
+                            None,
+                            datetime.datetime(2024, 4, 9, 23, 59, 59),
+                            datetime.datetime(1, 1, 1, 0, 0, 0, 1),
+                        ],
+                        type=pyarrow.timestamp("us"),
+                    ),
+                    "float": pyarrow.array(
+                        [1.0, None, float("nan"), -1.0],
+                        type=pyarrow.float64(),
+                    ),
+                    "int": pyarrow.array(
+                        [1, None, -1, 2**63 - 1],
+                        type=pyarrow.int64(),
+                    ),
+                    "string": ["123", None, "abc", "xyz"],
+                    "time": pyarrow.array(
+                        [
+                            datetime.time(0, 0, 0, 1),
+                            datetime.time(12, 0, 0),
+                            None,
+                            datetime.time(23, 59, 59, 999999),
+                        ],
+                        type=pyarrow.time64("us"),
+                    ),
+                    "timestamp": pyarrow.array(
+                        [
+                            datetime.datetime(2023, 8, 29),
+                            datetime.datetime(1, 1, 1, 0, 0, 0, 1),
+                            None,
+                            datetime.datetime(2024, 4, 9, 23, 59, 59),
+                        ],
+                        type=pyarrow.timestamp("us", datetime.timezone.utc),
+                    ),
+                }
+            ),
+            {
+                "bool": "boolean",
+                "bytes": "object",
+                "date": pandas.ArrowDtype(pyarrow.date32()),
+                "datetime": pandas.ArrowDtype(pyarrow.timestamp("us")),
+                "float": pandas.Float64Dtype(),
+                "int": pandas.Int64Dtype(),
+                "string": "string[pyarrow]",
+                "time": pandas.ArrowDtype(pyarrow.time64("us")),
+                "timestamp": pandas.ArrowDtype(
+                    pyarrow.timestamp("us", datetime.timezone.utc)
+                ),
+            },
+            pandas.DataFrame(
+                {
+                    "bool": pandas.Series([True, None, True, False], dtype="boolean"),
+                    "bytes": [b"123", None, b"abc", b"xyz"],
+                    "date": pandas.Series(
+                        [
+                            datetime.date(2023, 8, 29),
+                            None,
+                            datetime.date(2024, 4, 9),
+                            datetime.date(1, 1, 1),
+                        ],
+                        dtype=pandas.ArrowDtype(pyarrow.date32()),
+                    ),
+                    "datetime": pandas.Series(
+                        [
+                            datetime.datetime(2023, 8, 29),
+                            None,
+                            datetime.datetime(2024, 4, 9, 23, 59, 59),
+                            datetime.datetime(1, 1, 1, 0, 0, 0, 1),
+                        ],
+                        dtype=pandas.ArrowDtype(pyarrow.timestamp("us")),
+                    ),
+                    "float": pandas.Series(
+                        pandas.arrays.FloatingArray(
+                            numpy.array(
+                                [1.0, float("nan"), float("nan"), -1.0], dtype="float64"
+                            ),
+                            numpy.array([False, True, False, False], dtype="bool"),
+                        ),
+                        dtype=pandas.Float64Dtype(),
+                    ),
+                    "int": pandas.Series(
+                        [1, None, -1, 2**63 - 1],
+                        dtype=pandas.Int64Dtype(),
+                    ),
+                    "string": pandas.Series(
+                        ["123", None, "abc", "xyz"], dtype="string[pyarrow]"
+                    ),
+                    "time": pandas.Series(
+                        [
+                            datetime.time(0, 0, 0, 1),
+                            datetime.time(12, 0, 0),
+                            None,
+                            datetime.time(23, 59, 59, 999999),
+                        ],
+                        dtype=pandas.ArrowDtype(pyarrow.time64("us")),
+                    ),
+                    "timestamp": pandas.Series(
+                        [
+                            datetime.datetime(2023, 8, 29),
+                            datetime.datetime(1, 1, 1, 0, 0, 0, 1),
+                            None,
+                            datetime.datetime(2024, 4, 9, 23, 59, 59),
+                        ],
+                        dtype=pandas.ArrowDtype(
+                            pyarrow.timestamp("us", datetime.timezone.utc)
+                        ),
+                    ),
+                }
+            ),
+            id="scalar-dtypes",
+        ),
+        pytest.param(
+            pyarrow.Table.from_pydict(
+                {
+                    "geocol": [
+                        "POINT(32 210)",
+                        None,
+                        "LINESTRING(1 1, 2 1, 3.1 2.88, 3 -3)",
+                    ]
+                }
+            ),
+            {"geocol": geopandas.array.GeometryDtype()},
+            pandas.DataFrame(
+                {
+                    "geocol": geopandas.GeoSeries.from_wkt(
+                        ["POINT(32 210)", None, "LINESTRING(1 1, 2 1, 3.1 2.88, 3 -3)"],
+                        crs="EPSG:4326",
+                    ),
+                }
+            ),
+            id="geography-dtype",
+        ),
+    ),
+)
+def test_arrow_to_pandas(
+    arrow_table: Union[pyarrow.Table, pyarrow.RecordBatch],
+    dtypes: Dict,
+    expected: pandas.DataFrame,
+):
+    actual = bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes)
+    pandas.testing.assert_frame_equal(actual, expected)
+
+
+@pytest.mark.parametrize(
+    ("arrow_table", "dtypes"),
+    (
+        pytest.param(
+            pyarrow.Table.from_pydict({"col1": [1], "col2": [2]}),
+            {"col1": "Int64"},
+            id="too-few-dtypes",
+        ),
+        pytest.param(
+            pyarrow.RecordBatch.from_pydict({"col1": [1]}),
+            {"col1": "Int64", "col2": "string[pyarrow]"},
+            id="too-many-dtypes",
+        ),
+    ),
+)
+def test_arrow_to_pandas_wrong_size_dtypes(
+    arrow_table: Union[pyarrow.Table, pyarrow.RecordBatch], dtypes: Dict
+):
+    with pytest.raises(ValueError, match=f"Number of types {len(dtypes)}"):
+        bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes)