Skip to content

refactor: make to_pandas() call to_arrow() and use local dtypes in DataFrame construction #132

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Oct 26, 2023
Merged
Prev Previous commit
Next Next commit
add unit tests for extreme values
  • Loading branch information
tswast committed Oct 25, 2023
commit 8bdfd79756da0319c6b861e1705d28b184d429cc
27 changes: 22 additions & 5 deletions bigframes/session/_io/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Dict
from typing import Dict, Union

import geopandas # type: ignore
import pandas
Expand All @@ -23,16 +23,18 @@
import bigframes.constants


def arrow_to_pandas(arrow_table: pyarrow.Table, dtypes: Dict):
def arrow_to_pandas(
arrow_table: Union[pyarrow.Table, pyarrow.RecordBatch], dtypes: Dict
):
if len(dtypes) != arrow_table.num_columns:
raise ValueError(
f"Number of types {len(dtypes)} doesn't match number of columns "
f"{arrow_table.num_columns}. {bigframes.constants.FEEDBACK_LINK}"
)

serieses = {}
for column_name, column in zip(arrow_table.column_names, arrow_table):
dtype = dtypes[column_name]
for field, column in zip(arrow_table.schema, arrow_table):
dtype = dtypes[field.name]

if dtype == geopandas.array.GeometryDtype():
series = geopandas.GeoSeries.from_wkt(
Expand All @@ -52,9 +54,24 @@ def arrow_to_pandas(arrow_table: pyarrow.Table, dtypes: Dict):
pyarrow.compute.is_null(column).to_numpy(),
)
series = pandas.Series(pd_array, dtype=dtype)
elif dtype == pandas.Int64Dtype():
# Avoid out-of-bounds errors in Pandas 1.5.x, which incorrectly
# casts to float64 in an intermediate step.
pd_array = pandas.arrays.IntegerArray(
pyarrow.compute.fill_null(column, 0).to_numpy(),
pyarrow.compute.is_null(column).to_numpy(),
)
series = pandas.Series(pd_array, dtype=dtype)
elif isinstance(dtype, pandas.ArrowDtype):
# Avoid conversion logic if we are backing the pandas Series by the
# arrow array.
series = pandas.Series(
pandas.arrays.ArrowExtensionArray(column), # type: ignore
dtype=dtype,
)
else:
series = column.to_pandas(types_mapper=lambda _: dtype)

serieses[column_name] = series
serieses[field.name] = series

return pandas.DataFrame(serieses)
216 changes: 216 additions & 0 deletions tests/unit/session/test_io_pandas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import datetime
from typing import Dict, Union

import geopandas # type: ignore
import numpy
import pandas
import pandas.arrays
import pandas.testing
import pyarrow # type: ignore
import pytest

import bigframes.session._io.pandas


@pytest.mark.parametrize(
("arrow_table", "dtypes", "expected"),
(
pytest.param(
pyarrow.Table.from_pydict({}),
{},
pandas.DataFrame(),
id="empty-df",
),
pytest.param(
pyarrow.Table.from_pydict(
{
"bool": [True, None, True, False],
"bytes": [b"123", None, b"abc", b"xyz"],
"date": pyarrow.array(
[
datetime.date(2023, 8, 29),
None,
datetime.date(2024, 4, 9),
datetime.date(1, 1, 1),
],
type=pyarrow.date32(),
),
"datetime": pyarrow.array(
[
datetime.datetime(2023, 8, 29),
None,
datetime.datetime(2024, 4, 9, 23, 59, 59),
datetime.datetime(1, 1, 1, 0, 0, 0, 1),
],
type=pyarrow.timestamp("us"),
),
"float": pyarrow.array(
[1.0, None, float("nan"), -1.0],
type=pyarrow.float64(),
),
"int": pyarrow.array(
[1, None, -1, 2**63 - 1],
type=pyarrow.int64(),
),
"string": ["123", None, "abc", "xyz"],
"time": pyarrow.array(
[
datetime.time(0, 0, 0, 1),
datetime.time(12, 0, 0),
None,
datetime.time(23, 59, 59, 999999),
],
type=pyarrow.time64("us"),
),
"timestamp": pyarrow.array(
[
datetime.datetime(2023, 8, 29),
datetime.datetime(1, 1, 1, 0, 0, 0, 1),
None,
datetime.datetime(2024, 4, 9, 23, 59, 59),
],
type=pyarrow.timestamp("us", datetime.timezone.utc),
),
}
),
{
"bool": "boolean",
"bytes": "object",
"date": pandas.ArrowDtype(pyarrow.date32()),
"datetime": pandas.ArrowDtype(pyarrow.timestamp("us")),
"float": pandas.Float64Dtype(),
"int": pandas.Int64Dtype(),
"string": "string[pyarrow]",
"time": pandas.ArrowDtype(pyarrow.time64("us")),
"timestamp": pandas.ArrowDtype(
pyarrow.timestamp("us", datetime.timezone.utc)
),
},
pandas.DataFrame(
{
"bool": pandas.Series([True, None, True, False], dtype="boolean"),
"bytes": [b"123", None, b"abc", b"xyz"],
"date": pandas.Series(
[
datetime.date(2023, 8, 29),
None,
datetime.date(2024, 4, 9),
datetime.date(1, 1, 1),
],
dtype=pandas.ArrowDtype(pyarrow.date32()),
),
"datetime": pandas.Series(
[
datetime.datetime(2023, 8, 29),
None,
datetime.datetime(2024, 4, 9, 23, 59, 59),
datetime.datetime(1, 1, 1, 0, 0, 0, 1),
],
dtype=pandas.ArrowDtype(pyarrow.timestamp("us")),
),
"float": pandas.Series(
pandas.arrays.FloatingArray(
numpy.array(
[1.0, float("nan"), float("nan"), -1.0], dtype="float64"
),
numpy.array([False, True, False, False], dtype="bool"),
),
dtype=pandas.Float64Dtype(),
),
"int": pandas.Series(
[1, None, -1, 2**63 - 1],
dtype=pandas.Int64Dtype(),
),
"string": pandas.Series(
["123", None, "abc", "xyz"], dtype="string[pyarrow]"
),
"time": pandas.Series(
[
datetime.time(0, 0, 0, 1),
datetime.time(12, 0, 0),
None,
datetime.time(23, 59, 59, 999999),
],
dtype=pandas.ArrowDtype(pyarrow.time64("us")),
),
"timestamp": pandas.Series(
[
datetime.datetime(2023, 8, 29),
datetime.datetime(1, 1, 1, 0, 0, 0, 1),
None,
datetime.datetime(2024, 4, 9, 23, 59, 59),
],
dtype=pandas.ArrowDtype(
pyarrow.timestamp("us", datetime.timezone.utc)
),
),
}
),
id="scalar-dtypes",
),
pytest.param(
pyarrow.Table.from_pydict(
{
"geocol": [
"POINT(32 210)",
None,
"LINESTRING(1 1, 2 1, 3.1 2.88, 3 -3)",
]
}
),
{"geocol": geopandas.array.GeometryDtype()},
pandas.DataFrame(
{
"geocol": geopandas.GeoSeries.from_wkt(
["POINT(32 210)", None, "LINESTRING(1 1, 2 1, 3.1 2.88, 3 -3)"],
crs="EPSG:4326",
),
}
),
id="geography-dtype",
),
),
)
def test_arrow_to_pandas(
arrow_table: Union[pyarrow.Table, pyarrow.RecordBatch],
dtypes: Dict,
expected: pandas.DataFrame,
):
actual = bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes)
pandas.testing.assert_frame_equal(actual, expected)


@pytest.mark.parametrize(
("arrow_table", "dtypes"),
(
pytest.param(
pyarrow.Table.from_pydict({"col1": [1], "col2": [2]}),
{"col1": "Int64"},
id="too-few-dtypes",
),
pytest.param(
pyarrow.RecordBatch.from_pydict({"col1": [1]}),
{"col1": "Int64", "col2": "string[pyarrow]"},
id="too-many-dtypes",
),
),
)
def test_arrow_to_pandas_wrong_size_dtypes(
arrow_table: Union[pyarrow.Table, pyarrow.RecordBatch], dtypes: Dict
):
with pytest.raises(ValueError, match=f"Number of types {len(dtypes)}"):
bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes)