Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 18 additions & 24 deletions google/cloud/bigquery/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2648,31 +2648,25 @@ def to_dataframe(
if pyarrow.types.is_timestamp(col.type)
)

if len(record_batch) > 0:
df = record_batch.to_pandas(
df = record_batch.to_pandas(
date_as_object=date_as_object,
timestamp_as_object=timestamp_as_object,
integer_object_nulls=True,
types_mapper=_pandas_helpers.default_types_mapper(
date_as_object=date_as_object,
timestamp_as_object=timestamp_as_object,
integer_object_nulls=True,
types_mapper=_pandas_helpers.default_types_mapper(
date_as_object=date_as_object,
bool_dtype=bool_dtype,
int_dtype=int_dtype,
float_dtype=float_dtype,
string_dtype=string_dtype,
date_dtype=date_dtype,
datetime_dtype=datetime_dtype,
time_dtype=time_dtype,
timestamp_dtype=timestamp_dtype,
range_date_dtype=range_date_dtype,
range_datetime_dtype=range_datetime_dtype,
range_timestamp_dtype=range_timestamp_dtype,
),
)
else:
# Avoid "ValueError: need at least one array to concatenate" on
# older versions of pandas when converting empty RecordBatch to
# DataFrame. See: https://github.com/pandas-dev/pandas/issues/41241
Comment on lines -2672 to -2674
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pandas-dev/pandas#41052 has released with pandas 1.3.0 (as mentioned in above issue: pandas-dev/pandas#41241 (comment))

So if we can increase minimum pandas version to 1.3.0, we don't need this workaround

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, moving to a monorepo might help make the dependencies more consistent, ideally we might want any overlapped dependencies among the handwritten BigQuery projects to be consistent.

But just for the purpose of this PR, I think it's reasonable to increase the minimum pandas version to 1.3.0, as long as it's not breaking anything.

df = pandas.DataFrame([], columns=record_batch.schema.names)
bool_dtype=bool_dtype,
int_dtype=int_dtype,
float_dtype=float_dtype,
string_dtype=string_dtype,
date_dtype=date_dtype,
datetime_dtype=datetime_dtype,
time_dtype=time_dtype,
timestamp_dtype=timestamp_dtype,
range_date_dtype=range_date_dtype,
range_datetime_dtype=range_datetime_dtype,
range_timestamp_dtype=range_timestamp_dtype,
),
)

for column in dtypes:
df[column] = pandas.Series(df[column], dtype=dtypes[column], copy=False)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ bqstorage = [
"pyarrow >= 4.0.0",
]
pandas = [
"pandas >= 1.1.4",
"pandas >= 1.3.0",
"pandas-gbq >= 0.26.1",
"grpcio >= 1.47.0, < 2.0.0",
"grpcio >= 1.49.1, < 2.0.0; python_version >= '3.11'",
Expand Down
2 changes: 1 addition & 1 deletion testing/constraints-3.9.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ opentelemetry-api==1.1.0
opentelemetry-instrumentation==0.20b0
opentelemetry-sdk==1.1.0
packaging==24.2.0
pandas==1.1.4
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I confirmed that pandas==1.2.5 reproduces

FAILED tests/unit/test_table.py::TestRowIterator::test_to_dataframe_w_bqstorage_no_streams - ValueError: need at least one array to concatenate

and pandas==1.3.0 can pass

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for updating the constraint file!

pandas==1.3.0
pandas-gbq==0.26.1
proto-plus==1.22.3
protobuf==3.20.2
Expand Down
7 changes: 1 addition & 6 deletions tests/system/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -1222,12 +1222,7 @@ def test_list_rows_nullable_scalars_extreme_dtypes_w_custom_dtype(

# These pandas dtypes are handled by the custom dtypes.
assert df.dtypes["bool_col"].name == "boolean"
# Result is dependent upon which version of pandas is being used.
# Float64 was not introduced until pandas version 1.4.
if PANDAS_INSTALLED_VERSION >= "1.4":
assert df.dtypes["float64_col"].name == "Float64"
else:
assert df.dtypes["float64_col"].name == "string"
Comment on lines -1225 to -1230
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Float64 is introduced in pandas 1.2.0, not 1.4. So we can drop this handling now (pandas >= 1.3.0)

https://btx.cloud.google.com/invocations/feb3ac69-f2db-449c-ab07-2122420f069b/targets/cloud-devrel%2Fclient-libraries%2Fpython%2Fgoogleapis%2Fpython-bigquery%2Fpresubmit%2Fsystem-3.9/log

nox > python -m pip freeze
  .
  .
  .
pandas==1.3.0
  .
  .
  .

        if PANDAS_INSTALLED_VERSION >= "1.4":
            assert df.dtypes["float64_col"].name == "Float64"
        else:
>           assert df.dtypes["float64_col"].name == "string"
E           AssertionError: assert 'Float64' == 'string'
E             
E             - string
E             + Float64

https://pandas.pydata.org/pandas-docs/stable/whatsnew/v1.2.0.html#experimental-nullable-data-types-for-float-data

assert df.dtypes["float64_col"].name == "Float64"
assert df.dtypes["int64_col"].name == "Int64"
assert df.dtypes["string_col"].name == "string"

Expand Down
10 changes: 2 additions & 8 deletions tests/unit/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -4143,14 +4143,8 @@ def test_to_dataframe_w_dtypes_mapper(self):
)
self.assertEqual(df.name.dtype.name, "string")

# While pyproject.toml lists pandas 1.1 as the lowest supported version of
# pandas, the pip resolver is not able to resolve pandas 1.1 and numpy
if hasattr(pandas, "Float64Dtype"):
self.assertEqual(list(df.miles), [1.77, 6.66, 2.0])
self.assertEqual(df.miles.dtype.name, "Float64")
else:
self.assertEqual(list(df.miles), ["1.77", "6.66", "2.0"])
self.assertEqual(df.miles.dtype.name, "string")
self.assertEqual(list(df.miles), [1.77, 6.66, 2.0])
self.assertEqual(df.miles.dtype.name, "Float64")

if hasattr(pandas, "ArrowDtype"):
self.assertEqual(
Expand Down