From f26e390432ae5a78ea2e2c8a12801eada019e7b6 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 25 Jan 2019 15:15:39 -0800 Subject: [PATCH 01/19] CLN: Use `to_dataframe` to download query results. This allows us to remove logic for parsing the schema and align with google-cloud-bigquery. --- benchmark/README.md | 16 +++++++++++++ benchmark/read_gbq_large_results.py | 8 +++++++ benchmark/read_gbq_small_results.py | 7 ++++++ pandas_gbq/gbq.py | 37 ++++------------------------- 4 files changed, 36 insertions(+), 32 deletions(-) create mode 100644 benchmark/README.md create mode 100644 benchmark/read_gbq_large_results.py create mode 100644 benchmark/read_gbq_small_results.py diff --git a/benchmark/README.md b/benchmark/README.md new file mode 100644 index 00000000..5ede71d7 --- /dev/null +++ b/benchmark/README.md @@ -0,0 +1,16 @@ +# pandas-gbq benchmarks + +This directory contains a few scripts which are useful for performance +testing the pandas-gbq library. Use cProfile to time the script and see +details about where time is spent. To avoid timing how long BigQuery takes to +execute a query, run the benchmark twice to ensure the results are cached. + +## `read_gbq` + +Read a small table (a few KB). + + python -m cProfile --sort=cumtime read_gbq_small_results.py + +Read a large-ish table (100+ MB). + + python -m cProfile --sort=cumtime read_gbq_large_results.py diff --git a/benchmark/read_gbq_large_results.py b/benchmark/read_gbq_large_results.py new file mode 100644 index 00000000..5a8bf268 --- /dev/null +++ b/benchmark/read_gbq_large_results.py @@ -0,0 +1,8 @@ + +import pandas_gbq + +# Select 163 MB worth of data, to time how long it takes to download large +# result sets. +df = pandas_gbq.read_gbq( + "SELECT * FROM `bigquery-public-data.usa_names.usa_1910_2013`", + dialect="standard") diff --git a/benchmark/read_gbq_small_results.py b/benchmark/read_gbq_small_results.py new file mode 100644 index 00000000..cfff10b3 --- /dev/null +++ b/benchmark/read_gbq_small_results.py @@ -0,0 +1,7 @@ + +import pandas_gbq + +# Select a few KB worth of data, to time downloading small result sets. +df = pandas_gbq.read_gbq( + "SELECT * FROM `bigquery-public-data.utility_us.country_code_iso`", + dialect="standard") diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 948fd980..8cf2cf3b 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -1,11 +1,9 @@ import logging import time import warnings -from collections import OrderedDict from datetime import datetime import numpy as np -from pandas import DataFrame from pandas_gbq.exceptions import AccessDenied @@ -482,15 +480,9 @@ def run_query(self, query, **kwargs): rows_iter = query_reply.result() except self.http_error as ex: self.process_http_error(ex) - result_rows = list(rows_iter) - total_rows = rows_iter.total_rows - schema = { - "fields": [field.to_api_repr() for field in rows_iter.schema] - } - - logger.debug("Got {} rows.\n".format(total_rows)) - - return schema, result_rows + df = rows_iter.to_dataframe() + logger.debug("Got {} rows.\n".format(rows_iter.total_rows)) + return df def load_data( self, @@ -661,25 +653,6 @@ def _parse_schema(schema_fields): yield name, dtype -def _parse_data(schema, rows): - - column_dtypes = OrderedDict(_parse_schema(schema["fields"])) - df = DataFrame(data=(iter(r) for r in rows), columns=column_dtypes.keys()) - - for column in df: - dtype = column_dtypes[column] - null_safe = ( - df[column].notnull().all() - or dtype == float - or dtype == "datetime64[ns]" - ) - if dtype and null_safe: - df[column] = df[column].astype( - column_dtypes[column], errors="ignore" - ) - return df - - def read_gbq( query, project_id=None, @@ -825,8 +798,8 @@ def read_gbq( credentials=credentials, private_key=private_key, ) - schema, rows = connector.run_query(query, configuration=configuration) - final_df = _parse_data(schema, rows) + + final_df = connector.run_query(query, configuration=configuration) # Reindex the DataFrame on the provided column if index_col is not None: From 70435019d8d7eb5ac9352b9c7996715feb218bbd Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 8 Feb 2019 14:06:02 -0800 Subject: [PATCH 02/19] Supply expected dtypes to to_dataframe() --- benchmark/read_gbq_large_results.py | 3 +- benchmark/read_gbq_small_results.py | 3 +- pandas_gbq/gbq.py | 26 ++++++++++------ tests/system/test_gbq.py | 8 ----- tests/unit/test_gbq.py | 47 +++++++++-------------------- 5 files changed, 36 insertions(+), 51 deletions(-) diff --git a/benchmark/read_gbq_large_results.py b/benchmark/read_gbq_large_results.py index 5a8bf268..7dedbb13 100644 --- a/benchmark/read_gbq_large_results.py +++ b/benchmark/read_gbq_large_results.py @@ -5,4 +5,5 @@ # result sets. df = pandas_gbq.read_gbq( "SELECT * FROM `bigquery-public-data.usa_names.usa_1910_2013`", - dialect="standard") + dialect="standard", +) diff --git a/benchmark/read_gbq_small_results.py b/benchmark/read_gbq_small_results.py index cfff10b3..68621194 100644 --- a/benchmark/read_gbq_small_results.py +++ b/benchmark/read_gbq_small_results.py @@ -4,4 +4,5 @@ # Select a few KB worth of data, to time downloading small result sets. df = pandas_gbq.read_gbq( "SELECT * FROM `bigquery-public-data.utility_us.country_code_iso`", - dialect="standard") + dialect="standard", +) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 8cf2cf3b..fef73ca6 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -480,7 +480,10 @@ def run_query(self, query, **kwargs): rows_iter = query_reply.result() except self.http_error as ex: self.process_http_error(ex) - df = rows_iter.to_dataframe() + + schema_fields = [field.to_api_repr() for field in rows_iter.schema] + dtypes = _bqschema_to_dtypes(schema_fields) + df = rows_iter.to_dataframe(dtypes=dtypes) logger.debug("Got {} rows.\n".format(rows_iter.total_rows)) return df @@ -630,27 +633,32 @@ def delete_and_recreate_table(self, dataset_id, table_id, table_schema): table.create(table_id, table_schema) -def _parse_schema(schema_fields): +def _bqschema_to_dtypes(schema_fields): + # Only specify dtype when the dtype allows nulls. Otherwise, use pandas's + # default dtype choice. + # # see: # http://pandas.pydata.org/pandas-docs/dev/missing_data.html # #missing-data-casting-rules-and-indexing dtype_map = { "FLOAT": np.dtype(float), - "TIMESTAMP": "datetime64[ns]", + "TIMESTAMP": "datetime64[ns, UTC]", "TIME": "datetime64[ns]", "DATE": "datetime64[ns]", "DATETIME": "datetime64[ns]", - "BOOLEAN": bool, - "INTEGER": np.int64, } + dtypes = {} for field in schema_fields: name = str(field["name"]) if field["mode"].upper() == "REPEATED": - yield name, object - else: - dtype = dtype_map.get(field["type"].upper()) - yield name, dtype + continue + + dtype = dtype_map.get(field["type"].upper()) + if dtype: + dtypes[name] = dtype + + return dtypes def read_gbq( diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index dde34cb1..765fb8e2 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -138,14 +138,6 @@ def test_should_be_able_to_get_a_bigquery_client(self, gbq_connector): bigquery_client = gbq_connector.get_client() assert bigquery_client is not None - def test_should_be_able_to_get_schema_from_query(self, gbq_connector): - schema, pages = gbq_connector.run_query("SELECT 1") - assert schema is not None - - def test_should_be_able_to_get_results_from_query(self, gbq_connector): - schema, pages = gbq_connector.run_query("SELECT 1") - assert pages is not None - def test_should_read(project, credentials): query = 'SELECT "PI" AS valid_string' diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index 4f1d18ad..4427a07e 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -2,6 +2,7 @@ import pandas.util.testing as tm import pytest +import numpy from pandas import DataFrame from pandas.compat.numpy import np_datetime64_compat @@ -64,26 +65,23 @@ def no_auth(monkeypatch): @pytest.mark.parametrize( - ("input", "type_", "expected"), + ("type_", "expected"), [ - (1, "INTEGER", int(1)), - (1, "FLOAT", float(1)), - pytest.param("false", "BOOLEAN", False, marks=pytest.mark.xfail), - pytest.param( - "0e9", - "TIMESTAMP", - np_datetime64_compat("1970-01-01T00:00:00Z"), - marks=pytest.mark.xfail, - ), - ("STRING", "STRING", "STRING"), + ("INTEGER", None), # Can't handle NULL + ("BOOLEAN", None), # Can't handle NULL + ("FLOAT", numpy.dtype(float)), + ("TIMESTAMP", "datetime64[ns, UTC]"), + ("DATETIME", "datetime64[ns]"), ], ) -def test_should_return_bigquery_correctly_typed(input, type_, expected): - result = gbq._parse_data( - dict(fields=[dict(name="x", type=type_, mode="NULLABLE")]), - rows=[[input]], - ).iloc[0, 0] - assert result == expected +def test_should_return_bigquery_correctly_typed(type_, expected): + result = gbq._bqschema_to_dtypes( + [dict(name="x", type=type_, mode="NULLABLE")] + ) + if not expected: + assert result == {} + else: + assert result == {"x": expected} def test_to_gbq_should_fail_if_invalid_table_name_passed(): @@ -263,21 +261,6 @@ def test_read_gbq_with_inferred_project_id(monkeypatch): assert df is not None -def test_that_parse_data_works_properly(): - from google.cloud.bigquery.table import Row - - test_schema = { - "fields": [{"mode": "NULLABLE", "name": "column_x", "type": "STRING"}] - } - field_to_index = {"column_x": 0} - values = ("row_value",) - test_page = [Row(values, field_to_index)] - - test_output = gbq._parse_data(test_schema, test_page) - correct_output = DataFrame({"column_x": ["row_value"]}) - tm.assert_frame_equal(test_output, correct_output) - - def test_read_gbq_with_invalid_private_key_json_should_fail(): with pytest.raises(pandas_gbq.exceptions.InvalidPrivateKeyFormat): gbq.read_gbq( From b9f931d7d106219bc8ac626c65723c3ca16d33cf Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 8 Feb 2019 14:08:25 -0800 Subject: [PATCH 03/19] Bump miniimum google-cloud-bigquery version for dtypes argument. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e53d43f5..e5e40505 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ def readme(): "pydata-google-auth", "google-auth", "google-auth-oauthlib", - "google-cloud-bigquery>=0.32.0", + "google-cloud-bigquery>=1.9.0", ] extras = {"tqdm": "tqdm>=4.23.0"} From f805dba186fc3b850bb49f028b4de8c796f11a7c Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 8 Feb 2019 16:51:39 -0800 Subject: [PATCH 04/19] Update tests to match dtypes from to_dataframe(). --- tests/system/test_gbq.py | 76 +++++++++++++++++++++++++++++++--------- 1 file changed, 59 insertions(+), 17 deletions(-) diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index 765fb8e2..82835e56 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -6,11 +6,12 @@ import google.oauth2.service_account import numpy as np +import pandas import pandas.util.testing as tm -import pytest -import pytz from pandas import DataFrame, NaT, compat from pandas.compat import range, u +import pytest +import pytz from pandas_gbq import gbq @@ -311,7 +312,8 @@ def test_should_properly_handle_timestamp_unix_epoch(self, project_id): tm.assert_frame_equal( df, DataFrame( - {"unix_epoch": [np.datetime64("1970-01-01T00:00:00.000000Z")]} + {"unix_epoch": ["1970-01-01T00:00:00.000000Z"]}, + dtype="datetime64[ns, UTC]", ), ) @@ -323,6 +325,38 @@ def test_should_properly_handle_arbitrary_timestamp(self, project_id): credentials=self.credentials, dialect="legacy", ) + tm.assert_frame_equal( + df, + DataFrame( + {"valid_timestamp": ["2004-09-15T05:00:00.000000Z"]}, + dtype="datetime64[ns, UTC]", + ), + ) + + def test_should_properly_handle_datetime_unix_epoch(self, project_id): + query = 'SELECT DATETIME("1970-01-01 00:00:00") AS unix_epoch' + df = gbq.read_gbq( + query, + project_id=project_id, + credentials=self.credentials, + dialect="legacy", + ) + tm.assert_frame_equal( + df, + DataFrame( + {"unix_epoch": ["1970-01-01T00:00:00.000000Z"]}, + dtype="datetime64[ns]", + ), + ) + + def test_should_properly_handle_arbitrary_datetime(self, project_id): + query = 'SELECT DATETIME("2004-09-15 05:00:00") AS valid_timestamp' + df = gbq.read_gbq( + query, + project_id=project_id, + credentials=self.credentials, + dialect="legacy", + ) tm.assert_frame_equal( df, DataFrame( @@ -338,7 +372,7 @@ def test_should_properly_handle_arbitrary_timestamp(self, project_id): "expression, type_", [ ("current_date()", " Date: Tue, 12 Feb 2019 09:53:25 -0800 Subject: [PATCH 05/19] Cast to correct dtype in empty dataframes. --- pandas_gbq/gbq.py | 43 ++++++++++++++++++++++++++++++++++++---- tests/system/test_gbq.py | 4 ++-- tests/unit/test_gbq.py | 2 +- 3 files changed, 42 insertions(+), 7 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index fef73ca6..75dc735c 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -482,8 +482,12 @@ def run_query(self, query, **kwargs): self.process_http_error(ex) schema_fields = [field.to_api_repr() for field in rows_iter.schema] - dtypes = _bqschema_to_dtypes(schema_fields) - df = rows_iter.to_dataframe(dtypes=dtypes) + nullsafe_dtypes = _bqschema_to_nullsafe_dtypes(schema_fields) + df = rows_iter.to_dataframe(dtypes=nullsafe_dtypes) + + if df.empty: + df = _cast_empty_df_dtypes(schema_fields, df) + logger.debug("Got {} rows.\n".format(rows_iter.total_rows)) return df @@ -633,11 +637,11 @@ def delete_and_recreate_table(self, dataset_id, table_id, table_schema): table.create(table_id, table_schema) -def _bqschema_to_dtypes(schema_fields): +def _bqschema_to_nullsafe_dtypes(schema_fields): # Only specify dtype when the dtype allows nulls. Otherwise, use pandas's # default dtype choice. # - # see: + # See: # http://pandas.pydata.org/pandas-docs/dev/missing_data.html # #missing-data-casting-rules-and-indexing dtype_map = { @@ -661,6 +665,37 @@ def _bqschema_to_dtypes(schema_fields): return dtypes +def _cast_empty_df_dtypes(schema_fields, df): + """Cast any columns in an empty dataframe to correct type. + + In an empty dataframe, pandas cannot choose a dtype unless one is + explicitly provided. The _bqschema_to_nullsafe_dtypes() function only + provides dtypes when the dtype safely handles null values. This means + that empty int64 and boolean columns are incorrectly classified as + ``object``. + """ + if not df.empty: + raise ValueError( + "DataFrame must be empty in order to cast non-nullsafe dtypes" + ) + + dtype_map = { + "BOOLEAN": bool, + "INTEGER": np.int64, + } + + for field in schema_fields: + column = str(field["name"]) + if field["mode"].upper() == "REPEATED": + continue + + dtype = dtype_map.get(field["type"].upper()) + if dtype: + df[column] = df[column].astype(dtype) + + return df + + def read_gbq( query, project_id=None, diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index 82835e56..de243e14 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -590,8 +590,8 @@ def test_zero_rows(self, project_id): ) empty_columns = { "title": pandas.Series([], dtype=object), - "id": pandas.Series([], dtype=object), - "is_bot": pandas.Series([], dtype=object), + "id": pandas.Series([], dtype=np.dtype(int)), + "is_bot": pandas.Series([], dtype=np.dtype(bool)), "ts": pandas.Series([], dtype="datetime64[ns, UTC]"), } expected_result = DataFrame(empty_columns) diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index 4427a07e..fd5fa493 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -75,7 +75,7 @@ def no_auth(monkeypatch): ], ) def test_should_return_bigquery_correctly_typed(type_, expected): - result = gbq._bqschema_to_dtypes( + result = gbq._bqschema_to_nullsafe_dtypes( [dict(name="x", type=type_, mode="NULLABLE")] ) if not expected: From 90eb9fe097eb149d9168f752f3867aeadf06343b Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 12 Feb 2019 09:54:22 -0800 Subject: [PATCH 06/19] Blacken --- pandas_gbq/gbq.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 75dc735c..22e1b0b0 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -679,10 +679,7 @@ def _cast_empty_df_dtypes(schema_fields, df): "DataFrame must be empty in order to cast non-nullsafe dtypes" ) - dtype_map = { - "BOOLEAN": bool, - "INTEGER": np.int64, - } + dtype_map = {"BOOLEAN": bool, "INTEGER": np.int64} for field in schema_fields: column = str(field["name"]) From 013b00fb7164e4a14640e66d3bea79c684384e37 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 12 Feb 2019 10:36:20 -0800 Subject: [PATCH 07/19] Blacken benchmark. --- benchmark/read_gbq_large_results.py | 1 - benchmark/read_gbq_small_results.py | 1 - 2 files changed, 2 deletions(-) diff --git a/benchmark/read_gbq_large_results.py b/benchmark/read_gbq_large_results.py index 7dedbb13..98d9ff53 100644 --- a/benchmark/read_gbq_large_results.py +++ b/benchmark/read_gbq_large_results.py @@ -1,4 +1,3 @@ - import pandas_gbq # Select 163 MB worth of data, to time how long it takes to download large diff --git a/benchmark/read_gbq_small_results.py b/benchmark/read_gbq_small_results.py index 68621194..8e91b0a0 100644 --- a/benchmark/read_gbq_small_results.py +++ b/benchmark/read_gbq_small_results.py @@ -1,4 +1,3 @@ - import pandas_gbq # Select a few KB worth of data, to time downloading small result sets. From 5a526c0699989a936ea4685ccc17462ed50124ed Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 12 Feb 2019 10:57:35 -0800 Subject: [PATCH 08/19] Remove timezone from datetime tests. --- tests/system/test_gbq.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index de243e14..1f887fa3 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -344,7 +344,7 @@ def test_should_properly_handle_datetime_unix_epoch(self, project_id): tm.assert_frame_equal( df, DataFrame( - {"unix_epoch": ["1970-01-01T00:00:00.000000Z"]}, + {"unix_epoch": ["1970-01-01T00:00:00"]}, dtype="datetime64[ns]", ), ) @@ -362,7 +362,7 @@ def test_should_properly_handle_arbitrary_datetime(self, project_id): DataFrame( { "valid_timestamp": [ - np.datetime64("2004-09-15T05:00:00.000000Z") + np.datetime64("2004-09-15T05:00:00") ] } ), From d8e3b99e6599e174b3305402d019f62179bc669c Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 12 Feb 2019 12:47:43 -0800 Subject: [PATCH 09/19] Blacken tests. --- tests/system/test_gbq.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index 1f887fa3..976a2bda 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -344,8 +344,7 @@ def test_should_properly_handle_datetime_unix_epoch(self, project_id): tm.assert_frame_equal( df, DataFrame( - {"unix_epoch": ["1970-01-01T00:00:00"]}, - dtype="datetime64[ns]", + {"unix_epoch": ["1970-01-01T00:00:00"]}, dtype="datetime64[ns]" ), ) @@ -360,11 +359,7 @@ def test_should_properly_handle_arbitrary_datetime(self, project_id): tm.assert_frame_equal( df, DataFrame( - { - "valid_timestamp": [ - np.datetime64("2004-09-15T05:00:00") - ] - } + {"valid_timestamp": [np.datetime64("2004-09-15T05:00:00")]} ), ) From 1e4009a2dce3534150d6336dbaa87138e120dfda Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 12 Feb 2019 13:32:27 -0800 Subject: [PATCH 10/19] Update docs for minimum google-cloud-bigquery version. --- ci/requirements-3.6-0.20.1.conda | 2 +- docs/source/changelog.rst | 18 ++++++++++++++++++ pandas_gbq/gbq.py | 2 +- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/ci/requirements-3.6-0.20.1.conda b/ci/requirements-3.6-0.20.1.conda index a057399d..2f8d0d83 100644 --- a/ci/requirements-3.6-0.20.1.conda +++ b/ci/requirements-3.6-0.20.1.conda @@ -1,6 +1,6 @@ google-auth google-auth-oauthlib -google-cloud-bigquery==0.32.0 +google-cloud-bigquery==1.9.0 pytest pytest-cov codecov diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 6f3aa5cd..60861dea 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,6 +1,24 @@ Changelog ========= +.. _changelog-0.10.0: + +0.10.0 / TBD +------------ + +Dependency updates +~~~~~~~~~~~~~~~~~~ + +- Update the minimum version of ``google-cloud-bigquery`` to 1.9.0. + (:issue:`247`) + +Internal changes +~~~~~~~~~~~~~~~~ + +- Use ``to_dataframe()`` from ``google-cloud-bigquery`` in the ``read_gbq()`` + function. (:issue:`247`) + + .. _changelog-0.9.0: 0.9.0 / 2019-01-11 diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 22e1b0b0..e99eda2c 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -35,7 +35,7 @@ def _check_google_client_version(): raise ImportError("Could not import pkg_resources (setuptools).") # https://github.com/GoogleCloudPlatform/google-cloud-python/blob/master/bigquery/CHANGELOG.md - bigquery_minimum_version = pkg_resources.parse_version("0.32.0") + bigquery_minimum_version = pkg_resources.parse_version("1.9.0") BIGQUERY_INSTALLED_VERSION = pkg_resources.get_distribution( "google-cloud-bigquery" ).parsed_version From 827a0650a141df7cc8ddec37ae8f6c5b6467c567 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 15 Feb 2019 10:17:59 -0800 Subject: [PATCH 11/19] Update version number in unit tests. --- tests/unit/test_gbq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index fd5fa493..c868d853 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -23,7 +23,7 @@ def min_bq_version(): import pkg_resources - return pkg_resources.parse_version("0.32.0") + return pkg_resources.parse_version("1.9.0") def mock_none_credentials(*args, **kwargs): From 159bda0d02c3035dfd06bbc92f9d22d3b4317515 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 15 Feb 2019 15:40:02 -0800 Subject: [PATCH 12/19] Update dependencies for tests. --- ci/requirements-2.7.pip | 2 +- ci/requirements-3.5.pip | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/requirements-2.7.pip b/ci/requirements-2.7.pip index 48ceb439..10300f12 100644 --- a/ci/requirements-2.7.pip +++ b/ci/requirements-2.7.pip @@ -2,5 +2,5 @@ mock pandas==0.17.1 google-auth==1.4.1 google-auth-oauthlib==0.0.1 -google-cloud-bigquery==0.32.0 +google-cloud-bigquery==1.9.0 pydata-google-auth==0.1.2 diff --git a/ci/requirements-3.5.pip b/ci/requirements-3.5.pip index 980d0700..41a41891 100644 --- a/ci/requirements-3.5.pip +++ b/ci/requirements-3.5.pip @@ -1,5 +1,5 @@ pandas==0.19.0 google-auth==1.4.1 google-auth-oauthlib==0.0.1 -google-cloud-bigquery==0.32.0 +google-cloud-bigquery==1.9.0 pydata-google-auth==0.1.2 \ No newline at end of file From 0ecce9bde9157fdd9c141e23131c27596eebbfdd Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 15 Feb 2019 16:50:03 -0800 Subject: [PATCH 13/19] Fix lint error. --- tests/unit/test_gbq.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index c868d853..a1317887 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -4,7 +4,6 @@ import pytest import numpy from pandas import DataFrame -from pandas.compat.numpy import np_datetime64_compat import pandas_gbq.exceptions from pandas_gbq import gbq From b80cf5c769b3c621a9c68e03a7331d6189bf5864 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 15 Feb 2019 16:50:29 -0800 Subject: [PATCH 14/19] Specify column order on empty DataFrame. --- tests/system/test_gbq.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index 976a2bda..8781fac9 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -589,7 +589,9 @@ def test_zero_rows(self, project_id): "is_bot": pandas.Series([], dtype=np.dtype(bool)), "ts": pandas.Series([], dtype="datetime64[ns, UTC]"), } - expected_result = DataFrame(empty_columns) + expected_result = DataFrame( + empty_columns, columns=["title", "id", "is_bot", "ts"] + ) tm.assert_frame_equal(df, expected_result, check_index_type=False) def test_one_row_one_column(self, project_id): From 20dd01a3e16b0279e6d829adacf6d4cd8adee9cd Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 15 Feb 2019 16:54:35 -0800 Subject: [PATCH 15/19] Don't wipe out conda dependencies. --- ci/run_conda.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/run_conda.sh b/ci/run_conda.sh index 59769328..60ae6ff0 100755 --- a/ci/run_conda.sh +++ b/ci/run_conda.sh @@ -21,7 +21,7 @@ fi REQ="ci/requirements-${PYTHON}-${PANDAS}" conda install -q --file "$REQ.conda"; -python setup.py develop +python setup.py develop --no-deps # Run the tests $DIR/run_tests.sh From 59a93282efa1f81abbd37890bed40d328805eff5 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 15 Feb 2019 17:19:05 -0800 Subject: [PATCH 16/19] Add pydata-google-auth to conda deps. --- ci/requirements-3.6-0.20.1.conda | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ci/requirements-3.6-0.20.1.conda b/ci/requirements-3.6-0.20.1.conda index 2f8d0d83..1c7eb3f2 100644 --- a/ci/requirements-3.6-0.20.1.conda +++ b/ci/requirements-3.6-0.20.1.conda @@ -1,5 +1,4 @@ -google-auth -google-auth-oauthlib +pydata-google-auth google-cloud-bigquery==1.9.0 pytest pytest-cov From d7c1ca5fd675cb44b4b905e302d6d4b1875e0eba Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 19 Feb 2019 09:54:23 -0800 Subject: [PATCH 17/19] Document change in behavior of TIMEZONE columns. --- docs/source/changelog.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 60861dea..98c913b9 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -6,6 +6,9 @@ Changelog 0.10.0 / TBD ------------ +- ``read_gbq()`` converts BigQuery ``TIMESTAMP`` columns to the + timezone-aware ``datetime64`` ``dtype``. (:issue:`247`) + Dependency updates ~~~~~~~~~~~~~~~~~~ From 5b1bd0a78330f8dcd25df6f33db8554a65ea32b7 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 22 Feb 2019 11:27:39 -0800 Subject: [PATCH 18/19] Use timezone-naive datetime64 dtype for TIMESTAMP --- docs/source/changelog.rst | 3 --- pandas_gbq/gbq.py | 5 ++++- tests/system/test_gbq.py | 10 +++++----- tests/unit/test_gbq.py | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 98c913b9..60861dea 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -6,9 +6,6 @@ Changelog 0.10.0 / TBD ------------ -- ``read_gbq()`` converts BigQuery ``TIMESTAMP`` columns to the - timezone-aware ``datetime64`` ``dtype``. (:issue:`247`) - Dependency updates ~~~~~~~~~~~~~~~~~~ diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index e99eda2c..13d65669 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -646,7 +646,10 @@ def _bqschema_to_nullsafe_dtypes(schema_fields): # #missing-data-casting-rules-and-indexing dtype_map = { "FLOAT": np.dtype(float), - "TIMESTAMP": "datetime64[ns, UTC]", + # Even though TIMESTAMPs are timezone-aware in BigQuery, pandas doesn't + # support datetime64[ns, UTC] as dtype in DataFrame constructors. See: + # https://github.com/pandas-dev/pandas/issues/12513 + "TIMESTAMP": "datetime64[ns]", "TIME": "datetime64[ns]", "DATE": "datetime64[ns]", "DATETIME": "datetime64[ns]", diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index 8781fac9..fa068642 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -313,7 +313,7 @@ def test_should_properly_handle_timestamp_unix_epoch(self, project_id): df, DataFrame( {"unix_epoch": ["1970-01-01T00:00:00.000000Z"]}, - dtype="datetime64[ns, UTC]", + dtype="datetime64[ns]", ), ) @@ -329,7 +329,7 @@ def test_should_properly_handle_arbitrary_timestamp(self, project_id): df, DataFrame( {"valid_timestamp": ["2004-09-15T05:00:00.000000Z"]}, - dtype="datetime64[ns, UTC]", + dtype="datetime64[ns]", ), ) @@ -367,7 +367,7 @@ def test_should_properly_handle_arbitrary_datetime(self, project_id): "expression, type_", [ ("current_date()", " Date: Fri, 22 Feb 2019 11:32:59 -0800 Subject: [PATCH 19/19] Blacken --- tests/system/test_gbq.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index fa068642..82753a38 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -400,8 +400,7 @@ def test_should_properly_handle_null_timestamp(self, project_id): dialect="legacy", ) tm.assert_frame_equal( - df, - DataFrame({"null_timestamp": [NaT]}, dtype="datetime64[ns]"), + df, DataFrame({"null_timestamp": [NaT]}, dtype="datetime64[ns]") ) def test_should_properly_handle_null_datetime(self, project_id):