Skip to content

feat: add 'columns' as an alias for 'col_order' #298

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions bigframes/pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,20 +486,22 @@ def read_gbq(
query_or_table: str,
*,
index_col: Iterable[str] | str = (),
col_order: Iterable[str] = (),
columns: Iterable[str] = (),
max_results: Optional[int] = None,
filters: vendored_pandas_gbq.FiltersType = (),
use_cache: bool = True,
col_order: Iterable[str] = (),
) -> bigframes.dataframe.DataFrame:
_set_default_session_location_if_possible(query_or_table)
return global_session.with_default_session(
bigframes.session.Session.read_gbq,
query_or_table,
index_col=index_col,
col_order=col_order,
columns=columns,
max_results=max_results,
filters=filters,
use_cache=use_cache,
col_order=col_order,
)


Expand All @@ -520,18 +522,20 @@ def read_gbq_query(
query: str,
*,
index_col: Iterable[str] | str = (),
col_order: Iterable[str] = (),
columns: Iterable[str] = (),
max_results: Optional[int] = None,
use_cache: bool = True,
col_order: Iterable[str] = (),
) -> bigframes.dataframe.DataFrame:
_set_default_session_location_if_possible(query)
return global_session.with_default_session(
bigframes.session.Session.read_gbq_query,
query,
index_col=index_col,
col_order=col_order,
columns=columns,
max_results=max_results,
use_cache=use_cache,
col_order=col_order,
)


Expand All @@ -542,18 +546,20 @@ def read_gbq_table(
query: str,
*,
index_col: Iterable[str] | str = (),
col_order: Iterable[str] = (),
columns: Iterable[str] = (),
max_results: Optional[int] = None,
use_cache: bool = True,
col_order: Iterable[str] = (),
) -> bigframes.dataframe.DataFrame:
_set_default_session_location_if_possible(query)
return global_session.with_default_session(
bigframes.session.Session.read_gbq_table,
query,
index_col=index_col,
col_order=col_order,
columns=columns,
max_results=max_results,
use_cache=use_cache,
col_order=col_order,
)


Expand Down
66 changes: 45 additions & 21 deletions bigframes/session/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,20 +232,28 @@ def read_gbq(
query_or_table: str,
*,
index_col: Iterable[str] | str = (),
col_order: Iterable[str] = (),
columns: Iterable[str] = (),
max_results: Optional[int] = None,
filters: third_party_pandas_gbq.FiltersType = (),
use_cache: bool = True,
col_order: Iterable[str] = (),
# Add a verify index argument that fails if the index is not unique.
) -> dataframe.DataFrame:
# TODO(b/281571214): Generate prompt to show the progress of read_gbq.
query_or_table = self._filters_to_query(query_or_table, col_order, filters)
if columns and col_order:
raise ValueError(
"Must specify either columns (preferred) or col_order, not both"
)
elif col_order:
columns = col_order

query_or_table = self._filters_to_query(query_or_table, columns, filters)

if _is_query(query_or_table):
return self._read_gbq_query(
query_or_table,
index_col=index_col,
col_order=col_order,
columns=columns,
max_results=max_results,
api_name="read_gbq",
use_cache=use_cache,
Expand All @@ -257,7 +265,7 @@ def read_gbq(
return self._read_gbq_table(
query_or_table,
index_col=index_col,
col_order=col_order,
columns=columns,
max_results=max_results,
api_name="read_gbq",
use_cache=use_cache,
Expand Down Expand Up @@ -388,9 +396,10 @@ def read_gbq_query(
query: str,
*,
index_col: Iterable[str] | str = (),
col_order: Iterable[str] = (),
columns: Iterable[str] = (),
max_results: Optional[int] = None,
use_cache: bool = True,
col_order: Iterable[str] = (),
) -> dataframe.DataFrame:
"""Turn a SQL query into a DataFrame.

Expand Down Expand Up @@ -442,10 +451,17 @@ def read_gbq_query(
"""
# NOTE: This method doesn't (yet) exist in pandas or pandas-gbq, so
# these docstrings are inline.
if columns and col_order:
raise ValueError(
"Must specify either columns (preferred) or col_order, not both"
)
elif col_order:
columns = col_order

return self._read_gbq_query(
query=query,
index_col=index_col,
col_order=col_order,
columns=columns,
max_results=max_results,
api_name="read_gbq_query",
use_cache=use_cache,
Expand All @@ -456,7 +472,7 @@ def _read_gbq_query(
query: str,
*,
index_col: Iterable[str] | str = (),
col_order: Iterable[str] = (),
columns: Iterable[str] = (),
max_results: Optional[int] = None,
api_name: str = "read_gbq_query",
use_cache: bool = True,
Expand Down Expand Up @@ -492,7 +508,7 @@ def _read_gbq_query(
return self.read_gbq_table(
f"{destination.project}.{destination.dataset_id}.{destination.table_id}",
index_col=index_cols,
col_order=col_order,
columns=columns,
max_results=max_results,
use_cache=use_cache,
)
Expand All @@ -502,9 +518,10 @@ def read_gbq_table(
query: str,
*,
index_col: Iterable[str] | str = (),
col_order: Iterable[str] = (),
columns: Iterable[str] = (),
max_results: Optional[int] = None,
use_cache: bool = True,
col_order: Iterable[str] = (),
) -> dataframe.DataFrame:
"""Turn a BigQuery table into a DataFrame.

Expand All @@ -521,10 +538,17 @@ def read_gbq_table(
"""
# NOTE: This method doesn't (yet) exist in pandas or pandas-gbq, so
# these docstrings are inline.
if columns and col_order:
raise ValueError(
"Must specify either columns (preferred) or col_order, not both"
)
elif col_order:
columns = col_order

return self._read_gbq_table(
query=query,
index_col=index_col,
col_order=col_order,
columns=columns,
max_results=max_results,
api_name="read_gbq_table",
use_cache=use_cache,
Expand Down Expand Up @@ -583,7 +607,7 @@ def _read_gbq_table(
query: str,
*,
index_col: Iterable[str] | str = (),
col_order: Iterable[str] = (),
columns: Iterable[str] = (),
max_results: Optional[int] = None,
api_name: str,
use_cache: bool = True,
Expand All @@ -602,10 +626,10 @@ def _read_gbq_table(
table_ref, api_name=api_name, use_cache=use_cache
)

for key in col_order:
for key in columns:
if key not in table_expression.columns:
raise ValueError(
f"Column '{key}' of `col_order` not found in this table."
f"Column '{key}' of `columns` not found in this table."
)

if isinstance(index_col, str):
Expand All @@ -619,8 +643,8 @@ def _read_gbq_table(
f"Column `{key}` of `index_col` not found in this table."
)

if col_order:
table_expression = table_expression.select([*index_cols, *col_order])
if columns:
table_expression = table_expression.select([*index_cols, *columns])

# If the index is unique and sortable, then we don't need to generate
# an ordering column.
Expand Down Expand Up @@ -719,7 +743,7 @@ def _read_bigquery_load_job(
*,
job_config: bigquery.LoadJobConfig,
index_col: Iterable[str] | str = (),
col_order: Iterable[str] = (),
columns: Iterable[str] = (),
) -> dataframe.DataFrame:
if isinstance(index_col, str):
index_cols = [index_col]
Expand Down Expand Up @@ -760,7 +784,7 @@ def _read_bigquery_load_job(
return self.read_gbq_table(
table_id,
index_col=index_col,
col_order=col_order,
columns=columns,
)

def read_gbq_model(self, model_name: str):
Expand Down Expand Up @@ -959,13 +983,13 @@ def read_csv(
if index_col is None:
index_col = ()

# usecols should only be an iterable of strings (column names) for use as col_order in read_gbq.
col_order: Tuple[Any, ...] = tuple()
# usecols should only be an iterable of strings (column names) for use as columns in read_gbq.
columns: Tuple[Any, ...] = tuple()
if usecols is not None:
if isinstance(usecols, Iterable) and all(
isinstance(col, str) for col in usecols
):
col_order = tuple(col for col in usecols)
columns = tuple(col for col in usecols)
else:
raise NotImplementedError(
"BigQuery engine only supports an iterable of strings for `usecols`. "
Expand Down Expand Up @@ -1000,7 +1024,7 @@ def read_csv(
table,
job_config=job_config,
index_col=index_col,
col_order=col_order,
columns=columns,
)
else:
if any(arg in kwargs for arg in ("chunksize", "iterator")):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -613,7 +613,7 @@
"source": [
"# Query 3 columns of interest from drug label dataset\n",
"df = bpd.read_gbq(\"bigquery-public-data.fda_drug.drug_label\",\n",
" col_order=[\"openfda_generic_name\", \"openfda_brand_name\", \"indications_and_usage\"])\n",
" columns=[\"openfda_generic_name\", \"openfda_brand_name\", \"indications_and_usage\"])\n",
"\n",
"# Exclude any rows with missing data\n",
"df = df.dropna()\n",
Expand Down Expand Up @@ -825,7 +825,7 @@
"source": [
"# Query 3 columns of interest from drug label dataset\n",
"df_missing = bpd.read_gbq(\"bigquery-public-data.fda_drug.drug_label\",\n",
" col_order=[\"openfda_generic_name\", \"openfda_brand_name\", \"indications_and_usage\"])\n",
" columns=[\"openfda_generic_name\", \"openfda_brand_name\", \"indications_and_usage\"])\n",
"\n",
"# Exclude any rows with missing data\n",
"df_missing = df_missing.dropna()\n",
Expand Down
10 changes: 5 additions & 5 deletions tests/system/small/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def test_read_gbq_tokyo(


@pytest.mark.parametrize(
("query_or_table", "col_order"),
("query_or_table", "columns"),
[
pytest.param(
"{scalars_table_id}", ["bool_col", "int64_col"], id="two_cols_in_table"
Expand All @@ -79,16 +79,16 @@ def test_read_gbq_tokyo(
),
],
)
def test_read_gbq_w_col_order(
def test_read_gbq_w_columns(
session: bigframes.Session,
scalars_table_id: str,
query_or_table: str,
col_order: List[str],
columns: List[str],
):
df = session.read_gbq(
query_or_table.format(scalars_table_id=scalars_table_id), col_order=col_order
query_or_table.format(scalars_table_id=scalars_table_id), columns=columns
)
assert df.columns.tolist() == col_order
assert df.columns.tolist() == columns


@pytest.mark.parametrize(
Expand Down
11 changes: 7 additions & 4 deletions third_party/bigframes_vendored/pandas/io/gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,11 @@ def read_gbq(
query_or_table: str,
*,
index_col: Iterable[str] | str = (),
col_order: Iterable[str] = (),
columns: Iterable[str] = (),
max_results: Optional[int] = None,
filters: FiltersType = (),
use_cache: bool = True,
col_order: Iterable[str] = (),
):
"""Loads a DataFrame from BigQuery.

Expand Down Expand Up @@ -77,11 +78,11 @@ def read_gbq(

Reading data with `columns` and `filters` parameters:

>>> col_order = ['pitcherFirstName', 'pitcherLastName', 'year', 'pitchSpeed']
>>> columns = ['pitcherFirstName', 'pitcherLastName', 'year', 'pitchSpeed']
>>> filters = [('year', '==', 2016), ('pitcherFirstName', 'in', ['John', 'Doe']), ('pitcherLastName', 'in', ['Gant'])]
>>> df = bpd.read_gbq(
... "bigquery-public-data.baseball.games_wide",
... col_order=col_order,
... columns=columns,
... filters=filters,
... )
>>> df.head(1)
Expand All @@ -97,7 +98,7 @@ def read_gbq(
`project.dataset.tablename` or `dataset.tablename`.
index_col (Iterable[str] or str):
Name of result column(s) to use for index in results DataFrame.
col_order (Iterable[str]):
columns (Iterable[str]):
List of BigQuery column names in the desired order for results
DataFrame.
max_results (Optional[int], default None):
Expand All @@ -113,6 +114,8 @@ def read_gbq(
is to be conducted.
use_cache (bool, default True):
Whether to cache the query inputs. Default to True.
col_order (Iterable[str]):
Alias for columns, retained for backwards compatibility.

Returns:
bigframes.dataframe.DataFrame: A DataFrame representing results of the query or table.
Expand Down