Skip to content

perf: defer query in read_gbq with wildcard tables #1661

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 33 commits into from
May 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
a9edb2a
perf: defer query in `read_gbq` with wildcard tables
tswast Apr 27, 2025
df795b1
remove obsolete comments
tswast Apr 27, 2025
f81fe4e
Merge remote-tracking branch 'origin/main' into b405773140-wildcard
tswast Apr 28, 2025
79f4c58
use sql node instead of ibis table node to keep select * from omittin…
tswast Apr 28, 2025
5b0d0a0
test with cache and to_gbq
tswast Apr 29, 2025
118964b
rename columns before caching
tswast Apr 29, 2025
ca33463
remove unnecessary comment
tswast Apr 29, 2025
e546745
Merge remote-tracking branch 'origin/main' into b405773140-wildcard
tswast Apr 29, 2025
4897ca4
add missing import
tswast Apr 29, 2025
e1a7341
do not materialize _TABLE_SUFFIX
tswast Apr 29, 2025
af06200
fix unit tests
tswast Apr 29, 2025
af5c036
Merge branch 'main' into b405773140-wildcard
tswast Apr 29, 2025
f26574b
correct number of columns in cache with offsets
tswast Apr 29, 2025
dd05c2d
Merge branch 'main' into b405773140-wildcard
tswast Apr 29, 2025
ab0e50a
fix formatting
tswast Apr 29, 2025
89535e2
🦉 Updates from OwlBot post-processor
gcf-owl-bot[bot] Apr 29, 2025
8bb09d5
Merge branch 'b405773140-wildcard' of https://github.com/googleapis/p…
gcf-owl-bot[bot] Apr 29, 2025
40e2e77
Merge branch 'main' into b405773140-wildcard
tswast Apr 29, 2025
d37bf5e
revert datetime change, max_results change
tswast Apr 29, 2025
2f25f8d
Merge remote-tracking branch 'origin/b405773140-wildcard' into b40577…
tswast Apr 29, 2025
4bf66b6
add pseudocolumns to node
tswast Apr 29, 2025
8c96498
fix unit tests
tswast Apr 29, 2025
e1780a6
actually fix unit tests
tswast Apr 29, 2025
b027b51
try to rename as part of compile
tswast Apr 29, 2025
00fbd91
add renames to as cached table
tswast Apr 30, 2025
9a778db
use correct node for table schema
tswast Apr 30, 2025
d076cd3
Merge branch 'main' into b405773140-wildcard
tswast Apr 30, 2025
f3d5b7b
Merge branch 'main' into b405773140-wildcard
tswast May 5, 2025
7d8ddcc
Merge remote-tracking branch 'origin/main' into b405773140-pseudocolumns
tswast May 5, 2025
0722229
revert pseudocolumn addition
tswast May 5, 2025
80ce9c6
revert pseudocolumn fix
tswast May 5, 2025
c2ffc02
Merge remote-tracking branch 'origin/b405773140-wildcard' into b40577…
tswast May 5, 2025
2f2dcd6
add test for warning
tswast May 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion bigframes/session/_io/bigquery/read_gbq_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,16 @@ def validate_table(
# Anonymous dataset, does not support snapshot ever
if table.dataset_id.startswith("_"):
pass

# Only true tables support time travel
elif table.table_id.endswith("*"):
msg = bfe.format_message(
"Wildcard tables do not support FOR SYSTEM_TIME AS OF queries. "
"Attempting query without time travel. Be aware that "
"modifications to the underlying data may result in errors or "
"unexpected behavior."
)
warnings.warn(msg, category=bfe.TimeTravelDisabledWarning)
elif table.table_type != "TABLE":
if table.table_type == "MATERIALIZED_VIEW":
msg = bfe.format_message(
Expand Down Expand Up @@ -137,7 +146,7 @@ def validate_table(
sql_predicate=filter_str,
time_travel_timestamp=None,
)
# Any erorrs here should just be raised to user
# Any errors here should just be raised to user
bqclient.query_and_wait(
snapshot_sql, job_config=bigquery.QueryJobConfig(dry_run=True)
)
Expand Down
9 changes: 3 additions & 6 deletions bigframes/session/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,11 +518,7 @@ def read_gbq_table(
# clustered tables, so fallback to a query. We do this here so that
# the index is consistent with tables that have primary keys, even
# when max_results is set.
# TODO(b/338419730): We don't need to fallback to a query for wildcard
# tables if we allow some non-determinism when time travel isn't supported.
if max_results is not None or bf_io_bigquery.is_table_with_wildcard_suffix(
table_id
):
if max_results is not None:
# TODO(b/338111344): If we are running a query anyway, we might as
# well generate ROW_NUMBER() at the same time.
all_columns: Iterable[str] = (
Expand All @@ -540,14 +536,15 @@ def read_gbq_table(
time_travel_timestamp=None,
)

return self.read_gbq_query( # type: ignore # for dry_run overload
df = self.read_gbq_query( # type: ignore # for dry_run overload
query,
index_col=index_cols,
columns=columns,
api_name=api_name,
use_cache=use_cache,
dry_run=dry_run,
)
return df

if dry_run:
return dry_runs.get_table_stats(table)
Expand Down
6 changes: 5 additions & 1 deletion tests/system/small/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,11 +449,15 @@ def test_read_gbq_twice_with_same_timestamp(session, penguins_table_id):
@pytest.mark.parametrize(
"source_table",
[
# Wildcard tables
"bigquery-public-data.noaa_gsod.gsod194*",
# Linked datasets
"bigframes-dev.thelook_ecommerce.orders",
# Materialized views
"bigframes-dev.bigframes_tests_sys.base_table_mat_view",
],
)
def test_read_gbq_on_linked_dataset_warns(session, source_table):
def test_read_gbq_warns_time_travel_disabled(session, source_table):
with warnings.catch_warnings(record=True) as warned:
session.read_gbq(source_table, use_cache=False)
assert len(warned) == 1
Expand Down