Skip to content

feat: add DefaultIndexKind.NULL to use as index_col in read_gbq*, creating an indexless DataFrame/Series #662

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
May 20, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
aaa545b
feat: Support indexless dataframe/series
TrevorBergeron May 6, 2024
7f11946
Merge remote-tracking branch 'github/main' into null_index
TrevorBergeron May 7, 2024
9a5b212
fixes for kurt, skew, median
TrevorBergeron May 8, 2024
0248150
fix unit tests
TrevorBergeron May 8, 2024
26e2d4f
Merge remote-tracking branch 'github/main' into null_index
TrevorBergeron May 8, 2024
16e292b
fix more issues
TrevorBergeron May 8, 2024
5611a86
fix defaulting to primary key logic
TrevorBergeron May 8, 2024
8caa068
Merge remote-tracking branch 'github/main' into null_index
TrevorBergeron May 9, 2024
ea9b120
fix tests
TrevorBergeron May 9, 2024
88fc037
Merge remote-tracking branch 'github/main' into null_index
TrevorBergeron May 15, 2024
27d6f47
many small changes
TrevorBergeron May 15, 2024
75b1fd1
fix accidental null indexes and raising warning
TrevorBergeron May 16, 2024
0b26bbb
Merge remote-tracking branch 'github/main' into null_index
TrevorBergeron May 16, 2024
7142078
fix df quantile index
TrevorBergeron May 16, 2024
7b5f4f6
Merge remote-tracking branch 'github/main' into null_index
TrevorBergeron May 17, 2024
bc28bd4
disable legacy pandas for some tests, add concat test
TrevorBergeron May 17, 2024
bd0aa12
fix series repr
TrevorBergeron May 17, 2024
5efcc27
Update bigframes/session/__init__.py
TrevorBergeron May 17, 2024
4b487e7
Update bigframes/core/rewrite.py
TrevorBergeron May 17, 2024
3892241
Update bigframes/core/rewrite.py
TrevorBergeron May 17, 2024
09af424
🦉 Updates from OwlBot post-processor
gcf-owl-bot[bot] May 17, 2024
1164faf
🦉 Updates from OwlBot post-processor
gcf-owl-bot[bot] May 17, 2024
8844f27
Merge branch 'null_index' of https://github.com/googleapis/python-big…
gcf-owl-bot[bot] May 17, 2024
600d500
pr comments addressed
TrevorBergeron May 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Merge remote-tracking branch 'github/main' into null_index
  • Loading branch information
TrevorBergeron committed May 15, 2024
commit 88fc03714afe39f866fb77c4600e338fbcb7f431
89 changes: 89 additions & 0 deletions bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2076,6 +2076,95 @@ def _null_index_guard(self):
"Cannot perform this operation without an index. Set an index using set_index."
)

def _get_rows_as_json_values(self) -> Block:
# We want to preserve any ordering currently present before turning to
# direct SQL manipulation. We will restore the ordering when we rebuild
# expression.
# TODO(shobs): Replace direct SQL manipulation by structured expression
# manipulation
ordering_column_name = guid.generate_guid()
expr = self.session._cache_with_offsets(self.expr)
expr = expr.promote_offsets(ordering_column_name)
expr_sql = self.session._to_sql(expr)

# Names of the columns to serialize for the row.
# We will use the repr-eval pattern to serialize a value here and
# deserialize in the cloud function. Let's make sure that would work.
column_names = []
for col in list(self.index_columns) + [col for col in self.column_labels]:
serialized_column_name = repr(col)
try:
ast.literal_eval(serialized_column_name)
except Exception:
raise NameError(
f"Column name type '{type(col).__name__}' is not supported for row serialization."
" Please consider using a name for which literal_eval(repr(name)) works."
)

column_names.append(serialized_column_name)
column_names_csv = sql.csv(column_names, quoted=True)

# index columns count
index_columns_count = len(self.index_columns)

# column references to form the array of values for the row
column_references_csv = sql.csv(
[sql.cast_as_string(col) for col in self.expr.column_ids]
)

# types of the columns to serialize for the row
column_types = list(self.index.dtypes) + list(self.dtypes)
column_types_csv = sql.csv([str(typ) for typ in column_types], quoted=True)

# row dtype to use for deserializing the row as pandas series
pandas_row_dtype = bigframes.dtypes.lcd_type(*column_types)
if pandas_row_dtype is None:
pandas_row_dtype = "object"
pandas_row_dtype = sql.quote(str(pandas_row_dtype))

# create a json column representing row through SQL manipulation
row_json_column_name = guid.generate_guid()
select_columns = (
[ordering_column_name] + list(self.index_columns) + [row_json_column_name]
)
select_columns_csv = sql.csv(
[sql.column_reference(col) for col in select_columns]
)
json_sql = f"""\
With T0 AS (
{textwrap.indent(expr_sql, " ")}
),
T1 AS (
SELECT *,
JSON_OBJECT(
"names", [{column_names_csv}],
"types", [{column_types_csv}],
"values", [{column_references_csv}],
"indexlength", {index_columns_count},
"dtype", {pandas_row_dtype}
) AS {row_json_column_name} FROM T0
)
SELECT {select_columns_csv} FROM T1
"""
ibis_table = self.session.ibis_client.sql(json_sql)
order_for_ibis_table = ordering.ExpressionOrdering.from_offset_col(
ordering_column_name
)
expr = core.ArrayValue.from_ibis(
self.session,
ibis_table,
[ibis_table[col] for col in select_columns if col != ordering_column_name],
hidden_ordering_columns=[ibis_table[ordering_column_name]],
ordering=order_for_ibis_table,
)
block = Block(
expr,
index_columns=self.index_columns,
column_labels=[row_json_column_name],
index_labels=self._index_labels,
)
return block


class BlockIndexProperties:
"""Accessor for the index-related block properties."""
Expand Down
4 changes: 4 additions & 0 deletions bigframes/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,9 @@ class DefaultIndexWarning(Warning):
"""Default index may cause unexpected costs."""


class PreviewWarning(Warning):
"""The feature is in preview."""


class NullIndexError(ValueError):
"""Object has no index."""
You are viewing a condensed version of this merge commit. You can view the full changes here.