Skip to content

fix: Fix bug converting non-string labels to sql ids #296

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jan 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions bigframes/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import bigframes.core.nodes as nodes
from bigframes.core.ordering import OrderingColumnReference
import bigframes.core.ordering as orderings
import bigframes.core.utils
from bigframes.core.window_spec import WindowSpec
import bigframes.dtypes
import bigframes.operations as ops
Expand Down Expand Up @@ -69,10 +70,14 @@ def from_ibis(
@classmethod
def from_pandas(cls, pd_df: pandas.DataFrame):
iobytes = io.BytesIO()
# Discard row labels and use simple string ids for columns
column_ids = tuple(str(label) for label in pd_df.columns)
pd_df.reset_index(drop=True).set_axis(column_ids, axis=1).to_feather(iobytes)
node = nodes.ReadLocalNode(iobytes.getvalue(), column_ids=column_ids)
# Use alphanumeric identifiers, to avoid downstream problems with escaping.
as_ids = [
bigframes.core.utils.label_to_identifier(label, strict=True)
for label in pd_df.columns
]
unique_ids = tuple(bigframes.core.utils.disambiguate_ids(as_ids))
pd_df.reset_index(drop=True).set_axis(unique_ids, axis=1).to_feather(iobytes)
node = nodes.ReadLocalNode(iobytes.getvalue())
return cls(node)

@property
Expand Down
1 change: 0 additions & 1 deletion bigframes/core/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,6 @@ def __hash__(self):
@dataclass(frozen=True)
class ReadLocalNode(BigFrameNode):
feather_bytes: bytes
column_ids: typing.Tuple[str, ...]

def __hash__(self):
return self._node_hash
Expand Down
33 changes: 25 additions & 8 deletions bigframes/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import typing
from typing import Hashable, Iterable, List

Expand Down Expand Up @@ -84,26 +85,42 @@ def get_standardized_ids(
Tuple of (standardized_column_ids, standardized_index_ids)
"""
col_ids = [
UNNAMED_COLUMN_ID if col_label is None else str(col_label)
UNNAMED_COLUMN_ID if col_label is None else label_to_identifier(col_label)
for col_label in col_labels
]
idx_ids = [
UNNAMED_INDEX_ID if idx_label is None else str(idx_label)
UNNAMED_INDEX_ID if idx_label is None else label_to_identifier(idx_label)
for idx_label in idx_labels
]

ids = idx_ids + col_ids
ids = disambiguate_ids(idx_ids + col_ids)

idx_ids, col_ids = ids[: len(idx_ids)], ids[len(idx_ids) :]

return col_ids, idx_ids


def label_to_identifier(label: typing.Hashable, strict: bool = False) -> str:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would be nice to have unit tests on these utils methods

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since utils are not part of the public API, I would prefer to test them via the public APIs that use them. I tried to include some unusual labels in the test cases.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good, let's add these two kinds of labels to test the strict mode:

  1. Leading spaces followed by alpha-numeric
  2. Leading spaces followed by _

"""
Convert pandas label to make legal bigquery identifier. May create collisions (should deduplicate after).
Strict mode might not be necessary, but ibis seems to escape non-alphanumeric characters inconsistently.
"""
# Column values will be loaded as null if the column name has spaces.
# https://github.com/googleapis/python-bigquery/issues/1566
ids = [id.replace(" ", "_") for id in ids]
identifier = str(label).replace(" ", "_")
if strict:
identifier = re.sub(r"[^a-zA-Z0-9_]", "", identifier)
if not identifier:
identifier = "id"
return identifier


ids = typing.cast(
def disambiguate_ids(ids: typing.Sequence[str]) -> typing.List[str]:
"""Disambiguate list of ids by adding suffixes where needed. If inputs are legal sql ids, outputs should be as well."""
return typing.cast(
List[str],
vendored_pandas_io_common.dedup_names(ids, is_potential_multiindex=False),
)
idx_ids, col_ids = ids[: len(idx_ids)], ids[len(idx_ids) :]

return col_ids, idx_ids


def merge_column_labels(
Expand Down
32 changes: 31 additions & 1 deletion tests/system/small/test_multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,37 @@
import pytest

import bigframes.pandas as bpd
from tests.system.utils import assert_pandas_df_equal
from tests.system.utils import assert_pandas_df_equal, skip_legacy_pandas


@skip_legacy_pandas
def test_read_pandas_multi_index_axes():
index = pandas.MultiIndex.from_arrays(
[
pandas.Index([4, 99], dtype=pandas.Int64Dtype()),
pandas.Index(
[" Hello, World!", "_some_new_string"],
dtype=pandas.StringDtype(storage="pyarrow"),
),
],
names=[" 1index 1", "_1index 2"],
)
columns = pandas.MultiIndex.from_arrays(
[
pandas.Index([6, 87], dtype=pandas.Int64Dtype()),
pandas.Index(
[" Bonjour le monde!", "_une_chaîne_de_caractères"],
dtype=pandas.StringDtype(storage="pyarrow"),
),
],
names=[" 1columns 1", "_1new_index 2"],
)
pandas_df = pandas.DataFrame(
[[1, 2], [3, 4]], index=index, columns=columns, dtype=pandas.Int64Dtype()
)
bf_df = bpd.DataFrame(pandas_df)

pandas.testing.assert_frame_equal(bf_df.to_pandas(), pandas_df)


# Row Multi-index tests
Expand Down