diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 7c409839b1..fb9503dc72 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -224,7 +224,7 @@ def json_extract( >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}']) >>> bbq.json_extract(s, json_path="$.class") - 0 "{\\\"students\\\":[{\\\"id\\\":5},{\\\"id\\\":12}]}" + 0 {"students":[{"id":5},{"id":12}]} dtype: string Args: diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index cae527931c..5492502f21 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -152,12 +152,7 @@ def _get_ibis_column(self, key: str) -> ibis_types.Value: raise ValueError( "Column name {} not in set of values: {}".format(key, self.column_ids) ) - return typing.cast( - ibis_types.Value, - bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( - self._column_names[key] - ), - ) + return typing.cast(ibis_types.Value, self._column_names[key]) def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: ibis_type = typing.cast( @@ -327,12 +322,7 @@ def _to_ibis_expr( if not columns: return ibis.memtable([]) - # Make sure all dtypes are the "canonical" ones for BigFrames. This is - # important for operations like UNION where the schema must match. - table = self._table.select( - bigframes.core.compile.ibis_types.ibis_value_to_canonical_type(column) - for column in columns - ) + table = self._table.select(columns) base_table = table if self._reduced_predicate is not None: table = table.filter(base_table[PREDICATE_COLUMN]) @@ -1039,14 +1029,7 @@ def _to_ibis_expr( # Make sure we don't have any unbound (deferred) columns. table = self._table.select(columns) - # Make sure all dtypes are the "canonical" ones for BigFrames. This is - # important for operations like UNION where the schema must match. - table = table.select( - bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( - table[column] - ) - for column in table.columns - ) + table = table.select(table[column] for column in table.columns) base_table = table if self._reduced_predicate is not None: table = table.filter(base_table[PREDICATE_COLUMN]) diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index f3221f605f..0b3038c9c7 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -208,6 +208,7 @@ def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value: name = value.get_name() if ibis_type.is_json(): value = vendored_ibis_ops.ToJsonString(value).to_expr() + value = value.case().when("null", ibis.null()).else_(value).end() return value.name(name) # Allow REQUIRED fields to be joined with NULLABLE fields. nullable_type = ibis_type.copy(nullable=True) diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index 059b8eea87..18ccadd9f5 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -23,11 +23,13 @@ def _get_series_from_json(json_data): + # Note: converts None to sql "null" and not to json none. + values = [ + f"JSON '{json.dumps(data)}'" if data is not None else "NULL" + for data in json_data + ] sql = " UNION ALL ".join( - [ - f"SELECT {id} AS id, JSON '{json.dumps(data)}' AS data" - for id, data in enumerate(json_data) - ] + [f"SELECT {id} AS id, {value} AS data" for id, value in enumerate(values)] ) df = bpd.read_gbq(sql).set_index("id").sort_index() return df["data"] @@ -114,19 +116,19 @@ def test_json_set_w_invalid_series_type(): def test_json_extract_from_json(): s = _get_series_from_json([{"a": {"b": [1, 2]}}, {"a": {"c": 1}}, {"a": {"b": 0}}]) - actual = bbq.json_extract(s, "$.a.b") + actual = bbq.json_extract(s, "$.a.b").to_pandas() # After the introduction of the JSON type, the output should be a JSON-formatted series. - expected = _get_series_from_json(["[1,2]", None, "0"]) + expected = _get_series_from_json([[1, 2], None, 0]).to_pandas() pd.testing.assert_series_equal( - actual.to_pandas(), - expected.to_pandas(), + actual, + expected, ) def test_json_extract_from_string(): s = bpd.Series(['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}']) actual = bbq.json_extract(s, "$.a.b") - expected = _get_series_from_json(["[1,2]", None, "0"]) + expected = _get_series_from_json([[1, 2], None, 0]) pd.testing.assert_series_equal( actual.to_pandas(), expected.to_pandas(),