Skip to content

fix: Improve escaping of literals and identifiers #682

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
May 15, 2024
14 changes: 7 additions & 7 deletions bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2096,7 +2096,7 @@ def _get_rows_as_json_values(self) -> Block:
)

column_names.append(serialized_column_name)
column_names_csv = sql.csv(column_names, quoted=True)
column_names_csv = sql.csv(map(sql.simple_literal, column_names))

# index columns count
index_columns_count = len(self.index_columns)
Expand All @@ -2108,22 +2108,22 @@ def _get_rows_as_json_values(self) -> Block:

# types of the columns to serialize for the row
column_types = list(self.index.dtypes) + list(self.dtypes)
column_types_csv = sql.csv([str(typ) for typ in column_types], quoted=True)
column_types_csv = sql.csv(
[sql.simple_literal(str(typ)) for typ in column_types]
)

# row dtype to use for deserializing the row as pandas series
pandas_row_dtype = bigframes.dtypes.lcd_type(*column_types)
if pandas_row_dtype is None:
pandas_row_dtype = "object"
pandas_row_dtype = sql.quote(str(pandas_row_dtype))
pandas_row_dtype = sql.simple_literal(str(pandas_row_dtype))

# create a json column representing row through SQL manipulation
row_json_column_name = guid.generate_guid()
select_columns = (
[ordering_column_name] + list(self.index_columns) + [row_json_column_name]
)
select_columns_csv = sql.csv(
[sql.column_reference(col) for col in select_columns]
)
select_columns_csv = sql.csv([sql.identifier(col) for col in select_columns])
json_sql = f"""\
With T0 AS (
{textwrap.indent(expr_sql, " ")}
Expand All @@ -2136,7 +2136,7 @@ def _get_rows_as_json_values(self) -> Block:
"values", [{column_references_csv}],
"indexlength", {index_columns_count},
"dtype", {pandas_row_dtype}
) AS {row_json_column_name} FROM T0
) AS {sql.identifier(row_json_column_name)} FROM T0
)
SELECT {select_columns_csv} FROM T1
"""
Expand Down
28 changes: 5 additions & 23 deletions bigframes/core/compile/compiled.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,8 @@
import abc
import functools
import itertools
import textwrap
import typing
from typing import Collection, Iterable, Literal, Optional, Sequence
from typing import Collection, Literal, Optional, Sequence

import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops
import ibis
Expand All @@ -40,6 +39,7 @@
OrderingExpression,
)
import bigframes.core.schema as schemata
import bigframes.core.sql
from bigframes.core.window_spec import RangeWindowBounds, RowsWindowBounds, WindowSpec
import bigframes.dtypes
import bigframes.operations.aggregations as agg_ops
Expand Down Expand Up @@ -821,15 +821,13 @@ def to_sql(
)
)
output_columns = [
col_id_overrides.get(col) if (col in col_id_overrides) else col
for col in baked_ir.column_ids
col_id_overrides.get(col, col) for col in baked_ir.column_ids
]
selection = ", ".join(map(lambda col_id: f"`{col_id}`", output_columns))
sql = bigframes.core.sql.select_from(output_columns, sql)

sql = textwrap.dedent(f"SELECT {selection}\n" "FROM (\n" f"{sql}\n" ")\n")
# Single row frames may not have any ordering columns
if len(baked_ir._ordering.all_ordering_columns) > 0:
order_by_clause = baked_ir._ordering_clause(
order_by_clause = bigframes.core.sql.ordering_clause(
baked_ir._ordering.all_ordering_columns
)
sql += f"{order_by_clause}\n"
Expand All @@ -843,22 +841,6 @@ def to_sql(
)
return typing.cast(str, sql)

def _ordering_clause(self, ordering: Iterable[OrderingExpression]) -> str:
parts = []
for col_ref in ordering:
asc_desc = "ASC" if col_ref.direction.is_ascending else "DESC"
null_clause = "NULLS LAST" if col_ref.na_last else "NULLS FIRST"
ordering_expr = col_ref.scalar_expression
# We don't know how to compile scalar expressions in isolation
if ordering_expr.is_const:
# Probably shouldn't have constants in ordering definition, but best to ignore if somehow they end up here.
continue
if not isinstance(ordering_expr, ex.UnboundVariableExpression):
raise ValueError("Expected direct column reference.")
part = f"`{ordering_expr.id}` {asc_desc} {null_clause}"
parts.append(part)
return f"ORDER BY {' ,'.join(parts)}"

def _to_ibis_expr(
self,
*,
Expand Down
155 changes: 129 additions & 26 deletions bigframes/core/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,49 +11,152 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations

"""
Utility functions for SQL construction.
"""

from typing import Iterable
import datetime
import math
import textwrap
from typing import Iterable, TYPE_CHECKING

# Literals and identifiers matching this pattern can be unquoted
unquoted = r"^[A-Za-z_][A-Za-z_0-9]*$"

def quote(value: str):
"""Return quoted input string."""

# Let's use repr which also escapes any special characters
#
# >>> for val in [
# ... "123",
# ... "str with no special chars",
# ... "str with special chars.,'\"/\\"
# ... ]:
# ... print(f"{val} -> {repr(val)}")
# ...
# 123 -> '123'
# str with no special chars -> 'str with no special chars'
# str with special chars.,'"/\ -> 'str with special chars.,\'"/\\'
if TYPE_CHECKING:
import google.cloud.bigquery as bigquery

return repr(value)
import bigframes.core.ordering


def column_reference(column_name: str):
### Writing SQL Values (literals, column references, table references, etc.)
def simple_literal(value: str | int | bool | float):
"""Return quoted input string."""
# https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#literals
if isinstance(value, str):
# Single quoting seems to work nicer with ibis than double quoting
return f"'{escape_special_characters(value)}'"
elif isinstance(value, (bool, int)):
return str(value)
elif isinstance(value, float):
# https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#floating_point_literals
if math.isnan(value):
return 'CAST("nan" as FLOAT)'
if value == math.inf:
return 'CAST("+inf" as FLOAT)'
if value == -math.inf:
return 'CAST("-inf" as FLOAT)'
return str(value)
else:
raise ValueError(f"Cannot produce literal for {value}")


def multi_literal(*values: str):
literal_strings = [simple_literal(i) for i in values]
return "(" + ", ".join(literal_strings) + ")"


def identifier(id: str) -> str:
"""Return a string representing column reference in a SQL."""
# https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#identifiers
# Just always escape, otherwise need to check against every reserved sql keyword
return f"`{escape_special_characters(id)}`"


def escape_special_characters(value: str):
"""Escapes all special charactesrs"""
# https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#string_and_bytes_literals
trans_table = str.maketrans(
{
"\a": r"\a",
"\b": r"\b",
"\f": r"\f",
"\n": r"\n",
"\r": r"\r",
"\t": r"\t",
"\v": r"\v",
"\\": r"\\",
"?": r"\?",
'"': r"\"",
"'": r"\'",
"`": r"\`",
}
)
return value.translate(trans_table)


def cast_as_string(column_name: str) -> str:
"""Return a string representing string casting of a column."""

return f"`{column_name}`"
return f"CAST({identifier(column_name)} AS STRING)"


def cast_as_string(column_name: str):
"""Return a string representing string casting of a column."""
def csv(values: Iterable[str]) -> str:
"""Return a string of comma separated values."""
return ", ".join(values)

return f"CAST({column_reference(column_name)} AS STRING)"

def table_reference(table_ref: bigquery.TableReference) -> str:
return f"`{escape_special_characters(table_ref.project)}`.`{escape_special_characters(table_ref.dataset_id)}`.`{escape_special_characters(table_ref.table_id)}`"

def csv(values: Iterable[str], quoted=False):
"""Return a string of comma separated values."""

if quoted:
values = [quote(val) for val in values]
def infix_op(opname: str, left_arg: str, right_arg: str):
# Maybe should add parentheses??
return f"{left_arg} {opname} {right_arg}"

return ", ".join(values)

### Writing SELECT expressions
def select_from(columns: Iterable[str], subquery: str, distinct: bool = False):
selection = ", ".join(map(identifier, columns))
distinct_clause = "DISTINCT " if distinct else ""

return textwrap.dedent(
f"SELECT {distinct_clause}{selection}\nFROM (\n" f"{subquery}\n" ")\n"
)


def select_table(table_ref: bigquery.TableReference):
return textwrap.dedent(f"SELECT * FROM {table_reference(table_ref)}")


def is_distinct_sql(columns: Iterable[str], table_sql: str) -> str:
is_unique_sql = f"""WITH full_table AS (
{select_from(columns, table_sql)}
),
distinct_table AS (
{select_from(columns, table_sql, distinct=True)}
)

SELECT (SELECT COUNT(*) FROM full_table) AS `total_count`,
(SELECT COUNT(*) FROM distinct_table) AS `distinct_count`
"""
return is_unique_sql


def ordering_clause(
ordering: Iterable[bigframes.core.ordering.OrderingExpression],
) -> str:
import bigframes.core.expression

parts = []
for col_ref in ordering:
asc_desc = "ASC" if col_ref.direction.is_ascending else "DESC"
null_clause = "NULLS LAST" if col_ref.na_last else "NULLS FIRST"
ordering_expr = col_ref.scalar_expression
# We don't know how to compile scalar expressions in isolation
if ordering_expr.is_const:
# Probably shouldn't have constants in ordering definition, but best to ignore if somehow they end up here.
continue
assert isinstance(
ordering_expr, bigframes.core.expression.UnboundVariableExpression
)
part = f"`{ordering_expr.id}` {asc_desc} {null_clause}"
parts.append(part)
return f"ORDER BY {' ,'.join(parts)}"


def snapshot_clause(time_travel_timestamp: datetime.datetime):
return f"FOR SYSTEM_TIME AS OF TIMESTAMP({repr(time_travel_timestamp.isoformat())})"
38 changes: 25 additions & 13 deletions bigframes/session/_io/bigquery/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@

import bigframes
from bigframes.core import log_adapter
import bigframes.core.sql
import bigframes.formatting_helpers as formatting_helpers

IO_ORDERING_ID = "bqdf_row_nums"
Expand Down Expand Up @@ -353,7 +354,7 @@ def to_query(
else:
select_clause = "SELECT *"

where_clause = ""
filter_string = ""
if filters:
valid_operators: Mapping[third_party_pandas_gbq.FilterOps, str] = {
"in": "IN",
Expand All @@ -373,12 +374,11 @@ def to_query(
):
filters = typing.cast(third_party_pandas_gbq.FiltersType, [filters])

or_expressions = []
for group in filters:
if not isinstance(group, Iterable):
group = [group]

and_expressions = []
and_expression = ""
for filter_item in group:
if not isinstance(filter_item, tuple) or (len(filter_item) != 3):
raise ValueError(
Expand All @@ -397,17 +397,29 @@ def to_query(

operator_str = valid_operators[operator]

column_ref = bigframes.core.sql.identifier(column)
if operator_str in ["IN", "NOT IN"]:
value_list = ", ".join([repr(v) for v in value])
expression = f"`{column}` {operator_str} ({value_list})"
value_literal = bigframes.core.sql.multi_literal(*value)
else:
expression = f"`{column}` {operator_str} {repr(value)}"
and_expressions.append(expression)

or_expressions.append(" AND ".join(and_expressions))
value_literal = bigframes.core.sql.simple_literal(value)
expression = bigframes.core.sql.infix_op(
operator_str, column_ref, value_literal
)
if and_expression:
and_expression = bigframes.core.sql.infix_op(
"AND", and_expression, expression
)
else:
and_expression = expression

if or_expressions:
where_clause = " WHERE " + " OR ".join(or_expressions)
if filter_string:
filter_string = bigframes.core.sql.infix_op(
"OR", filter_string, and_expression
)
else:
filter_string = and_expression

full_query = f"{select_clause} FROM {sub_query} AS sub{where_clause}"
return full_query
if filter_string:
return f"{select_clause} FROM {sub_query} AS sub WHERE {filter_string}"
else:
return f"{select_clause} FROM {sub_query} AS sub"
Loading