Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 151 additions & 0 deletions bigframes/core/agg_expressions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import abc
import dataclasses
import functools
import itertools
import typing
from typing import Callable, Mapping, TypeVar

from bigframes import dtypes
from bigframes.core import expression
import bigframes.core.identifiers as ids
import bigframes.operations.aggregations as agg_ops

TExpression = TypeVar("TExpression", bound="Aggregation")


@dataclasses.dataclass(frozen=True)
class Aggregation(expression.Expression):
"""Represents windowing or aggregation over a column."""

op: agg_ops.WindowOp = dataclasses.field()

@property
def column_references(self) -> typing.Tuple[ids.ColumnId, ...]:
return tuple(
itertools.chain.from_iterable(
map(lambda x: x.column_references, self.inputs)
)
)

@functools.cached_property
def is_resolved(self) -> bool:
return all(input.is_resolved for input in self.inputs)

@functools.cached_property
def output_type(self) -> dtypes.ExpressionType:
if not self.is_resolved:
raise ValueError(f"Type of expression {self.op} has not been fixed.")

input_types = [input.output_type for input in self.inputs]

return self.op.output_type(*input_types)

@property
@abc.abstractmethod
def inputs(
self,
) -> typing.Tuple[expression.Expression, ...]:
...

@property
def free_variables(self) -> typing.Tuple[str, ...]:
return tuple(
itertools.chain.from_iterable(map(lambda x: x.free_variables, self.inputs))
)

@property
def is_const(self) -> bool:
return all(child.is_const for child in self.inputs)

@abc.abstractmethod
def replace_args(self: TExpression, *arg) -> TExpression:
...

def transform_children(
self: TExpression, t: Callable[[expression.Expression], expression.Expression]
) -> TExpression:
return self.replace_args(*(t(arg) for arg in self.inputs))

def bind_variables(
self: TExpression,
bindings: Mapping[str, expression.Expression],
allow_partial_bindings: bool = False,
) -> TExpression:
return self.transform_children(
lambda x: x.bind_variables(bindings, allow_partial_bindings)
)

def bind_refs(
self: TExpression,
bindings: Mapping[ids.ColumnId, expression.Expression],
allow_partial_bindings: bool = False,
) -> TExpression:
return self.transform_children(
lambda x: x.bind_refs(bindings, allow_partial_bindings)
)


@dataclasses.dataclass(frozen=True)
class NullaryAggregation(Aggregation):
op: agg_ops.NullaryWindowOp = dataclasses.field()

@property
def inputs(
self,
) -> typing.Tuple[expression.Expression, ...]:
return ()

def replace_args(self, *arg) -> NullaryAggregation:
return self


@dataclasses.dataclass(frozen=True)
class UnaryAggregation(Aggregation):
op: agg_ops.UnaryWindowOp
arg: expression.Expression

@property
def inputs(
self,
) -> typing.Tuple[expression.Expression, ...]:
return (self.arg,)

def replace_args(self, arg: expression.Expression) -> UnaryAggregation:
return UnaryAggregation(
self.op,
arg,
)


@dataclasses.dataclass(frozen=True)
class BinaryAggregation(Aggregation):
op: agg_ops.BinaryAggregateOp = dataclasses.field()
left: expression.Expression = dataclasses.field()
right: expression.Expression = dataclasses.field()

@property
def inputs(
self,
) -> typing.Tuple[expression.Expression, ...]:
return (self.left, self.right)

def replace_args(
self, larg: expression.Expression, rarg: expression.Expression
) -> BinaryAggregation:
return BinaryAggregation(self.op, larg, rarg)
9 changes: 5 additions & 4 deletions bigframes/core/array_value.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import pandas
import pyarrow as pa

from bigframes.core import agg_expressions
import bigframes.core.expression as ex
import bigframes.core.guid
import bigframes.core.identifiers as ids
Expand Down Expand Up @@ -190,7 +191,7 @@ def row_count(self) -> ArrayValue:
child=self.node,
aggregations=(
(
ex.NullaryAggregation(agg_ops.size_op),
agg_expressions.NullaryAggregation(agg_ops.size_op),
ids.ColumnId(bigframes.core.guid.generate_guid()),
),
),
Expand Down Expand Up @@ -379,7 +380,7 @@ def drop_columns(self, columns: Iterable[str]) -> ArrayValue:

def aggregate(
self,
aggregations: typing.Sequence[typing.Tuple[ex.Aggregation, str]],
aggregations: typing.Sequence[typing.Tuple[agg_expressions.Aggregation, str]],
by_column_ids: typing.Sequence[str] = (),
dropna: bool = True,
) -> ArrayValue:
Expand Down Expand Up @@ -420,15 +421,15 @@ def project_window_op(
"""

return self.project_window_expr(
ex.UnaryAggregation(op, ex.deref(column_name)),
agg_expressions.UnaryAggregation(op, ex.deref(column_name)),
window_spec,
never_skip_nulls,
skip_reproject_unsafe,
)

def project_window_expr(
self,
expression: ex.Aggregation,
expression: agg_expressions.Aggregation,
window: WindowSpec,
never_skip_nulls=False,
skip_reproject_unsafe: bool = False,
Expand Down
9 changes: 3 additions & 6 deletions bigframes/core/bigframe_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,12 @@
import functools
import itertools
import typing
from typing import Callable, Dict, Generator, Iterable, Mapping, Sequence, Tuple, Union
from typing import Callable, Dict, Generator, Iterable, Mapping, Sequence, Tuple

from bigframes.core import expression, field, identifiers
import bigframes.core.schema as schemata
import bigframes.dtypes

if typing.TYPE_CHECKING:
import bigframes.session

COLUMN_SET = frozenset[identifiers.ColumnId]

T = typing.TypeVar("T")
Expand Down Expand Up @@ -281,8 +278,8 @@ def field_by_id(self) -> Mapping[identifiers.ColumnId, field.Field]:
@property
def _node_expressions(
self,
) -> Sequence[Union[expression.Expression, expression.Aggregation]]:
"""List of scalar expressions. Intended for checking engine compatibility with used ops."""
) -> Sequence[expression.Expression]:
"""List of expressions. Intended for checking engine compatibility with used ops."""
return ()

# Plan algorithms
Expand Down
22 changes: 13 additions & 9 deletions bigframes/core/block_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@
import pandas as pd

import bigframes.constants
from bigframes.core import agg_expressions
import bigframes.core as core
import bigframes.core.blocks as blocks
import bigframes.core.expression as ex
import bigframes.core.ordering as ordering
import bigframes.core.window_spec as windows
import bigframes.dtypes
import bigframes.dtypes as dtypes
import bigframes.operations as ops
import bigframes.operations.aggregations as agg_ops
Expand Down Expand Up @@ -133,7 +133,7 @@ def quantile(
block, _ = block.aggregate(
grouping_column_ids,
tuple(
ex.UnaryAggregation(agg_ops.AnyValueOp(), ex.deref(col))
agg_expressions.UnaryAggregation(agg_ops.AnyValueOp(), ex.deref(col))
for col in quantile_cols
),
column_labels=pd.Index(labels),
Expand Down Expand Up @@ -363,7 +363,7 @@ def value_counts(
block = dropna(block, columns, how="any")
block, agg_ids = block.aggregate(
by_column_ids=(*grouping_keys, *columns),
aggregations=[ex.NullaryAggregation(agg_ops.size_op)],
aggregations=[agg_expressions.NullaryAggregation(agg_ops.size_op)],
dropna=drop_na and not grouping_keys,
)
count_id = agg_ids[0]
Expand Down Expand Up @@ -647,15 +647,15 @@ def skew(
# counts, moment3 for each column
aggregations = []
for i, col in enumerate(original_columns):
count_agg = ex.UnaryAggregation(
count_agg = agg_expressions.UnaryAggregation(
agg_ops.count_op,
ex.deref(col),
)
moment3_agg = ex.UnaryAggregation(
moment3_agg = agg_expressions.UnaryAggregation(
agg_ops.mean_op,
ex.deref(delta3_ids[i]),
)
variance_agg = ex.UnaryAggregation(
variance_agg = agg_expressions.UnaryAggregation(
agg_ops.PopVarOp(),
ex.deref(col),
)
Expand Down Expand Up @@ -698,9 +698,13 @@ def kurt(
# counts, moment4 for each column
aggregations = []
for i, col in enumerate(original_columns):
count_agg = ex.UnaryAggregation(agg_ops.count_op, ex.deref(col))
moment4_agg = ex.UnaryAggregation(agg_ops.mean_op, ex.deref(delta4_ids[i]))
variance_agg = ex.UnaryAggregation(agg_ops.PopVarOp(), ex.deref(col))
count_agg = agg_expressions.UnaryAggregation(agg_ops.count_op, ex.deref(col))
moment4_agg = agg_expressions.UnaryAggregation(
agg_ops.mean_op, ex.deref(delta4_ids[i])
)
variance_agg = agg_expressions.UnaryAggregation(
agg_ops.PopVarOp(), ex.deref(col)
)
aggregations.extend([count_agg, moment4_agg, variance_agg])

block, agg_ids = block.aggregate(
Expand Down
Loading