Skip to content

feat: add pandas.qcut #104

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Oct 26, 2023
Next Next commit
feat: add pandas.qcut
  • Loading branch information
TrevorBergeron committed Oct 12, 2023
commit a618415d6e9536edb185a539b37a85b8735f2fca
33 changes: 33 additions & 0 deletions bigframes/core/reshape/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import bigframes.core as core
import bigframes.core.utils as utils
import bigframes.dataframe
import bigframes.operations as ops
import bigframes.operations.aggregations as agg_ops
import bigframes.series

Expand Down Expand Up @@ -118,3 +119,35 @@ def cut(
f"Only labels=False is supported in BigQuery DataFrames so far. {constants.FEEDBACK_LINK}"
)
return x._apply_window_op(agg_ops.CutOp(bins), window_spec=core.WindowSpec())


def qcut(
x: bigframes.series.Series,
q: typing.Union[int, typing.Sequence[float]],
*,
labels: Optional[bool] = None,
duplicates: typing.Literal["drop", "error"] = "error",
) -> bigframes.series.Series:
if isinstance(q, int) and q <= 0:
raise ValueError("`q` should be a positive integer.")

if labels is not False:
raise NotImplementedError(
f"Only labels=False is supported in BigQuery DataFrames so far. {constants.FEEDBACK_LINK}"
)
if duplicates != "drop":
raise NotImplementedError(
f"Only duplicates='drop' is supported in BigQuery DataFrames so far. {constants.FEEDBACK_LINK}"
)
block = x._block
label = block.col_id_to_label[x._value_column]
block, nullity_id = block.apply_unary_op(x._value_column, ops.notnull_op)
block, result = block.apply_window_op(
x._value_column,
agg_ops.QcutOp(q),
window_spec=core.WindowSpec(grouping_keys=(nullity_id,)),
)
block, result = block.apply_binary_op(
result, nullity_id, ops.partial_arg3(ops.where_op, None), result_label=label
)
return bigframes.series.Series(block.select_column(result))
42 changes: 42 additions & 0 deletions bigframes/operations/aggregations.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,48 @@ def handles_ties(self):
return True


class QcutOp(WindowOp):
def __init__(self, quantiles: typing.Union[int, typing.Sequence[float]]):
self.name = f"qcut-{quantiles}"
self._quantiles = quantiles

@numeric_op
def _as_ibis(
self, column: ibis_types.Column, window=None
) -> ibis_types.IntegerValue:
if isinstance(self._quantiles, int):
percent_ranks = typing.cast(
ibis_types.FloatingColumn,
_apply_window_if_present(column.percent_rank(), window),
)
float_bucket = typing.cast(
ibis_types.FloatingColumn, (percent_ranks * self._quantiles)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There seems to be mypy failures for this and other operations:

bigframes/operations/aggregations.py:272: error: Unsupported operand types for * ("FloatingColumn" and "int") [operator]
bigframes/operations/aggregations.py:274: error: Argument "lower" to "clip" of "NumericValue" has incompatible type "int"; expected "NumericValue | None" [arg-type]
bigframes/operations/aggregations.py:274: error: Unsupported operand types for - ("NumericValue" and "int") [operator]
bigframes/operations/aggregations.py:281: error: Unsupported operand types for < ("FloatingColumn" and "float") [operator]
bigframes/operations/aggregations.py:284: error: Unsupported operand types for <= ("FloatingColumn" and "float") [operator]

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

)
return float_bucket.ceil().clip(lower=1) - 1
else:
percent_ranks = typing.cast(
ibis_types.FloatingColumn,
_apply_window_if_present(column.percent_rank(), window),
)
out = ibis.case()
out = out.when(percent_ranks < self._quantiles[0], None)
for bucket_n in range(len(self._quantiles) - 1):
out = out.when(
percent_ranks <= self._quantiles[bucket_n + 1],
dtypes.literal_to_ibis_scalar(bucket_n, force_dtype=Int64Dtype()),
)
out = out.else_(None)
return out.end()

@property
def skips_nulls(self):
return False

@property
def handles_ties(self):
return True


class NuniqueOp(AggregateOp):
name = "nunique"

Expand Down
13 changes: 13 additions & 0 deletions bigframes/pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,19 @@ def cut(
cut.__doc__ = vendored_pandas_tile.cut.__doc__


def qcut(
x: bigframes.series.Series,
q: int,
*,
labels: Optional[bool] = None,
duplicates: typing.Literal["drop", "error"] = "error",
) -> bigframes.series.Series:
return bigframes.core.reshape.qcut(x, q, labels=labels, duplicates=duplicates)


qcut.__doc__ = vendored_pandas_tile.qcut.__doc__


def merge(
left: DataFrame,
right: DataFrame,
Expand Down
25 changes: 25 additions & 0 deletions tests/system/small/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,3 +223,28 @@ def test_cut(scalars_dfs):
bf_result = bf_result.to_pandas()
pd_result = pd_result.astype("Int64")
pd.testing.assert_series_equal(bf_result, pd_result)


@pytest.mark.parametrize(
("q",),
[
(1,),
(2,),
(7,),
(32,),
([0, 0.1, 0.3, 0.4, 0.9, 1.0],),
([0.5, 0.9],),
],
)
def test_qcut(scalars_dfs, q):
scalars_df, scalars_pandas_df = scalars_dfs

pd_result = pd.qcut(
scalars_pandas_df["float64_col"], q, labels=False, duplicates="drop"
)
bf_result = bpd.qcut(scalars_df["float64_col"], q, labels=False, duplicates="drop")

bf_result = bf_result.to_pandas()
pd_result = pd_result.astype("Int64")

pd.testing.assert_series_equal(bf_result, pd_result)
30 changes: 30 additions & 0 deletions third_party/bigframes_vendored/pandas/core/reshape/tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,33 @@ def cut(
False : returns an ndarray of integers.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)


def qcut(x, q, *, labels=None, duplicates="error"):
"""
Quantile-based discretization function.

Discretize variable into equal-sized buckets based on rank or based
on sample quantiles. For example 1000 values for 10 quantiles would
produce a Categorical object indicating quantile membership for each data point.

Args:
x (Series):
The input Series to be binned. Must be 1-dimensional.
q (int or list-like of float):
Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles.
labels (None):
Used as labels for the resulting bins. Must be of the same length as
the resulting bins. If False, return only integer indicators of the
bins. If True, raises an error.
duplicates ({default 'raise', 'drop'}, optional):
If bin edges are not unique, raise ValueError or drop non-uniques.

Returns:
Series: Categorical or Series of integers if labels is False
The return type (Categorical or Series) depends on the input: a Series
of type category if input is a Series else Categorical. Bins are
represented as categories when categorical data is returned.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)