Skip to content

fix: all sort operation are now stable #195

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Nov 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions bigframes/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,12 +200,8 @@ def filter(self, predicate_id: str, keep_null: bool = False) -> ArrayValue:
)
)

def order_by(
self, by: Sequence[OrderingColumnReference], stable: bool = False
) -> ArrayValue:
return ArrayValue(
nodes.OrderByNode(child=self.node, by=tuple(by), stable=stable)
)
def order_by(self, by: Sequence[OrderingColumnReference]) -> ArrayValue:
return ArrayValue(nodes.OrderByNode(child=self.node, by=tuple(by)))

def reversed(self) -> ArrayValue:
return ArrayValue(nodes.ReversedNode(child=self.node))
Expand Down
4 changes: 2 additions & 2 deletions bigframes/core/block_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,7 @@ def nsmallest(
)
for col_id in column_ids
]
block = block.order_by(order_refs, stable=True)
block = block.order_by(order_refs)
if keep in ("first", "last"):
return block.slice(0, n)
else: # keep == "all":
Expand Down Expand Up @@ -541,7 +541,7 @@ def nlargest(
)
for col_id in column_ids
]
block = block.order_by(order_refs, stable=True)
block = block.order_by(order_refs)
if keep in ("first", "last"):
return block.slice(0, n)
else: # keep == "all":
Expand Down
4 changes: 1 addition & 3 deletions bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,10 +235,9 @@ def cols_matching_label(self, partial_label: Label) -> typing.Sequence[str]:
def order_by(
self,
by: typing.Sequence[ordering.OrderingColumnReference],
stable: bool = False,
) -> Block:
return Block(
self._expr.order_by(by, stable=stable),
self._expr.order_by(by),
index_columns=self.index_columns,
column_labels=self.column_labels,
index_labels=self.index.names,
Expand Down Expand Up @@ -1596,7 +1595,6 @@ def merge(
# sort uses coalesced join keys always
joined_expr = joined_expr.order_by(
[ordering.OrderingColumnReference(col_id) for col_id in coalesced_ids],
stable=True,
)

joined_expr = joined_expr.select_columns(result_columns)
Expand Down
6 changes: 2 additions & 4 deletions bigframes/core/compile/compiled.py
Original file line number Diff line number Diff line change
Expand Up @@ -753,11 +753,9 @@ def builder(self) -> OrderedIR.Builder:
predicates=self._predicates,
)

def order_by(
self, by: Sequence[OrderingColumnReference], stable: bool = False
) -> OrderedIR:
def order_by(self, by: Sequence[OrderingColumnReference]) -> OrderedIR:
expr_builder = self.builder()
expr_builder.ordering = self._ordering.with_ordering_columns(by, stable=stable)
expr_builder.ordering = self._ordering.with_ordering_columns(by)
return expr_builder.build()

def reversed(self) -> OrderedIR:
Expand Down
2 changes: 1 addition & 1 deletion bigframes/core/compile/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def compile_filter(node: nodes.FilterNode, ordered: bool = True):
@_compile_node.register
def compile_orderby(node: nodes.OrderByNode, ordered: bool = True):
if ordered:
return compile_ordered(node.child).order_by(node.by, node.stable)
return compile_ordered(node.child).order_by(node.by)
else:
return compile_unordered(node.child)

Expand Down
4 changes: 0 additions & 4 deletions bigframes/core/groupby/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,6 @@ def rolling(self, window: int, min_periods=None) -> windows.Window:
)
block = self._block.order_by(
[order.OrderingColumnReference(col) for col in self._by_col_ids],
stable=True,
)
return windows.Window(
block, window_spec, self._selected_cols, drop_null_groups=self._dropna
Expand All @@ -231,7 +230,6 @@ def expanding(self, min_periods: int = 1) -> windows.Window:
)
block = self._block.order_by(
[order.OrderingColumnReference(col) for col in self._by_col_ids],
stable=True,
)
return windows.Window(
block, window_spec, self._selected_cols, drop_null_groups=self._dropna
Expand Down Expand Up @@ -552,7 +550,6 @@ def rolling(self, window: int, min_periods=None) -> windows.Window:
)
block = self._block.order_by(
[order.OrderingColumnReference(col) for col in self._by_col_ids],
stable=True,
)
return windows.Window(
block,
Expand All @@ -570,7 +567,6 @@ def expanding(self, min_periods: int = 1) -> windows.Window:
)
block = self._block.order_by(
[order.OrderingColumnReference(col) for col in self._by_col_ids],
stable=True,
)
return windows.Window(
block,
Expand Down
1 change: 0 additions & 1 deletion bigframes/core/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,6 @@ class FilterNode(UnaryNode):
@dataclass(frozen=True)
class OrderByNode(UnaryNode):
by: Tuple[OrderingColumnReference, ...]
stable: bool = False


@dataclass(frozen=True)
Expand Down
47 changes: 22 additions & 25 deletions bigframes/core/ordering.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@
# Sufficient to store any value up to 2^63
DEFAULT_ORDERING_ID_LENGTH: int = math.ceil(63 * math.log(2, ORDERING_ID_STRING_BASE))

STABLE_SORTS = ["mergesort", "stable"]


class OrderingDirection(Enum):
ASC = 1
Expand Down Expand Up @@ -113,47 +111,46 @@ def with_non_sequential(self):
def with_ordering_columns(
self,
ordering_value_columns: Sequence[OrderingColumnReference] = (),
stable: bool = False,
) -> ExpressionOrdering:
"""Creates a new ordering that reorders by the given columns.

Args:
ordering_value_columns:
In decreasing precedence order, the values used to sort the ordering
stable:
If True, will use apply a stable sorting, using the old ordering where
the new ordering produces ties. Otherwise, ties will be resolved in
a performance maximizing way,

Returns:
Modified ExpressionOrdering
"""
col_ids_new = [
ordering_ref.column_id for ordering_ref in ordering_value_columns
]
if stable:
# Only reference each column once, so discard old referenc if there is a new reference
old_ordering_keep = [
ordering_ref
for ordering_ref in self.ordering_value_columns
if ordering_ref.column_id not in col_ids_new
]
else:
# New ordering needs to keep all total ordering columns no matter what.
# All other old ordering references can be discarded as does not need
# to be a stable sort.
old_ordering_keep = [
ordering_ref
for ordering_ref in self.ordering_value_columns
if (ordering_ref.column_id not in col_ids_new)
and (ordering_ref.column_id in self.total_ordering_columns)
]
new_ordering = (*ordering_value_columns, *old_ordering_keep)
old_ordering_keep = [
ordering_ref
for ordering_ref in self.ordering_value_columns
if ordering_ref.column_id not in col_ids_new
]

# Truncate to remove any unneded col references after all total order cols included
new_ordering = self._truncate_ordering(
(*ordering_value_columns, *old_ordering_keep)
)
return ExpressionOrdering(
new_ordering,
total_ordering_columns=self.total_ordering_columns,
)

def _truncate_ordering(
self, order_refs: tuple[OrderingColumnReference, ...]
) -> tuple[OrderingColumnReference, ...]:
total_order_cols_remaining = set(self.total_ordering_columns)
for i in range(len(order_refs)):
column = order_refs[i].column_id
if column in total_order_cols_remaining:
total_order_cols_remaining.remove(column)
if len(total_order_cols_remaining) == 0:
return order_refs[: i + 1]
raise ValueError("Ordering did not contain all total_order_cols")

def with_reverse(self):
"""Reverses the ordering."""
return ExpressionOrdering(
Expand Down
4 changes: 1 addition & 3 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1262,9 +1262,7 @@ def sort_values(
column_id, direction=direction, na_last=na_last
)
)
return DataFrame(
self._block.order_by(ordering, stable=kind in order.STABLE_SORTS)
)
return DataFrame(self._block.order_by(ordering))

def value_counts(
self,
Expand Down
7 changes: 1 addition & 6 deletions bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,7 @@
import bigframes.core.groupby as groupby
import bigframes.core.indexers
import bigframes.core.indexes as indexes
from bigframes.core.ordering import (
OrderingColumnReference,
OrderingDirection,
STABLE_SORTS,
)
from bigframes.core.ordering import OrderingColumnReference, OrderingDirection
import bigframes.core.scalar as scalars
import bigframes.core.utils as utils
import bigframes.core.window
Expand Down Expand Up @@ -1067,7 +1063,6 @@ def sort_values(
na_last=(na_position == "last"),
)
],
stable=kind in STABLE_SORTS,
)
return Series(block)

Expand Down