Skip to content

feat: add DataFrame.select_dtypes method #242

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Dec 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,19 @@ def info(
# TODO: Convert to different units (kb, mb, etc.)
obuf.write(f"memory usage: {self.memory_usage().sum()} bytes\n")

def select_dtypes(self, include=None, exclude=None) -> DataFrame:
# Create empty pandas dataframe with same schema and then leverage actual pandas implementation
as_pandas = pandas.DataFrame(
{
col_id: pandas.Series([], dtype=dtype)
for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
}
)
selected_columns = tuple(
as_pandas.select_dtypes(include=include, exclude=exclude).columns
)
return DataFrame(self._block.select_columns(selected_columns))

def _set_internal_query_job(self, query_job: bigquery.QueryJob):
self._query_job = query_job

Expand Down
20 changes: 20 additions & 0 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,26 @@ def test_df_info(scalars_dfs):
assert expected == bf_result.getvalue()


@pytest.mark.parametrize(
("include", "exclude"),
[
("Int64", None),
(["int"], None),
("number", None),
([pd.Int64Dtype(), pd.BooleanDtype()], None),
(None, [pd.Int64Dtype(), pd.BooleanDtype()]),
("Int64", ["boolean"]),
],
)
def test_select_dtypes(scalars_dfs, include, exclude):
scalars_df, scalars_pandas_df = scalars_dfs

pd_result = scalars_pandas_df.select_dtypes(include=include, exclude=exclude)
bf_result = scalars_df.select_dtypes(include=include, exclude=exclude).to_pandas()

pd.testing.assert_frame_equal(pd_result, bf_result)


def test_drop_index(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs

Expand Down
36 changes: 36 additions & 0 deletions third_party/bigframes_vendored/pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,42 @@ def memory_usage(self, index: bool = True):
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def select_dtypes(self, include=None, exclude=None) -> DataFrame:
"""
Return a subset of the DataFrame's columns based on the column dtypes.

**Examples:**

>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None

>>> df = bpd.DataFrame({'col1': [1, 2], 'col2': ["hello", "world"], 'col3': [True, False]})
>>> df.select_dtypes(include=['Int64'])
col1
0 1
1 2
<BLANKLINE>
[2 rows x 1 columns]

>>> df.select_dtypes(exclude=['Int64'])
col2 col3
0 hello True
1 world False
<BLANKLINE>
[2 rows x 2 columns]


Args:
include (scalar or list-like):
A selection of dtypes or strings to be included.
exclude (scalar or list-like):
A selection of dtypes or strings to be excluded.

Returns:
DataFrame: The subset of the frame including the dtypes in ``include`` and excluding the dtypes in ``exclude``.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

# ----------------------------------------------------------------------
# IO methods (to / from other formats)
def to_numpy(self, dtype=None, copy=False, na_value=None, **kwargs) -> np.ndarray:
Expand Down