googleapis · gcf-merge-on-green · Dec 1, 2023 · Nov 29, 2023 · Nov 30, 2023 · Dec 1, 2023
@@ -434,6 +434,19 @@ def info(
             # TODO: Convert to different units (kb, mb, etc.)
             obuf.write(f"memory usage: {self.memory_usage().sum()} bytes\n")
 
+    def select_dtypes(self, include=None, exclude=None) -> DataFrame:
+        # Create empty pandas dataframe with same schema and then leverage actual pandas implementation
+        as_pandas = pandas.DataFrame(
+            {
+                col_id: pandas.Series([], dtype=dtype)
+                for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
+            }
+        )
+        selected_columns = tuple(
+            as_pandas.select_dtypes(include=include, exclude=exclude).columns
+        )
+        return DataFrame(self._block.select_columns(selected_columns))
+
     def _set_internal_query_job(self, query_job: bigquery.QueryJob):
         self._query_job = query_job
 

@@ -297,6 +297,26 @@ def test_df_info(scalars_dfs):
     assert expected == bf_result.getvalue()
 
 
+@pytest.mark.parametrize(
+    ("include", "exclude"),
+    [
+        ("Int64", None),
+        (["int"], None),
+        ("number", None),
+        ([pd.Int64Dtype(), pd.BooleanDtype()], None),
+        (None, [pd.Int64Dtype(), pd.BooleanDtype()]),
+        ("Int64", ["boolean"]),
+    ],
+)
+def test_select_dtypes(scalars_dfs, include, exclude):
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    pd_result = scalars_pandas_df.select_dtypes(include=include, exclude=exclude)
+    bf_result = scalars_df.select_dtypes(include=include, exclude=exclude).to_pandas()
+
+    pd.testing.assert_frame_equal(pd_result, bf_result)
+
+
 def test_drop_index(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
 

@@ -158,6 +158,42 @@ def memory_usage(self, index: bool = True):
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
+    def select_dtypes(self, include=None, exclude=None) -> DataFrame:
+        """
+        Return a subset of the DataFrame's columns based on the column dtypes.
+
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': ["hello", "world"], 'col3': [True, False]})
+            >>> df.select_dtypes(include=['Int64'])
+               col1
+            0     1
+            1     2
+            <BLANKLINE>
+            [2 rows x 1 columns]
+
+            >>> df.select_dtypes(exclude=['Int64'])
+                col2   col3
+            0  hello   True
+            1  world  False
+            <BLANKLINE>
+            [2 rows x 2 columns]
+
+
+        Args:
+            include (scalar or list-like):
+                A selection of dtypes or strings to be included.
+            exclude (scalar or list-like):
+                A selection of dtypes or strings to be excluded.
+
+        Returns:
+            DataFrame: The subset of the frame including the dtypes in ``include`` and excluding the dtypes in ``exclude``.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
     # ----------------------------------------------------------------------
     # IO methods (to / from other formats)
     def to_numpy(self, dtype=None, copy=False, na_value=None, **kwargs) -> np.ndarray: