apache · ehsantn · Jul 3, 2025 · Jul 3, 2025 · Jul 4, 2025 · Jul 7, 2025
diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md
@@ -1523,6 +1523,52 @@ print(ray_dataset.take(2))
 ]
 ```
 
+### Bodo
+
+PyIceberg interfaces closely with Bodo Dataframes (see [Bodo Iceberg Quick Start](https://docs.bodo.ai/latest/quick_start/quickstart_local_iceberg/)),
+which provides a drop-in replacement for Pandas that applies query, compiler and HPC optimizations automatically.
+Bodo accelerates and scales Python code from single laptops to large clusters without code rewrites.
+
+<!-- prettier-ignore-start -->
+
+!!! note "Requirements"
+    This requires [`bodo` to be installed](index.md).
+
+```python
+pip install pyiceberg['bodo']
+```
+<!-- prettier-ignore-end -->
+
+A table can be read easily into a Bodo Dataframe to perform Pandas operations:
+
+```python
+df = table.to_bodo()  # equivalent to `bodo.pandas.read_iceberg_table(table)`
+df = df[df["trip_distance"] >= 10.0]
+df = df[["VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime"]]
+print(df)
+```
+
+This creates a lazy query, optimizes it, and runs it on all available cores (print triggers execution):
+
+```python
+        VendorID tpep_pickup_datetime tpep_dropoff_datetime
+0              2  2023-01-01 00:27:12   2023-01-01 00:49:56
+1              2  2023-01-01 00:09:29   2023-01-01 00:29:23
+2              1  2023-01-01 00:13:30   2023-01-01 00:44:00
+3              2  2023-01-01 00:41:41   2023-01-01 01:19:32
+4              2  2023-01-01 00:22:39   2023-01-01 01:30:45
+...          ...                  ...                   ...
+245478         2  2023-01-31 22:32:57   2023-01-31 23:01:48
+245479         2  2023-01-31 22:03:26   2023-01-31 22:46:13
+245480         2  2023-01-31 23:25:56   2023-02-01 00:05:42
+245481         2  2023-01-31 23:18:00   2023-01-31 23:46:00
+245482         2  2023-01-31 23:18:00   2023-01-31 23:41:00
+
+[245483 rows x 3 columns]
+```
+
+Bodo is optimized to take advantage of Iceberg features such as hidden partitioning and various statistics for efficient reads.
+
 ### Daft
 
 PyIceberg interfaces closely with Daft Dataframes (see also: [Daft integration with Iceberg](https://docs.daft.ai/en/stable/io/iceberg/)) which provides a full lazily optimized query engine interface on top of PyIceberg tables.

diff --git a/mkdocs/docs/index.md b/mkdocs/docs/index.md
@@ -52,6 +52,7 @@ You can mix and match optional dependencies depending on your needs:
 | pandas        | Installs both PyArrow and Pandas                                          |
 | duckdb        | Installs both PyArrow and DuckDB                                          |
 | ray           | Installs PyArrow, Pandas, and Ray                                         |
+| bodo          | Installs Bodo                                                             |
 | daft          | Installs Daft                                                             |
 | polars       | Installs Polars                                                           |
 | s3fs          | S3FS as a FileIO implementation to interact with the object store         |

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -137,6 +137,7 @@
 from pyiceberg.utils.properties import property_as_bool
 
 if TYPE_CHECKING:
+    import bodo.pandas as bd
     import daft
     import pandas as pd
     import polars as pl
@@ -1485,6 +1486,16 @@ def to_daft(self) -> daft.DataFrame:
 
         return daft.read_iceberg(self)
 
+    def to_bodo(self) -> bd.DataFrame:
+        """Read a bodo DataFrame lazily from this Iceberg table.
+
+        Returns:
+            bd.DataFrame: Unmaterialized Bodo Dataframe created from the Iceberg table
+        """
+        import bodo.pandas as bd
+
+        return bd.read_iceberg_table(self)
+
     def to_polars(self) -> pl.LazyFrame:
         """Lazily read from this Apache Iceberg table.
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -78,6 +78,7 @@ gcsfs = { version = ">=2023.1.0", optional = true }
 huggingface-hub = { version = ">=0.24.0", optional = true }
 psycopg2-binary = { version = ">=2.9.6", optional = true }
 sqlalchemy = { version = "^2.0.18", optional = true }
+bodo = { version = ">=2025.7.4", optional = true }
 daft = { version = ">=0.5.0", optional = true }
 cachetools = ">=5.5,<7.0"
 pyiceberg-core = { version = "^0.5.1", optional = true }
@@ -298,6 +299,7 @@ pyarrow = ["pyarrow", "pyiceberg-core"]
 pandas = ["pandas", "pyarrow"]
 duckdb = ["duckdb", "pyarrow"]
 ray = ["ray", "pyarrow", "pandas"]
+bodo = ["bodo"]
 daft = ["daft"]
 polars = ["polars"]
 snappy = ["python-snappy"]
@@ -483,6 +485,10 @@ ignore_missing_imports = true
 module = "daft.*"
 ignore_missing_imports = true
 
+[[tool.mypy.overrides]]
+module = "bodo.*"
+ignore_missing_imports = true
+
 [[tool.mypy.overrides]]
 module = "pyparsing.*"
 ignore_missing_imports = true

diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py
@@ -339,6 +339,16 @@ def test_daft_nan_rewritten(catalog: Catalog) -> None:
     assert math.isnan(df.to_pydict()["col_numeric"][0])
 
 
+@pytest.mark.integration
+@pytest.mark.filterwarnings("ignore")
+@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")])
+def test_bodo_nan(catalog: Catalog) -> None:
+    table_test_null_nan_rewritten = catalog.load_table("default.test_null_nan_rewritten")
+    df = table_test_null_nan_rewritten.to_bodo()
+    assert len(df) == 3
+    assert math.isnan(df.col_numeric.iloc[0])
+
+
 @pytest.mark.integration
 @pytest.mark.filterwarnings("ignore")
 @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")])

diff --git a/tests/integration/test_writes/test_partitioned_writes.py b/tests/integration/test_writes/test_partitioned_writes.py
@@ -451,6 +451,11 @@ def test_dynamic_partition_overwrite_unpartitioned_evolve_to_identity_transform(
 
 @pytest.mark.integration
 def test_summaries_with_null(spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None:
+    import pyarrow
+    from packaging import version
+
+    under_20_arrow = version.parse(pyarrow.__version__) < version.parse("20.0.0")
+
     identifier = "default.arrow_table_summaries"
 
     try:
@@ -547,14 +552,14 @@ def test_summaries_with_null(spark: SparkSession, session_catalog: Catalog, arro
         "total-records": "6",
     }
     assert summaries[5] == {
-        "removed-files-size": "16174",
+        "removed-files-size": "15774" if under_20_arrow else "16174",
         "changed-partition-count": "2",
         "total-equality-deletes": "0",
         "deleted-data-files": "4",
         "total-position-deletes": "0",
         "total-delete-files": "0",
         "deleted-records": "4",
-        "total-files-size": "8884",
+        "total-files-size": "8684" if under_20_arrow else "8884",
         "total-data-files": "2",
         "total-records": "2",
     }
@@ -564,9 +569,9 @@ def test_summaries_with_null(spark: SparkSession, session_catalog: Catalog, arro
         "total-equality-deletes": "0",
         "added-records": "2",
         "total-position-deletes": "0",
-        "added-files-size": "8087",
+        "added-files-size": "7887" if under_20_arrow else "8087",
         "total-delete-files": "0",
-        "total-files-size": "16971",
+        "total-files-size": "16571" if under_20_arrow else "16971",
         "total-data-files": "4",
         "total-records": "4",
     }

diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py
@@ -271,6 +271,11 @@ def test_summaries(spark: SparkSession, session_catalog: Catalog, arrow_table_wi
 
 @pytest.mark.integration
 def test_summaries_partial_overwrite(spark: SparkSession, session_catalog: Catalog) -> None:
+    import pyarrow
+    from packaging import version
+
+    under_20_arrow = version.parse(pyarrow.__version__) < version.parse("20.0.0")
+
     identifier = "default.test_summaries_partial_overwrite"
     TEST_DATA = {
         "id": [1, 2, 3, 1, 1],
@@ -311,13 +316,13 @@ def test_summaries_partial_overwrite(spark: SparkSession, session_catalog: Catal
     # APPEND
     assert summaries[0] == {
         "added-data-files": "3",
-        "added-files-size": "2618",
+        "added-files-size": "2570" if under_20_arrow else "2618",
         "added-records": "5",
         "changed-partition-count": "3",
         "total-data-files": "3",
         "total-delete-files": "0",
         "total-equality-deletes": "0",
-        "total-files-size": "2618",
+        "total-files-size": "2570" if under_20_arrow else "2618",
         "total-position-deletes": "0",
         "total-records": "5",
     }
@@ -346,16 +351,16 @@ def test_summaries_partial_overwrite(spark: SparkSession, session_catalog: Catal
     assert len(files) == 3
     assert summaries[1] == {
         "added-data-files": "1",
-        "added-files-size": "875",
+        "added-files-size": "859" if under_20_arrow else "875",
         "added-records": "2",
         "changed-partition-count": "1",
         "deleted-data-files": "1",
         "deleted-records": "3",
-        "removed-files-size": "882",
+        "removed-files-size": "866" if under_20_arrow else "882",
         "total-data-files": "3",
         "total-delete-files": "0",
         "total-equality-deletes": "0",
-        "total-files-size": "2611",
+        "total-files-size": "2563" if under_20_arrow else "2611",
         "total-position-deletes": "0",
         "total-records": "4",
     }