Skip to content

refactor: move LazyTableProvider into python crate #3509

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 1, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 2 additions & 106 deletions crates/core/src/delta_datafusion/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,10 @@ use datafusion_expr::{
col, BinaryExpr, Expr, Extension, LogicalPlan, Operator, TableProviderFilterPushDown,
Volatility,
};
use datafusion_physical_expr::{create_physical_expr, PhysicalExpr};
use datafusion_physical_expr::PhysicalExpr;
use datafusion_physical_plan::filter::FilterExec;
use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
use datafusion_physical_plan::memory::{LazyBatchGenerator, LazyMemoryExec};
use datafusion_physical_plan::limit::LocalLimitExec;
use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricBuilder, MetricsSet};
use datafusion_physical_plan::projection::ProjectionExec;
use datafusion_physical_plan::{
DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, SendableRecordBatchStream,
Statistics,
Expand All @@ -85,7 +83,6 @@ use either::Either;
use futures::TryStreamExt;
use itertools::Itertools;
use object_store::ObjectMeta;
use parking_lot::RwLock;
use serde::{Deserialize, Serialize};

use url::Url;
Expand Down Expand Up @@ -1036,107 +1033,6 @@ impl TableProvider for DeltaTableProvider {
}
}

#[derive(Debug)]
pub struct LazyTableProvider {
schema: Arc<ArrowSchema>,
batches: Vec<Arc<RwLock<dyn LazyBatchGenerator>>>,
}

impl LazyTableProvider {
/// Build a DeltaTableProvider
pub fn try_new(
schema: Arc<ArrowSchema>,
batches: Vec<Arc<RwLock<dyn LazyBatchGenerator>>>,
) -> DeltaResult<Self> {
Ok(LazyTableProvider { schema, batches })
}
}

#[async_trait]
impl TableProvider for LazyTableProvider {
fn as_any(&self) -> &dyn Any {
self
}

fn schema(&self) -> Arc<ArrowSchema> {
self.schema.clone()
}

fn table_type(&self) -> TableType {
TableType::Base
}

fn get_table_definition(&self) -> Option<&str> {
None
}

fn get_logical_plan(&self) -> Option<Cow<'_, LogicalPlan>> {
None
}

async fn scan(
&self,
_session: &dyn Session,
projection: Option<&Vec<usize>>,
filters: &[Expr],
limit: Option<usize>,
) -> DataFusionResult<Arc<dyn ExecutionPlan>> {
let mut plan: Arc<dyn ExecutionPlan> = Arc::new(LazyMemoryExec::try_new(
self.schema(),
self.batches.clone(),
)?);

let df_schema: DFSchema = plan.schema().try_into()?;

if let Some(filter_expr) = conjunction(filters.iter().cloned()) {
let physical_expr =
create_physical_expr(&filter_expr, &df_schema, &ExecutionProps::new())?;
plan = Arc::new(FilterExec::try_new(physical_expr, plan)?);
}

if let Some(projection) = projection {
let current_projection = (0..plan.schema().fields().len()).collect::<Vec<usize>>();
if projection != &current_projection {
let execution_props = &ExecutionProps::new();
let fields: DeltaResult<Vec<(Arc<dyn PhysicalExpr>, String)>> = projection
.iter()
.map(|i| {
let (table_ref, field) = df_schema.qualified_field(*i);
create_physical_expr(
&Expr::Column(Column::from((table_ref, field))),
&df_schema,
execution_props,
)
.map(|expr| (expr, field.name().clone()))
.map_err(DeltaTableError::from)
})
.collect();
plan = Arc::new(ProjectionExec::try_new(fields?, plan)?);
}
}

if let Some(limit) = limit {
plan = Arc::new(GlobalLimitExec::new(plan, 0, Some(limit)))
};

Ok(plan)
}

fn supports_filters_pushdown(
&self,
filter: &[&Expr],
) -> DataFusionResult<Vec<TableProviderFilterPushDown>> {
Ok(filter
.iter()
.map(|_| TableProviderFilterPushDown::Inexact)
.collect())
}

fn statistics(&self) -> Option<Statistics> {
None
}
}

// TODO: this will likely also need to perform column mapping later when we support reader protocol v2
/// A wrapper for parquet scans
#[derive(Debug)]
Expand Down
34 changes: 29 additions & 5 deletions python/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
[package]
name = "deltalake-python"
version = "1.0.2"
authors = ["Qingping Hou <[email protected]>", "Will Jones <[email protected]>"]
authors = [
"Qingping Hou <[email protected]>",
"Will Jones <[email protected]>",
]
homepage = "https://github.com/delta-io/delta-rs"
license = "Apache-2.0"
description = "Native Delta Lake Python binding based on delta-rs with Pandas integration"
Expand All @@ -17,19 +20,24 @@ doc = false
[dependencies]
delta_kernel.workspace = true

pyo3-arrow = { version = "0.9.0", default-features = false}
pyo3-arrow = { version = "0.9.0", default-features = false }

# arrow
arrow-schema = { workspace = true, features = ["serde"] }

# datafusion
# datafusion-catalog = { workspace = true }
datafusion-expr = { workspace = true }
datafusion-ffi = { workspace = true }
datafusion-physical-expr = { workspace = true }
datafusion-physical-plan = { workspace = true }

# serde
serde = { workspace = true }
serde_json = { workspace = true }

# "stdlib"
async-trait = { workspace = true }
chrono = { workspace = true }
env_logger = "0"
regex = { workspace = true }
Expand All @@ -46,15 +54,23 @@ tokio = { workspace = true, features = ["rt-multi-thread"] }
deltalake-mount = { path = "../crates/mount" }

# catalog-unity
deltalake-catalog-unity = { path = "../crates/catalog-unity", features = ["aws", "azure", "gcp", "r2"] }
deltalake-catalog-unity = { path = "../crates/catalog-unity", features = [
"aws",
"azure",
"gcp",
"r2",
] }

# Non-unix or emscripten os
[target.'cfg(any(not(target_family = "unix"), target_os = "emscripten"))'.dependencies]
mimalloc = { version = "0.1", default-features = false }

# Unix (excluding macOS & emscripten) → jemalloc
[target.'cfg(all(target_family = "unix", not(target_os = "macos"), not(target_os = "emscripten")))'.dependencies]
jemallocator = { version = "0.5", features = ["disable_initial_exec_tls", "background_threads"] }
jemallocator = { version = "0.5", features = [
"disable_initial_exec_tls",
"background_threads",
] }

# macOS → jemalloc (without background_threads) (https://github.com/jemalloc/jemalloc/issues/843)
[target.'cfg(all(target_family = "unix", target_os = "macos"))'.dependencies]
Expand All @@ -67,7 +83,15 @@ features = ["extension-module", "abi3", "abi3-py39"]
[dependencies.deltalake]
path = "../crates/deltalake"
version = "0"
features = ["azure", "gcs", "python", "datafusion", "unity-experimental", "hdfs", "lakefs"]
features = [
"azure",
"gcs",
"python",
"datafusion",
"unity-experimental",
"hdfs",
"lakefs",
]

[features]
default = ["rustls"]
Expand Down
Loading
Loading