feat: support limit push down in datafusion (#177)

luoyuxia · web-flow · commit 1154bdbe7134 · 2026-04-03T14:46:19.000+08:00
diff --git a/crates/integration_tests/tests/read_tables.rs b/crates/integration_tests/tests/read_tables.rs
@@ -947,3 +947,55 @@ async fn test_read_data_evolution_table_with_projection() {
         "Projected data evolution read should return correct values"
     );
 }
+
+// ---------------------------------------------------------------------------
+// Limit pushdown integration tests
+// ---------------------------------------------------------------------------
+
+/// Helper function to scan and read with limit pushdown.
+async fn plan_table(table: &paimon::Table, limit: Option<usize>) -> Plan {
+    let mut read_builder = table.new_read_builder();
+    if let Some(limit) = limit {
+        read_builder.with_limit(limit);
+    }
+    let scan = read_builder.new_scan();
+    scan.plan().await.expect("Failed to plan scan")
+}
+
+/// Test limit pushdown: when limit is smaller than total rows, fewer data files may be generated.
+#[tokio::test]
+async fn test_limit_pushdown() {
+    let catalog = create_file_system_catalog();
+
+    // Test limit pushdown for data evolution table
+    let table = get_table_from_catalog(&catalog, "data_evolution_table").await;
+
+    // Get full plan without limit
+    let full_plan = plan_table(&table, None).await;
+    let full_data_split_count: usize = full_plan.splits().iter().count();
+
+    // Get the plan with limit = 2
+    let limited_plan = plan_table(&table, Some(2)).await;
+    let limited_data_split_count: usize = limited_plan.splits().iter().count();
+
+    // For data evolution tables, limit pushdown at split level uses merged_row_count
+    // The limited data split count should be < full data split count
+    assert!(
+        limited_data_split_count < full_data_split_count,
+        "Limit pushdown should reduce data split count for data evolution table: limited={limited_data_split_count}, full={full_data_split_count}"
+    );
+
+    // Verify data evolution splits have merged_row_count
+    for split in full_plan.splits() {
+        let merged_count = split.merged_row_count().expect(
+            "Data evolution table should have merged_row_count (all files should have first_row_id)",
+        );
+        // merged_row_count should be < row_count (overlapping ranges reduce count)
+        assert!(
+            merged_count < split.row_count(),
+            "merged_row_count ({}) should be < row_count ({})",
+            merged_count,
+            split.row_count()
+        );
+    }
+}
diff --git a/crates/integrations/datafusion/src/physical_plan/scan.rs b/crates/integrations/datafusion/src/physical_plan/scan.rs
@@ -16,16 +16,18 @@
 // under the License.
 
 use std::any::Any;
+use std::pin::Pin;
 use std::sync::Arc;
 
+use datafusion::arrow::array::RecordBatch;
 use datafusion::arrow::datatypes::SchemaRef as ArrowSchemaRef;
 use datafusion::error::Result as DFResult;
 use datafusion::execution::{SendableRecordBatchStream, TaskContext};
 use datafusion::physical_expr::EquivalenceProperties;
 use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType};
 use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
 use datafusion::physical_plan::{DisplayAs, ExecutionPlan, Partitioning, PlanProperties};
-use futures::{StreamExt, TryStreamExt};
+use futures::{Stream, StreamExt, TryStreamExt};
 use paimon::table::Table;
 use paimon::DataSplit;
 
@@ -46,6 +48,8 @@ pub struct PaimonTableScan {
     /// Wrapped in `Arc` to avoid deep-cloning `DataSplit` metadata in `execute()`.
     planned_partitions: Vec<Arc<[DataSplit]>>,
     plan_properties: PlanProperties,
+    /// Optional limit on the number of rows to return.
+    limit: Option<usize>,
 }
 
 impl PaimonTableScan {
@@ -54,6 +58,7 @@ impl PaimonTableScan {
         table: Table,
         projected_columns: Option<Vec<String>>,
         planned_partitions: Vec<Arc<[DataSplit]>>,
+        limit: Option<usize>,
     ) -> Self {
         let plan_properties = PlanProperties::new(
             EquivalenceProperties::new(schema.clone()),
@@ -66,6 +71,7 @@ impl PaimonTableScan {
             projected_columns,
             planned_partitions,
             plan_properties,
+            limit,
         }
     }
 
@@ -77,6 +83,10 @@ impl PaimonTableScan {
     pub(crate) fn planned_partitions(&self) -> &[Arc<[DataSplit]>] {
         &self.planned_partitions
     }
+
+    pub fn limit(&self) -> Option<usize> {
+        self.limit
+    }
 }
 
 impl ExecutionPlan for PaimonTableScan {
@@ -118,6 +128,7 @@ impl ExecutionPlan for PaimonTableScan {
         let table = self.table.clone();
         let schema = self.schema();
         let projected_columns = self.projected_columns.clone();
+        let limit = self.limit;
 
         let fut = async move {
             let mut read_builder = table.new_read_builder();
@@ -138,9 +149,30 @@ impl ExecutionPlan for PaimonTableScan {
         };
 
         let stream = futures::stream::once(fut).try_flatten();
+
+        // Enforce the final LIMIT at the DataFusion execution layer.
+        let limited_stream: Pin<Box<dyn Stream<Item = DFResult<RecordBatch>> + Send>> =
+            if let Some(limit) = limit {
+                let mut remaining = limit;
+                Box::pin(stream.try_filter_map(move |batch| {
+                    futures::future::ready(if remaining == 0 {
+                        Ok(None)
+                    } else if batch.num_rows() <= remaining {
+                        remaining -= batch.num_rows();
+                        Ok(Some(batch))
+                    } else {
+                        let limited_batch = batch.slice(0, remaining);
+                        remaining = 0;
+                        Ok(Some(limited_batch))
+                    })
+                }))
+            } else {
+                Box::pin(stream)
+            };
+
         Ok(Box::pin(RecordBatchStreamAdapter::new(
             self.schema(),
-            stream,
+            limited_stream,
         )))
     }
 }
@@ -155,7 +187,11 @@ impl DisplayAs for PaimonTableScan {
             f,
             "PaimonTableScan: partitions={}",
             self.planned_partitions.len()
-        )
+        )?;
+        if let Some(limit) = self.limit {
+            write!(f, ", limit={limit}")?;
+        }
+        Ok(())
     }
 }
 
@@ -176,7 +212,13 @@ mod tests {
     #[test]
     fn test_partition_count_empty_plan() {
         let schema = test_schema();
-        let scan = PaimonTableScan::new(schema, dummy_table(), None, vec![Arc::from(Vec::new())]);
+        let scan = PaimonTableScan::new(
+            schema,
+            dummy_table(),
+            None,
+            vec![Arc::from(Vec::new())],
+            None,
+        );
         assert_eq!(scan.properties().output_partitioning().partition_count(), 1);
     }
 
@@ -188,7 +230,7 @@ mod tests {
             Arc::from(Vec::new()),
             Arc::from(Vec::new()),
         ];
-        let scan = PaimonTableScan::new(schema, dummy_table(), None, planned_partitions);
+        let scan = PaimonTableScan::new(schema, dummy_table(), None, planned_partitions, None);
         assert_eq!(scan.properties().output_partitioning().partition_count(), 3);
     }
 
diff --git a/crates/integrations/datafusion/src/table/mod.rs b/crates/integrations/datafusion/src/table/mod.rs
@@ -100,7 +100,7 @@ impl TableProvider for PaimonTableProvider {
         state: &dyn Session,
         projection: Option<&Vec<usize>>,
         filters: &[Expr],
-        _limit: Option<usize>,
+        limit: Option<usize>,
     ) -> DFResult<Arc<dyn ExecutionPlan>> {
         // Convert projection indices to column names and compute projected schema
         let (projected_schema, projected_columns) = if let Some(indices) = projection {
@@ -119,6 +119,11 @@ impl TableProvider for PaimonTableProvider {
         if let Some(filter) = build_pushed_predicate(filters, self.table.schema().fields()) {
             read_builder.with_filter(filter);
         }
+        // Push the limit hint to paimon-core planning to reduce splits when possible.
+        // DataFusion still enforces the final LIMIT semantics.
+        if let Some(limit) = limit {
+            read_builder.with_limit(limit);
+        }
         let scan = read_builder.new_scan();
         let plan = scan.plan().await.map_err(to_datafusion_error)?;
 
@@ -143,6 +148,7 @@ impl TableProvider for PaimonTableProvider {
             self.table.clone(),
             projected_columns,
             planned_partitions,
+            limit,
         )))
     }
 }
diff --git a/crates/integrations/datafusion/tests/read_tables.rs b/crates/integrations/datafusion/tests/read_tables.rs
@@ -286,6 +286,54 @@ async fn test_mixed_and_filter_keeps_residual_datafusion_filter() {
     assert_eq!(actual_rows, vec![(2, "bob".to_string())]);
 }
 
+/// Test limit pushdown: ensures that LIMIT queries return the correct number of rows.
+#[tokio::test]
+async fn test_limit_pushdown() {
+    // Test append-only table (simple_log_table)
+    {
+        let batches = collect_query(
+            "simple_log_table",
+            "SELECT id, name FROM simple_log_table LIMIT 2",
+        )
+        .await
+        .expect("Limit query should succeed");
+
+        let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+        assert_eq!(total_rows, 2, "LIMIT 2 should return exactly 2 rows");
+    }
+
+    // Test data evolution table
+    {
+        let batches = collect_query(
+            "data_evolution_table",
+            "SELECT id, name FROM data_evolution_table LIMIT 3",
+        )
+        .await
+        .expect("Limit query on data evolution table should succeed");
+
+        let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+        assert_eq!(
+            total_rows, 3,
+            "LIMIT 3 should return exactly 3 rows for data evolution table"
+        );
+
+        // Verify the data is from the merged result (not raw files)
+        let mut rows = extract_id_name_rows(&batches);
+        rows.sort_by_key(|(id, _)| *id);
+
+        // LIMIT 3 returns ids 1, 2, 3 with merged values
+        assert_eq!(
+            rows,
+            vec![
+                (1, "alice-v2".to_string()),
+                (2, "bob".to_string()),
+                (3, "carol-v2".to_string()),
+            ],
+            "Data evolution table LIMIT 3 should return merged rows"
+        );
+    }
+}
+
 // ======================= Catalog Provider Tests =======================
 #[tokio::test]
 async fn test_query_via_catalog_provider() {
diff --git a/crates/paimon/src/table/read_builder.rs b/crates/paimon/src/table/read_builder.rs
@@ -36,6 +36,7 @@ pub struct ReadBuilder<'a> {
     table: &'a Table,
     projected_fields: Option<Vec<String>>,
     filter: Option<Predicate>,
+    limit: Option<usize>,
 }
 
 impl<'a> ReadBuilder<'a> {
@@ -44,6 +45,7 @@ impl<'a> ReadBuilder<'a> {
             table,
             projected_fields: None,
             filter: None,
+            limit: None,
         }
     }
 
@@ -72,9 +74,22 @@ impl<'a> ReadBuilder<'a> {
         self
     }
 
+    /// Push a row-limit hint down to scan planning.
+    ///
+    /// This allows the scan to generate fewer splits when possible. The hint is
+    /// applied based on the `merged_row_count()` of each split.
+    ///
+    /// Note: This method does not guarantee that exactly `limit` rows will be
+    /// returned by [`TableRead`]. It is only a pushdown hint for planning.
+    /// Callers or query engines are responsible for enforcing the final LIMIT.
+    pub fn with_limit(&mut self, limit: usize) -> &mut Self {
+        self.limit = Some(limit);
+        self
+    }
+
     /// Create a table scan. Call [TableScan::plan] to get splits.
     pub fn new_scan(&self) -> TableScan<'a> {
-        TableScan::new(self.table, self.filter.clone())
+        TableScan::new(self.table, self.filter.clone(), self.limit)
     }
 
     /// Create a table read for consuming splits (e.g. from a scan plan).
diff --git a/crates/paimon/src/table/source.rs b/crates/paimon/src/table/source.rs
diff --git a/crates/paimon/src/table/table_scan.rs b/crates/paimon/src/table/table_scan.rs