From 945a8bac69d12c6c4c24e4ebb67fd455987420d4 Mon Sep 17 00:00:00 2001
From: nathanmetzger <xbattlax@gmail.com>
Date: Thu, 15 Jan 2026 14:22:36 +0100
Subject: [PATCH] feat(scan): implement size-based file scan task planning

---
 crates/iceberg/src/scan/bin_packing.rs | 523 +++++++++++++++++++++++++
 crates/iceberg/src/scan/context.rs     | 152 ++++++-
 crates/iceberg/src/scan/mod.rs         | 141 ++++++-
 crates/iceberg/src/scan/task.rs        |  93 +++++
 4 files changed, 893 insertions(+), 16 deletions(-)
 create mode 100644 crates/iceberg/src/scan/bin_packing.rs

diff --git a/crates/iceberg/src/scan/bin_packing.rs b/crates/iceberg/src/scan/bin_packing.rs
new file mode 100644
index 0000000000..d1d28d76eb
--- /dev/null
+++ b/crates/iceberg/src/scan/bin_packing.rs
@@ -0,0 +1,523 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Bin-packing algorithm for combining file scan tasks.
+//!
+//! This module implements a bin-packing algorithm that combines small
+//! [`FileScanTask`]s into [`CombinedScanTask`]s to reduce overhead from
+//! opening many small files.
+
+use std::collections::VecDeque;
+use std::pin::Pin;
+use std::task::{Context, Poll};
+
+use futures::Stream;
+
+use crate::Result;
+use crate::scan::{CombinedScanTask, FileScanTask};
+
+/// Configuration for the bin-packing algorithm.
+#[derive(Debug, Clone, Copy)]
+pub struct BinPackingConfig {
+    /// Target size in bytes for each combined task.
+    pub target_size: u64,
+    /// The number of bins to keep open for finding the best fit.
+    /// Higher values may produce better packing but use more memory.
+    pub lookback: usize,
+    /// The estimated cost in bytes to open a file.
+    /// This is added to the task weight to account for file open overhead.
+    pub open_file_cost: u64,
+}
+
+impl Default for BinPackingConfig {
+    fn default() -> Self {
+        Self {
+            target_size: 128 * 1024 * 1024, // 128MB
+            lookback: 20,
+            open_file_cost: 4 * 1024 * 1024, // 4MB
+        }
+    }
+}
+
+/// Calculates the weight of a file scan task for bin-packing purposes.
+///
+/// The weight considers both the data size and the cost of opening files.
+/// For tasks that reference delete files, the delete file sizes are also included.
+pub fn calculate_task_weight(task: &FileScanTask, open_file_cost: u64) -> u64 {
+    // Base weight is the length of data to read
+    let data_weight = task.length;
+
+    // Add delete file content size estimate (we don't have exact sizes,
+    // so we count each delete file as contributing open_file_cost)
+    let delete_weight = task.deletes.len() as u64 * open_file_cost;
+
+    // The weight is the maximum of:
+    // 1. Data size + delete file overhead
+    // 2. Number of files to open * open_file_cost
+    let file_count = 1 + task.deletes.len() as u64;
+    let open_cost_weight = file_count * open_file_cost;
+
+    std::cmp::max(data_weight + delete_weight, open_cost_weight)
+}
+
+/// A bin-packing iterator that combines file scan tasks into combined tasks.
+///
+/// This implements a greedy bin-packing algorithm with lookback:
+/// - Tasks are added to open bins that can accommodate them
+/// - When no bin fits, the largest bin is emitted and a new bin is created
+/// - Lookback controls how many bins are kept open for better fitting
+pub struct BinPackingIterator<I> {
+    /// The source iterator of file scan tasks
+    source: I,
+    /// Currently open bins
+    bins: Vec<CombinedScanTask>,
+    /// Configuration for the packing algorithm
+    config: BinPackingConfig,
+    /// Whether the source iterator is exhausted
+    source_exhausted: bool,
+}
+
+impl<I> BinPackingIterator<I>
+where I: Iterator<Item = FileScanTask>
+{
+    /// Creates a new bin-packing iterator.
+    pub fn new(source: I, config: BinPackingConfig) -> Self {
+        Self {
+            source,
+            bins: Vec::with_capacity(config.lookback),
+            config,
+            source_exhausted: false,
+        }
+    }
+
+    /// Finds the best bin for a task, or returns None if no bin fits.
+    fn find_best_bin(&mut self, weight: u64) -> Option<usize> {
+        // Find the bin with the most space that can still fit this task
+        let mut best_idx = None;
+        let mut best_remaining = u64::MAX;
+
+        for (idx, bin) in self.bins.iter().enumerate() {
+            if !bin.would_exceed(weight, self.config.target_size) {
+                let remaining = self.config.target_size - bin.estimated_size() - weight;
+                if remaining < best_remaining {
+                    best_remaining = remaining;
+                    best_idx = Some(idx);
+                }
+            }
+        }
+
+        best_idx
+    }
+
+    /// Removes and returns the largest bin (by estimated size).
+    fn remove_largest_bin(&mut self) -> Option<CombinedScanTask> {
+        if self.bins.is_empty() {
+            return None;
+        }
+
+        let largest_idx = self
+            .bins
+            .iter()
+            .enumerate()
+            .max_by_key(|(_, bin)| bin.estimated_size())
+            .map(|(idx, _)| idx)
+            .unwrap();
+
+        Some(self.bins.remove(largest_idx))
+    }
+}
+
+impl<I> Iterator for BinPackingIterator<I>
+where I: Iterator<Item = FileScanTask>
+{
+    type Item = CombinedScanTask;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        loop {
+            // If source is exhausted, drain remaining bins
+            if self.source_exhausted {
+                return self.bins.pop().filter(|bin| !bin.is_empty());
+            }
+
+            // Get next task from source
+            let Some(task) = self.source.next() else {
+                self.source_exhausted = true;
+                // Start draining bins
+                return self.bins.pop().filter(|bin| !bin.is_empty());
+            };
+
+            let weight = calculate_task_weight(&task, self.config.open_file_cost);
+
+            // Try to find a bin that can fit this task
+            if let Some(bin_idx) = self.find_best_bin(weight) {
+                self.bins[bin_idx].add_task(task, weight);
+            } else {
+                // No bin fits - need to create a new one
+                // If we've hit the lookback limit, emit the largest bin first
+                if self.bins.len() >= self.config.lookback {
+                    let largest = self.remove_largest_bin();
+
+                    // Create new bin with this task
+                    let mut new_bin = CombinedScanTask::new();
+                    new_bin.add_task(task, weight);
+                    self.bins.push(new_bin);
+
+                    // Return the largest bin
+                    if let Some(bin) = largest
+                        && !bin.is_empty()
+                    {
+                        return Some(bin);
+                    }
+                } else {
+                    // Still have room for more bins
+                    let mut new_bin = CombinedScanTask::new();
+                    new_bin.add_task(task, weight);
+                    self.bins.push(new_bin);
+                }
+            }
+        }
+    }
+}
+
+/// Combines file scan tasks into combined scan tasks using bin-packing.
+///
+/// This function takes an iterator of file scan tasks and returns an iterator
+/// of combined scan tasks, where small tasks have been grouped together.
+///
+/// # Arguments
+/// * `tasks` - Iterator of file scan tasks to combine
+/// * `config` - Configuration for the bin-packing algorithm
+pub fn plan_tasks<I>(tasks: I, config: BinPackingConfig) -> BinPackingIterator<I>
+where I: Iterator<Item = FileScanTask> {
+    BinPackingIterator::new(tasks, config)
+}
+
+/// A streaming bin-packing adapter that combines file scan tasks into combined tasks.
+///
+/// This wraps an async stream of `Result<FileScanTask>` and produces a stream of
+/// `Result<CombinedScanTask>`, applying bin-packing on-the-fly without collecting
+/// all tasks into memory.
+pub struct BinPackingStream<S> {
+    /// The source stream of file scan tasks
+    source: S,
+    /// Currently open bins
+    bins: Vec<CombinedScanTask>,
+    /// Queue of completed bins ready to emit
+    ready_bins: VecDeque<CombinedScanTask>,
+    /// Configuration for the packing algorithm
+    config: BinPackingConfig,
+    /// Whether the source stream is exhausted
+    source_exhausted: bool,
+}
+
+impl<S> BinPackingStream<S>
+where S: Stream<Item = Result<FileScanTask>> + Unpin
+{
+    /// Creates a new streaming bin-packing adapter.
+    pub fn new(source: S, config: BinPackingConfig) -> Self {
+        Self {
+            source,
+            bins: Vec::with_capacity(config.lookback),
+            ready_bins: VecDeque::new(),
+            config,
+            source_exhausted: false,
+        }
+    }
+
+    /// Finds the best bin for a task, or returns None if no bin fits.
+    fn find_best_bin(&self, weight: u64) -> Option<usize> {
+        let mut best_idx = None;
+        let mut best_remaining = u64::MAX;
+
+        for (idx, bin) in self.bins.iter().enumerate() {
+            if !bin.would_exceed(weight, self.config.target_size) {
+                let remaining = self.config.target_size - bin.estimated_size() - weight;
+                if remaining < best_remaining {
+                    best_remaining = remaining;
+                    best_idx = Some(idx);
+                }
+            }
+        }
+
+        best_idx
+    }
+
+    /// Removes and returns the largest bin (by estimated size).
+    fn remove_largest_bin(&mut self) -> Option<CombinedScanTask> {
+        if self.bins.is_empty() {
+            return None;
+        }
+
+        let largest_idx = self
+            .bins
+            .iter()
+            .enumerate()
+            .max_by_key(|(_, bin)| bin.estimated_size())
+            .map(|(idx, _)| idx)
+            .unwrap();
+
+        Some(self.bins.remove(largest_idx))
+    }
+
+    /// Adds a task to an appropriate bin, possibly emitting a completed bin.
+    fn add_task(&mut self, task: FileScanTask) {
+        let weight = calculate_task_weight(&task, self.config.open_file_cost);
+
+        if let Some(bin_idx) = self.find_best_bin(weight) {
+            self.bins[bin_idx].add_task(task, weight);
+        } else {
+            // No bin fits - need to create a new one
+            if self.bins.len() >= self.config.lookback {
+                // Hit lookback limit, emit largest bin
+                if let Some(bin) = self.remove_largest_bin()
+                    && !bin.is_empty()
+                {
+                    self.ready_bins.push_back(bin);
+                }
+            }
+            // Create new bin with this task
+            let mut new_bin = CombinedScanTask::new();
+            new_bin.add_task(task, weight);
+            self.bins.push(new_bin);
+        }
+    }
+
+    /// Drains all remaining bins into the ready queue.
+    fn drain_remaining_bins(&mut self) {
+        while let Some(bin) = self.bins.pop() {
+            if !bin.is_empty() {
+                self.ready_bins.push_back(bin);
+            }
+        }
+    }
+}
+
+impl<S> Stream for BinPackingStream<S>
+where S: Stream<Item = Result<FileScanTask>> + Unpin
+{
+    type Item = Result<CombinedScanTask>;
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        loop {
+            // First, return any ready bins
+            if let Some(bin) = self.ready_bins.pop_front() {
+                return Poll::Ready(Some(Ok(bin)));
+            }
+
+            // If source is exhausted, drain remaining bins
+            if self.source_exhausted {
+                if let Some(bin) = self.bins.pop() {
+                    if !bin.is_empty() {
+                        return Poll::Ready(Some(Ok(bin)));
+                    }
+                } else {
+                    return Poll::Ready(None);
+                }
+                continue;
+            }
+
+            // Poll the source stream
+            match Pin::new(&mut self.source).poll_next(cx) {
+                Poll::Ready(Some(Ok(task))) => {
+                    self.add_task(task);
+                    // Continue loop to check if we have ready bins
+                }
+                Poll::Ready(Some(Err(e))) => {
+                    return Poll::Ready(Some(Err(e)));
+                }
+                Poll::Ready(None) => {
+                    self.source_exhausted = true;
+                    self.drain_remaining_bins();
+                    // Continue loop to drain ready_bins
+                }
+                Poll::Pending => {
+                    return Poll::Pending;
+                }
+            }
+        }
+    }
+}
+
+/// Creates a streaming bin-packing adapter from a stream of file scan tasks.
+///
+/// This function takes an async stream of file scan tasks and returns a stream
+/// of combined scan tasks, applying bin-packing on-the-fly without collecting
+/// all tasks into memory.
+pub fn plan_tasks_stream<S>(source: S, config: BinPackingConfig) -> BinPackingStream<S>
+where S: Stream<Item = Result<FileScanTask>> + Unpin {
+    BinPackingStream::new(source, config)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use super::*;
+    use crate::spec::{DataFileFormat, NestedField, PrimitiveType, Schema, Type};
+
+    fn create_test_task(path: &str, length: u64) -> FileScanTask {
+        let schema = Arc::new(
+            Schema::builder()
+                .with_fields(vec![Arc::new(NestedField::required(
+                    1,
+                    "id",
+                    Type::Primitive(PrimitiveType::Int),
+                ))])
+                .build()
+                .unwrap(),
+        );
+
+        FileScanTask {
+            start: 0,
+            length,
+            record_count: None,
+            data_file_path: path.to_string(),
+            data_file_format: DataFileFormat::Parquet,
+            schema,
+            project_field_ids: vec![1],
+            predicate: None,
+            deletes: vec![],
+            partition: None,
+            partition_spec: None,
+            name_mapping: None,
+            case_sensitive: true,
+        }
+    }
+
+    #[test]
+    fn test_single_task_single_bin() {
+        let tasks = vec![create_test_task("file1.parquet", 50 * 1024 * 1024)]; // 50MB
+
+        let config = BinPackingConfig {
+            target_size: 128 * 1024 * 1024,
+            lookback: 20,
+            open_file_cost: 4 * 1024 * 1024,
+        };
+
+        let combined: Vec<_> = plan_tasks(tasks.into_iter(), config).collect();
+
+        assert_eq!(combined.len(), 1);
+        assert_eq!(combined[0].len(), 1);
+    }
+
+    #[test]
+    fn test_small_tasks_combined() {
+        // Create 10 small tasks of 10MB each
+        let tasks: Vec<_> = (0..10)
+            .map(|i| create_test_task(&format!("file{i}.parquet"), 10 * 1024 * 1024))
+            .collect();
+
+        let config = BinPackingConfig {
+            target_size: 128 * 1024 * 1024, // 128MB target
+            lookback: 20,
+            open_file_cost: 4 * 1024 * 1024, // 4MB per file
+        };
+
+        let combined: Vec<_> = plan_tasks(tasks.into_iter(), config).collect();
+
+        // With 10MB data + 4MB open cost = 14MB per task
+        // 128MB / 14MB ≈ 9 tasks per bin, so we expect ~2 bins
+        assert!(
+            combined.len() <= 2,
+            "Expected at most 2 bins, got {}",
+            combined.len()
+        );
+
+        // Total tasks should still be 10
+        let total_tasks: usize = combined.iter().map(|c| c.len()).sum();
+        assert_eq!(total_tasks, 10);
+    }
+
+    #[test]
+    fn test_large_task_own_bin() {
+        let tasks = vec![
+            create_test_task("small.parquet", 10 * 1024 * 1024), // 10MB
+            create_test_task("large.parquet", 200 * 1024 * 1024), // 200MB > target
+            create_test_task("small2.parquet", 10 * 1024 * 1024), // 10MB
+        ];
+
+        let config = BinPackingConfig {
+            target_size: 128 * 1024 * 1024, // 128MB target
+            lookback: 20,
+            open_file_cost: 4 * 1024 * 1024,
+        };
+
+        let combined: Vec<_> = plan_tasks(tasks.into_iter(), config).collect();
+
+        // Large task exceeds target, should be in its own bin
+        // Small tasks may be combined
+        assert!(combined.len() >= 2);
+
+        // Check that large task is in its own bin
+        let has_large_alone = combined
+            .iter()
+            .any(|c| c.len() == 1 && c.tasks()[0].data_file_path == "large.parquet");
+        assert!(has_large_alone, "Large task should be in its own bin");
+    }
+
+    #[test]
+    fn test_calculate_task_weight_basic() {
+        let task = create_test_task("file.parquet", 100 * 1024 * 1024); // 100MB
+        let open_cost = 4 * 1024 * 1024; // 4MB
+
+        let weight = calculate_task_weight(&task, open_cost);
+
+        // Weight should be max(100MB + 0 deletes, 1 file * 4MB) = 100MB
+        assert_eq!(weight, 100 * 1024 * 1024);
+    }
+
+    #[test]
+    fn test_calculate_task_weight_small_file() {
+        let task = create_test_task("small.parquet", 1024 * 1024); // 1MB
+        let open_cost = 4 * 1024 * 1024; // 4MB
+
+        let weight = calculate_task_weight(&task, open_cost);
+
+        // Weight should be max(1MB, 4MB open cost) = 4MB
+        assert_eq!(weight, 4 * 1024 * 1024);
+    }
+
+    #[test]
+    fn test_empty_input() {
+        let tasks: Vec<FileScanTask> = vec![];
+        let config = BinPackingConfig::default();
+
+        let combined: Vec<_> = plan_tasks(tasks.into_iter(), config).collect();
+
+        assert!(combined.is_empty());
+    }
+
+    #[test]
+    fn test_lookback_limit() {
+        // Create many small tasks to test lookback behavior
+        let tasks: Vec<_> = (0..50)
+            .map(|i| create_test_task(&format!("file{i}.parquet"), 1024 * 1024))
+            .collect();
+
+        let config = BinPackingConfig {
+            target_size: 128 * 1024 * 1024,
+            lookback: 5, // Only keep 5 bins open
+            open_file_cost: 4 * 1024 * 1024,
+        };
+
+        let combined: Vec<_> = plan_tasks(tasks.into_iter(), config).collect();
+
+        // All tasks should be accounted for
+        let total_tasks: usize = combined.iter().map(|c| c.len()).sum();
+        assert_eq!(total_tasks, 50);
+    }
+}
diff --git a/crates/iceberg/src/scan/context.rs b/crates/iceberg/src/scan/context.rs
index 169d8e6405..6d5e77b646 100644
--- a/crates/iceberg/src/scan/context.rs
+++ b/crates/iceberg/src/scan/context.rs
@@ -24,8 +24,8 @@ use crate::delete_file_index::DeleteFileIndex;
 use crate::expr::{Bind, BoundPredicate, Predicate};
 use crate::io::object_cache::ObjectCache;
 use crate::scan::{
-    BoundPredicates, ExpressionEvaluatorCache, FileScanTask, ManifestEvaluatorCache,
-    PartitionFilterCache,
+    BoundPredicates, ExpressionEvaluatorCache, FileScanTask, FileScanTaskDeleteFile,
+    ManifestEvaluatorCache, PartitionFilterCache,
 };
 use crate::spec::{
     ManifestContentType, ManifestEntryRef, ManifestFile, ManifestList, SchemaRef, SnapshotRef,
@@ -47,6 +47,8 @@ pub(crate) struct ManifestFileContext {
     expression_evaluator_cache: Arc<ExpressionEvaluatorCache>,
     delete_file_index: DeleteFileIndex,
     case_sensitive: bool,
+    /// Target size in bytes for splitting large files (0 = disabled)
+    split_target_size: u64,
 }
 
 /// Wraps a [`ManifestEntryRef`] alongside the objects that are needed
@@ -61,6 +63,8 @@ pub(crate) struct ManifestEntryContext {
     pub snapshot_schema: SchemaRef,
     pub delete_file_index: DeleteFileIndex,
     pub case_sensitive: bool,
+    /// Target size in bytes for splitting large files (0 = disabled)
+    pub split_target_size: u64,
 }
 
 impl ManifestFileContext {
@@ -92,6 +96,7 @@ impl ManifestFileContext {
                 snapshot_schema: snapshot_schema.clone(),
                 delete_file_index: delete_file_index.clone(),
                 case_sensitive: self.case_sensitive,
+                split_target_size: self.split_target_size,
             };
 
             sender
@@ -105,9 +110,20 @@ impl ManifestFileContext {
 }
 
 impl ManifestEntryContext {
-    /// consume this `ManifestEntryContext`, returning a `FileScanTask`
-    /// created from it
-    pub(crate) async fn into_file_scan_task(self) -> Result<FileScanTask> {
+    /// Consume this `ManifestEntryContext`, returning one or more `FileScanTask`s.
+    ///
+    /// If `split_target_size` is set and the file is larger than that threshold,
+    /// the file will be split into multiple tasks. Each split task will read a
+    /// portion of the file using byte-range filtering.
+    ///
+    /// If the file has `split_offsets` metadata (e.g., Parquet row group boundaries),
+    /// splits will align with those offsets for optimal performance.
+    pub(crate) async fn into_file_scan_tasks(self) -> Result<Vec<FileScanTask>> {
+        let file_size = self.manifest_entry.file_size_in_bytes();
+        let record_count = self.manifest_entry.record_count();
+        let split_target_size = self.split_target_size;
+
+        // Get delete files (shared across all splits from the same data file)
         let deletes = self
             .delete_file_index
             .get_deletes_for_data_file(
@@ -116,18 +132,128 @@ impl ManifestEntryContext {
             )
             .await;
 
-        Ok(FileScanTask {
-            start: 0,
-            length: self.manifest_entry.file_size_in_bytes(),
-            record_count: Some(self.manifest_entry.record_count()),
+        // If splitting is disabled or file is small enough, return a single task
+        if split_target_size == 0 || file_size <= split_target_size {
+            return Ok(vec![self.create_file_scan_task(
+                0,
+                file_size,
+                Some(record_count),
+                deletes,
+            )]);
+        }
+
+        // Try to split using split_offsets if available (aligns with row group boundaries)
+        if let Some(offsets) = self.manifest_entry.data_file().split_offsets()
+            && !offsets.is_empty()
+        {
+            return Ok(self.split_by_offsets(offsets, file_size, deletes));
+        }
+
+        // Fall back to splitting by byte ranges
+        Ok(self.split_by_size(file_size, split_target_size, deletes))
+    }
+
+    /// Split the file using pre-computed offsets (e.g., Parquet row group boundaries).
+    /// This produces optimal splits that align with the file's internal structure.
+    fn split_by_offsets(
+        &self,
+        offsets: &[i64],
+        file_size: u64,
+        mut deletes: Vec<FileScanTaskDeleteFile>,
+    ) -> Vec<FileScanTask> {
+        // Pre-compute valid splits to know total count
+        let splits: Vec<(u64, u64)> = offsets
+            .iter()
+            .enumerate()
+            .filter_map(|(i, &offset)| {
+                let start = offset as u64;
+                let length = if i + 1 < offsets.len() {
+                    (offsets[i + 1] - offset) as u64
+                } else {
+                    file_size.saturating_sub(start)
+                };
+                if length > 0 {
+                    Some((start, length))
+                } else {
+                    None
+                }
+            })
+            .collect();
+
+        let num_splits = splits.len();
+        if num_splits == 0 {
+            return vec![];
+        }
+
+        let mut tasks = Vec::with_capacity(num_splits);
+
+        for (i, (start, length)) in splits.into_iter().enumerate() {
+            // Last split takes ownership, others clone
+            let task_deletes = if i == num_splits - 1 {
+                std::mem::take(&mut deletes)
+            } else {
+                deletes.clone()
+            };
+            tasks.push(self.create_file_scan_task(start, length, None, task_deletes));
+        }
+
+        tasks
+    }
+
+    /// Split the file into chunks of approximately `target_size` bytes.
+    /// This is used when split_offsets are not available.
+    fn split_by_size(
+        &self,
+        file_size: u64,
+        target_size: u64,
+        mut deletes: Vec<FileScanTaskDeleteFile>,
+    ) -> Vec<FileScanTask> {
+        let num_splits = file_size.div_ceil(target_size) as usize;
+        if num_splits == 0 {
+            return vec![];
+        }
+
+        let mut tasks = Vec::with_capacity(num_splits);
+        let mut offset = 0u64;
+        let mut split_idx = 0usize;
+
+        while offset < file_size {
+            let length = std::cmp::min(target_size, file_size - offset);
+            // Last split takes ownership, others clone
+            let task_deletes = if split_idx == num_splits - 1 {
+                std::mem::take(&mut deletes)
+            } else {
+                deletes.clone()
+            };
+            tasks.push(self.create_file_scan_task(offset, length, None, task_deletes));
+            offset += length;
+            split_idx += 1;
+        }
+
+        tasks
+    }
+
+    /// Create a single FileScanTask with the given byte range.
+    fn create_file_scan_task(
+        &self,
+        start: u64,
+        length: u64,
+        record_count: Option<u64>,
+        deletes: Vec<FileScanTaskDeleteFile>,
+    ) -> FileScanTask {
+        FileScanTask {
+            start,
+            length,
+            record_count,
 
             data_file_path: self.manifest_entry.file_path().to_string(),
             data_file_format: self.manifest_entry.file_format(),
 
-            schema: self.snapshot_schema,
+            schema: self.snapshot_schema.clone(),
             project_field_ids: self.field_ids.to_vec(),
             predicate: self
                 .bound_predicates
+                .as_ref()
                 .map(|x| x.as_ref().snapshot_bound_predicate.clone()),
 
             deletes,
@@ -139,7 +265,7 @@ impl ManifestEntryContext {
             // TODO: Extract name_mapping from table metadata property "schema.name-mapping.default"
             name_mapping: None,
             case_sensitive: self.case_sensitive,
-        })
+        }
     }
 }
 
@@ -160,6 +286,9 @@ pub(crate) struct PlanContext {
     pub partition_filter_cache: Arc<PartitionFilterCache>,
     pub manifest_evaluator_cache: Arc<ManifestEvaluatorCache>,
     pub expression_evaluator_cache: Arc<ExpressionEvaluatorCache>,
+
+    /// Target size in bytes for splitting large files (0 = disabled)
+    pub split_target_size: u64,
 }
 
 impl PlanContext {
@@ -282,6 +411,7 @@ impl PlanContext {
             expression_evaluator_cache: self.expression_evaluator_cache.clone(),
             delete_file_index,
             case_sensitive: self.case_sensitive,
+            split_target_size: self.split_target_size,
         }
     }
 }
diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs
index c055c12c9a..cf394d8ea8 100644
--- a/crates/iceberg/src/scan/mod.rs
+++ b/crates/iceberg/src/scan/mod.rs
@@ -17,6 +17,7 @@
 
 //! Table scan api.
 
+mod bin_packing;
 mod cache;
 use cache::*;
 mod context;
@@ -26,6 +27,7 @@ mod task;
 use std::sync::Arc;
 
 use arrow_array::RecordBatch;
+pub use bin_packing::*;
 use futures::channel::mpsc::{Sender, channel};
 use futures::stream::BoxStream;
 use futures::{SinkExt, StreamExt, TryStreamExt};
@@ -60,6 +62,9 @@ pub struct TableScanBuilder<'a> {
     concurrency_limit_manifest_files: usize,
     row_group_filtering_enabled: bool,
     row_selection_enabled: bool,
+    split_target_size: u64,
+    split_open_file_cost: u64,
+    split_lookback: usize,
 }
 
 impl<'a> TableScanBuilder<'a> {
@@ -78,6 +83,9 @@ impl<'a> TableScanBuilder<'a> {
             concurrency_limit_manifest_files: num_cpus,
             row_group_filtering_enabled: true,
             row_selection_enabled: false,
+            split_target_size: 0, // Disabled by default for backwards compatibility
+            split_open_file_cost: 4 * 1024 * 1024, // 4MB default (matches Java)
+            split_lookback: 20,   // 20 bins default (matches Java)
         }
     }
 
@@ -184,6 +192,45 @@ impl<'a> TableScanBuilder<'a> {
         self
     }
 
+    /// Sets the target size in bytes for splitting large files into smaller scan tasks.
+    ///
+    /// When a file is larger than this size, it will be split into multiple
+    /// `FileScanTask`s, allowing for more parallelism during reads. Each split
+    /// task will read a portion of the file using byte-range filtering.
+    ///
+    /// If the file has `split_offsets` metadata (e.g., Parquet row group boundaries),
+    /// splits will align with those offsets for optimal performance.
+    ///
+    /// Set to 0 to disable splitting (default). A typical value is 128MB (134217728 bytes).
+    pub fn with_split_target_size(mut self, split_target_size: u64) -> Self {
+        self.split_target_size = split_target_size;
+        self
+    }
+
+    /// Sets the estimated cost in bytes to open a file, used for bin-packing.
+    ///
+    /// When combining small scan tasks using `plan_tasks()`, this value is added
+    /// to each task's weight to account for the overhead of opening files.
+    /// This helps balance the tradeoff between parallel file access and file open costs.
+    ///
+    /// Default is 4MB. Higher values encourage more aggressive task combining.
+    pub fn with_split_open_file_cost(mut self, open_file_cost: u64) -> Self {
+        self.split_open_file_cost = open_file_cost;
+        self
+    }
+
+    /// Sets the number of bins to keep open during bin-packing.
+    ///
+    /// When combining small scan tasks using `plan_tasks()`, this controls how many
+    /// "bins" (partially-filled combined tasks) are kept open for finding the best fit.
+    /// Higher values may produce better packing but use more memory.
+    ///
+    /// Default is 20.
+    pub fn with_split_lookback(mut self, lookback: usize) -> Self {
+        self.split_lookback = lookback;
+        self
+    }
+
     /// Build the table scan.
     pub fn build(self) -> Result<TableScan> {
         let snapshot = match self.snapshot_id {
@@ -210,6 +257,9 @@ impl<'a> TableScanBuilder<'a> {
                         concurrency_limit_manifest_files: self.concurrency_limit_manifest_files,
                         row_group_filtering_enabled: self.row_group_filtering_enabled,
                         row_selection_enabled: self.row_selection_enabled,
+                        split_target_size: self.split_target_size,
+                        split_open_file_cost: self.split_open_file_cost,
+                        split_lookback: self.split_lookback,
                     });
                 };
                 current_snapshot_id.clone()
@@ -291,6 +341,7 @@ impl<'a> TableScanBuilder<'a> {
             partition_filter_cache: Arc::new(PartitionFilterCache::new()),
             manifest_evaluator_cache: Arc::new(ManifestEvaluatorCache::new()),
             expression_evaluator_cache: Arc::new(ExpressionEvaluatorCache::new()),
+            split_target_size: self.split_target_size,
         };
 
         Ok(TableScan {
@@ -303,6 +354,9 @@ impl<'a> TableScanBuilder<'a> {
             concurrency_limit_manifest_files: self.concurrency_limit_manifest_files,
             row_group_filtering_enabled: self.row_group_filtering_enabled,
             row_selection_enabled: self.row_selection_enabled,
+            split_target_size: self.split_target_size,
+            split_open_file_cost: self.split_open_file_cost,
+            split_lookback: self.split_lookback,
         })
     }
 }
@@ -331,6 +385,18 @@ pub struct TableScan {
 
     row_group_filtering_enabled: bool,
     row_selection_enabled: bool,
+
+    /// Target size in bytes for splitting large files into smaller tasks.
+    /// 0 means disabled.
+    /// Note: This is stored for introspection; actual splitting uses PlanContext.
+    #[allow(dead_code)]
+    split_target_size: u64,
+
+    /// The estimated cost in bytes to open a file, used for bin-packing.
+    split_open_file_cost: u64,
+
+    /// The number of bins to keep open during bin-packing.
+    split_lookback: usize,
 }
 
 impl TableScan {
@@ -430,6 +496,42 @@ impl TableScan {
         Ok(file_scan_task_rx.boxed())
     }
 
+    /// Returns a stream of [`CombinedScanTask`]s by combining small file scan tasks.
+    ///
+    /// This method first plans file scan tasks (splitting large files if configured),
+    /// then uses a bin-packing algorithm to combine small tasks into larger combined
+    /// tasks. This reduces the overhead of opening many small files while maintaining
+    /// parallelism.
+    ///
+    /// The bin-packing algorithm considers:
+    /// - Target size: Tasks are combined until reaching `split_target_size` (default: 128MB)
+    /// - Open file cost: The overhead of opening each file (default: 4MB) is added to task weight
+    /// - Lookback: Multiple bins are kept open for better packing (default: 20 bins)
+    ///
+    /// Use `with_split_target_size()`, `with_split_open_file_cost()`, and `with_split_lookback()`
+    /// on the `TableScanBuilder` to configure the bin-packing behavior.
+    pub async fn plan_tasks(&self) -> Result<CombinedScanTaskStream> {
+        let file_scan_tasks = self.plan_files().await?;
+
+        // If split_target_size is 0, bin-packing is effectively disabled
+        // but we still need to wrap tasks in CombinedScanTask
+        let target_size = if self.split_target_size == 0 {
+            // Use a large default target so tasks still get combined reasonably
+            128 * 1024 * 1024 // 128MB
+        } else {
+            self.split_target_size
+        };
+
+        let config = BinPackingConfig {
+            target_size,
+            lookback: self.split_lookback,
+            open_file_cost: self.split_open_file_cost,
+        };
+
+        // Stream bin-packing: processes tasks on-the-fly without collecting all into memory
+        Ok(Box::pin(plan_tasks_stream(file_scan_tasks, config)))
+    }
+
     /// Returns an [`ArrowRecordBatchStream`].
     pub async fn to_arrow(&self) -> Result<ArrowRecordBatchStream> {
         let mut arrow_reader_builder = ArrowReaderBuilder::new(self.file_io.clone())
@@ -502,11 +604,12 @@ impl TableScan {
         }
 
         // congratulations! the manifest entry has made its way through the
-        // entire plan without getting filtered out. Create a corresponding
-        // FileScanTask and push it to the result stream
-        file_scan_task_tx
-            .send(Ok(manifest_entry_context.into_file_scan_task().await?))
-            .await?;
+        // entire plan without getting filtered out. Create one or more
+        // FileScanTasks (depending on split_target_size) and push to result stream
+        let tasks = manifest_entry_context.into_file_scan_tasks().await?;
+        for task in tasks {
+            file_scan_task_tx.send(Ok(task)).await?;
+        }
 
         Ok(())
     }
@@ -2254,4 +2357,32 @@ pub mod tests {
         // Assert it finished (didn't timeout)
         assert!(result.is_ok(), "Scan timed out - deadlock detected");
     }
+
+    #[test]
+    fn test_with_split_target_size() {
+        let fixture = TableTestFixture::new();
+
+        // Test that split_target_size can be set via builder
+        let table_scan = fixture
+            .table
+            .scan()
+            .with_split_target_size(128 * 1024 * 1024) // 128MB
+            .build()
+            .unwrap();
+
+        // The scan should build successfully
+        // Note: Actual splitting is tested via the context unit tests
+        assert!(table_scan.plan_context.is_some());
+    }
+
+    #[test]
+    fn test_split_target_size_default_disabled() {
+        let fixture = TableTestFixture::new();
+
+        // Default split_target_size should be 0 (disabled)
+        let table_scan = fixture.table.scan().build().unwrap();
+
+        // The scan should build successfully with default (disabled) splitting
+        assert!(table_scan.plan_context.is_some());
+    }
 }
diff --git a/crates/iceberg/src/scan/task.rs b/crates/iceberg/src/scan/task.rs
index 5349a9bdd2..831ab5f3be 100644
--- a/crates/iceberg/src/scan/task.rs
+++ b/crates/iceberg/src/scan/task.rs
@@ -168,3 +168,96 @@ pub struct FileScanTaskDeleteFile {
     /// equality ids for equality deletes (null for anything other than equality-deletes)
     pub equality_ids: Option<Vec<i32>>,
 }
+
+/// A combined scan task that groups multiple [`FileScanTask`]s together.
+///
+/// This is the result of the bin-packing algorithm that combines small file
+/// scan tasks into larger groups for more efficient execution. Each combined
+/// task represents a unit of work that can be processed together.
+///
+/// The grouping considers:
+/// - Target split size: Tasks are combined until reaching the target size
+/// - Open file cost: The overhead of opening each file is factored into sizing
+/// - Lookback: Multiple open bins are maintained for better packing
+#[derive(Debug, Clone)]
+pub struct CombinedScanTask {
+    /// The individual file scan tasks in this combined task
+    tasks: Vec<FileScanTask>,
+    /// Total estimated size in bytes (including open file costs)
+    estimated_size: u64,
+}
+
+impl CombinedScanTask {
+    /// Creates a new empty combined scan task.
+    pub fn new() -> Self {
+        Self {
+            tasks: Vec::new(),
+            estimated_size: 0,
+        }
+    }
+
+    /// Creates a new combined scan task with the given tasks and estimated size.
+    pub fn with_tasks(tasks: Vec<FileScanTask>, estimated_size: u64) -> Self {
+        Self {
+            tasks,
+            estimated_size,
+        }
+    }
+
+    /// Returns the file scan tasks in this combined task.
+    pub fn tasks(&self) -> &[FileScanTask] {
+        &self.tasks
+    }
+
+    /// Consumes this combined task and returns the underlying file scan tasks.
+    pub fn into_tasks(self) -> Vec<FileScanTask> {
+        self.tasks
+    }
+
+    /// Returns the number of file scan tasks in this combined task.
+    pub fn len(&self) -> usize {
+        self.tasks.len()
+    }
+
+    /// Returns true if this combined task has no file scan tasks.
+    pub fn is_empty(&self) -> bool {
+        self.tasks.is_empty()
+    }
+
+    /// Returns the total estimated size in bytes.
+    pub fn estimated_size(&self) -> u64 {
+        self.estimated_size
+    }
+
+    /// Returns the total number of files referenced by tasks in this combined task.
+    /// Note: Multiple tasks may reference the same file (splits), so this counts
+    /// unique file paths.
+    pub fn files_count(&self) -> usize {
+        use std::collections::HashSet;
+        self.tasks
+            .iter()
+            .map(|t| &t.data_file_path)
+            .collect::<HashSet<_>>()
+            .len()
+    }
+
+    /// Adds a file scan task to this combined task.
+    pub(crate) fn add_task(&mut self, task: FileScanTask, weight: u64) {
+        self.tasks.push(task);
+        self.estimated_size += weight;
+    }
+
+    /// Returns true if adding a task with the given weight would exceed the target size.
+    pub(crate) fn would_exceed(&self, weight: u64, target_size: u64) -> bool {
+        self.estimated_size + weight > target_size
+    }
+}
+
+impl Default for CombinedScanTask {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// A stream of [`CombinedScanTask`].
+pub type CombinedScanTaskStream = BoxStream<'static, Result<CombinedScanTask>>;