From 90b9484f5544dacbb5c32b501d546656ee136eed Mon Sep 17 00:00:00 2001
From: Frank McSherry <fmcsherry@me.com>
Date: Wed, 6 May 2026 13:27:00 -0400
Subject: [PATCH 1/7] Bespoke columnar MergeBatcher

---
 .../src/columnar/arrangement/mod.rs           |   5 +-
 .../src/columnar/arrangement/trie_merger.rs   | 572 +++++++++---------
 differential-dataflow/src/columnar/batcher.rs | 120 ++++
 differential-dataflow/src/columnar/mod.rs     |   1 +
 4 files changed, 394 insertions(+), 304 deletions(-)
 create mode 100644 differential-dataflow/src/columnar/batcher.rs
diff --git a/differential-dataflow/src/columnar/arrangement/mod.rs b/differential-dataflow/src/columnar/arrangement/mod.rs
index be980fcdd..801ecbe0a 100644
--- a/differential-dataflow/src/columnar/arrangement/mod.rs
+++ b/differential-dataflow/src/columnar/arrangement/mod.rs
@@ -4,7 +4,6 @@
 //!   into DD's trace machinery.
 //! - `Coltainer<C>` wraps a columnar `C::Container` as a DD `BatchContainer`.
 //! - `TrieChunker` strips `RecordedUpdates` down to `UpdatesTyped` for the merge batcher.
-//! - `batcher` contains required trait stubs for `UpdatesTyped`.
 //! - `trie_merger` is the batch-at-a-time merging logic.
 //! - `builder::ValMirror` is the `trace::Builder` that seals melded chunks into
 //!   an `OrdValBatch`.
@@ -21,7 +20,7 @@ pub mod trie_merger;
 /// A trace implementation backed by columnar storage.
 pub type ValSpine<K, V, T, R> = Spine<Rc<OrdValBatch<ColumnarLayout<(K,V,T,R)>>>>;
 /// A batcher for columnar storage.
-pub type ValBatcher<K, V, T, R> = ValBatcher2<(K,V,T,R)>;
+pub type ValBatcher<K, V, T, R> = super::batcher::MergeBatcher<(K,V,T,R)>;
 /// A builder for columnar storage.
 pub type ValBuilder<K, V, T, R> = RcBuilder<builder::ValMirror<(K,V,T,R)>>;
 
@@ -124,8 +123,6 @@ pub mod batch_container {
 
 use super::updates::UpdatesTyped;
 use super::RecordedUpdates;
-use crate::trace::implementations::merge_batcher::MergeBatcher;
-type ValBatcher2<U> = MergeBatcher<RecordedUpdates<U>, TrieChunker<U>, trie_merger::TrieMerger<U>>;
 
 /// A chunker that unwraps `RecordedUpdates` into bare `UpdatesTyped` for the merge batcher.
 ///
diff --git a/differential-dataflow/src/columnar/arrangement/trie_merger.rs b/differential-dataflow/src/columnar/arrangement/trie_merger.rs
index fb88140ca..20d68cef9 100644
--- a/differential-dataflow/src/columnar/arrangement/trie_merger.rs
+++ b/differential-dataflow/src/columnar/arrangement/trie_merger.rs
@@ -1,6 +1,6 @@
 //! Batch-at-a-time merging of sorted, consolidated `UpdatesTyped` chains.
 //!
-//! The core is `TrieMerger::merge_batches`, which walks pairs of chunks via
+//! The core is `merge_batches`, which walks pairs of chunks via
 //! `merge_batch`, building a chain of merged outputs with `ChainBuilder`.
 //! `survey` maps the interleaving of the two inputs at each trie layer,
 //! `write_from_surveys` (via `write_layer` and `write_diffs`) copies the
@@ -8,20 +8,10 @@
 
 use columnar::{Columnar, Len};
 use timely::progress::frontier::{Antichain, AntichainRef};
-use crate::trace::implementations::merge_batcher::Merger;
 
 use super::super::layout::ColumnarUpdate as Update;
 use super::super::updates::UpdatesTyped;
 
-/// Merge-batcher merger that melds sorted, consolidated `UpdatesTyped` tries.
-pub struct TrieMerger<U: Update> {
-    _marker: std::marker::PhantomData<U>,
-}
-
-impl<U: Update> Default for TrieMerger<U> {
-    fn default() -> Self { Self { _marker: std::marker::PhantomData } }
-}
-
 /// A merging iterator over two sorted iterators.
 struct Merging<I1: Iterator, I2: Iterator> {
     iter1: std::iter::Peekable<I1>,
@@ -69,318 +59,300 @@ fn form_chunks<'a, U: Update>(
     }
 }
 
-impl<U: Update> Merger for TrieMerger<U>
+/// Partition `merged` into chunks ready to ship (times strictly less than `upper`)
+/// and chunks kept for future seals (times at-or-after `upper`). Updates `frontier`
+/// to the antichain of kept times.
+pub fn extract<U: Update>(
+    mut merged: Vec<UpdatesTyped<U>>,
+    upper: AntichainRef<U::Time>,
+    frontier: &mut Antichain<U::Time>,
+    ship: &mut Vec<UpdatesTyped<U>>,
+    kept: &mut Vec<UpdatesTyped<U>>,
+)
 where
     U::Time: 'static,
 {
-    type Chunk = UpdatesTyped<U>;
-    type Time = U::Time;
-
-    fn merge(
-        &mut self,
-        list1: Vec<UpdatesTyped<U>>,
-        list2: Vec<UpdatesTyped<U>>,
-        output: &mut Vec<UpdatesTyped<U>>,
-        _stash: &mut Vec<UpdatesTyped<U>>,
-    ) {
-        Self::merge_batches(list1, list2, output, _stash);
-    }
-
-    fn extract(
-        &mut self,
-        mut merged: Vec<Self::Chunk>,
-        upper: AntichainRef<Self::Time>,
-        frontier: &mut Antichain<Self::Time>,
-        ship: &mut Vec<Self::Chunk>,
-        kept: &mut Vec<Self::Chunk>,
-        _stash: &mut Vec<Self::Chunk>,
-    ) {
-        use columnar::{Container, ContainerOf, Index, Push};
-        use columnar::primitive::offsets::Strides;
-        use crate::columnar::updates::{Lists, retain_items};
-
-        // TODO: rework to move from trie structure to trie structure.
-        let mut time_owned = U::Time::default();
-        let mut bitmap = Vec::new();    // update should be kept.
-        for chunk in merged.drain(..) {
-            bitmap.clear();
-            let view = chunk.view();
-            let times = view.times.values;
-            for idx in 0 .. times.len() {
-                Columnar::copy_from(&mut time_owned, times.get(idx));
-                if upper.less_equal(&time_owned) {
-                    frontier.insert_ref(&time_owned);
-                    bitmap.push(true);
-                }
-                else { bitmap.push(false); }
+    use columnar::{Container, ContainerOf, Index, Push};
+    use columnar::primitive::offsets::Strides;
+    use crate::columnar::updates::{Lists, retain_items};
+
+    // TODO: rework to move from trie structure to trie structure.
+    let mut time_owned = U::Time::default();
+    let mut bitmap = Vec::new();    // update should be kept.
+    for chunk in merged.drain(..) {
+        bitmap.clear();
+        let view = chunk.view();
+        let times = view.times.values;
+        for idx in 0 .. times.len() {
+            Columnar::copy_from(&mut time_owned, times.get(idx));
+            if upper.less_equal(&time_owned) {
+                frontier.insert_ref(&time_owned);
+                bitmap.push(true);
             }
-            if bitmap.iter().all(|x| *x) { kept.push(chunk); }
-            else if bitmap.iter().all(|x| !*x) { ship.push(chunk); }
-            else {
-
-                let (times, temp) = retain_items::<ContainerOf<U::Time>>(view.times, &bitmap[..]);
-                let (vals, temp) = retain_items::<ContainerOf<U::Val>>(view.vals, &temp[..]);
-                let (keys, _temp) = retain_items::<ContainerOf<U::Key>>(view.keys, &temp[..]);
-                let d_borrow = view.diffs;
-                let mut diffs = <Lists::<ContainerOf<U::Diff>> as Container>::with_capacity_for([d_borrow].into_iter());
-                for (index, bit) in bitmap.iter().enumerate() {
-                    if *bit { diffs.values.push(d_borrow.values.get(index)); }
-                }
-                diffs.bounds = Strides::new(1, times.values.len() as u64);
-                kept.push(UpdatesTyped {
-                    keys,
-                    vals,
-                    times,
-                    diffs,
-                });
-
-                for bit in bitmap.iter_mut() { *bit = !*bit; }
-
-                let (times, temp) = retain_items::<ContainerOf<U::Time>>(view.times, &bitmap[..]);
-                let (vals, temp) = retain_items::<ContainerOf<U::Val>>(view.vals, &temp[..]);
-                let (keys, _temp) = retain_items::<ContainerOf<U::Key>>(view.keys, &temp[..]);
-                let d_borrow = view.diffs;
-                let mut diffs = <Lists::<ContainerOf<U::Diff>> as Container>::with_capacity_for([d_borrow].into_iter());
-                for (index, bit) in bitmap.iter().enumerate() {
-                    if *bit { diffs.values.push(d_borrow.values.get(index)); }
-                }
-                diffs.bounds = Strides::new(1, times.values.len() as u64);
-                ship.push(UpdatesTyped {
-                    keys,
-                    vals,
-                    times,
-                    diffs,
-                });
+            else { bitmap.push(false); }
+        }
+        if bitmap.iter().all(|x| *x) { kept.push(chunk); }
+        else if bitmap.iter().all(|x| !*x) { ship.push(chunk); }
+        else {
+
+            let (times, temp) = retain_items::<ContainerOf<U::Time>>(view.times, &bitmap[..]);
+            let (vals, temp) = retain_items::<ContainerOf<U::Val>>(view.vals, &temp[..]);
+            let (keys, _temp) = retain_items::<ContainerOf<U::Key>>(view.keys, &temp[..]);
+            let d_borrow = view.diffs;
+            let mut diffs = <Lists::<ContainerOf<U::Diff>> as Container>::with_capacity_for([d_borrow].into_iter());
+            for (index, bit) in bitmap.iter().enumerate() {
+                if *bit { diffs.values.push(d_borrow.values.get(index)); }
             }
+            diffs.bounds = Strides::new(1, times.values.len() as u64);
+            kept.push(UpdatesTyped {
+                keys,
+                vals,
+                times,
+                diffs,
+            });
+
+            for bit in bitmap.iter_mut() { *bit = !*bit; }
+
+            let (times, temp) = retain_items::<ContainerOf<U::Time>>(view.times, &bitmap[..]);
+            let (vals, temp) = retain_items::<ContainerOf<U::Val>>(view.vals, &temp[..]);
+            let (keys, _temp) = retain_items::<ContainerOf<U::Key>>(view.keys, &temp[..]);
+            let d_borrow = view.diffs;
+            let mut diffs = <Lists::<ContainerOf<U::Diff>> as Container>::with_capacity_for([d_borrow].into_iter());
+            for (index, bit) in bitmap.iter().enumerate() {
+                if *bit { diffs.values.push(d_borrow.values.get(index)); }
+            }
+            diffs.bounds = Strides::new(1, times.values.len() as u64);
+            ship.push(UpdatesTyped {
+                keys,
+                vals,
+                times,
+                diffs,
+            });
         }
     }
-
-    fn account(chunk: &Self::Chunk) -> (usize, usize, usize, usize) {
-        use timely::Accountable;
-        (chunk.record_count() as usize, 0, 0, 0)
-    }
 }
 
-impl<U: Update> TrieMerger<U>
+/// Iterator-based merge: flatten, merge, consolidate, form.
+/// Correct but slow — used as fallback.
+#[allow(dead_code)]
+fn merge_iterator<U: Update>(
+    list1: &[UpdatesTyped<U>],
+    list2: &[UpdatesTyped<U>],
+    output: &mut Vec<UpdatesTyped<U>>,
+)
 where
     U::Time: 'static,
 {
-    /// Iterator-based merge: flatten, merge, consolidate, form.
-    /// Correct but slow — used as fallback.
-    #[allow(dead_code)]
-    fn merge_iterator(
-        list1: &[UpdatesTyped<U>],
-        list2: &[UpdatesTyped<U>],
-        output: &mut Vec<UpdatesTyped<U>>,
-    ) {
-        let iter1 = list1.iter().flat_map(|chunk| chunk.iter());
-        let iter2 = list2.iter().flat_map(|chunk| chunk.iter());
-
-        let merged = Merging {
-            iter1: iter1.peekable(),
-            iter2: iter2.peekable(),
-        };
-
-        form_chunks::<U>(merged, output);
-    }
+    let iter1 = list1.iter().flat_map(|chunk| chunk.iter());
+    let iter2 = list2.iter().flat_map(|chunk| chunk.iter());
 
-    /// A merge implementation that operates batch-at-a-time.
-    #[inline(never)]
-    fn merge_batches(
-        list1: Vec<UpdatesTyped<U>>,
-        list2: Vec<UpdatesTyped<U>>,
-        output: &mut Vec<UpdatesTyped<U>>,
-        stash: &mut Vec<UpdatesTyped<U>>,
-    ) {
-
-        // The design for efficient "batch" merginging of chains of links is:
-        // 0.   We choose a target link size, K, and will keep the average link size at least K and the max size at 2k.
-        //      K should be large enough to amortize some set-up, but not so large that one or two extra break the bank.
-        // 1.   We will repeatedly consider pairs of links, and fully merge one with a prefix of the other.
-        //      The last elements of each link will tell us which of the two suffixes must be held back.
-        // 2.   We then have a chain of as many links as we started with, with potential defects to correct:
-        //      a.  A link may contain some number of zeros: we can remove them if we are eager, based on size.
-        //      b.  A link may contain more than 2K updates; we can split it.
-        //      c.  Two adjacent links may contain fewer than 2K updates; we can meld (careful append) them.
-        // 3.   After a pass of the above, we should have restored the invariant.
-        //      We can try and me smarter and fuse some of the above work rather than explicitly stage results.
-        //
-        // The challenging moment is the merge that can start with a suffix of one link, involving a prefix of one link.
-        // These could be the same link, different links, and generally there is the potential for complexity here.
-
-        let mut builder = ChainBuilder::default();
-
-        let mut queue1: std::collections::VecDeque<_> = list1.into();
-        let mut queue2: std::collections::VecDeque<_> = list2.into();
-
-        // The first unconsumed update in each block, via (k_idx, v_idx, t_idx), or None if exhausted.
-        // These are (0,0,0) for a new block, and should become None once there are no remaining updates.
-        let mut cursor1 = queue1.pop_front().map(|b| ((0,0,0), b));
-        let mut cursor2 = queue2.pop_front().map(|b| ((0,0,0), b));
-
-        // For each pair of batches
-        while cursor1.is_some() && cursor2.is_some() {
-            Self::merge_batch(&mut cursor1, &mut cursor2, &mut builder, stash);
-            if cursor1.is_none() { cursor1 = queue1.pop_front().map(|b| ((0,0,0), b)); }
-            if cursor2.is_none() { cursor2 = queue2.pop_front().map(|b| ((0,0,0), b)); }
-        }
+    let merged = Merging {
+        iter1: iter1.peekable(),
+        iter2: iter2.peekable(),
+    };
 
-        // TODO: create batch for the non-empty cursor.
-        if let Some(((k,v,t),batch)) = cursor1 {
-            let mut out_batch = stash.pop().unwrap_or_default();
-            let empty: UpdatesTyped<U> = Default::default();
-            let view = batch.view();
-            write_from_surveys(
-                &batch,
-                &empty,
-                &[Report::This(0, 1)],
-                &[Report::This(k, view.keys.values.len())],
-                &[Report::This(v, view.vals.values.len())],
-                &[Report::This(t, view.times.values.len())],
-                &mut out_batch,
-            );
-            builder.push(out_batch);
-        }
-        if let Some(((k,v,t),batch)) = cursor2 {
-            let mut out_batch = stash.pop().unwrap_or_default();
-            let empty: UpdatesTyped<U> = Default::default();
-            let view = batch.view();
-            write_from_surveys(
-                &empty,
-                &batch,
-                &[Report::That(0, 1)],
-                &[Report::That(k, view.keys.values.len())],
-                &[Report::That(v, view.vals.values.len())],
-                &[Report::That(t, view.times.values.len())],
-                &mut out_batch,
-            );
-            builder.push(out_batch);
-        }
+    form_chunks::<U>(merged, output);
+}
+
+/// A merge implementation that operates batch-at-a-time.
+#[inline(never)]
+pub fn merge_batches<U: Update>(
+    list1: Vec<UpdatesTyped<U>>,
+    list2: Vec<UpdatesTyped<U>>,
+    output: &mut Vec<UpdatesTyped<U>>,
+)
+where
+    U::Time: 'static,
+{
 
-        builder.extend(queue1);
-        builder.extend(queue2);
-        *output = builder.done();
-        // TODO: Tidy output to satisfy structural invariants.
+    // The design for efficient "batch" merginging of chains of links is:
+    // 0.   We choose a target link size, K, and will keep the average link size at least K and the max size at 2k.
+    //      K should be large enough to amortize some set-up, but not so large that one or two extra break the bank.
+    // 1.   We will repeatedly consider pairs of links, and fully merge one with a prefix of the other.
+    //      The last elements of each link will tell us which of the two suffixes must be held back.
+    // 2.   We then have a chain of as many links as we started with, with potential defects to correct:
+    //      a.  A link may contain some number of zeros: we can remove them if we are eager, based on size.
+    //      b.  A link may contain more than 2K updates; we can split it.
+    //      c.  Two adjacent links may contain fewer than 2K updates; we can meld (careful append) them.
+    // 3.   After a pass of the above, we should have restored the invariant.
+    //      We can try and me smarter and fuse some of the above work rather than explicitly stage results.
+    //
+    // The challenging moment is the merge that can start with a suffix of one link, involving a prefix of one link.
+    // These could be the same link, different links, and generally there is the potential for complexity here.
+
+    let mut builder = ChainBuilder::default();
+
+    let mut queue1: std::collections::VecDeque<_> = list1.into();
+    let mut queue2: std::collections::VecDeque<_> = list2.into();
+
+    // The first unconsumed update in each block, via (k_idx, v_idx, t_idx), or None if exhausted.
+    // These are (0,0,0) for a new block, and should become None once there are no remaining updates.
+    let mut cursor1 = queue1.pop_front().map(|b| ((0,0,0), b));
+    let mut cursor2 = queue2.pop_front().map(|b| ((0,0,0), b));
+
+    // For each pair of batches
+    while cursor1.is_some() && cursor2.is_some() {
+        merge_batch(&mut cursor1, &mut cursor2, &mut builder);
+        if cursor1.is_none() { cursor1 = queue1.pop_front().map(|b| ((0,0,0), b)); }
+        if cursor2.is_none() { cursor2 = queue2.pop_front().map(|b| ((0,0,0), b)); }
     }
 
-    /// Merge two batches, one completely and another through the corresponding prefix.
-    ///
-    /// Each invocation determines the maximum amount of both batches we can merge, determined
-    /// by comparing the elements at the tails of each batch, and locating the lesser in other.
-    /// We will merge the whole of the batch containing the lesser, and the prefix up through
-    /// the lesser element in the other batch, setting the cursor to the first element strictly
-    /// greater than that lesser element.
-    ///
-    /// The algorithm uses a list of `Report` findings to map the interleavings of the layers.
-    /// Each indicates either a range exclusive to one of the inputs, or a one element common
-    /// to the layers from both inputs, which must be further explored. This map would normally
-    /// allow the full merge to happen, but we need to carefully start at each cursor, and end
-    /// just before the first element greater than the lesser bound.
-    ///
-    /// The consumed prefix and disjoint suffix should be single report entries, and it seems
-    /// fine to first produce all reports and then reflect on the cursors, rather than use the
-    /// cursors as part of the mapping.
-    #[inline(never)]
-    fn merge_batch(
-        batch1: &mut Option<((usize, usize, usize), UpdatesTyped<U>)>,
-        batch2: &mut Option<((usize, usize, usize), UpdatesTyped<U>)>,
-        builder: &mut ChainBuilder<U>,
-        stash: &mut Vec<UpdatesTyped<U>>,
-    ) {
-        // TODO: Optimization for one batch exceeding the other.
-
-        let ((k0_idx, v0_idx, t0_idx), updates0) = batch1.take().unwrap();
-        let ((k1_idx, v1_idx, t1_idx), updates1) = batch2.take().unwrap();
-
-        let view0 = updates0.view();
-        let view1 = updates1.view();
-        let keys0 = view0.keys;
-        let keys1 = view1.keys;
-        let vals0 = view0.vals;
-        let vals1 = view1.vals;
-        let times0 = view0.times;
-        let times1 = view1.times;
-
-        // Survey the interleaving of the two inputs.
-        let mut key_survey = survey::<columnar::ContainerOf<U::Key>>(keys0, keys1, &[Report::Both(0,0)]);
-        let mut val_survey = survey::<columnar::ContainerOf<U::Val>>(vals0, vals1, &key_survey);
-        let mut time_survey = survey::<columnar::ContainerOf<U::Time>>(times0, times1, &val_survey);
-
-        // We now know enough to start writing into an output batch.
-        // We should update the input surveys to reflect the subset
-        // of data that we want.
-        //
-        // At most one cursor should be non-zero (assert!).
-        // A non-zero cursor must correspond to the first entry of the surveys,
-        // as there is at least one consumed update that precedes the other batch.
-        // We need to nudge that report forward to align with the cursor, potentially
-        // squeezing the report to nothing (to the upper bound).
-
-        // We start by updating the surveys to reflect the cursors.
-        // If either cursor is set, then its batch has an element strictly less than the other batch.
-        // We therefore expect to find a prefix of This/That at the start of the survey.
-        if (k0_idx, v0_idx, t0_idx) != (0,0,0) {
-            let mut done = false; while !done { if let Report::This(l,u) = &mut key_survey[0] { if *u <= k0_idx { key_survey.remove(0); } else { *l = k0_idx; done = true; } } else { done = true; } }
-            let mut done = false; while !done { if let Report::This(l,u) = &mut val_survey[0] { if *u <= v0_idx { val_survey.remove(0); } else { *l = v0_idx; done = true; } } else { done = true; } }
-            let mut done = false; while !done { if let Report::This(l,u) = &mut time_survey[0] { if *u <= t0_idx { time_survey.remove(0); } else { *l = t0_idx; done = true; } } else { done = true; } }
-        }
+    // TODO: create batch for the non-empty cursor.
+    if let Some(((k,v,t),batch)) = cursor1 {
+        let mut out_batch = UpdatesTyped::<U>::default();
+        let empty: UpdatesTyped<U> = Default::default();
+        let view = batch.view();
+        write_from_surveys(
+            &batch,
+            &empty,
+            &[Report::This(0, 1)],
+            &[Report::This(k, view.keys.values.len())],
+            &[Report::This(v, view.vals.values.len())],
+            &[Report::This(t, view.times.values.len())],
+            &mut out_batch,
+        );
+        builder.push(out_batch);
+    }
+    if let Some(((k,v,t),batch)) = cursor2 {
+        let mut out_batch = UpdatesTyped::<U>::default();
+        let empty: UpdatesTyped<U> = Default::default();
+        let view = batch.view();
+        write_from_surveys(
+            &empty,
+            &batch,
+            &[Report::That(0, 1)],
+            &[Report::That(k, view.keys.values.len())],
+            &[Report::That(v, view.vals.values.len())],
+            &[Report::That(t, view.times.values.len())],
+            &mut out_batch,
+        );
+        builder.push(out_batch);
+    }
 
-        if (k1_idx, v1_idx, t1_idx) != (0,0,0) {
-            let mut done = false; while !done { if let Report::That(l,u) = &mut key_survey[0] { if *u <= k1_idx { key_survey.remove(0); } else { *l = k1_idx; done = true; } } else { done = true; } }
-            let mut done = false; while !done { if let Report::That(l,u) = &mut val_survey[0] { if *u <= v1_idx { val_survey.remove(0); } else { *l = v1_idx; done = true; } } else { done = true; } }
-            let mut done = false; while !done { if let Report::That(l,u) = &mut time_survey[0] { if *u <= t1_idx { time_survey.remove(0); } else { *l = t1_idx; done = true; } } else { done = true; } }
-        }
+    builder.extend(queue1);
+    builder.extend(queue2);
+    *output = builder.done();
+    // TODO: Tidy output to satisfy structural invariants.
+}
 
-        // We want to trim the tails of the surveys to only cover ranges present in both inputs.
-        // We can determine which was "longer" by looking at the last entry of the bottom layer,
-        // which tells us which input (or both) contained the last element.
-        //
-        // From the bottom layer up, we'll identify the index of the last item, and then determine
-        // the index of the list it belongs to. We use that index in the next layer, to locate the
-        // index of the list it belongs to, on upward.
-        let next_cursor = match time_survey.last().unwrap() {
-            Report::This(_,_) => {
-                // Collect the last value indexes known to strictly exceed an entry in the other batch.
-                let mut t = times0.values.len();
-                while let Some(Report::This(l,_)) = time_survey.last() { t = *l; time_survey.pop(); }
-                let mut v = vals0.values.len();
-                while let Some(Report::This(l,_)) = val_survey.last() { v = *l; val_survey.pop(); }
-                let mut k = keys0.values.len();
-                while let Some(Report::This(l,_)) = key_survey.last() { k = *l; key_survey.pop(); }
-                // Now we may need to correct by nudging down.
-                if v == times0.len() || times0.bounds.bounds(v).0 > t { v -= 1; }
-                if k == vals0.len() || vals0.bounds.bounds(k).0 > v { k -= 1; }
-                Some(Ok((k,v,t)))
-            }
-            Report::Both(_,_) => { None }
-            Report::That(_,_) => {
-                // Collect the last value indexes known to strictly exceed an entry in the other batch.
-                let mut t = times1.values.len();
-                while let Some(Report::That(l,_)) = time_survey.last() { t = *l; time_survey.pop(); }
-                let mut v = vals1.values.len();
-                while let Some(Report::That(l,_)) = val_survey.last() { v = *l; val_survey.pop(); }
-                let mut k = keys1.values.len();
-                while let Some(Report::That(l,_)) = key_survey.last() { k = *l; key_survey.pop(); }
-                // Now we may need to correct by nudging down.
-                if v == times1.len() || times1.bounds.bounds(v).0 > t { v -= 1; }
-                if k == vals1.len() || vals1.bounds.bounds(k).0 > v { k -= 1; }
-                Some(Err((k,v,t)))
-            }
-        };
+/// Merge two batches, one completely and another through the corresponding prefix.
+///
+/// Each invocation determines the maximum amount of both batches we can merge, determined
+/// by comparing the elements at the tails of each batch, and locating the lesser in other.
+/// We will merge the whole of the batch containing the lesser, and the prefix up through
+/// the lesser element in the other batch, setting the cursor to the first element strictly
+/// greater than that lesser element.
+///
+/// The algorithm uses a list of `Report` findings to map the interleavings of the layers.
+/// Each indicates either a range exclusive to one of the inputs, or a one element common
+/// to the layers from both inputs, which must be further explored. This map would normally
+/// allow the full merge to happen, but we need to carefully start at each cursor, and end
+/// just before the first element greater than the lesser bound.
+///
+/// The consumed prefix and disjoint suffix should be single report entries, and it seems
+/// fine to first produce all reports and then reflect on the cursors, rather than use the
+/// cursors as part of the mapping.
+#[inline(never)]
+fn merge_batch<U: Update>(
+    batch1: &mut Option<((usize, usize, usize), UpdatesTyped<U>)>,
+    batch2: &mut Option<((usize, usize, usize), UpdatesTyped<U>)>,
+    builder: &mut ChainBuilder<U>,
+)
+where
+    U::Time: 'static,
+{
+    // TODO: Optimization for one batch exceeding the other.
 
-        // Having updated the surveys, we now copy over the ranges they identify.
-        let mut out_batch = stash.pop().unwrap_or_default();
-        // TODO: We should be able to size `out_batch` pretty accurately from the survey.
-        write_from_surveys(&updates0, &updates1, &[Report::Both(0,0)], &key_survey, &val_survey, &time_survey, &mut out_batch);
-        builder.push(out_batch);
+    let ((k0_idx, v0_idx, t0_idx), updates0) = batch1.take().unwrap();
+    let ((k1_idx, v1_idx, t1_idx), updates1) = batch2.take().unwrap();
 
-        match next_cursor {
-            Some(Ok(kvt)) => { *batch1 = Some((kvt, updates0)); }
-            Some(Err(kvt)) => {*batch2 = Some((kvt, updates1)); }
-            None => { }
-        }
+    let view0 = updates0.view();
+    let view1 = updates1.view();
+    let keys0 = view0.keys;
+    let keys1 = view1.keys;
+    let vals0 = view0.vals;
+    let vals1 = view1.vals;
+    let times0 = view0.times;
+    let times1 = view1.times;
+
+    // Survey the interleaving of the two inputs.
+    let mut key_survey = survey::<columnar::ContainerOf<U::Key>>(keys0, keys1, &[Report::Both(0,0)]);
+    let mut val_survey = survey::<columnar::ContainerOf<U::Val>>(vals0, vals1, &key_survey);
+    let mut time_survey = survey::<columnar::ContainerOf<U::Time>>(times0, times1, &val_survey);
+
+    // We now know enough to start writing into an output batch.
+    // We should update the input surveys to reflect the subset
+    // of data that we want.
+    //
+    // At most one cursor should be non-zero (assert!).
+    // A non-zero cursor must correspond to the first entry of the surveys,
+    // as there is at least one consumed update that precedes the other batch.
+    // We need to nudge that report forward to align with the cursor, potentially
+    // squeezing the report to nothing (to the upper bound).
+
+    // We start by updating the surveys to reflect the cursors.
+    // If either cursor is set, then its batch has an element strictly less than the other batch.
+    // We therefore expect to find a prefix of This/That at the start of the survey.
+    if (k0_idx, v0_idx, t0_idx) != (0,0,0) {
+        let mut done = false; while !done { if let Report::This(l,u) = &mut key_survey[0] { if *u <= k0_idx { key_survey.remove(0); } else { *l = k0_idx; done = true; } } else { done = true; } }
+        let mut done = false; while !done { if let Report::This(l,u) = &mut val_survey[0] { if *u <= v0_idx { val_survey.remove(0); } else { *l = v0_idx; done = true; } } else { done = true; } }
+        let mut done = false; while !done { if let Report::This(l,u) = &mut time_survey[0] { if *u <= t0_idx { time_survey.remove(0); } else { *l = t0_idx; done = true; } } else { done = true; } }
+    }
+
+    if (k1_idx, v1_idx, t1_idx) != (0,0,0) {
+        let mut done = false; while !done { if let Report::That(l,u) = &mut key_survey[0] { if *u <= k1_idx { key_survey.remove(0); } else { *l = k1_idx; done = true; } } else { done = true; } }
+        let mut done = false; while !done { if let Report::That(l,u) = &mut val_survey[0] { if *u <= v1_idx { val_survey.remove(0); } else { *l = v1_idx; done = true; } } else { done = true; } }
+        let mut done = false; while !done { if let Report::That(l,u) = &mut time_survey[0] { if *u <= t1_idx { time_survey.remove(0); } else { *l = t1_idx; done = true; } } else { done = true; } }
     }
 
+    // We want to trim the tails of the surveys to only cover ranges present in both inputs.
+    // We can determine which was "longer" by looking at the last entry of the bottom layer,
+    // which tells us which input (or both) contained the last element.
+    //
+    // From the bottom layer up, we'll identify the index of the last item, and then determine
+    // the index of the list it belongs to. We use that index in the next layer, to locate the
+    // index of the list it belongs to, on upward.
+    let next_cursor = match time_survey.last().unwrap() {
+        Report::This(_,_) => {
+            // Collect the last value indexes known to strictly exceed an entry in the other batch.
+            let mut t = times0.values.len();
+            while let Some(Report::This(l,_)) = time_survey.last() { t = *l; time_survey.pop(); }
+            let mut v = vals0.values.len();
+            while let Some(Report::This(l,_)) = val_survey.last() { v = *l; val_survey.pop(); }
+            let mut k = keys0.values.len();
+            while let Some(Report::This(l,_)) = key_survey.last() { k = *l; key_survey.pop(); }
+            // Now we may need to correct by nudging down.
+            if v == times0.len() || times0.bounds.bounds(v).0 > t { v -= 1; }
+            if k == vals0.len() || vals0.bounds.bounds(k).0 > v { k -= 1; }
+            Some(Ok((k,v,t)))
+        }
+        Report::Both(_,_) => { None }
+        Report::That(_,_) => {
+            // Collect the last value indexes known to strictly exceed an entry in the other batch.
+            let mut t = times1.values.len();
+            while let Some(Report::That(l,_)) = time_survey.last() { t = *l; time_survey.pop(); }
+            let mut v = vals1.values.len();
+            while let Some(Report::That(l,_)) = val_survey.last() { v = *l; val_survey.pop(); }
+            let mut k = keys1.values.len();
+            while let Some(Report::That(l,_)) = key_survey.last() { k = *l; key_survey.pop(); }
+            // Now we may need to correct by nudging down.
+            if v == times1.len() || times1.bounds.bounds(v).0 > t { v -= 1; }
+            if k == vals1.len() || vals1.bounds.bounds(k).0 > v { k -= 1; }
+            Some(Err((k,v,t)))
+        }
+    };
+
+    // Having updated the surveys, we now copy over the ranges they identify.
+    let mut out_batch = UpdatesTyped::<U>::default();
+    // TODO: We should be able to size `out_batch` pretty accurately from the survey.
+    write_from_surveys(&updates0, &updates1, &[Report::Both(0,0)], &key_survey, &val_survey, &time_survey, &mut out_batch);
+    builder.push(out_batch);
+
+    match next_cursor {
+        Some(Ok(kvt)) => { *batch1 = Some((kvt, updates0)); }
+        Some(Err(kvt)) => {*batch2 = Some((kvt, updates1)); }
+        None => { }
+    }
 }
 
 /// Write merged output from four levels of survey reports.
diff --git a/differential-dataflow/src/columnar/batcher.rs b/differential-dataflow/src/columnar/batcher.rs
new file mode 100644
index 000000000..0c622b50a
--- /dev/null
+++ b/differential-dataflow/src/columnar/batcher.rs
@@ -0,0 +1,120 @@
+//! A `Batcher` for `RecordedUpdates<U>` streams that consolidates input via
+//! `TrieChunker` and merges sorted chains via the free functions in `trie_merger`.
+
+use timely::progress::frontier::AntichainRef;
+use timely::progress::{frontier::Antichain, Timestamp};
+use timely::container::{ContainerBuilder, PushInto};
+
+use crate::logging::Logger;
+use crate::trace::{Batcher, Builder, Description};
+
+use super::layout::ColumnarUpdate as Update;
+use super::updates::UpdatesTyped;
+use super::RecordedUpdates;
+use super::arrangement::TrieChunker;
+use super::arrangement::trie_merger;
+
+/// Creates batches from `RecordedUpdates<U>` streams.
+pub struct MergeBatcher<U: Update> {
+    /// Transforms input streams to chunks of sorted, consolidated data.
+    chunker: TrieChunker<U>,
+    /// A sequence of power-of-two length lists of sorted, consolidated containers.
+    chains: Vec<Vec<UpdatesTyped<U>>>,
+    /// Current lower frontier, we sealed up to here.
+    lower: Antichain<U::Time>,
+    /// The lower-bound frontier of the data, after the last call to seal.
+    frontier: Antichain<U::Time>,
+}
+
+impl<U: Update<Time: Timestamp>> Batcher for MergeBatcher<U> {
+    type Input = RecordedUpdates<U>;
+    type Time = U::Time;
+    type Output = UpdatesTyped<U>;
+
+    fn new(_logger: Option<Logger>, _operator_id: usize) -> Self {
+        Self {
+            chunker: TrieChunker::default(),
+            chains: Vec::new(),
+            frontier: Antichain::new(),
+            lower: Antichain::from_elem(U::Time::minimum()),
+        }
+    }
+
+    /// Push a container of data into this merge batcher. Updates the internal chain structure if
+    /// needed.
+    fn push_container(&mut self, container: &mut RecordedUpdates<U>) {
+        self.chunker.push_into(container);
+        while let Some(chunk) = self.chunker.extract() {
+            let chunk = std::mem::take(chunk);
+            self.insert_chain(vec![chunk]);
+        }
+    }
+
+    // Sealing a batch means finding those updates with times not greater or equal to any time
+    // in `upper`. All updates must have time greater or equal to the previously used `upper`,
+    // which we call `lower`, by assumption that after sealing a batcher we receive no more
+    // updates with times not greater or equal to `upper`.
+    fn seal<B: Builder<Input = Self::Output, Time = Self::Time>>(&mut self, upper: Antichain<U::Time>) -> B::Output {
+        // Finish
+        while let Some(chunk) = self.chunker.finish() {
+            let chunk = std::mem::take(chunk);
+            self.insert_chain(vec![chunk]);
+        }
+
+        // Merge all remaining chains into a single chain.
+        while self.chains.len() > 1 {
+            let list1 = self.chains.pop().unwrap();
+            let list2 = self.chains.pop().unwrap();
+            let merged = Self::merge_by(list1, list2);
+            self.chains.push(merged);
+        }
+        let merged = self.chains.pop().unwrap_or_default();
+
+        // Extract readied data.
+        let mut kept = Vec::new();
+        let mut readied = Vec::new();
+        self.frontier.clear();
+
+        trie_merger::extract(merged, upper.borrow(), &mut self.frontier, &mut readied, &mut kept);
+
+        if !kept.is_empty() {
+            self.chains.push(kept);
+        }
+
+        let description = Description::new(self.lower.clone(), upper.clone(), Antichain::from_elem(U::Time::minimum()));
+        let seal = B::seal(&mut readied, description);
+        self.lower = upper;
+        seal
+    }
+
+    /// The frontier of elements remaining after the most recent call to `self.seal`.
+    #[inline]
+    fn frontier(&mut self) -> AntichainRef<'_, U::Time> {
+        self.frontier.borrow()
+    }
+}
+
+impl<U: Update> MergeBatcher<U> {
+    /// Insert a chain and maintain chain properties: Chains are geometrically sized and ordered
+    /// by decreasing length.
+    fn insert_chain(&mut self, chain: Vec<UpdatesTyped<U>>) {
+        if !chain.is_empty() {
+            self.chains.push(chain);
+            while self.chains.len() > 1 && (self.chains[self.chains.len() - 1].len() >= self.chains[self.chains.len() - 2].len() / 2) {
+                let list1 = self.chains.pop().unwrap();
+                let list2 = self.chains.pop().unwrap();
+                let merged = Self::merge_by(list1, list2);
+                self.chains.push(merged);
+            }
+        }
+    }
+
+    // merges two sorted input lists into one sorted output list.
+    fn merge_by(list1: Vec<UpdatesTyped<U>>, list2: Vec<UpdatesTyped<U>>) -> Vec<UpdatesTyped<U>> {
+        // TODO: `list1` and `list2` get dropped; would be better to reuse?
+        let mut output = Vec::with_capacity(list1.len() + list2.len());
+        trie_merger::merge_batches(list1, list2, &mut output);
+
+        output
+    }
+}
diff --git a/differential-dataflow/src/columnar/mod.rs b/differential-dataflow/src/columnar/mod.rs
index 0a22398b9..e38b4581a 100644
--- a/differential-dataflow/src/columnar/mod.rs
+++ b/differential-dataflow/src/columnar/mod.rs
@@ -36,6 +36,7 @@ pub mod updates;
 pub mod builder;
 pub mod exchange;
 pub mod arrangement;
+pub mod batcher;
 
 pub use updates::UpdatesTyped;
 pub use builder::ValBuilder as ValColBuilder;

From 627e936484b1954cdc5033d38d1319808cc9f9ec Mon Sep 17 00:00:00 2001
From: Frank McSherry <fmcsherry@me.com>
Date: Wed, 6 May 2026 15:35:01 -0400
Subject: [PATCH 2/7] Introduce spill traits

---
 differential-dataflow/src/columnar/mod.rs   |  1 +
 differential-dataflow/src/columnar/spill.rs | 49 +++++++++++++++++++++
 2 files changed, 50 insertions(+)
 create mode 100644 differential-dataflow/src/columnar/spill.rs

diff --git a/differential-dataflow/src/columnar/mod.rs b/differential-dataflow/src/columnar/mod.rs
index e38b4581a..a0a104a18 100644
--- a/differential-dataflow/src/columnar/mod.rs
+++ b/differential-dataflow/src/columnar/mod.rs
@@ -37,6 +37,7 @@ pub mod builder;
 pub mod exchange;
 pub mod arrangement;
 pub mod batcher;
+pub mod spill;
 
 pub use updates::UpdatesTyped;
 pub use builder::ValBuilder as ValColBuilder;
diff --git a/differential-dataflow/src/columnar/spill.rs b/differential-dataflow/src/columnar/spill.rs
new file mode 100644
index 000000000..0b3e8a7e7
--- /dev/null
+++ b/differential-dataflow/src/columnar/spill.rs
@@ -0,0 +1,49 @@
+//! Traits for paging chunks of merge-batcher state to and from backing storage.
+//!
+//! Modeled on timely's pager traits in
+//! `timely-dataflow/communication/src/allocator/zero_copy/spill.rs`
+//! (`SpillPolicy`, `BytesSpill`, `BytesFetch`), but parameterized over a chunk
+//! type `C` rather than fixed to `timely::bytes::arc::Bytes`. For the columnar
+//! batcher we expect `C = Updates<U>`; that wiring lives elsewhere — this file
+//! only defines the trait shapes.
+
+use std::collections::VecDeque;
+
+/// A queue entry: either an in-memory chunk or a handle that can fetch one
+/// (or several) from backing storage.
+pub enum Entry<C> {
+    /// In-memory chunk.
+    Typed(C),
+    /// Paged-out chunk(s); fetch via the handle.
+    Paged(Box<dyn Fetch<C>>),
+}
+
+/// Decides which queue entries to spill out and which to keep resident.
+///
+/// Invoked at well-defined moments by the holder of the queue (e.g., after
+/// pushing a new chunk). The implementation may rewrite entries in either
+/// direction: convert `Typed` to `Paged` (spill out) or `Paged` to `Typed`
+/// (fetch back).
+pub trait SpillPolicy<C> {
+    /// Optionally transform the queue.
+    fn apply(&mut self, queue: &mut VecDeque<Entry<C>>);
+}
+
+/// Move in-memory chunks to backing storage, returning fetch handles.
+///
+/// The implementation should drain from `chunks` and push to `handles` as it
+/// goes; on failure it may stop partway, leaving the lists in a consistent
+/// state that will be retried in the future. If it cannot leave the lists in
+/// a consistent state it should panic.
+pub trait Spill<C> {
+    /// Spill `chunks` to storage, producing one fetch handle per spilled group.
+    fn spill(&mut self, chunks: &mut Vec<C>, handles: &mut Vec<Box<dyn Fetch<C>>>);
+}
+
+/// Handle to spilled chunk(s). Consume to retrieve them from storage.
+pub trait Fetch<C> {
+    /// Consume the handle and return the spilled chunks.
+    ///
+    /// On failure, the handle is returned so the caller can retry later.
+    fn fetch(self: Box<Self>) -> Result<Vec<C>, Box<dyn Fetch<C>>>;
+}

From e99c2ec1d878cb08b3b18ed0155dc692a0d35264 Mon Sep 17 00:00:00 2001
From: Frank McSherry <fmcsherry@me.com>
Date: Wed, 6 May 2026 20:59:40 -0400
Subject: [PATCH 3/7] Introduce fetching iteration

---
 differential-dataflow/Cargo.toml              |   1 +
 .../examples/columnar_spill.rs                | 681 ++++++++++++++++++
 .../src/columnar/arrangement/trie_merger.rs   | 116 +--
 differential-dataflow/src/columnar/batcher.rs | 159 +++-
 4 files changed, 892 insertions(+), 65 deletions(-)
 create mode 100644 differential-dataflow/examples/columnar_spill.rs

diff --git a/differential-dataflow/Cargo.toml b/differential-dataflow/Cargo.toml
index 5a42ce9b2..eddb1d88a 100644
--- a/differential-dataflow/Cargo.toml
+++ b/differential-dataflow/Cargo.toml
@@ -25,6 +25,7 @@ itertools="^0.13"
 graph_map = "0.1"
 bytemuck = "1.18.0"
 mimalloc = "0.1.48"
+tempfile = "3"
 
 [dependencies]
 columnar = { workspace = true }
diff --git a/differential-dataflow/examples/columnar_spill.rs b/differential-dataflow/examples/columnar_spill.rs
new file mode 100644
index 000000000..6a59c8d88
--- /dev/null
+++ b/differential-dataflow/examples/columnar_spill.rs
@@ -0,0 +1,681 @@
+//! Example: file-backed spill for the columnar `MergeBatcher`.
+//!
+//! Demonstrates `Spill` / `Fetch` / `SpillPolicy` impls modeled on TD's
+//! `communication/examples/spill_stress.rs`. Spills `UpdatesTyped<U>` chunks
+//! to a tempfile via per-column `Stash::write_bytes`, fetches them back via
+//! `Stash::try_from_bytes` and `Updates::into_typed`.
+//!
+//! Run with: `cargo run --example columnar_spill`
+
+use std::io::{Read, Seek, SeekFrom, Write};
+use std::marker::PhantomData;
+use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
+use std::sync::{Arc, Mutex, OnceLock};
+
+use mimalloc::MiMalloc;
+
+#[global_allocator]
+static GLOBAL: MiMalloc = MiMalloc;
+
+// Static spill-policy config, read by `SpillBatcher::new` when `arrange_core`
+// constructs each worker's batcher (we can't pass parameters through the
+// `Batcher::new(logger, op_id)` constructor).
+static ENABLE_SPILL: AtomicBool = AtomicBool::new(true);
+static HEAD: AtomicUsize = AtomicUsize::new(10_000_000);
+static THRESH: AtomicUsize = AtomicUsize::new(50_000_000);
+
+/// Cross-worker registry of `Threshold` stats so we can sum them after a run.
+static SHARED_STATS: OnceLock<Mutex<Vec<Arc<ThresholdStats>>>> = OnceLock::new();
+
+fn register_stats(stats: Arc<ThresholdStats>) {
+    SHARED_STATS
+        .get_or_init(|| Mutex::new(Vec::new()))
+        .lock()
+        .unwrap()
+        .push(stats);
+}
+
+fn collect_stats() -> (usize, usize) {
+    if let Some(m) = SHARED_STATS.get() {
+        let v = m.lock().unwrap();
+        let fires: usize = v.iter().map(|s| s.fires.load(Ordering::Relaxed)).sum();
+        let chunks: usize = v.iter().map(|s| s.chunks_spilled.load(Ordering::Relaxed)).sum();
+        (fires, chunks)
+    } else {
+        (0, 0)
+    }
+}
+
+fn reset_stats() {
+    if let Some(m) = SHARED_STATS.get() {
+        m.lock().unwrap().clear();
+    }
+}
+
+use columnar::Push;
+use columnar::bytes::stash::Stash;
+
+use differential_dataflow::columnar::{RecordedUpdates, ValBuilder, ValColBuilder, ValSpine};
+use differential_dataflow::columnar::batcher::MergeBatcher;
+use differential_dataflow::columnar::layout::ColumnarUpdate as Update;
+use differential_dataflow::columnar::spill::{Entry, Fetch, Spill, SpillPolicy};
+use differential_dataflow::columnar::updates::{Updates, UpdatesTyped};
+use differential_dataflow::logging::Logger;
+use differential_dataflow::operators::arrange::arrangement::arrange_core;
+use differential_dataflow::trace::{Batcher, Builder};
+use timely::dataflow::channels::pact::Pipeline;
+use timely::dataflow::operators::probe::{Handle as ProbeHandle, Probe};
+use timely::dataflow::operators::Input;
+use timely::dataflow::InputHandle;
+use timely::progress::frontier::AntichainRef;
+use timely::progress::{frontier::Antichain, Timestamp};
+
+/// File-backed `Spill`. Serializes each chunk into a reusable `Vec<u8>` and
+/// writes it with one `write_all` per chunk — one syscall per spill, vs. one
+/// per column.
+pub struct FileSpill<U: Update> {
+    file: Arc<Mutex<std::fs::File>>,
+    /// Cumulative byte offset for the next write.
+    offset: u64,
+    /// Reusable serialization buffer; grows to fit the largest chunk seen,
+    /// then sticks at that capacity (no per-chunk allocation).
+    buf: Vec<u8>,
+    _marker: PhantomData<U>,
+}
+
+impl<U: Update> FileSpill<U> {
+    pub fn new() -> std::io::Result<Self> {
+        let file = tempfile::tempfile()?;
+        Ok(Self {
+            file: Arc::new(Mutex::new(file)),
+            offset: 0,
+            buf: Vec::new(),
+            _marker: PhantomData,
+        })
+    }
+}
+
+impl<U: Update + 'static> Spill<UpdatesTyped<U>> for FileSpill<U> {
+    fn spill(
+        &mut self,
+        chunks: &mut Vec<UpdatesTyped<U>>,
+        handles: &mut Vec<Box<dyn Fetch<UpdatesTyped<U>>>>,
+    ) {
+        while let Some(chunk) = chunks.pop() {
+            let updates: Updates<U, Vec<u8>> = chunk.into();
+            let keys_len = updates.keys.length_in_bytes() as u64;
+            let vals_len = updates.vals.length_in_bytes() as u64;
+            let times_len = updates.times.length_in_bytes() as u64;
+            let diffs_len = updates.diffs.length_in_bytes() as u64;
+            let total = 32 + keys_len + vals_len + times_len + diffs_len;
+
+            // Serialize the whole chunk (header + four columns) into the
+            // reusable buffer, then issue a single write_all to the file.
+            self.buf.clear();
+            self.buf.extend_from_slice(&keys_len.to_le_bytes());
+            self.buf.extend_from_slice(&vals_len.to_le_bytes());
+            self.buf.extend_from_slice(&times_len.to_le_bytes());
+            self.buf.extend_from_slice(&diffs_len.to_le_bytes());
+            updates.keys.write_bytes(&mut self.buf).unwrap();
+            updates.vals.write_bytes(&mut self.buf).unwrap();
+            updates.times.write_bytes(&mut self.buf).unwrap();
+            updates.diffs.write_bytes(&mut self.buf).unwrap();
+            debug_assert_eq!(self.buf.len() as u64, total);
+
+            let start = self.offset;
+            let mut file = self.file.lock().unwrap();
+            file.seek(SeekFrom::Start(start)).unwrap();
+            file.write_all(&self.buf).unwrap();
+            drop(file);
+            self.offset += total;
+
+            handles.push(Box::new(FileFetch::<U> {
+                file: self.file.clone(),
+                offset: start,
+                _marker: PhantomData,
+            }));
+        }
+    }
+}
+
+/// Per-chunk fetch handle. Reads a 32-byte header (four column lengths) at the
+/// recorded offset, then four `Stash::try_from_bytes` payloads.
+pub struct FileFetch<U: Update> {
+    file: Arc<Mutex<std::fs::File>>,
+    offset: u64,
+    _marker: PhantomData<U>,
+}
+
+impl<U: Update + 'static> Fetch<UpdatesTyped<U>> for FileFetch<U> {
+    fn fetch(self: Box<Self>) -> Result<Vec<UpdatesTyped<U>>, Box<dyn Fetch<UpdatesTyped<U>>>> {
+        let mut file = self.file.lock().unwrap();
+        file.seek(SeekFrom::Start(self.offset)).unwrap();
+        let mut header = [0u8; 32];
+        file.read_exact(&mut header).unwrap();
+        let keys_len = u64::from_le_bytes(header[0..8].try_into().unwrap()) as usize;
+        let vals_len = u64::from_le_bytes(header[8..16].try_into().unwrap()) as usize;
+        let times_len = u64::from_le_bytes(header[16..24].try_into().unwrap()) as usize;
+        let diffs_len = u64::from_le_bytes(header[24..32].try_into().unwrap()) as usize;
+
+        let mut keys_bytes = vec![0u8; keys_len];
+        file.read_exact(&mut keys_bytes).unwrap();
+        let mut vals_bytes = vec![0u8; vals_len];
+        file.read_exact(&mut vals_bytes).unwrap();
+        let mut times_bytes = vec![0u8; times_len];
+        file.read_exact(&mut times_bytes).unwrap();
+        let mut diffs_bytes = vec![0u8; diffs_len];
+        file.read_exact(&mut diffs_bytes).unwrap();
+        drop(file);
+
+        let keys = Stash::try_from_bytes(keys_bytes).unwrap();
+        let vals = Stash::try_from_bytes(vals_bytes).unwrap();
+        let times = Stash::try_from_bytes(times_bytes).unwrap();
+        let diffs = Stash::try_from_bytes(diffs_bytes).unwrap();
+        let updates: Updates<U, Vec<u8>> = Updates { keys, vals, times, diffs };
+        Ok(vec![updates.into_typed()])
+    }
+}
+
+/// Trivial `SpillPolicy`: page out every `Typed` entry on each apply.
+/// Useful for direct queue exercise; not intended as a real policy.
+pub struct SpillEverything<U: Update> {
+    spill: FileSpill<U>,
+}
+
+impl<U: Update + 'static> SpillPolicy<UpdatesTyped<U>> for SpillEverything<U> {
+    fn apply(&mut self, queue: &mut std::collections::VecDeque<Entry<UpdatesTyped<U>>>) {
+        let mut new_queue = std::collections::VecDeque::with_capacity(queue.len());
+        let mut buf = Vec::new();
+        let mut handles: Vec<Box<dyn Fetch<UpdatesTyped<U>>>> = Vec::new();
+        for entry in queue.drain(..) {
+            match entry {
+                Entry::Typed(c) => {
+                    buf.push(c);
+                    self.spill.spill(&mut buf, &mut handles);
+                    let handle = handles.pop().expect("FileSpill produces a handle per chunk");
+                    new_queue.push_back(Entry::Paged(handle));
+                }
+                Entry::Paged(h) => new_queue.push_back(Entry::Paged(h)),
+            }
+        }
+        *queue = new_queue;
+    }
+}
+
+/// Threshold-based spill policy adapted from timely's
+/// `communication::allocator::zero_copy::spill::threshold::Threshold`.
+///
+/// Counts records (not bytes) for the threshold check. When the queue's
+/// resident records exceed `head_reserve_records + threshold_records`, spill
+/// chunks past the head reserve. Unlike TD we don't carve out the last
+/// entry — TD's last entry is a `try_merge` target being extended in place;
+/// our chunks are all finished, so any of them can be spilled.
+pub struct Threshold<U: Update> {
+    spill: FileSpill<U>,
+    /// Records near the head of the queue stay resident.
+    pub head_reserve_records: usize,
+    /// Spillable surplus: trigger when resident exceeds head + threshold.
+    pub threshold_records: usize,
+    /// Counters shared with the caller (chunks_spilled, fires).
+    pub stats: Arc<ThresholdStats>,
+}
+
+#[derive(Default)]
+pub struct ThresholdStats {
+    pub fires: AtomicUsize,
+    pub chunks_spilled: AtomicUsize,
+}
+
+impl<U: Update> Threshold<U> {
+    pub fn new(spill: FileSpill<U>, head_reserve_records: usize, threshold_records: usize) -> Self {
+        Self {
+            spill,
+            head_reserve_records,
+            threshold_records,
+            stats: Arc::new(ThresholdStats::default()),
+        }
+    }
+}
+
+impl<U: Update + 'static> SpillPolicy<UpdatesTyped<U>> for Threshold<U> {
+    fn apply(&mut self, queue: &mut std::collections::VecDeque<Entry<UpdatesTyped<U>>>) {
+        let resident: usize = queue.iter().map(|e| match e {
+            Entry::Typed(c) => c.len(),
+            Entry::Paged(_) => 0,
+        }).sum();
+        if resident <= self.head_reserve_records + self.threshold_records {
+            return;
+        }
+
+        // Walk the queue, accumulating a head reserve. Past the reserve, mark
+        // every Typed entry for spill.
+        let mut cumulative = 0usize;
+        let mut target_indices: Vec<usize> = Vec::new();
+        for (i, entry) in queue.iter().enumerate() {
+            if let Entry::Typed(c) = entry {
+                if cumulative >= self.head_reserve_records {
+                    target_indices.push(i);
+                }
+                cumulative += c.len();
+            }
+        }
+        if target_indices.is_empty() { return; }
+
+        // Take the targeted chunks out, leaving empty placeholders we overwrite below.
+        let mut targets: Vec<UpdatesTyped<U>> = Vec::with_capacity(target_indices.len());
+        for &i in &target_indices {
+            if let Entry::Typed(c) = &mut queue[i] {
+                targets.push(std::mem::take(c));
+            }
+        }
+
+        let mut handles: Vec<Box<dyn Fetch<UpdatesTyped<U>>>> = Vec::new();
+        self.spill.spill(&mut targets, &mut handles);
+        // FileSpill drains via pop (LIFO); reverse so handles align with target_indices order.
+        handles.reverse();
+        assert_eq!(target_indices.len(), handles.len());
+        self.stats.fires.fetch_add(1, Ordering::Relaxed);
+        self.stats.chunks_spilled.fetch_add(handles.len(), Ordering::Relaxed);
+        for (i, handle) in target_indices.into_iter().zip(handles) {
+            queue[i] = Entry::Paged(handle);
+        }
+    }
+}
+
+/// `Batcher` wrapper that installs a `Threshold` policy on a `MergeBatcher`
+/// at construction time, reading config from `HEAD` / `THRESH` / `ENABLE_SPILL`
+/// statics. Slots into `arrange_core` in place of `ValBatcher` and lets the
+/// timely operator drive a spilling merger without surgery to the `Batcher`
+/// trait signature.
+pub struct SpillBatcher<K, V, T, R>(MergeBatcher<(K, V, T, R)>)
+where
+    (K, V, T, R): Update;
+
+impl<K, V, T, R> Batcher for SpillBatcher<K, V, T, R>
+where
+    K: columnar::Columnar + 'static,
+    V: columnar::Columnar + 'static,
+    T: columnar::Columnar + Timestamp + 'static,
+    R: columnar::Columnar + 'static,
+    (K, V, T, R): Update<Time = T> + 'static,
+{
+    type Input = RecordedUpdates<(K, V, T, R)>;
+    type Time = T;
+    type Output = UpdatesTyped<(K, V, T, R)>;
+
+    fn new(logger: Option<Logger>, operator_id: usize) -> Self {
+        let mut inner = <MergeBatcher<(K, V, T, R)> as Batcher>::new(logger, operator_id);
+        if ENABLE_SPILL.load(Ordering::Relaxed) {
+            let head = HEAD.load(Ordering::Relaxed);
+            let thresh = THRESH.load(Ordering::Relaxed);
+            let policy = Threshold::<(K, V, T, R)>::new(
+                FileSpill::new().expect("tempfile"),
+                head,
+                thresh,
+            );
+            register_stats(policy.stats.clone());
+            inner.set_spill_policy(Box::new(policy));
+        }
+        Self(inner)
+    }
+
+    fn push_container(&mut self, container: &mut Self::Input) {
+        self.0.push_container(container);
+    }
+
+    fn seal<B: Builder<Input = Self::Output, Time = Self::Time>>(
+        &mut self,
+        upper: Antichain<T>,
+    ) -> B::Output {
+        self.0.seal::<B>(upper)
+    }
+
+    fn frontier(&mut self) -> AntichainRef<'_, T> {
+        self.0.frontier()
+    }
+}
+
+type TestUpdate = (u64, u64, u64, i64);
+
+fn make_chunk(updates: &[(u64, u64, u64, i64)]) -> UpdatesTyped<TestUpdate> {
+    let mut out = UpdatesTyped::<TestUpdate>::default();
+    for (k, v, t, d) in updates {
+        out.push((k, v, t, d));
+    }
+    out.consolidate()
+}
+
+fn collect(chunk: &UpdatesTyped<TestUpdate>) -> Vec<(u64, u64, u64, i64)> {
+    chunk.iter().map(|(k, v, t, d)| (*k, *v, *t, *d)).collect()
+}
+
+fn main() {
+    // Build a few synthetic chunks.
+    let chunk_a = make_chunk(&[
+        (1, 10, 100, 1),
+        (1, 10, 200, 2),
+        (1, 20, 100, 3),
+        (2, 20, 200, 5),
+    ]);
+    let chunk_b = make_chunk(&[
+        (3, 30, 100, 7),
+        (3, 30, 200, -7),
+        (4, 40, 100, 11),
+    ]);
+    let chunk_c = make_chunk(&[
+        (5, 50, 100, 1),
+    ]);
+
+    let originals = vec![chunk_a, chunk_b, chunk_c];
+    let expected: Vec<Vec<_>> = originals.iter().map(collect).collect();
+
+    // Direct Spill/Fetch roundtrip.
+    {
+        let mut spill = FileSpill::<TestUpdate>::new().unwrap();
+        let mut chunks = originals.clone();
+        let mut handles: Vec<Box<dyn Fetch<UpdatesTyped<TestUpdate>>>> = Vec::new();
+        spill.spill(&mut chunks, &mut handles);
+        assert!(chunks.is_empty(), "spill should drain chunks");
+        assert_eq!(handles.len(), expected.len(), "expected one handle per chunk");
+
+        // Spill drains in pop order (LIFO); reverse to align with original order.
+        handles.reverse();
+
+        for (i, handle) in handles.into_iter().enumerate() {
+            let fetched = handle.fetch().unwrap_or_else(|_| panic!("fetch should succeed"));
+            assert_eq!(fetched.len(), 1, "FileFetch returns one chunk per handle");
+            let got = collect(&fetched[0]);
+            assert_eq!(got, expected[i], "chunk {} mismatch after roundtrip", i);
+        }
+        println!("ok: direct Spill+Fetch roundtripped {} chunks", expected.len());
+    }
+
+    // SpillPolicy roundtrip via a queue: every Typed becomes Paged, then we
+    // fetch each one back and compare.
+    {
+        let mut policy = SpillEverything {
+            spill: FileSpill::<TestUpdate>::new().unwrap(),
+        };
+        let mut queue: std::collections::VecDeque<Entry<UpdatesTyped<TestUpdate>>> =
+            originals.iter().cloned().map(Entry::Typed).collect();
+        policy.apply(&mut queue);
+
+        // Every entry should now be Paged, in original order.
+        assert_eq!(queue.len(), expected.len());
+        for (i, entry) in queue.into_iter().enumerate() {
+            match entry {
+                Entry::Paged(handle) => {
+                    let fetched = handle.fetch().unwrap_or_else(|_| panic!("fetch should succeed"));
+                    assert_eq!(fetched.len(), 1);
+                    assert_eq!(collect(&fetched[0]), expected[i], "queue position {}", i);
+                }
+                Entry::Typed(_) => panic!("SpillEverything should leave nothing typed"),
+            }
+        }
+        println!("ok: SpillEverything paged & retrieved {} chunks in order", expected.len());
+    }
+
+    // End-to-end demo: a real timely dataflow.
+    //
+    // Each worker generates its share of the cancellation workload (positives
+    // then negatives) and feeds them into an `arrange_core` whose batcher is
+    // our `SpillBatcher`. With multiple workers we get parallel mergers, each
+    // with its own `Threshold` policy and tempfile.
+    {
+        let cfg = match parse_args() {
+            Some(cfg) => cfg,
+            None => return,
+        };
+
+        let total_records = (cfg.times * cfg.keys_per_time) as usize * 2;
+        let bytes_per_record = std::mem::size_of::<TestUpdate>();
+        let raw_gb = (total_records * bytes_per_record) as f64 / (1u64 << 30) as f64;
+        let per_worker_head = cfg.head / cfg.workers.max(1);
+        let per_worker_thresh = cfg.thresh / cfg.workers.max(1);
+        println!(
+            "config: times={} keys={} workers={} head={} ({} per worker) thresh={} ({} per worker) mode={:?} sample_secs={}",
+            cfg.times, cfg.keys_per_time, cfg.workers,
+            cfg.head, per_worker_head,
+            cfg.thresh, per_worker_thresh,
+            cfg.mode, cfg.sample_secs,
+        );
+        println!(
+            "workload: {} records ({:.2} GB raw, {} bytes/record)",
+            total_records, raw_gb, bytes_per_record,
+        );
+
+        if cfg.mode != Mode::Baseline {
+            ENABLE_SPILL.store(true, Ordering::Relaxed);
+            // Divide head/threshold across workers so the user-supplied values
+            // are a global budget, not per-worker. With N workers, each gets
+            // 1/N of the budget; total resident across workers is bounded by
+            // the configured head + threshold.
+            let per_worker_head = cfg.head / cfg.workers.max(1);
+            let per_worker_thresh = cfg.thresh / cfg.workers.max(1);
+            HEAD.store(per_worker_head, Ordering::Relaxed);
+            THRESH.store(per_worker_thresh, Ordering::Relaxed);
+            reset_stats();
+            let elapsed = run_timely_dataflow(cfg.times, cfg.keys_per_time, cfg.workers, cfg.sample_secs, "spill");
+            let (fires, chunks) = collect_stats();
+            println!(
+                "spill: {:.2}s | {:.2} M records/s | {:.2} GB/s | threshold fired {} times, spilled {} chunks",
+                elapsed.as_secs_f64(),
+                total_records as f64 / elapsed.as_secs_f64() / 1e6,
+                raw_gb / elapsed.as_secs_f64(),
+                fires, chunks,
+            );
+        }
+
+        if cfg.mode != Mode::Spill {
+            ENABLE_SPILL.store(false, Ordering::Relaxed);
+            reset_stats();
+            let elapsed = run_timely_dataflow(cfg.times, cfg.keys_per_time, cfg.workers, cfg.sample_secs, "baseline");
+            println!(
+                "baseline: {:.2}s | {:.2} M records/s | {:.2} GB/s",
+                elapsed.as_secs_f64(),
+                total_records as f64 / elapsed.as_secs_f64() / 1e6,
+                raw_gb / elapsed.as_secs_f64(),
+            );
+        }
+    }
+}
+
+/// Run a single timely dataflow with `workers` worker threads. Each worker
+/// generates `keys_per_time / workers` records per timestamp, feeds them to
+/// an `arrange_core` over our `SpillBatcher`, advances time, and waits for
+/// the probe. Returns elapsed wall time.
+fn run_timely_dataflow(
+    times: u64,
+    keys_per_time: u64,
+    workers: usize,
+    sample_secs: u64,
+    label: &str,
+) -> std::time::Duration {
+    let stop = Arc::new(AtomicBool::new(false));
+
+    // RSS sampler thread.
+    let stop_clone = stop.clone();
+    let label_owned = label.to_string();
+    let sampler = if sample_secs > 0 {
+        Some(std::thread::spawn(move || {
+            let start = std::time::Instant::now();
+            while !stop_clone.load(Ordering::Relaxed) {
+                if let Some(rss) = rss_kb() {
+                    println!(
+                        "  [{}] +{:>5.0}s   RSS {:>9} kB",
+                        label_owned,
+                        start.elapsed().as_secs_f64(),
+                        rss
+                    );
+                }
+                std::thread::sleep(std::time::Duration::from_secs(sample_secs));
+            }
+        }))
+    } else {
+        None
+    };
+
+    let timer = std::time::Instant::now();
+
+    timely::execute(timely::Config::process(workers), move |worker| {
+        let index = worker.index();
+        let peers = worker.peers();
+
+        let mut input = <InputHandle<u64, ValColBuilder<TestUpdate>>>::new_with_builder();
+        let mut probe: ProbeHandle<u64> = ProbeHandle::new();
+
+        worker.dataflow::<u64, _, _>(|scope| {
+            let stream = scope.input_from(&mut input);
+            let arranged = arrange_core::<
+                _,
+                SpillBatcher<u64, u64, u64, i64>,
+                ValBuilder<u64, u64, u64, i64>,
+                ValSpine<u64, u64, u64, i64>,
+            >(stream, Pipeline, "ColumnarSpillArrange");
+            arranged.stream.probe_with(&mut probe);
+        });
+
+        // Push positives and negatives at the same timely time so they land
+        // in the batcher together — the merger cancels (k,v,t,+1) against
+        // (k,v,t,-1) during its own merges, instead of producing two giant
+        // sealed batches that cancel only at the spine.
+        //
+        // Keys are mixed through a deterministic bijection so the post-sort
+        // columnar bytes look ~random (otherwise the workload is sequential
+        // u64s plus repeated v/t/r, which macOS' page compressor crushes,
+        // making the baseline-vs-spill comparison unfairly favor baseline).
+        // `mix` is reversible and identical across phases, so cancellation
+        // still pairs positives with negatives.
+        fn mix(k: u64) -> u64 {
+            let x = k.wrapping_mul(0x9E3779B97F4A7C15);
+            x ^ (x >> 32)
+        }
+
+        // Step periodically so the input handle's internal buffer doesn't
+        // pile up unbounded. Diffs are derived from the same `mix` so they
+        // also look random per record (incompressible) but pair-wise negate
+        // exactly across phases.
+        const STEP_EVERY: usize = 1 << 16;
+        let mut sent_since_step = 0usize;
+        for sign in [1i64, -1] {
+            for t in 0..times {
+                let mut k = index as u64;
+                while k < keys_per_time {
+                    let kh = mix(k);
+                    // Half-range, always odd — nonzero and `-d` never overflows.
+                    let d = ((kh as i64) >> 1) | 1;
+                    input.send((kh, kh & 0x3, t, sign * d));
+                    k += peers as u64;
+                    sent_since_step += 1;
+                    if sent_since_step >= STEP_EVERY {
+                        worker.step();
+                        sent_since_step = 0;
+                    }
+                }
+            }
+        }
+        input.advance_to(1);
+        input.flush();
+
+        while probe.less_than(input.time()) {
+            worker.step();
+        }
+    })
+    .expect("timely::execute failed");
+
+    let elapsed = timer.elapsed();
+    stop.store(true, Ordering::Relaxed);
+    if let Some(s) = sampler {
+        let _ = s.join();
+    }
+    elapsed
+}
+
+#[derive(Debug, PartialEq)]
+enum Mode { Both, Spill, Baseline }
+
+struct Config {
+    times: u64,
+    keys_per_time: u64,
+    head: usize,
+    thresh: usize,
+    workers: usize,
+    sample_secs: u64,
+    mode: Mode,
+}
+
+fn parse_args() -> Option<Config> {
+    let mut cfg = Config {
+        times: 8,
+        keys_per_time: 500_000,
+        head: 10_000_000,
+        thresh: 50_000_000,
+        workers: 1,
+        sample_secs: 0,
+        mode: Mode::Both,
+    };
+    let mut it = std::env::args().skip(1);
+    while let Some(a) = it.next() {
+        let take = |it: &mut dyn Iterator<Item = String>, name: &str| -> String {
+            it.next().unwrap_or_else(|| { print_usage(); panic!("--{} requires a value", name) })
+        };
+        match a.as_str() {
+            "-h" | "--help" => { print_usage(); return None; }
+            "--times"   => { cfg.times = take(&mut it, "times").parse().expect("times: integer"); }
+            "--keys"    => { cfg.keys_per_time = take(&mut it, "keys").parse().expect("keys: integer"); }
+            "--head"    => { cfg.head = take(&mut it, "head").parse().expect("head: integer"); }
+            "--thresh"  => { cfg.thresh = take(&mut it, "thresh").parse().expect("thresh: integer"); }
+            "--workers" => { cfg.workers = take(&mut it, "workers").parse().expect("workers: integer"); }
+            "--sample-secs" => { cfg.sample_secs = take(&mut it, "sample-secs").parse().expect("sample-secs: integer"); }
+            "--mode"    => {
+                cfg.mode = match take(&mut it, "mode").as_str() {
+                    "both" => Mode::Both,
+                    "spill" => Mode::Spill,
+                    "baseline" => Mode::Baseline,
+                    other => { print_usage(); panic!("unknown mode: {}", other); }
+                };
+            }
+            other => { print_usage(); panic!("unknown arg: {}", other); }
+        }
+    }
+    Some(cfg)
+}
+
+fn print_usage() {
+    eprintln!("Usage: columnar_spill [OPTIONS]");
+    eprintln!();
+    eprintln!("  --times N            distinct data timestamps         (default 8)");
+    eprintln!("  --keys N             keys per timestamp               (default 500000)");
+    eprintln!("  --head N             total head_reserve_records (split across workers)  (default 10000000)");
+    eprintln!("  --thresh N           total threshold_records    (split across workers)  (default 50000000)");
+    eprintln!("  --workers N          timely worker threads            (default 1)");
+    eprintln!("  --sample-secs N      print RSS every N seconds        (default 0 = off)");
+    eprintln!("  --mode MODE          spill | baseline | both          (default both)");
+    eprintln!();
+    eprintln!("Total records pushed = 2 * times * keys (positives + negatives that cancel).");
+    eprintln!("Records are partitioned across workers by k % workers.");
+    eprintln!();
+    eprintln!("Examples:");
+    eprintln!("  # default — 8M records, both runs, 1 worker");
+    eprintln!("  columnar_spill");
+    eprintln!();
+    eprintln!("  # 100 GB spill-only on 4 workers, RSS every 30s");
+    eprintln!("  columnar_spill --mode spill --workers 4 --times 64 --keys 24000000 \\");
+    eprintln!("                 --head 10000000 --thresh 50000000 --sample-secs 30");
+    eprintln!();
+    eprintln!("  # baseline only (no spill installed)");
+    eprintln!("  columnar_spill --mode baseline");
+}
+
+/// Return current process RSS in kB by shelling out to `ps`. Portable across
+/// macOS and Linux without adding a dep. Returns `None` if `ps` isn't
+/// available or output can't be parsed.
+fn rss_kb() -> Option<usize> {
+    let pid = std::process::id();
+    let output = std::process::Command::new("ps")
+        .args(["-o", "rss=", "-p", &pid.to_string()])
+        .output()
+        .ok()?;
+    let s = std::str::from_utf8(&output.stdout).ok()?;
+    s.trim().parse::<usize>().ok()
+}
diff --git a/differential-dataflow/src/columnar/arrangement/trie_merger.rs b/differential-dataflow/src/columnar/arrangement/trie_merger.rs
index 20d68cef9..f01de7529 100644
--- a/differential-dataflow/src/columnar/arrangement/trie_merger.rs
+++ b/differential-dataflow/src/columnar/arrangement/trie_merger.rs
@@ -60,17 +60,23 @@ fn form_chunks<'a, U: Update>(
 }
 
 /// Partition `merged` into chunks ready to ship (times strictly less than `upper`)
-/// and chunks kept for future seals (times at-or-after `upper`). Updates `frontier`
-/// to the antichain of kept times.
-pub fn extract<U: Update>(
-    mut merged: Vec<UpdatesTyped<U>>,
+/// and chunks kept for future seals (times at-or-after `upper`), updating
+/// `frontier` to the antichain of kept times. `merged` is consumed lazily,
+/// and outputs flow through `ship` / `kept` sinks so the caller can spill or
+/// forward as chunks are produced rather than buffering them.
+pub fn extract<U, I, FShip, FKept>(
+    merged: I,
     upper: AntichainRef<U::Time>,
     frontier: &mut Antichain<U::Time>,
-    ship: &mut Vec<UpdatesTyped<U>>,
-    kept: &mut Vec<UpdatesTyped<U>>,
+    mut ship: FShip,
+    mut kept: FKept,
 )
 where
+    U: Update,
     U::Time: 'static,
+    I: IntoIterator<Item = UpdatesTyped<U>>,
+    FShip: FnMut(UpdatesTyped<U>),
+    FKept: FnMut(UpdatesTyped<U>),
 {
     use columnar::{Container, ContainerOf, Index, Push};
     use columnar::primitive::offsets::Strides;
@@ -79,7 +85,7 @@ where
     // TODO: rework to move from trie structure to trie structure.
     let mut time_owned = U::Time::default();
     let mut bitmap = Vec::new();    // update should be kept.
-    for chunk in merged.drain(..) {
+    for chunk in merged {
         bitmap.clear();
         let view = chunk.view();
         let times = view.times.values;
@@ -91,8 +97,8 @@ where
             }
             else { bitmap.push(false); }
         }
-        if bitmap.iter().all(|x| *x) { kept.push(chunk); }
-        else if bitmap.iter().all(|x| !*x) { ship.push(chunk); }
+        if bitmap.iter().all(|x| *x) { kept(chunk); }
+        else if bitmap.iter().all(|x| !*x) { ship(chunk); }
         else {
 
             let (times, temp) = retain_items::<ContainerOf<U::Time>>(view.times, &bitmap[..]);
@@ -104,7 +110,7 @@ where
                 if *bit { diffs.values.push(d_borrow.values.get(index)); }
             }
             diffs.bounds = Strides::new(1, times.values.len() as u64);
-            kept.push(UpdatesTyped {
+            kept(UpdatesTyped {
                 keys,
                 vals,
                 times,
@@ -122,7 +128,7 @@ where
                 if *bit { diffs.values.push(d_borrow.values.get(index)); }
             }
             diffs.bounds = Strides::new(1, times.values.len() as u64);
-            ship.push(UpdatesTyped {
+            ship(UpdatesTyped {
                 keys,
                 vals,
                 times,
@@ -155,14 +161,24 @@ where
 }
 
 /// A merge implementation that operates batch-at-a-time.
+///
+/// Inputs are taken as `IntoIterator` so the caller can stream chunks in
+/// lazily — e.g. fetching paged-out chunks one group at a time — rather than
+/// materializing entire chains up front. Output chunks are emitted via the
+/// caller-supplied `sink` as they become stable, so the caller can apply a
+/// spill policy mid-merge rather than buffering the full result.
 #[inline(never)]
-pub fn merge_batches<U: Update>(
-    list1: Vec<UpdatesTyped<U>>,
-    list2: Vec<UpdatesTyped<U>>,
-    output: &mut Vec<UpdatesTyped<U>>,
+pub fn merge_batches<U, I1, I2, S>(
+    list1: I1,
+    list2: I2,
+    sink: S,
 )
 where
+    U: Update,
     U::Time: 'static,
+    I1: IntoIterator<Item = UpdatesTyped<U>>,
+    I2: IntoIterator<Item = UpdatesTyped<U>>,
+    S: FnMut(UpdatesTyped<U>),
 {
 
     // The design for efficient "batch" merginging of chains of links is:
@@ -180,21 +196,21 @@ where
     // The challenging moment is the merge that can start with a suffix of one link, involving a prefix of one link.
     // These could be the same link, different links, and generally there is the potential for complexity here.
 
-    let mut builder = ChainBuilder::default();
+    let mut builder = ChainBuilder::new(sink);
 
-    let mut queue1: std::collections::VecDeque<_> = list1.into();
-    let mut queue2: std::collections::VecDeque<_> = list2.into();
+    let mut iter1 = list1.into_iter();
+    let mut iter2 = list2.into_iter();
 
     // The first unconsumed update in each block, via (k_idx, v_idx, t_idx), or None if exhausted.
     // These are (0,0,0) for a new block, and should become None once there are no remaining updates.
-    let mut cursor1 = queue1.pop_front().map(|b| ((0,0,0), b));
-    let mut cursor2 = queue2.pop_front().map(|b| ((0,0,0), b));
+    let mut cursor1 = iter1.next().map(|b| ((0,0,0), b));
+    let mut cursor2 = iter2.next().map(|b| ((0,0,0), b));
 
     // For each pair of batches
     while cursor1.is_some() && cursor2.is_some() {
         merge_batch(&mut cursor1, &mut cursor2, &mut builder);
-        if cursor1.is_none() { cursor1 = queue1.pop_front().map(|b| ((0,0,0), b)); }
-        if cursor2.is_none() { cursor2 = queue2.pop_front().map(|b| ((0,0,0), b)); }
+        if cursor1.is_none() { cursor1 = iter1.next().map(|b| ((0,0,0), b)); }
+        if cursor2.is_none() { cursor2 = iter2.next().map(|b| ((0,0,0), b)); }
     }
 
     // TODO: create batch for the non-empty cursor.
@@ -229,9 +245,9 @@ where
         builder.push(out_batch);
     }
 
-    builder.extend(queue1);
-    builder.extend(queue2);
-    *output = builder.done();
+    builder.extend(iter1);
+    builder.extend(iter2);
+    builder.done();
     // TODO: Tidy output to satisfy structural invariants.
 }
 
@@ -253,10 +269,10 @@ where
 /// fine to first produce all reports and then reflect on the cursors, rather than use the
 /// cursors as part of the mapping.
 #[inline(never)]
-fn merge_batch<U: Update>(
+fn merge_batch<U: Update, F: FnMut(UpdatesTyped<U>)>(
     batch1: &mut Option<((usize, usize, usize), UpdatesTyped<U>)>,
     batch2: &mut Option<((usize, usize, usize), UpdatesTyped<U>)>,
-    builder: &mut ChainBuilder<U>,
+    builder: &mut ChainBuilder<U, F>,
 )
 where
     U::Time: 'static,
@@ -614,28 +630,42 @@ pub enum Report {
     Both(usize, usize),
 }
 
-/// Accumulates a sequence of `UpdatesTyped` chunks, merging the tail when a new
-/// chunk would extend the current run rather than start a new one.
-pub struct ChainBuilder<U: super::super::layout::ColumnarUpdate> { updates: Vec<UpdatesTyped<U>> }
+/// Accumulates `UpdatesTyped` chunks one at a time, melding small adjacent
+/// chunks. Holds at most one chunk in memory (the meld target); whenever a
+/// push doesn't meld, the prior target becomes "stable" and is emitted via
+/// the caller-provided sink. The sink can spill, count, or forward the chunk
+/// however it likes.
+pub struct ChainBuilder<U: super::super::layout::ColumnarUpdate, F: FnMut(UpdatesTyped<U>)> {
+    last: Option<UpdatesTyped<U>>,
+    sink: F,
+}
 
-impl<U: super::super::layout::ColumnarUpdate> Default for ChainBuilder<U> { fn default() -> Self { Self { updates: Default::default() } } }
+impl<U: super::super::layout::ColumnarUpdate, F: FnMut(UpdatesTyped<U>)> ChainBuilder<U, F> {
+    fn new(sink: F) -> Self { Self { last: None, sink } }
 
-impl<U: super::super::layout::ColumnarUpdate> ChainBuilder<U> {
     fn push(&mut self, mut link: UpdatesTyped<U>) {
         link = link.filter_zero();
-        if link.len() > 0 {
-            if let Some(last) = self.updates.last_mut() {
-                if last.len() + link.len() < 2 * crate::columnar::LINK_TARGET {
-                    let mut build = super::super::updates::UpdatesBuilder::new_from(std::mem::take(last));
-                    build.meld(&link);
-                    *last = build.done();
+        if link.len() == 0 { return; }
+        match self.last.as_mut() {
+            Some(last) if last.len() + link.len() < 2 * crate::columnar::LINK_TARGET => {
+                let mut build = super::super::updates::UpdatesBuilder::new_from(std::mem::take(last));
+                build.meld(&link);
+                *last = build.done();
+            }
+            _ => {
+                if let Some(prev) = self.last.take() {
+                    (self.sink)(prev);
                 }
-                else { self.updates.push(link); }
-
+                self.last = Some(link);
             }
-            else { self.updates.push(link); }
         }
     }
-    fn extend(&mut self, iter: impl IntoIterator<Item=UpdatesTyped<U>>) { for link in iter { self.push(link); }}
-    fn done(self) -> Vec<UpdatesTyped<U>> { self.updates }
+    fn extend(&mut self, iter: impl IntoIterator<Item=UpdatesTyped<U>>) {
+        for link in iter { self.push(link); }
+    }
+    fn done(mut self) {
+        if let Some(last) = self.last.take() {
+            (self.sink)(last);
+        }
+    }
 }
diff --git a/differential-dataflow/src/columnar/batcher.rs b/differential-dataflow/src/columnar/batcher.rs
index 0c622b50a..085c44012 100644
--- a/differential-dataflow/src/columnar/batcher.rs
+++ b/differential-dataflow/src/columnar/batcher.rs
@@ -1,6 +1,8 @@
 //! A `Batcher` for `RecordedUpdates<U>` streams that consolidates input via
 //! `TrieChunker` and merges sorted chains via the free functions in `trie_merger`.
 
+use std::collections::VecDeque;
+
 use timely::progress::frontier::AntichainRef;
 use timely::progress::{frontier::Antichain, Timestamp};
 use timely::container::{ContainerBuilder, PushInto};
@@ -13,17 +15,22 @@ use super::updates::UpdatesTyped;
 use super::RecordedUpdates;
 use super::arrangement::TrieChunker;
 use super::arrangement::trie_merger;
+use super::spill::{Entry, SpillPolicy};
 
 /// Creates batches from `RecordedUpdates<U>` streams.
 pub struct MergeBatcher<U: Update> {
     /// Transforms input streams to chunks of sorted, consolidated data.
     chunker: TrieChunker<U>,
-    /// A sequence of power-of-two length lists of sorted, consolidated containers.
-    chains: Vec<Vec<UpdatesTyped<U>>>,
+    /// A sequence of power-of-two length chains of sorted, consolidated entries.
+    /// Each entry is either an in-memory chunk or a handle to a paged-out chunk.
+    chains: Vec<VecDeque<Entry<UpdatesTyped<U>>>>,
     /// Current lower frontier, we sealed up to here.
     lower: Antichain<U::Time>,
     /// The lower-bound frontier of the data, after the last call to seal.
     frontier: Antichain<U::Time>,
+    /// Optional spill policy, consulted after each chain insert. `None` keeps
+    /// everything resident.
+    policy: Option<Box<dyn SpillPolicy<UpdatesTyped<U>>>>,
 }
 
 impl<U: Update<Time: Timestamp>> Batcher for MergeBatcher<U> {
@@ -37,6 +44,7 @@ impl<U: Update<Time: Timestamp>> Batcher for MergeBatcher<U> {
             chains: Vec::new(),
             frontier: Antichain::new(),
             lower: Antichain::from_elem(U::Time::minimum()),
+            policy: None,
         }
     }
 
@@ -46,7 +54,7 @@ impl<U: Update<Time: Timestamp>> Batcher for MergeBatcher<U> {
         self.chunker.push_into(container);
         while let Some(chunk) = self.chunker.extract() {
             let chunk = std::mem::take(chunk);
-            self.insert_chain(vec![chunk]);
+            self.insert_chain(VecDeque::from([Entry::Typed(chunk)]));
         }
     }
 
@@ -58,27 +66,47 @@ impl<U: Update<Time: Timestamp>> Batcher for MergeBatcher<U> {
         // Finish
         while let Some(chunk) = self.chunker.finish() {
             let chunk = std::mem::take(chunk);
-            self.insert_chain(vec![chunk]);
+            self.insert_chain(VecDeque::from([Entry::Typed(chunk)]));
         }
 
         // Merge all remaining chains into a single chain.
         while self.chains.len() > 1 {
             let list1 = self.chains.pop().unwrap();
             let list2 = self.chains.pop().unwrap();
-            let merged = Self::merge_by(list1, list2);
-            self.chains.push(merged);
+            let merged = self.merge_by(list1, list2);
+            self.push_chain(merged);
         }
         let merged = self.chains.pop().unwrap_or_default();
 
-        // Extract readied data.
-        let mut kept = Vec::new();
-        let mut readied = Vec::new();
+        // Extract readied data, streaming. `merged` is consumed lazily via
+        // `FetchIter`; ship-side chunks flow into `readied` for the
+        // builder; kept-side chunks flow into a fresh chain that is offered
+        // to the spill policy as each chunk lands, so kept never accumulates
+        // resident in full.
+        let mut readied: Vec<UpdatesTyped<U>> = Vec::new();
+        let mut kept_chain: VecDeque<Entry<UpdatesTyped<U>>> = VecDeque::new();
         self.frontier.clear();
+        {
+            let policy = &mut self.policy;
+            let frontier = &mut self.frontier;
+            let ship = |chunk: UpdatesTyped<U>| readied.push(chunk);
+            let keep = |chunk: UpdatesTyped<U>| {
+                kept_chain.push_back(Entry::Typed(chunk));
+                if let Some(p) = policy.as_mut() {
+                    p.apply(&mut kept_chain);
+                }
+            };
+            trie_merger::extract(
+                FetchIter::new(merged),
+                upper.borrow(),
+                frontier,
+                ship,
+                keep,
+            );
+        }
 
-        trie_merger::extract(merged, upper.borrow(), &mut self.frontier, &mut readied, &mut kept);
-
-        if !kept.is_empty() {
-            self.chains.push(kept);
+        if !kept_chain.is_empty() {
+            self.push_chain(kept_chain);
         }
 
         let description = Description::new(self.lower.clone(), upper.clone(), Antichain::from_elem(U::Time::minimum()));
@@ -95,26 +123,113 @@ impl<U: Update<Time: Timestamp>> Batcher for MergeBatcher<U> {
 }
 
 impl<U: Update> MergeBatcher<U> {
+    /// Install a spill policy. Consulted after each chain insert.
+    pub fn set_spill_policy(&mut self, policy: Box<dyn SpillPolicy<UpdatesTyped<U>>>) {
+        self.policy = Some(policy);
+    }
+
+    /// Sum of records currently held in `Entry::Typed` chunks across all
+    /// chains. `Entry::Paged` entries are excluded — they live on backing
+    /// storage, not in the process heap. Spill policies bound this quantity;
+    /// RSS may still grow due to materialize-on-merge.
+    pub fn resident_records(&self) -> usize {
+        self.chains
+            .iter()
+            .flat_map(|c| c.iter())
+            .map(|e| match e {
+                Entry::Typed(c) => {
+                    use timely::Accountable;
+                    c.record_count() as usize
+                }
+                Entry::Paged(_) => 0,
+            })
+            .sum()
+    }
+
     /// Insert a chain and maintain chain properties: Chains are geometrically sized and ordered
     /// by decreasing length.
-    fn insert_chain(&mut self, chain: Vec<UpdatesTyped<U>>) {
+    fn insert_chain(&mut self, chain: VecDeque<Entry<UpdatesTyped<U>>>) {
         if !chain.is_empty() {
-            self.chains.push(chain);
+            self.push_chain(chain);
             while self.chains.len() > 1 && (self.chains[self.chains.len() - 1].len() >= self.chains[self.chains.len() - 2].len() / 2) {
                 let list1 = self.chains.pop().unwrap();
                 let list2 = self.chains.pop().unwrap();
-                let merged = Self::merge_by(list1, list2);
-                self.chains.push(merged);
+                let merged = self.merge_by(list1, list2);
+                self.push_chain(merged);
             }
         }
     }
 
-    // merges two sorted input lists into one sorted output list.
-    fn merge_by(list1: Vec<UpdatesTyped<U>>, list2: Vec<UpdatesTyped<U>>) -> Vec<UpdatesTyped<U>> {
-        // TODO: `list1` and `list2` get dropped; would be better to reuse?
-        let mut output = Vec::with_capacity(list1.len() + list2.len());
-        trie_merger::merge_batches(list1, list2, &mut output);
+    /// Push a chain onto `chains` and consult the spill policy on the result.
+    /// Following TD's `MergeQueue::extend`, which calls `policy.apply` after
+    /// each queue extension. Applied to inputs and to merge / extract results
+    /// alike, so threshold-style policies see multi-chunk chains.
+    fn push_chain(&mut self, chain: VecDeque<Entry<UpdatesTyped<U>>>) {
+        self.chains.push(chain);
+        if let Some(policy) = self.policy.as_mut() {
+            if let Some(top) = self.chains.last_mut() {
+                policy.apply(top);
+            }
+        }
+    }
 
+    /// Merge two sorted chains. Inputs are streamed lazily through
+    /// `FetchIter` so paged entries are fetched one group at a time.
+    /// Output chunks flow through a sink that pushes into a fresh chain and
+    /// invokes the spill policy after each emission, so the merge result can
+    /// be paged out as it's produced rather than buffered in full.
+    fn merge_by(
+        &mut self,
+        list1: VecDeque<Entry<UpdatesTyped<U>>>,
+        list2: VecDeque<Entry<UpdatesTyped<U>>>,
+    ) -> VecDeque<Entry<UpdatesTyped<U>>> {
+        let mut output: VecDeque<Entry<UpdatesTyped<U>>> = VecDeque::new();
+        let policy = &mut self.policy;
+        let sink = |chunk: UpdatesTyped<U>| {
+            output.push_back(Entry::Typed(chunk));
+            if let Some(p) = policy.as_mut() {
+                p.apply(&mut output);
+            }
+        };
+        trie_merger::merge_batches(
+            FetchIter::new(list1),
+            FetchIter::new(list2),
+            sink,
+        );
         output
     }
+
+}
+
+/// Streaming iterator over a chain's chunks. Yields `Entry::Typed` chunks
+/// directly; for `Entry::Paged`, calls `Fetch::fetch` on demand and yields
+/// the resulting chunks one by one. Bounds materialized chunks to one fetch
+/// group at a time (plus whatever the consumer is holding).
+struct FetchIter<U: Update> {
+    queue: VecDeque<Entry<UpdatesTyped<U>>>,
+    pending: VecDeque<UpdatesTyped<U>>,
+}
+
+impl<U: Update> FetchIter<U> {
+    fn new(queue: VecDeque<Entry<UpdatesTyped<U>>>) -> Self {
+        Self { queue, pending: VecDeque::new() }
+    }
+}
+
+impl<U: Update> Iterator for FetchIter<U> {
+    type Item = UpdatesTyped<U>;
+    fn next(&mut self) -> Option<UpdatesTyped<U>> {
+        loop {
+            if let Some(c) = self.pending.pop_front() {
+                return Some(c);
+            }
+            match self.queue.pop_front()? {
+                Entry::Typed(c) => return Some(c),
+                Entry::Paged(handle) => match handle.fetch() {
+                    Ok(chunks) => self.pending.extend(chunks),
+                    Err(_) => panic!("Fetch::fetch failed; retry path not yet wired"),
+                },
+            }
+        }
+    }
 }

From 891b8a6393921662cb4635e6f4dff10cc4672704 Mon Sep 17 00:00:00 2001
From: Frank McSherry <fmcsherry@me.com>
Date: Wed, 6 May 2026 22:23:48 -0400
Subject: [PATCH 4/7] Split large chain links

---
 .../src/columnar/arrangement/trie_merger.rs   | 59 ++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)

diff --git a/differential-dataflow/src/columnar/arrangement/trie_merger.rs b/differential-dataflow/src/columnar/arrangement/trie_merger.rs
index f01de7529..fff0c2731 100644
--- a/differential-dataflow/src/columnar/arrangement/trie_merger.rs
+++ b/differential-dataflow/src/columnar/arrangement/trie_merger.rs
@@ -640,12 +640,24 @@ pub struct ChainBuilder<U: super::super::layout::ColumnarUpdate, F: FnMut(Update
     sink: F,
 }
 
-impl<U: super::super::layout::ColumnarUpdate, F: FnMut(UpdatesTyped<U>)> ChainBuilder<U, F> {
+impl<U: super::super::layout::ColumnarUpdate, F: FnMut(UpdatesTyped<U>)> ChainBuilder<U, F>
+where
+    U::Time: 'static,
+{
     fn new(sink: F) -> Self { Self { last: None, sink } }
 
     fn push(&mut self, mut link: UpdatesTyped<U>) {
         link = link.filter_zero();
         if link.len() == 0 { return; }
+        // Split links larger than twice the link target so downstream chains
+        // have multiple entries — required for per-chain spill policies (e.g.
+        // `Threshold` in the columnar_spill example) to actually spill anything.
+        if link.len() > 2 * crate::columnar::LINK_TARGET {
+            let (first, rest) = split_at::<U>(link, crate::columnar::LINK_TARGET);
+            self.push(first);
+            self.push(rest);
+            return;
+        }
         match self.last.as_mut() {
             Some(last) if last.len() + link.len() < 2 * crate::columnar::LINK_TARGET => {
                 let mut build = super::super::updates::UpdatesBuilder::new_from(std::mem::take(last));
@@ -669,3 +681,48 @@ impl<U: super::super::layout::ColumnarUpdate, F: FnMut(UpdatesTyped<U>)> ChainBu
         }
     }
 }
+
+/// Split `chunk` into two `UpdatesTyped` parts at record index `n`: the first
+/// `n` records and the remaining `chunk.len() - n`. Bitmap pattern mirrors
+/// `extract`'s split between ship/kept halves.
+fn split_at<U: Update>(chunk: UpdatesTyped<U>, n: usize) -> (UpdatesTyped<U>, UpdatesTyped<U>)
+where
+    U::Time: 'static,
+{
+    use columnar::{Container, ContainerOf, Index, Push};
+    use columnar::primitive::offsets::Strides;
+    use crate::columnar::updates::{Lists, retain_items};
+
+    let total = chunk.len();
+    if n == 0 { return (UpdatesTyped::default(), chunk); }
+    if n >= total { return (chunk, UpdatesTyped::default()); }
+
+    let view = chunk.view();
+    let mut bitmap: Vec<bool> = (0..total).map(|i| i < n).collect();
+
+    // First half: records [0, n).
+    let (times, temp) = retain_items::<ContainerOf<U::Time>>(view.times, &bitmap[..]);
+    let (vals, temp) = retain_items::<ContainerOf<U::Val>>(view.vals, &temp[..]);
+    let (keys, _temp) = retain_items::<ContainerOf<U::Key>>(view.keys, &temp[..]);
+    let d_borrow = view.diffs;
+    let mut diffs = <Lists::<ContainerOf<U::Diff>> as Container>::with_capacity_for([d_borrow].into_iter());
+    for (i, &bit) in bitmap.iter().enumerate() {
+        if bit { diffs.values.push(d_borrow.values.get(i)); }
+    }
+    diffs.bounds = Strides::new(1, times.values.len() as u64);
+    let first = UpdatesTyped { keys, vals, times, diffs };
+
+    // Invert and build second half: records [n, total).
+    for bit in bitmap.iter_mut() { *bit = !*bit; }
+    let (times, temp) = retain_items::<ContainerOf<U::Time>>(view.times, &bitmap[..]);
+    let (vals, temp) = retain_items::<ContainerOf<U::Val>>(view.vals, &temp[..]);
+    let (keys, _temp) = retain_items::<ContainerOf<U::Key>>(view.keys, &temp[..]);
+    let mut diffs = <Lists::<ContainerOf<U::Diff>> as Container>::with_capacity_for([d_borrow].into_iter());
+    for (i, &bit) in bitmap.iter().enumerate() {
+        if bit { diffs.values.push(d_borrow.values.get(i)); }
+    }
+    diffs.bounds = Strides::new(1, times.values.len() as u64);
+    let second = UpdatesTyped { keys, vals, times, diffs };
+
+    (first, second)
+}

From 090a53beffffa6384f7506bc907feabb4675dde6 Mon Sep 17 00:00:00 2001
From: Frank McSherry <fmcsherry@me.com>
Date: Wed, 6 May 2026 22:23:58 -0400
Subject: [PATCH 5/7] Rotate spill files

---
 .../examples/columnar_spill.rs                | 51 +++++++++++++------
 1 file changed, 36 insertions(+), 15 deletions(-)

diff --git a/differential-dataflow/examples/columnar_spill.rs b/differential-dataflow/examples/columnar_spill.rs
index 6a59c8d88..5231c8573 100644
--- a/differential-dataflow/examples/columnar_spill.rs
+++ b/differential-dataflow/examples/columnar_spill.rs
@@ -71,12 +71,16 @@ use timely::progress::frontier::AntichainRef;
 use timely::progress::{frontier::Antichain, Timestamp};
 
 /// File-backed `Spill`. Serializes each chunk into a reusable `Vec<u8>` and
-/// writes it with one `write_all` per chunk — one syscall per spill, vs. one
-/// per column.
+/// writes it with one `write_all` per chunk. Rotates to a new tempfile every
+/// `ROTATE_AFTER_BYTES` so disk space is reclaimed as `FileFetch` handles are
+/// consumed: once a file's last handle is dropped, the `Arc` hits zero, the
+/// (already-unlinked) tempfile closes, and the OS gives the space back.
 pub struct FileSpill<U: Update> {
-    file: Arc<Mutex<std::fs::File>>,
-    /// Cumulative byte offset for the next write.
-    offset: u64,
+    /// Current write file. `None` until first spill, or after rotation if no
+    /// chunks have been written to a fresh file yet.
+    current: Option<Arc<Mutex<std::fs::File>>>,
+    /// Bytes written to `current` so far.
+    current_offset: u64,
     /// Reusable serialization buffer; grows to fit the largest chunk seen,
     /// then sticks at that capacity (no per-chunk allocation).
     buf: Vec<u8>,
@@ -84,15 +88,31 @@ pub struct FileSpill<U: Update> {
 }
 
 impl<U: Update> FileSpill<U> {
+    /// Rotate to a new tempfile after this many bytes. Sized so each file
+    /// holds many chunks (amortizing the file-open cost) but small enough
+    /// that we don't accumulate hundreds of GB on disk before any can be
+    /// reclaimed.
+    const ROTATE_AFTER_BYTES: u64 = 1 << 30; // 1 GiB
+
     pub fn new() -> std::io::Result<Self> {
-        let file = tempfile::tempfile()?;
         Ok(Self {
-            file: Arc::new(Mutex::new(file)),
-            offset: 0,
+            current: None,
+            current_offset: 0,
             buf: Vec::new(),
             _marker: PhantomData,
         })
     }
+
+    fn current_file(&mut self) -> std::io::Result<Arc<Mutex<std::fs::File>>> {
+        if self.current.is_none() || self.current_offset >= Self::ROTATE_AFTER_BYTES {
+            // Drop the previous Arc — outstanding `FileFetch` handles still
+            // hold it; once they're all consumed, the file is unlinked-closed
+            // and the OS reclaims its space.
+            self.current = Some(Arc::new(Mutex::new(tempfile::tempfile()?)));
+            self.current_offset = 0;
+        }
+        Ok(self.current.as_ref().unwrap().clone())
+    }
 }
 
 impl<U: Update + 'static> Spill<UpdatesTyped<U>> for FileSpill<U> {
@@ -122,15 +142,16 @@ impl<U: Update + 'static> Spill<UpdatesTyped<U>> for FileSpill<U> {
             updates.diffs.write_bytes(&mut self.buf).unwrap();
             debug_assert_eq!(self.buf.len() as u64, total);
 
-            let start = self.offset;
-            let mut file = self.file.lock().unwrap();
-            file.seek(SeekFrom::Start(start)).unwrap();
-            file.write_all(&self.buf).unwrap();
-            drop(file);
-            self.offset += total;
+            let file = self.current_file().expect("tempfile");
+            let start = self.current_offset;
+            let mut f = file.lock().unwrap();
+            f.seek(SeekFrom::Start(start)).unwrap();
+            f.write_all(&self.buf).unwrap();
+            drop(f);
+            self.current_offset += total;
 
             handles.push(Box::new(FileFetch::<U> {
-                file: self.file.clone(),
+                file: file.clone(),
                 offset: start,
                 _marker: PhantomData,
             }));

From cda3f389c87e4d877ac50ff14541145191b19447 Mon Sep 17 00:00:00 2001
From: Frank McSherry <fmcsherry@me.com>
Date: Thu, 7 May 2026 08:50:28 -0400
Subject: [PATCH 6/7] Compression before paging

---
 differential-dataflow/Cargo.toml              |  1 +
 .../examples/columnar_spill.rs                | 71 ++++++++++++++-----
 2 files changed, 56 insertions(+), 16 deletions(-)

diff --git a/differential-dataflow/Cargo.toml b/differential-dataflow/Cargo.toml
index eddb1d88a..0e6b74bfb 100644
--- a/differential-dataflow/Cargo.toml
+++ b/differential-dataflow/Cargo.toml
@@ -26,6 +26,7 @@ graph_map = "0.1"
 bytemuck = "1.18.0"
 mimalloc = "0.1.48"
 tempfile = "3"
+lz4_flex = "0.11"
 
 [dependencies]
 columnar = { workspace = true }
diff --git a/differential-dataflow/examples/columnar_spill.rs b/differential-dataflow/examples/columnar_spill.rs
index 5231c8573..5bd795d45 100644
--- a/differential-dataflow/examples/columnar_spill.rs
+++ b/differential-dataflow/examples/columnar_spill.rs
@@ -24,6 +24,12 @@ static ENABLE_SPILL: AtomicBool = AtomicBool::new(true);
 static HEAD: AtomicUsize = AtomicUsize::new(10_000_000);
 static THRESH: AtomicUsize = AtomicUsize::new(50_000_000);
 
+/// Cumulative bytes serialized (pre-compression) and bytes written
+/// (post-compression) across all `FileSpill` instances. Lets us report a
+/// compression ratio at the end of a run.
+static BYTES_DECOMPRESSED: AtomicUsize = AtomicUsize::new(0);
+static BYTES_COMPRESSED: AtomicUsize = AtomicUsize::new(0);
+
 /// Cross-worker registry of `Threshold` stats so we can sum them after a run.
 static SHARED_STATS: OnceLock<Mutex<Vec<Arc<ThresholdStats>>>> = OnceLock::new();
 
@@ -50,6 +56,8 @@ fn reset_stats() {
     if let Some(m) = SHARED_STATS.get() {
         m.lock().unwrap().clear();
     }
+    BYTES_DECOMPRESSED.store(0, Ordering::Relaxed);
+    BYTES_COMPRESSED.store(0, Ordering::Relaxed);
 }
 
 use columnar::Push;
@@ -130,7 +138,7 @@ impl<U: Update + 'static> Spill<UpdatesTyped<U>> for FileSpill<U> {
             let total = 32 + keys_len + vals_len + times_len + diffs_len;
 
             // Serialize the whole chunk (header + four columns) into the
-            // reusable buffer, then issue a single write_all to the file.
+            // reusable buffer.
             self.buf.clear();
             self.buf.extend_from_slice(&keys_len.to_le_bytes());
             self.buf.extend_from_slice(&vals_len.to_le_bytes());
@@ -142,51 +150,72 @@ impl<U: Update + 'static> Spill<UpdatesTyped<U>> for FileSpill<U> {
             updates.diffs.write_bytes(&mut self.buf).unwrap();
             debug_assert_eq!(self.buf.len() as u64, total);
 
+            // Compress before writing. lz4 block format: caller is responsible
+            // for tracking the decompressed size, which we stash in the handle.
+            let compressed = lz4_flex::block::compress(&self.buf);
+            let comp_len = compressed.len() as u64;
+            BYTES_DECOMPRESSED.fetch_add(total as usize, Ordering::Relaxed);
+            BYTES_COMPRESSED.fetch_add(comp_len as usize, Ordering::Relaxed);
+
             let file = self.current_file().expect("tempfile");
             let start = self.current_offset;
             let mut f = file.lock().unwrap();
             f.seek(SeekFrom::Start(start)).unwrap();
-            f.write_all(&self.buf).unwrap();
+            f.write_all(&compressed).unwrap();
             drop(f);
-            self.current_offset += total;
+            self.current_offset += comp_len;
 
             handles.push(Box::new(FileFetch::<U> {
                 file: file.clone(),
                 offset: start,
+                compressed_len: comp_len,
+                decompressed_len: total,
                 _marker: PhantomData,
             }));
         }
     }
 }
 
-/// Per-chunk fetch handle. Reads a 32-byte header (four column lengths) at the
-/// recorded offset, then four `Stash::try_from_bytes` payloads.
+/// Per-chunk fetch handle. On `fetch`, reads `compressed_len` bytes at
+/// `offset`, decompresses to `decompressed_len`, then parses the 32-byte
+/// header + four column payloads.
 pub struct FileFetch<U: Update> {
     file: Arc<Mutex<std::fs::File>>,
     offset: u64,
+    compressed_len: u64,
+    decompressed_len: u64,
     _marker: PhantomData<U>,
 }
 
 impl<U: Update + 'static> Fetch<UpdatesTyped<U>> for FileFetch<U> {
     fn fetch(self: Box<Self>) -> Result<Vec<UpdatesTyped<U>>, Box<dyn Fetch<UpdatesTyped<U>>>> {
+        // Read the compressed bytes in one shot.
+        let mut compressed = vec![0u8; self.compressed_len as usize];
         let mut file = self.file.lock().unwrap();
         file.seek(SeekFrom::Start(self.offset)).unwrap();
-        let mut header = [0u8; 32];
-        file.read_exact(&mut header).unwrap();
+        file.read_exact(&mut compressed).unwrap();
+        drop(file);
+
+        let decompressed = lz4_flex::block::decompress(&compressed, self.decompressed_len as usize)
+            .expect("lz4 decompress");
+
+        // Parse the 32-byte header from the decompressed buffer.
+        let header = &decompressed[0..32];
         let keys_len = u64::from_le_bytes(header[0..8].try_into().unwrap()) as usize;
         let vals_len = u64::from_le_bytes(header[8..16].try_into().unwrap()) as usize;
         let times_len = u64::from_le_bytes(header[16..24].try_into().unwrap()) as usize;
         let diffs_len = u64::from_le_bytes(header[24..32].try_into().unwrap()) as usize;
 
-        let mut keys_bytes = vec![0u8; keys_len];
-        file.read_exact(&mut keys_bytes).unwrap();
-        let mut vals_bytes = vec![0u8; vals_len];
-        file.read_exact(&mut vals_bytes).unwrap();
-        let mut times_bytes = vec![0u8; times_len];
-        file.read_exact(&mut times_bytes).unwrap();
-        let mut diffs_bytes = vec![0u8; diffs_len];
-        file.read_exact(&mut diffs_bytes).unwrap();
-        drop(file);
+        // Slice the four columns out of the decompressed buffer, each into
+        // its own owned `Vec<u8>` (`Stash::try_from_bytes` requires owned).
+        let mut o = 32;
+        let keys_bytes = decompressed[o..o + keys_len].to_vec();
+        o += keys_len;
+        let vals_bytes = decompressed[o..o + vals_len].to_vec();
+        o += vals_len;
+        let times_bytes = decompressed[o..o + times_len].to_vec();
+        o += times_len;
+        let diffs_bytes = decompressed[o..o + diffs_len].to_vec();
 
         let keys = Stash::try_from_bytes(keys_bytes).unwrap();
         let vals = Stash::try_from_bytes(vals_bytes).unwrap();
@@ -478,6 +507,8 @@ fn main() {
             reset_stats();
             let elapsed = run_timely_dataflow(cfg.times, cfg.keys_per_time, cfg.workers, cfg.sample_secs, "spill");
             let (fires, chunks) = collect_stats();
+            let dec = BYTES_DECOMPRESSED.load(Ordering::Relaxed);
+            let comp = BYTES_COMPRESSED.load(Ordering::Relaxed);
             println!(
                 "spill: {:.2}s | {:.2} M records/s | {:.2} GB/s | threshold fired {} times, spilled {} chunks",
                 elapsed.as_secs_f64(),
@@ -485,6 +516,14 @@ fn main() {
                 raw_gb / elapsed.as_secs_f64(),
                 fires, chunks,
             );
+            if dec > 0 {
+                let dec_gb = dec as f64 / (1u64 << 30) as f64;
+                let comp_gb = comp as f64 / (1u64 << 30) as f64;
+                println!(
+                    "compression: {:.2} GB → {:.2} GB ({:.2}× ratio, lz4)",
+                    dec_gb, comp_gb, dec as f64 / comp.max(1) as f64,
+                );
+            }
         }
 
         if cfg.mode != Mode::Spill {

From f7fb900a1a8b651137f87d586a16bcc7695010f8 Mon Sep 17 00:00:00 2001
From: Frank McSherry <fmcsherry@me.com>
Date: Thu, 7 May 2026 13:47:46 -0400
Subject: [PATCH 7/7] Tidy columnar_spill example

---
 .../examples/columnar_spill.rs                | 51 -------------------
 1 file changed, 51 deletions(-)

diff --git a/differential-dataflow/examples/columnar_spill.rs b/differential-dataflow/examples/columnar_spill.rs
index 5bd795d45..96776121f 100644
--- a/differential-dataflow/examples/columnar_spill.rs
+++ b/differential-dataflow/examples/columnar_spill.rs
@@ -226,32 +226,6 @@ impl<U: Update + 'static> Fetch<UpdatesTyped<U>> for FileFetch<U> {
     }
 }
 
-/// Trivial `SpillPolicy`: page out every `Typed` entry on each apply.
-/// Useful for direct queue exercise; not intended as a real policy.
-pub struct SpillEverything<U: Update> {
-    spill: FileSpill<U>,
-}
-
-impl<U: Update + 'static> SpillPolicy<UpdatesTyped<U>> for SpillEverything<U> {
-    fn apply(&mut self, queue: &mut std::collections::VecDeque<Entry<UpdatesTyped<U>>>) {
-        let mut new_queue = std::collections::VecDeque::with_capacity(queue.len());
-        let mut buf = Vec::new();
-        let mut handles: Vec<Box<dyn Fetch<UpdatesTyped<U>>>> = Vec::new();
-        for entry in queue.drain(..) {
-            match entry {
-                Entry::Typed(c) => {
-                    buf.push(c);
-                    self.spill.spill(&mut buf, &mut handles);
-                    let handle = handles.pop().expect("FileSpill produces a handle per chunk");
-                    new_queue.push_back(Entry::Paged(handle));
-                }
-                Entry::Paged(h) => new_queue.push_back(Entry::Paged(h)),
-            }
-        }
-        *queue = new_queue;
-    }
-}
-
 /// Threshold-based spill policy adapted from timely's
 /// `communication::allocator::zero_copy::spill::threshold::Threshold`.
 ///
@@ -440,31 +414,6 @@ fn main() {
         println!("ok: direct Spill+Fetch roundtripped {} chunks", expected.len());
     }
 
-    // SpillPolicy roundtrip via a queue: every Typed becomes Paged, then we
-    // fetch each one back and compare.
-    {
-        let mut policy = SpillEverything {
-            spill: FileSpill::<TestUpdate>::new().unwrap(),
-        };
-        let mut queue: std::collections::VecDeque<Entry<UpdatesTyped<TestUpdate>>> =
-            originals.iter().cloned().map(Entry::Typed).collect();
-        policy.apply(&mut queue);
-
-        // Every entry should now be Paged, in original order.
-        assert_eq!(queue.len(), expected.len());
-        for (i, entry) in queue.into_iter().enumerate() {
-            match entry {
-                Entry::Paged(handle) => {
-                    let fetched = handle.fetch().unwrap_or_else(|_| panic!("fetch should succeed"));
-                    assert_eq!(fetched.len(), 1);
-                    assert_eq!(collect(&fetched[0]), expected[i], "queue position {}", i);
-                }
-                Entry::Typed(_) => panic!("SpillEverything should leave nothing typed"),
-            }
-        }
-        println!("ok: SpillEverything paged & retrieved {} chunks in order", expected.len());
-    }
-
     // End-to-end demo: a real timely dataflow.
     //
     // Each worker generates its share of the cancellation workload (positives