From 9e89b7cee5656925ce782303efab3c94fff9584e Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Mon, 26 Jan 2026 15:52:50 -0800
Subject: [PATCH 01/55] Specialized x86 implementation of interleave_vectors

---
 apps/iir_blur/Makefile               |   2 +-
 apps/iir_blur/iir_blur_generator.cpp |  18 +-
 src/CodeGen_Hexagon.cpp              |   4 -
 src/CodeGen_LLVM.cpp                 |  15 +-
 src/CodeGen_LLVM.h                   |   6 +-
 src/CodeGen_X86.cpp                  | 604 +++++++++++++++++++++++++++
 src/Util.h                           |   5 +
 7 files changed, 638 insertions(+), 16 deletions(-)

diff --git a/apps/iir_blur/Makefile b/apps/iir_blur/Makefile
index 49104b3e5fa3..92ed5d2a5b0b 100644
--- a/apps/iir_blur/Makefile
+++ b/apps/iir_blur/Makefile
@@ -25,7 +25,7 @@ $(BIN)/%/filter: filter.cpp $(BIN)/%/iir_blur.a $(BIN)/%/iir_blur_auto_schedule.
 	$(CXX) $(CXXFLAGS) -I$(BIN)/$* -Wall -O3 $^ -o $@ $(LDFLAGS) $(IMAGE_IO_FLAGS) $(CUDA_LDFLAGS) $(OPENCL_LDFLAGS)
 
 $(BIN)/%/out.png: $(BIN)/%/filter
-	$< ../images/rgba.png $(BIN)/$*/out.png
+	$< ../images/rgb.png $(BIN)/$*/out.png
 
 clean:
 	rm -rf $(BIN)
diff --git a/apps/iir_blur/iir_blur_generator.cpp b/apps/iir_blur/iir_blur_generator.cpp
index ef3b44eef461..4e4db6e61410 100644
--- a/apps/iir_blur/iir_blur_generator.cpp
+++ b/apps/iir_blur/iir_blur_generator.cpp
@@ -38,17 +38,25 @@ Func blur_cols_transpose(Func input, Expr height, Expr alpha, bool skip_schedule
             // CPU schedule.
             // 8.2ms on an Intel i9-9960X using 16 threads
             // Split the transpose into tiles of rows. Parallelize over channels
-            // and strips (Halide supports nested parallelism).
-            Var xo, yo, t;
+            // and strips.
+            Var xo, yo, t, yi;
             transpose.compute_root()
                 .tile(x, y, xo, yo, x, y, vec, vec * 4)
+                .split(y, y, yi, vec)
+                .unroll(yi)
                 .vectorize(x)
-                .parallel(yo)
-                .parallel(c);
+                .fuse(yo, c, t)
+                .parallel(t);
+
+            blur.in(transpose)
+                .reorder_storage(y, x)
+                .compute_at(transpose, y)
+                .vectorize(x)
+                .unroll(y);
 
             // Run the filter on each row of tiles (which corresponds to a strip of
             // columns in the input).
-            blur.compute_at(transpose, yo);
+            blur.compute_at(transpose, t);
 
             // Vectorize computations within the strips.
             blur.update(0)
diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp
index 05b68447b6a4..5347f69b279c 100644
--- a/src/CodeGen_Hexagon.cpp
+++ b/src/CodeGen_Hexagon.cpp
@@ -1404,10 +1404,6 @@ Value *CodeGen_Hexagon::vlut256(Value *lut, Value *idx, int min_index,
     return slice_vector(concat_vectors(result), 0, idx_elements);
 }
 
-bool is_power_of_two(int x) {
-    return (x & (x - 1)) == 0;
-}
-
 // vdelta and vrdelta are instructions that take an input vector and
 // pass it through a network made up of levels. Each element x at each
 // level i can either take the element from the previous level at the
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 4a5b45475533..7715fce28c34 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -1363,10 +1363,6 @@ void CodeGen_LLVM::codegen(const Stmt &s) {
     s.accept(this);
 }
 
-bool CodeGen_LLVM::is_power_of_two(int x) const {
-    return (x & (x - 1)) == 0;
-}
-
 Type CodeGen_LLVM::upgrade_type_for_arithmetic(const Type &t) const {
     if (t.is_bfloat() || (t.is_float() && t.bits() < 32)) {
         return Float(32, t.lanes());
@@ -2194,6 +2190,17 @@ void CodeGen_LLVM::visit(const Broadcast *op) {
     value = create_broadcast(v, op->lanes);
 }
 
+Value *CodeGen_LLVM::optimization_fence(Value *v) {
+    llvm::Type *t = v->getType();
+    internal_assert(!t->isScalableTy())
+        << "optimization_fence does not support scalable vectors yet";
+    const int bits = t->getPrimitiveSizeInBits();
+    llvm::Type *float_type = llvm_type_of(Float(64, bits / 64));
+    v = builder->CreateBitCast(v, float_type);
+    v = builder->CreateArithmeticFence(v, float_type);
+    return builder->CreateBitCast(v, t);
+}
+
 Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
     internal_assert(!vecs.empty());
     for (size_t i = 1; i < vecs.size(); i++) {
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index 183463d5fdb6..e006a885fc57 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -460,6 +460,10 @@ class CodeGen_LLVM : public IRVisitor {
      * an arbitrary number of vectors.*/
     virtual llvm::Value *interleave_vectors(const std::vector<llvm::Value *> &);
 
+    /** A fence to prevent fusion of ops by llvm. Designed for floats, but we
+     * abuse it to prevent shufflevector fusion too. */
+    llvm::Value *optimization_fence(llvm::Value *);
+
     /** Description of an intrinsic function overload. Overloads are resolved
      * using both argument and return types. The scalar types of the arguments
      * and return type must match exactly for an overload resolution to succeed. */
@@ -523,8 +527,6 @@ class CodeGen_LLVM : public IRVisitor {
     /** Shorthand for shuffling a single vector. */
     llvm::Value *shuffle_vectors(llvm::Value *v, const std::vector<int> &indices);
 
-    bool is_power_of_two(int x) const;
-
     bool is_scalable_vector(llvm::Value *v) const;
 
     /** Go looking for a vector version of a runtime function. Will
diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 3d2388fdf89c..ab854f72e897 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -11,6 +11,8 @@
 #include "Substitute.h"
 #include "Util.h"
 
+#include <algorithm>
+
 namespace Halide {
 namespace Internal {
 
@@ -111,6 +113,8 @@ class CodeGen_X86 : public CodeGen_Posix {
     void codegen_vector_reduce(const VectorReduce *, const Expr &init) override;
     // @}
 
+    llvm::Value *interleave_vectors(const std::vector<llvm::Value *> &) override;
+
 private:
     Scope<MemoryType> mem_type;
 };
@@ -913,6 +917,606 @@ void CodeGen_X86::codegen_vector_reduce(const VectorReduce *op, const Expr &init
     CodeGen_Posix::codegen_vector_reduce(op, init);
 }
 
+Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
+    // Only use x86-specific interleaving for AVX and above
+    if (vecs.empty() || !target.has_feature(Target::AVX)) {
+        return CodeGen_Posix::interleave_vectors(vecs);
+    }
+
+    if (vecs.size() == 1) {
+        return vecs[0];
+    }
+
+    // Get the element type and vector properties
+    llvm::Type *vec_type = vecs[0]->getType();
+    llvm::Type *element_type = get_vector_element_type(vec_type);
+    int vec_elements = get_vector_num_elements(vec_type);
+    const size_t element_bits = element_type->getScalarSizeInBits();
+    const size_t elems_per_native_vec = native_vector_bits() / element_bits;
+    const size_t elems_per_slice = 128 / element_bits;
+
+    // Only apply special x86 logic for power-of-two interleaves for avx and
+    // above (TODO: Could slice into native vectors and concat results even if
+    // not power of two)
+
+    if (!is_power_of_two(vec_elements) ||
+        !is_power_of_two(vecs.size())) {
+        return CodeGen_Posix::interleave_vectors(vecs);
+    }
+
+    /*
+      x86 has a weird set of vector shuffle instructions due to historical
+      baggage, and the strategy in the base class for interleaving vectors
+      works poorly. Here we have a somewhat complex algorithm for generating
+      better sequences of shuffle instructions for avx and avx-512.
+
+      Consider the location of one of the elements of one of the vectors. It
+      has a vector index, which says which vector it's in, and a vector lane
+      index, which gives the lane. x86 shuffles work in terms of 128-bit
+      subvectors, which we will call slices. So we'll decompose that lane index
+      into a slice index, to identify the 128-bit slice within a vector, and
+      the lane index within that slice. For avx the slice index is either zero
+      or one, and for avx-512 it's 0, 1, 2, or 3. Because we have limited
+      everything to be a power of two, we can write out these indices in
+      binary. We'll use v for the vector index, s for the slice index, and l
+      for the lane index. For an avx-512 interleave of 16 vectors of 32
+      elements each (i.e. uint16s), a location could thus be written as:
+
+      [l0 l1 l2] [s0 s1] [v0 v1 v2 v3]
+
+      where l0 is the least-significant bit of the lane index, and so on.
+
+      An interleave takes the bits that give the vector index and moves them to
+      be the least significant bits, shifting everything else over. So the
+      indices of our vectors after the interleave should be:
+
+      [v0 v1 v2] [v3 l0] [l1 l2 s0 s1]
+
+      Assigning numbers to each according to their final location, we start with:
+
+      [4 5 6] [7 8] [0 1 2 3]
+
+      and we want to issue some sequence of instructions to get us to:
+
+      [0 1 2] [3 4] [5 6 7 8]
+
+      Now let's consider the instructions we have available. These generally
+      permute these bits. E.g. an instruction that interleaves two entire
+      vectors, applied to every pairs of vectors, would take the some vector bit
+      and make it the lowest lane bit instead, shuffling the other bits upwards,
+      with the highest-order within-vector bit taking the place of the vector
+      bit (because we produce separate vectors for the low and high half of the
+      result. So if we used this instruction to push the highest vector bit
+      inwards, we could turn this:
+
+      [4 5 6] [7 8] [0 1 2 3]
+
+      into this:
+
+      [3 4 5] [6 7] [0 1 2 8]
+
+      If we did this three more times, pulling a different vector bit in each
+      time, we'd get:
+
+      [0 1 2] [3 4] [5 6 7 8]
+
+      and we'd be done! This is what the base class does. Unfortunately, x86 has
+      no such instruction, so we'll have to figure out something else.
+      Interleaving vectors often happens in contexts with high register
+      pressure, so we will restrict our attention to instructions that take
+      immediates. The most important one is vunpckl/h. This interleaves lanes
+      between two vectors but staying within each 128-bit slice. So the slice
+      bits will be unchanged, and the lane bits will be rotated right along with
+      one of the vector bits. So if we interleave vectors starting from the
+      second-highest vector bit, we can turn this:
+
+      [4 5 6] [_ _] [_ _ 2 _]
+
+      into this:
+
+      [2 4 5] [_ _] [_ _ 6 _]
+
+      where the underscores indicate bits that are unchanged.
+
+      Unlike a full vector interleave, the slice bits stayed fixed, and the
+      highest within-slice lane bit (6) took the place of the vector bit
+      instead. This is at least a good start. If we do this two more times,
+      pulling in vector bits 0 and 1, we can make this:
+
+      [0 1 2] [7 8] [4 5 6 3]
+
+      The lane bits are now in the desired state. The next instruction to
+      consider is shufi. It's more general than this, but for our purposes there
+      are two interesting things we can do with it. We concatenate the low halves
+      of two vectors or the high halves of two vectors, which swaps the
+      high-order slice bit with one of the vector bits:
+
+      [_ _ _] [_ 8] [_ _ _ 3] -> [_ _ _] [_ 3] [_ _ _ 8]
+
+      We can also interleave the even slices of a vector with the even slices of
+      another (and do the same for odd), which rotates left the two slice bits
+      together with one of the vector bits:
+
+      [_ _ _] [7 3] [4 _ _ _] -> [_ _ _] [3 4] [7 _ _ _]
+
+      The vector bit became the high slice bit, the low slice bit took the place
+      of the vector bit, and the high slice bit becomes the low slice
+      bit. Filling in the underscores, we're now in this state:
+
+      [0 1 2] [3 4] [7 5 6 8]
+
+      Only the vector bits are wrong, but permuting entire vectors is free,
+      because that's just changing which register names we're referring to
+      (shuffling our array of llvm::Value *). So all totalled, per vector, we
+      needed three unckl/h instructions, and one shufi instruction of each
+      kind. If the vectors were a narrower type, it would have just added one
+      more unpckl.
+
+      If you're interleaving lots of complete vectors, that's the whole story,
+      but there are other situations to consider. It's not uncommon to want to
+      interleave half-vectors to make some number of full vectors. We can model
+      this by having some slice or even lane bits start as missing. So
+      interleaving 16 half-vectors of uint16s to 8 full vectors would be
+      starting from this:
+
+      [4 5 6] [7] [0 1 2 3]
+
+      and trying to get here:
+
+      [0 1 2] [3 4] [5 6 7]
+
+      Each of our instructions has to operate on every vector, so to reduce the
+      number of instructions so we'd first like to do something to create that
+      missing high slice bit, halving the number of vectors. E.g. we could
+      identify pairs of vectors to concatenate. Let's try concatenating pairs
+      using the high vector bit (3):
+
+      [4 5 6] [7 3] [0 1 2]
+
+      Now we do three unpcks to rotate 0 1 2 into the correct place:
+
+      [0 1 2] [7 3] [4 5 6]
+
+      Now a single shufi can rotate 7 3 and 4:
+
+      [0 1 2] [3 4] [7 5 6]
+
+      and we just need to reorder whole vectors and we're done. So in this case
+      we needed only a single shufi instruction, because our desired low slice
+      bit (3) was already sitting there as the high slice bit after
+      pairwise concatenation.
+
+      Now consider the case where we had only four half-vectors to interleave to
+      produce two whole vectors:
+
+      [2 3 4] [5] [0 1]
+
+      There's no good concatenation we can do to make whole vectors. That 0 and 1
+      both need to end up as lanes bits, and we have no instructions that swap
+      slice bits with lanes bits. So we'll just have to run unpck instructions at
+      half-vector width to push that 4 into the vector bit range:
+
+      [1 2 3] [5] [0 4]
+
+      and now we can concatenate according to bit 4 to make whole vectors
+
+      [1 2 3] [5 4] [0]
+
+      We then do one more unpck to pull the 0 down:
+
+      [0 1 2] [5 4] [3]
+
+      Next, we need to make 3 a slice bit. We can use shufi to swap it with 4:
+
+      [0 1 2] [5 3] [4]
+
+      and then another shufi to rotate those three
+
+      [0 1 2] [3 4] [5]
+
+      and we're done.
+
+      Depending on how many of each bit we start with, we can also end up in
+      situations where everything is correct except the two slice bits are in
+      the wrong order, in which case we can use a shufi instruction with a
+      vector and itself to swap those two bits.
+
+      So there are many possible paths depending on the number of elements per
+      vector, the number of elements per 128-bit slice of each vector, and the
+      number of vectors to interleave. The way to stay sane is to just
+      explicitly track the vectors above as l_bits, s_bits, and v_bits, and
+      transform it alongside all our instructions as we try to get the right
+      bits in the right final places.
+    */
+
+    // Make a working copy
+    std::vector<llvm::Value *> v = vecs;
+
+    // The number of 128-bit slices per vector is 2 for avx and 4 for avx512
+    const int final_num_s_bits = ctz64(native_vector_bits() / 128);
+    internal_assert(final_num_s_bits == 1 || final_num_s_bits == 2) << native_vector_bits() << " " << final_num_s_bits << "\n";
+
+    const int num_v_bits = ctz64(v.size());
+    const int num_s_bits = ((size_t)vec_elements <= elems_per_slice) ? 0 : ctz64(vec_elements / elems_per_slice);
+    const int num_l_bits = ctz64(std::min((size_t)vec_elements, elems_per_slice));
+
+    // Construct the initial tracking vectors for each bit location
+    std::vector<int> v_bits(num_v_bits), l_bits(num_l_bits), s_bits(num_s_bits);
+    int c = 0;
+    for (int i = 0; i < num_v_bits; i++) {
+        // We want the v bits to end up innermost, so number them 0, 1, 2 ...
+        v_bits[i] = c++;
+    }
+    for (int i = 0; i < num_l_bits; i++) {
+        // Then come the l bits
+        l_bits[i] = c++;
+    }
+    for (int i = 0; i < num_s_bits; i++) {
+        // and finally, the slice bits
+        s_bits[i] = c++;
+    }
+
+    // Now we define helpers for each instruction we are going to use
+
+    // unpckl/h instruction
+    auto unpck = [&](Value *a, Value *b) -> std::pair<Value *, Value *> {
+        int n = get_vector_num_elements(a->getType());
+        std::vector<int> lo_indices, hi_indices;
+
+        for (int i = 0; i < n; i += (int)elems_per_slice) {
+            int half = (int)elems_per_slice / 2;
+            // For the low result, interleave the first half of each slice
+            for (int j = 0; j < half; j++) {
+                lo_indices.push_back(i + j);
+                lo_indices.push_back(n + i + j);
+            }
+            // For the high result, interleave the second half of each slice
+            for (int j = half; j < (int)elems_per_slice; j++) {
+                hi_indices.push_back(i + j);
+                hi_indices.push_back(n + i + j);
+            }
+        }
+
+        Value *lo = shuffle_vectors(a, b, lo_indices);
+        Value *hi = shuffle_vectors(a, b, hi_indices);
+        // Everything falls apart if we let LLVM fuse shuffles, so we add
+        // optimization fences around the results to ensure we get the
+        // instructions we're asking for.
+        return {optimization_fence(lo), optimization_fence(hi)};
+    };
+
+    // shufi instruction, with or without cross-over
+    auto shufi = [&](Value *a, Value *b, bool crossover) -> std::pair<Value *, Value *> {
+        int n = get_vector_num_elements(a->getType());
+        std::vector<int> lo_indices, hi_indices;
+        if (final_num_s_bits == 2) {
+            // AVX-512
+            for (int i = 0; i < (int)elems_per_slice; i++) {
+                lo_indices.push_back(i);
+                hi_indices.push_back(i + (crossover ? 1 : 2) * (int)elems_per_slice);
+            }
+            for (int i = 0; i < (int)elems_per_slice; i++) {
+                lo_indices.push_back(i + (crossover ? 2 : 1) * (int)elems_per_slice);
+                hi_indices.push_back(i + 3 * (int)elems_per_slice);
+            }
+            for (int i = 0; i < (int)elems_per_slice * 2; i++) {
+                lo_indices.push_back(lo_indices[i] + n);
+                hi_indices.push_back(hi_indices[i] + n);
+            }
+        } else {
+            // AVX-2
+            for (int i = 0; i < (int)elems_per_slice; i++) {
+                lo_indices.push_back(i);
+                hi_indices.push_back(i + elems_per_slice);
+            }
+            for (int i = 0; i < (int)elems_per_slice; i++) {
+                lo_indices.push_back(lo_indices[i] + n);
+                hi_indices.push_back(hi_indices[i] + n);
+            }
+        }
+        Value *lo = shuffle_vectors(a, b, lo_indices);
+        Value *hi = shuffle_vectors(a, b, hi_indices);
+        return {optimization_fence(lo), optimization_fence(hi)};
+    };
+
+    // A 2x2 transpose of slices within a single vector
+    auto self_shufi = [&](Value *a) -> Value * {
+        internal_assert(4 * (int)elems_per_slice == vec_elements)
+            << "Should only be using shufi helper when targeting avx-512 shuffles on native vectors\n"
+            << elems_per_slice << " " << vec_elements << " " << native_vector_bits() << "\n";
+        std::vector<int> indices;
+        for (int j : {0, 2, 1, 3}) {
+            for (int i = 0; i < (int)elems_per_slice; i++) {
+                indices.push_back(i + j * (int)elems_per_slice);
+            }
+        }
+        return optimization_fence(shuffle_vectors(a, a, indices));
+    };
+
+    // First, if the vectors are wider than native, that will manifest as too
+    // many slice bits. Cut them into separate native vectors. This will not
+    // create any instructions.
+    while ((size_t)vec_elements > elems_per_native_vec) {
+        int cut = vec_elements / 2;
+        std::vector<Value *> new_v;
+        for (auto *vec : v) {
+            new_v.push_back(slice_vector(vec, 0, cut));
+        }
+        for (auto *vec : v) {
+            new_v.push_back(slice_vector(vec, cut, cut));
+        }
+        v = new_v;
+        vec_elements = cut;
+
+        v_bits.push_back(s_bits.back());
+        s_bits.pop_back();
+    }
+
+    // Interleave pairs if we have vectors smaller than a single slice. Choosing
+    // which pairs to interleave is important because we want to pull down v
+    // bits that are destined to end up as l bits, and we want to pull them down
+    // in order.
+    if ((size_t)vec_elements < elems_per_slice) {
+        int highest_desired_l_bit = ctz64(elems_per_slice) - 1;
+        int bit = highest_desired_l_bit;
+        if (!v_bits.empty() && std::find(v_bits.begin(), v_bits.end(), bit) == v_bits.end()) {
+            bit = v_bits.back();
+        }
+
+        while (bit >= 0 && (size_t)vec_elements < elems_per_slice && !v_bits.empty()) {
+            auto it = std::find(v_bits.begin(), v_bits.end(), bit);
+            if (it == v_bits.end()) {
+                break;
+            }
+            int j = it - v_bits.begin();
+            v_bits.erase(it);
+            l_bits.insert(l_bits.begin(), bit);
+
+            // The distance in the vecs array is the index of the corresponding
+            // v bit we're pulling down.
+            int step = 1 << j;
+            std::vector<Value *> new_v;
+            new_v.reserve(v.size() / 2);
+            for (size_t i = 0; i < v.size(); i++) {
+                // Pair each vector with the one separated by the step.
+                size_t j = i ^ step;
+
+                // Don't process vectors twice.
+                if (j < i) continue;
+
+                // Just interleave the two vectors. Because we have fewer
+                // elements than one slice, unpckl/h is a straight interleave.
+                std::vector<int> indices;
+                for (int k = 0; k < vec_elements; k++) {
+                    indices.push_back(k);
+                    indices.push_back(vec_elements + k);
+                }
+                new_v.push_back(shuffle_vectors(v[i], v[j], indices));
+            }
+            v.swap(new_v);
+            vec_elements *= 2;
+            bit--;
+        }
+    }
+
+    // Concatenate/repack to get at least the desired number of slice bits.
+    while ((int)s_bits.size() < final_num_s_bits && !v_bits.empty()) {
+        int desired_low_slice_bit = ctz64(elems_per_slice);
+        int desired_high_slice_bit = desired_low_slice_bit + 1;
+
+        int bit;
+        if (!s_bits.empty() &&
+            s_bits[0] == desired_low_slice_bit) {
+            // Only the avx-512 path should land here due to the while condition.
+            internal_assert(final_num_s_bits == 2);
+            bit = desired_high_slice_bit;
+        } else {
+            bit = desired_low_slice_bit;
+        }
+
+        auto v_it = std::find(v_bits.begin(), v_bits.end(), bit);
+        if (v_it != v_bits.end()) {
+            int j = v_it - v_bits.begin();
+            v_bits.erase(v_it);
+            s_bits.push_back(bit);
+
+            int step = 1 << j;
+            std::vector<Value *> new_v;
+            new_v.reserve(v.size() / 2);
+            for (size_t i = 0; i < v.size(); i++) {
+                size_t k = i ^ step;
+                if (k < i) continue;
+                new_v.push_back(concat_vectors({v[i], v[k]}));
+            }
+            v.swap(new_v);
+            vec_elements *= 2;
+        } else {
+            // Oh no, the bit we wanted to use isn't in v_bits, it's in l_bits.
+            // We'll do sub-width unpck instead with an appropriate v bit to try
+            // to push it out. This is in a while loop, so it will keep doing
+            // this until it pops out the top of the l bits and we identify it
+            // as a v bit.
+            if (std::find(l_bits.begin(), l_bits.end(), bit) != l_bits.end()) {
+                int b = l_bits[0] - 1;
+                if (std::find(v_bits.begin(), v_bits.end(), b) == v_bits.end()) {
+                    b = v_bits.back();
+                }
+
+                auto vb_it = std::find(v_bits.begin(), v_bits.end(), b);
+                int j = vb_it - v_bits.begin();
+                *vb_it = l_bits.back();
+                l_bits.pop_back();
+                l_bits.insert(l_bits.begin(), b);
+
+                int step = 1 << j;
+                for (size_t i = 0; i < v.size(); i++) {
+                    size_t k = i ^ step;
+                    if (k < i) continue;
+                    auto [lo, hi] = unpck(v[i], v[k]);
+                    v[i] = lo;
+                    v[k] = hi;
+                }
+            }
+        }
+    }
+
+    // If only one vector is left, we just need to check if the slice bits are
+    // in the right order:
+    if (v_bits.empty()) {
+        internal_assert(v.size() == 1);
+        if (s_bits.size() == 2 && s_bits[0] > s_bits[1]) {
+            v[0] = self_shufi(v[0]);
+            std::swap(s_bits[0], s_bits[1]);
+        }
+        return v[0];
+    }
+
+    // Now we have at least two whole vectors. Next we finalize lane bits using
+    // unpck instructions.
+    while (l_bits[0] != 0) {
+        int bit = std::min(l_bits[0], (int)ctz64(elems_per_slice)) - 1;
+
+        auto vb_it = std::find(v_bits.begin(), v_bits.end(), bit);
+        internal_assert(vb_it != v_bits.end());
+
+        int j = vb_it - v_bits.begin();
+        *vb_it = l_bits.back();
+        l_bits.pop_back();
+        l_bits.insert(l_bits.begin(), bit);
+
+        int step = 1 << j;
+        for (size_t i = 0; i < v.size(); i++) {
+            size_t k = i ^ step;
+            if (k < i) continue;
+            auto [lo, hi] = unpck(v[i], v[k]);
+            v[i] = lo;
+            v[k] = hi;
+        }
+    }
+
+    // They should be 0, 1, 2, 3...
+    for (int i = 0; i < (int)l_bits.size(); i++) {
+        internal_assert(l_bits[i] == i);
+    }
+
+    // Then we fix the slice bits with shufi instructions
+
+    // First the low slice bit
+    int low_slice_bit = l_bits.size();
+    auto ls_in_v = std::find(v_bits.begin(), v_bits.end(), low_slice_bit);
+    if (ls_in_v != v_bits.end()) {
+        int i = ls_in_v - v_bits.begin();
+        int step = 1 << i;
+        std::swap(*ls_in_v, s_bits.back());
+
+        for (size_t idx = 0; idx < v.size(); idx++) {
+            size_t j = idx ^ step;
+            if (j <= idx) continue;
+            auto [lo, hi] = shufi(v[idx], v[j], false);
+            v[idx] = lo;
+            v[j] = hi;
+        }
+    }
+
+    // And then the high slice bit, if there is one
+    if (final_num_s_bits == 2) {
+        // AVX-512
+        int high_slice_bit = low_slice_bit + 1;
+        auto hs_in_v = std::find(v_bits.begin(), v_bits.end(), high_slice_bit);
+        if (hs_in_v != v_bits.end()) {
+            // The high slice bit is in the v_bits. Note that if it's not, it'll
+            // be one of the slice bits. It can't be an l bit, because we've
+            // already finalized them.
+            int i = hs_in_v - v_bits.begin();
+            int step = 1 << i;
+
+            if (!s_bits.empty() && s_bits.back() == low_slice_bit) {
+                // The low slice bit is currently occupying the high slice bit slot,
+                // so we need to shuffle it over at the same time by using the
+                // crossover variant of shufi.
+                int temp = s_bits[0];
+                s_bits[0] = s_bits.back();
+                s_bits.back() = *hs_in_v;
+                *hs_in_v = temp;
+
+                for (size_t idx = 0; idx < v.size(); idx++) {
+                    size_t j = idx ^ step;
+                    if (j <= idx) continue;
+                    auto [lo, hi] = shufi(v[idx], v[j], true);
+                    v[idx] = lo;
+                    v[j] = hi;
+                }
+            } else {
+                // The low slice bit must be already in place, so no crossover required.
+                internal_assert(s_bits[0] == low_slice_bit);
+                std::swap(*hs_in_v, s_bits.back());
+
+                for (size_t idx = 0; idx < v.size(); idx++) {
+                    size_t j = idx ^ step;
+                    if (j <= idx) continue;
+                    auto [lo, hi] = shufi(v[idx], v[j], false);
+                    v[idx] = lo;
+                    v[j] = hi;
+                }
+            }
+        } else if (s_bits.size() == 2 &&
+                   s_bits[0] == high_slice_bit &&
+                   s_bits[1] == low_slice_bit) {
+            // The slice bits are both there, but in the wrong order
+            std::swap(s_bits[0], s_bits[1]);
+            for (size_t i = 0; i < v.size(); i++) {
+                v[i] = self_shufi(v[i]);
+            }
+        }
+
+        // Both slice bits should be correct now
+        internal_assert(s_bits.size() == 2 &&
+                        s_bits[0] == low_slice_bit &&
+                        s_bits[1] == high_slice_bit);
+
+    } else {
+        // AVX-2 The sole slice bit should be correct now.
+        internal_assert(s_bits.size() == 1 &&
+                        s_bits[0] == low_slice_bit);
+    }
+
+    // The lane and slice bits are correct, but the vectors are in some
+    // arbitrary order. We'll reorder them by deinterleaving the list according
+    // to each bit position, in increasing order.
+    for (size_t i = 0; i < v_bits.size(); i++) {
+        int bit = i + s_bits.size() + l_bits.size();
+        auto vb_it = std::find(v_bits.begin(), v_bits.end(), bit);
+        internal_assert(vb_it != v_bits.end());
+
+        int j = vb_it - v_bits.begin();
+        v_bits.erase(vb_it);
+        v_bits.push_back(bit);
+
+        std::vector<Value *> a, b;
+        a.reserve(v.size() / 2);
+        b.reserve(v.size() / 2);
+        int mask = 1 << j;
+        for (size_t k = 0; k < v.size(); k++) {
+            if ((k & mask) == 0) {
+                a.push_back(v[k]);
+            } else {
+                b.push_back(v[k]);
+            }
+        }
+        v.clear();
+        v.insert(v.end(), a.begin(), a.end());
+        v.insert(v.end(), b.begin(), b.end());
+    }
+
+    // The v bits should be correct now
+    for (int i = 0; i < (int)v_bits.size(); i++) {
+        internal_assert(v_bits[i] == i + (int)(l_bits.size() + s_bits.size()));
+    }
+
+    // Concatenate all results into a single vector. Phew.
+    return concat_vectors(v);
+}
+
 void CodeGen_X86::visit(const Allocate *op) {
     ScopedBinding<MemoryType> bind(mem_type, op->name, op->memory_type);
     CodeGen_Posix::visit(op);
diff --git a/src/Util.h b/src/Util.h
index 3196c1966cbb..4a6c84d9e594 100644
--- a/src/Util.h
+++ b/src/Util.h
@@ -568,6 +568,11 @@ inline int64_t next_power_of_two(int64_t x) {
     return static_cast<int64_t>(1) << static_cast<int64_t>(std::ceil(std::log2(x)));
 }
 
+/** Return whether or not an integer is a power of two. */
+inline bool is_power_of_two(int64_t x) {
+    return (x & (x - 1)) == 0;
+}
+
 template<typename T>
 inline T align_up(T x, int n) {
     return (x + n - 1) / n * n;

From 188bee0d01f154fd08d594bbe94403e9a8a03e1e Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 27 Jan 2026 11:00:39 -0800
Subject: [PATCH 02/55] Update test to be more exhaustive

---
 test/performance/block_transpose.cpp | 188 +++++++++++++--------------
 1 file changed, 89 insertions(+), 99 deletions(-)

diff --git a/test/performance/block_transpose.cpp b/test/performance/block_transpose.cpp
index 740908358443..9915cf8e5f51 100644
--- a/test/performance/block_transpose.cpp
+++ b/test/performance/block_transpose.cpp
@@ -7,108 +7,73 @@
 using namespace Halide;
 using namespace Halide::Tools;
 
-enum {
-    scalar_trans,
-    vec_y_trans,
-    vec_x_trans
+struct Result {
+    int type_size, block_width, block_height;
+    double bandwidth;
 };
 
-Buffer<uint16_t> test_transpose(int mode) {
-    Func input, block, block_transpose, output;
-    Var x, y;
-
-    input(x, y) = cast<uint16_t>(x + y);
-    input.compute_root();
+template<typename T>
+Result test_transpose(int block_width, int block_height, const Target &t) {
+    const int N = 256;
+    Buffer<T> in(N, N), out(N, N);
 
-    block(x, y) = input(x, y);
-    block_transpose(x, y) = block(y, x);
-    output(x, y) = block_transpose(x, y);
-
-    Var xi, yi;
-    output.tile(x, y, xi, yi, 8, 8).vectorize(xi).unroll(yi);
-
-    // Do 8 vectorized loads from the input.
-    block.compute_at(output, x).vectorize(x).unroll(y);
-
-    std::string algorithm;
-    switch (mode) {
-    case scalar_trans:
-        block_transpose.compute_at(output, x).unroll(x).unroll(y);
-        algorithm = "Scalar transpose";
-        output.compile_to_assembly(Internal::get_test_tmp_dir() + "scalar_transpose.s", std::vector<Argument>());
-        break;
-    case vec_y_trans:
-        block_transpose.compute_at(output, x).vectorize(y).unroll(x);
-        algorithm = "Transpose vectorized in y";
-        output.compile_to_assembly(Internal::get_test_tmp_dir() + "fast_transpose_y.s", std::vector<Argument>());
-        break;
-    case vec_x_trans:
-        block_transpose.compute_at(output, x).vectorize(x).unroll(y);
-        algorithm = "Transpose vectorized in x";
-        output.compile_to_assembly(Internal::get_test_tmp_dir() + "fast_transpose_x.s", std::vector<Argument>());
-        break;
+    for (int y = 0; y < N; y++) {
+        for (int x = 0; x < N; x++) {
+            in(x, y) = (T)(x + y * N);
+        }
     }
 
-    Buffer<uint16_t> result(1024, 1024);
-    output.compile_jit();
-
-    output.realize(result);
-
-    double t = benchmark([&]() {
-        output.realize(result);
-    });
-
-    std::cout << "Dummy Func version: " << algorithm << " bandwidth " << 1024 * 1024 / t << " byte/s.\n";
-    return result;
-}
-
-/* This illustrates how to achieve the same scheduling behavior using the 'in()'
- * directive as opposed to creating dummy Funcs as done in 'test_transpose()' */
-Buffer<uint16_t> test_transpose_wrap(int mode) {
     Func input, block_transpose, block, output;
     Var x, y;
 
-    input(x, y) = cast<uint16_t>(x + y);
-    input.compute_root();
+    input(x, y) = in(x, y);
 
     output(x, y) = input(y, x);
 
     Var xi, yi;
-    output.tile(x, y, xi, yi, 8, 8).vectorize(xi).unroll(yi);
-
-    // Do 8 vectorized loads from the input.
-    block_transpose = input.in(output).compute_at(output, x).vectorize(x).unroll(y);
-
-    std::string algorithm;
-    switch (mode) {
-    case scalar_trans:
-        block = block_transpose.in(output).reorder_storage(y, x).compute_at(output, x).unroll(x).unroll(y);
-        algorithm = "Scalar transpose";
-        output.compile_to_assembly(Internal::get_test_tmp_dir() + "scalar_transpose.s", std::vector<Argument>());
-        break;
-    case vec_y_trans:
-        block = block_transpose.in(output).reorder_storage(y, x).compute_at(output, x).vectorize(y).unroll(x);
-        algorithm = "Transpose vectorized in y";
-        output.compile_to_assembly(Internal::get_test_tmp_dir() + "fast_transpose_y.s", std::vector<Argument>());
-        break;
-    case vec_x_trans:
-        block = block_transpose.in(output).reorder_storage(y, x).compute_at(output, x).vectorize(x).unroll(y);
-        algorithm = "Transpose vectorized in x";
-        output.compile_to_assembly(Internal::get_test_tmp_dir() + "fast_transpose_x.s", std::vector<Argument>());
-        break;
-    }
+    output.tile(x, y, xi, yi, block_width, block_height, TailStrategy::RoundUp)
+        .vectorize(xi)
+        .unroll(yi);
+
+    // Do vectorized loads from the input.
+    input.in().compute_at(output, x).vectorize(x).unroll(y);
+
+    // Transpose in registers
+    input.in().in().reorder_storage(y, x).compute_at(output, x).vectorize(x).unroll(y);
+
+    // TODO: Should not be necessary, but prevents licm from doing something dumb.
+    output.output_buffer().dim(0).set_bounds(0, 256);
 
-    Buffer<uint16_t> result(1024, 1024);
     output.compile_jit();
 
-    output.realize(result);
+    output.realize(out);
 
-    double t = benchmark([&]() {
-        output.realize(result);
+    double time = benchmark(10, 10, [&]() {
+        output.realize(out);
     });
 
-    std::cout << "Wrapper version: " << algorithm << " bandwidth " << 1024 * 1024 / t << " byte/s.\n";
-    return result;
+    for (int y = 0; y < N; y++) {
+        for (int x = 0; x < N; x++) {
+            T actual = out(x, y), correct = in(y, x);
+            if (actual != correct) {
+                std::cerr << "out(" << x << ", " << y << ") = "
+                          << actual << " instead of " << correct << "\n";
+                exit(1);
+            }
+        }
+    }
+
+    // Uncomment to dump asm for inspection
+    /*
+    output.compile_to_assembly(Internal::get_test_tmp_dir() + "transpose_uint" +
+                                   std::to_string(sizeof(T) * 8) + "_" +
+                                   std::to_string(block_width) + "x" +
+                                   std::to_string(block_height) + ".s",
+                               std::vector<Argument>{in}, "transpose", t);
+    */
+
+    return Result{(int)sizeof(T), block_width, block_height,
+                  out.size_in_bytes() / (1.0e9 * time)};
 }
 
 int main(int argc, char **argv) {
@@ -118,23 +83,48 @@ int main(int argc, char **argv) {
         return 0;
     }
 
-    test_transpose(scalar_trans);
-    test_transpose_wrap(scalar_trans);
-    test_transpose(vec_y_trans);
-    test_transpose_wrap(vec_y_trans);
-
-    Buffer<uint16_t> im1 = test_transpose(vec_x_trans);
-    Buffer<uint16_t> im2 = test_transpose_wrap(vec_x_trans);
-
-    // Check correctness of the wrapper version
-    for (int y = 0; y < im2.height(); y++) {
-        for (int x = 0; x < im2.width(); x++) {
-            if (im2(x, y) != im1(x, y)) {
-                printf("wrapper(%d, %d) = %d instead of %d\n",
-                       x, y, im2(x, y), im1(x, y));
-                return 1;
+    // Set the target features to use for dumping to assembly
+    target.set_features({Target::NoRuntime, Target::NoAsserts, Target::NoBoundsQuery});
+
+    std::cout << "Computing best tile sizes for each type\n";
+    std::vector<Result> results;
+    int limit = 64 * 64;
+    for (int bh : {1, 2, 4, 8, 16, 32, 64}) {
+        for (int bw : {1, 2, 4, 8, 16, 32, 64}) {
+            std::cout << "." << std::flush;
+            results.push_back(test_transpose<uint8_t>(bw, bh, target));
+            if (bw * bh <= limit / 2) {
+                results.push_back(test_transpose<uint16_t>(bw, bh, target));
+            }
+            if (bw * bh <= limit / 4) {
+                results.push_back(test_transpose<uint32_t>(bw, bh, target));
+            }
+            if (bw * bh <= limit / 8) {
+                results.push_back(test_transpose<uint64_t>(bw, bh, target));
+            }
+        }
+    }
+    std::cout << "\nbytes, tile width, tile height, bandwidth (GB/s):\n";
+
+    // Sort the results by bandwidth
+    std::sort(results.begin(), results.end(),
+              [](const Result &a, const Result &b) {
+                  return a.bandwidth > b.bandwidth;
+              });
+
+    // Print top n tile sizes for each type
+    for (int t : {1, 2, 4, 8}) {
+        int top_n = 5;
+        for (size_t i = 0; i < results.size() && top_n > 0; i++) {
+            if (results[i].type_size == t) {
+                std::cout << t << " "
+                          << results[i].block_width << " "
+                          << results[i].block_height << " "
+                          << results[i].bandwidth << "\n";
+                top_n--;
             }
         }
+        std::cout << "\n";
     }
 
     printf("Success!\n");

From 2ba8ddeac15016095f2b41fec6936f0ba80eb820 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 27 Jan 2026 11:38:15 -0800
Subject: [PATCH 03/55] Fix comment.

The previous comment reported a time that seemed to have regressed. It
was not 8.2ms on main - more like 11
---
 apps/iir_blur/iir_blur_generator.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/iir_blur/iir_blur_generator.cpp b/apps/iir_blur/iir_blur_generator.cpp
index 4e4db6e61410..3c4dee4304af 100644
--- a/apps/iir_blur/iir_blur_generator.cpp
+++ b/apps/iir_blur/iir_blur_generator.cpp
@@ -36,7 +36,7 @@ Func blur_cols_transpose(Func input, Expr height, Expr alpha, bool skip_schedule
     if (!skip_schedule) {
         if (!target.has_gpu_feature()) {
             // CPU schedule.
-            // 8.2ms on an Intel i9-9960X using 16 threads
+            // 9.7ms on an Intel i9-9960X at 3.1 GHz using 16 threads
             // Split the transpose into tiles of rows. Parallelize over channels
             // and strips.
             Var xo, yo, t, yi;

From d102f7bee65116d980284ecdc60fd2b8997e9db3 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 27 Jan 2026 11:42:50 -0800
Subject: [PATCH 04/55] Comment fix

---
 src/CodeGen_X86.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index ab854f72e897..6ef95f6b51bc 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -982,7 +982,7 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
 
       Now let's consider the instructions we have available. These generally
       permute these bits. E.g. an instruction that interleaves two entire
-      vectors, applied to every pairs of vectors, would take the some vector bit
+      vectors, applied to pairs of vectors, would take the some vector bit
       and make it the lowest lane bit instead, shuffling the other bits upwards,
       with the highest-order within-vector bit taking the place of the vector
       bit (because we produce separate vectors for the low and high half of the

From 46d41ddbe8bfe1bccf33c211902072407f0ead39 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 27 Jan 2026 12:54:39 -0800
Subject: [PATCH 05/55] clang-tidy fixes

---
 src/CodeGen_X86.cpp | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 6ef95f6b51bc..968e9f25e54a 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -982,8 +982,8 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
 
       Now let's consider the instructions we have available. These generally
       permute these bits. E.g. an instruction that interleaves two entire
-      vectors, applied to pairs of vectors, would take the some vector bit
-      and make it the lowest lane bit instead, shuffling the other bits upwards,
+      vectors, applied to pairs of vectors, would take the some vector bit and
+      make it the lowest lane bit instead, shuffling the other bits upwards,
       with the highest-order within-vector bit taking the place of the vector
       bit (because we produce separate vectors for the low and high half of the
       result. So if we used this instruction to push the highest vector bit
@@ -1239,6 +1239,7 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
     while ((size_t)vec_elements > elems_per_native_vec) {
         int cut = vec_elements / 2;
         std::vector<Value *> new_v;
+        new_v.reserve(v.size() * 2);
         for (auto *vec : v) {
             new_v.push_back(slice_vector(vec, 0, cut));
         }
@@ -1282,7 +1283,9 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
                 size_t j = i ^ step;
 
                 // Don't process vectors twice.
-                if (j < i) continue;
+                if (j < i) {
+                    continue;
+                }
 
                 // Just interleave the two vectors. Because we have fewer
                 // elements than one slice, unpckl/h is a straight interleave.
@@ -1411,7 +1414,9 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
 
         for (size_t idx = 0; idx < v.size(); idx++) {
             size_t j = idx ^ step;
-            if (j <= idx) continue;
+            if (j <= idx) {
+                continue;
+            }
             auto [lo, hi] = shufi(v[idx], v[j], false);
             v[idx] = lo;
             v[j] = hi;
@@ -1441,7 +1446,9 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
 
                 for (size_t idx = 0; idx < v.size(); idx++) {
                     size_t j = idx ^ step;
-                    if (j <= idx) continue;
+                    if (j <= idx) {
+                        continue;
+                    }
                     auto [lo, hi] = shufi(v[idx], v[j], true);
                     v[idx] = lo;
                     v[j] = hi;
@@ -1453,7 +1460,9 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
 
                 for (size_t idx = 0; idx < v.size(); idx++) {
                     size_t j = idx ^ step;
-                    if (j <= idx) continue;
+                    if (j <= idx) {
+                        continue;
+                    }
                     auto [lo, hi] = shufi(v[idx], v[j], false);
                     v[idx] = lo;
                     v[j] = hi;
@@ -1464,8 +1473,8 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
                    s_bits[1] == low_slice_bit) {
             // The slice bits are both there, but in the wrong order
             std::swap(s_bits[0], s_bits[1]);
-            for (size_t i = 0; i < v.size(); i++) {
-                v[i] = self_shufi(v[i]);
+            for (auto &vec : v) {
+                vec = self_shufi(vec);
             }
         }
 

From 27f122026c317b18afc3b39b0044bd135d8e135a Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 27 Jan 2026 14:16:52 -0800
Subject: [PATCH 06/55] Make variable names more consistent

---
 src/CodeGen_X86.cpp | 61 +++++++++++++++++++++++++--------------------
 1 file changed, 34 insertions(+), 27 deletions(-)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 968e9f25e54a..2b086538eee5 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -1328,7 +1328,9 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
             new_v.reserve(v.size() / 2);
             for (size_t i = 0; i < v.size(); i++) {
                 size_t k = i ^ step;
-                if (k < i) continue;
+                if (k < i) {
+                    continue;
+                }
                 new_v.push_back(concat_vectors({v[i], v[k]}));
             }
             v.swap(new_v);
@@ -1341,11 +1343,12 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
             // as a v bit.
             if (std::find(l_bits.begin(), l_bits.end(), bit) != l_bits.end()) {
                 int b = l_bits[0] - 1;
-                if (std::find(v_bits.begin(), v_bits.end(), b) == v_bits.end()) {
-                    b = v_bits.back();
+                auto vb_it = std::find(v_bits.begin(), v_bits.end(), b);
+                if (vb_it == v_bits.end()) {
+                    vb_it = v_bits.end() - 1;
+                    b = *vb_it;
                 }
 
-                auto vb_it = std::find(v_bits.begin(), v_bits.end(), b);
                 int j = vb_it - v_bits.begin();
                 *vb_it = l_bits.back();
                 l_bits.pop_back();
@@ -1354,7 +1357,9 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
                 int step = 1 << j;
                 for (size_t i = 0; i < v.size(); i++) {
                     size_t k = i ^ step;
-                    if (k < i) continue;
+                    if (k < i) {
+                        continue;
+                    }
                     auto [lo, hi] = unpck(v[i], v[k]);
                     v[i] = lo;
                     v[k] = hi;
@@ -1390,7 +1395,9 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
         int step = 1 << j;
         for (size_t i = 0; i < v.size(); i++) {
             size_t k = i ^ step;
-            if (k < i) continue;
+            if (k < i) {
+                continue;
+            }
             auto [lo, hi] = unpck(v[i], v[k]);
             v[i] = lo;
             v[k] = hi;
@@ -1408,18 +1415,18 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
     int low_slice_bit = l_bits.size();
     auto ls_in_v = std::find(v_bits.begin(), v_bits.end(), low_slice_bit);
     if (ls_in_v != v_bits.end()) {
-        int i = ls_in_v - v_bits.begin();
-        int step = 1 << i;
+        int j = ls_in_v - v_bits.begin();
+        int step = 1 << j;
         std::swap(*ls_in_v, s_bits.back());
 
-        for (size_t idx = 0; idx < v.size(); idx++) {
-            size_t j = idx ^ step;
-            if (j <= idx) {
+        for (size_t i = 0; i < v.size(); i++) {
+            size_t k = i ^ step;
+            if (k < i) {
                 continue;
             }
-            auto [lo, hi] = shufi(v[idx], v[j], false);
-            v[idx] = lo;
-            v[j] = hi;
+            auto [lo, hi] = shufi(v[i], v[k], false);
+            v[i] = lo;
+            v[k] = hi;
         }
     }
 
@@ -1432,8 +1439,8 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
             // The high slice bit is in the v_bits. Note that if it's not, it'll
             // be one of the slice bits. It can't be an l bit, because we've
             // already finalized them.
-            int i = hs_in_v - v_bits.begin();
-            int step = 1 << i;
+            int j = hs_in_v - v_bits.begin();
+            int step = 1 << j;
 
             if (!s_bits.empty() && s_bits.back() == low_slice_bit) {
                 // The low slice bit is currently occupying the high slice bit slot,
@@ -1444,27 +1451,27 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
                 s_bits.back() = *hs_in_v;
                 *hs_in_v = temp;
 
-                for (size_t idx = 0; idx < v.size(); idx++) {
-                    size_t j = idx ^ step;
-                    if (j <= idx) {
+                for (size_t i = 0; i < v.size(); i++) {
+                    size_t k = i ^ step;
+                    if (k < i) {
                         continue;
                     }
-                    auto [lo, hi] = shufi(v[idx], v[j], true);
-                    v[idx] = lo;
-                    v[j] = hi;
+                    auto [lo, hi] = shufi(v[i], v[k], true);
+                    v[i] = lo;
+                    v[k] = hi;
                 }
             } else {
                 // The low slice bit must be already in place, so no crossover required.
                 internal_assert(s_bits[0] == low_slice_bit);
                 std::swap(*hs_in_v, s_bits.back());
 
-                for (size_t idx = 0; idx < v.size(); idx++) {
-                    size_t j = idx ^ step;
-                    if (j <= idx) {
+                for (size_t i = 0; i < v.size(); i++) {
+                    size_t k = i ^ step;
+                    if (k < i) {
                         continue;
                     }
-                    auto [lo, hi] = shufi(v[idx], v[j], false);
-                    v[idx] = lo;
+                    auto [lo, hi] = shufi(v[i], v[k], false);
+                    v[i] = lo;
                     v[j] = hi;
                 }
             }

From 5576f46776bbfc0f629c6ec8adf8986943056fd5 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 27 Jan 2026 15:18:19 -0800
Subject: [PATCH 07/55] Simplify code with helper lambda

---
 src/CodeGen_X86.cpp | 118 ++++++++++++++++++--------------------------
 1 file changed, 49 insertions(+), 69 deletions(-)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 2b086538eee5..8cfcedccd50d 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -1233,6 +1233,23 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
         return optimization_fence(shuffle_vectors(a, a, indices));
     };
 
+    // A helper to iterate over all pairs of entries in v, separated by some
+    // power-of-two spacing.
+    auto for_all_pairs = [&](size_t log_step, auto fn) {
+        size_t step = 1 << log_step;
+        for (size_t i = 0; i < v.size(); i++) {
+            // Pair each vector with the one separated by the step.
+            size_t j = i ^ step;
+
+            // Don't process vectors twice.
+            if (j < i) {
+                continue;
+            }
+
+            fn(&v[i], &v[j]);
+        }
+    };
+
     // First, if the vectors are wider than native, that will manifest as too
     // many slice bits. Cut them into separate native vectors. This will not
     // create any instructions.
@@ -1275,18 +1292,9 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
 
             // The distance in the vecs array is the index of the corresponding
             // v bit we're pulling down.
-            int step = 1 << j;
             std::vector<Value *> new_v;
             new_v.reserve(v.size() / 2);
-            for (size_t i = 0; i < v.size(); i++) {
-                // Pair each vector with the one separated by the step.
-                size_t j = i ^ step;
-
-                // Don't process vectors twice.
-                if (j < i) {
-                    continue;
-                }
-
+            for_all_pairs(j, [&](auto *a, auto *b) {
                 // Just interleave the two vectors. Because we have fewer
                 // elements than one slice, unpckl/h is a straight interleave.
                 std::vector<int> indices;
@@ -1294,8 +1302,8 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
                     indices.push_back(k);
                     indices.push_back(vec_elements + k);
                 }
-                new_v.push_back(shuffle_vectors(v[i], v[j], indices));
-            }
+                new_v.push_back(shuffle_vectors(*a, *b, indices));
+            });
             v.swap(new_v);
             vec_elements *= 2;
             bit--;
@@ -1323,16 +1331,11 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
             v_bits.erase(v_it);
             s_bits.push_back(bit);
 
-            int step = 1 << j;
             std::vector<Value *> new_v;
             new_v.reserve(v.size() / 2);
-            for (size_t i = 0; i < v.size(); i++) {
-                size_t k = i ^ step;
-                if (k < i) {
-                    continue;
-                }
-                new_v.push_back(concat_vectors({v[i], v[k]}));
-            }
+            for_all_pairs(j, [&](auto *a, auto *b) {
+                new_v.push_back(concat_vectors({*a, *b}));
+            });
             v.swap(new_v);
             vec_elements *= 2;
         } else {
@@ -1354,16 +1357,11 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
                 l_bits.pop_back();
                 l_bits.insert(l_bits.begin(), b);
 
-                int step = 1 << j;
-                for (size_t i = 0; i < v.size(); i++) {
-                    size_t k = i ^ step;
-                    if (k < i) {
-                        continue;
-                    }
-                    auto [lo, hi] = unpck(v[i], v[k]);
-                    v[i] = lo;
-                    v[k] = hi;
-                }
+                for_all_pairs(j, [&](auto *a, auto *b) {
+                    auto [lo, hi] = unpck(*a, *b);
+                    *a = lo;
+                    *b = hi;
+                });
             }
         }
     }
@@ -1392,16 +1390,11 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
         l_bits.pop_back();
         l_bits.insert(l_bits.begin(), bit);
 
-        int step = 1 << j;
-        for (size_t i = 0; i < v.size(); i++) {
-            size_t k = i ^ step;
-            if (k < i) {
-                continue;
-            }
-            auto [lo, hi] = unpck(v[i], v[k]);
-            v[i] = lo;
-            v[k] = hi;
-        }
+        for_all_pairs(j, [&](auto *a, auto *b) {
+            auto [lo, hi] = unpck(*a, *b);
+            *a = lo;
+            *b = hi;
+        });
     }
 
     // They should be 0, 1, 2, 3...
@@ -1416,18 +1409,13 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
     auto ls_in_v = std::find(v_bits.begin(), v_bits.end(), low_slice_bit);
     if (ls_in_v != v_bits.end()) {
         int j = ls_in_v - v_bits.begin();
-        int step = 1 << j;
         std::swap(*ls_in_v, s_bits.back());
 
-        for (size_t i = 0; i < v.size(); i++) {
-            size_t k = i ^ step;
-            if (k < i) {
-                continue;
-            }
-            auto [lo, hi] = shufi(v[i], v[k], false);
-            v[i] = lo;
-            v[k] = hi;
-        }
+        for_all_pairs(j, [&](auto *a, auto *b) {
+            auto [lo, hi] = shufi(*a, *b, false);
+            *a = lo;
+            *b = hi;
+        });
     }
 
     // And then the high slice bit, if there is one
@@ -1440,7 +1428,6 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
             // be one of the slice bits. It can't be an l bit, because we've
             // already finalized them.
             int j = hs_in_v - v_bits.begin();
-            int step = 1 << j;
 
             if (!s_bits.empty() && s_bits.back() == low_slice_bit) {
                 // The low slice bit is currently occupying the high slice bit slot,
@@ -1451,29 +1438,22 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
                 s_bits.back() = *hs_in_v;
                 *hs_in_v = temp;
 
-                for (size_t i = 0; i < v.size(); i++) {
-                    size_t k = i ^ step;
-                    if (k < i) {
-                        continue;
-                    }
-                    auto [lo, hi] = shufi(v[i], v[k], true);
-                    v[i] = lo;
-                    v[k] = hi;
-                }
+                for_all_pairs(j, [&](auto *a, auto *b) {
+                    auto [lo, hi] = shufi(*a, *b, true);
+                    *a = lo;
+                    *b = hi;
+                });
+
             } else {
                 // The low slice bit must be already in place, so no crossover required.
                 internal_assert(s_bits[0] == low_slice_bit);
                 std::swap(*hs_in_v, s_bits.back());
 
-                for (size_t i = 0; i < v.size(); i++) {
-                    size_t k = i ^ step;
-                    if (k < i) {
-                        continue;
-                    }
-                    auto [lo, hi] = shufi(v[i], v[k], false);
-                    v[i] = lo;
-                    v[j] = hi;
-                }
+                for_all_pairs(j, [&](auto *a, auto *b) {
+                    auto [lo, hi] = shufi(*a, *b, false);
+                    *a = lo;
+                    *b = hi;
+                });
             }
         } else if (s_bits.size() == 2 &&
                    s_bits[0] == high_slice_bit &&

From 107aaa5122b6c962c51b2cefb181671315368635 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 27 Jan 2026 15:20:06 -0800
Subject: [PATCH 08/55] Comment tweaks

---
 src/CodeGen_X86.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 8cfcedccd50d..b68987bebd71 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -950,17 +950,17 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
       works poorly. Here we have a somewhat complex algorithm for generating
       better sequences of shuffle instructions for avx and avx-512.
 
-      Consider the location of one of the elements of one of the vectors. It
-      has a vector index, which says which vector it's in, and a vector lane
-      index, which gives the lane. x86 shuffles work in terms of 128-bit
-      subvectors, which we will call slices. So we'll decompose that lane index
-      into a slice index, to identify the 128-bit slice within a vector, and
-      the lane index within that slice. For avx the slice index is either zero
-      or one, and for avx-512 it's 0, 1, 2, or 3. Because we have limited
-      everything to be a power of two, we can write out these indices in
-      binary. We'll use v for the vector index, s for the slice index, and l
-      for the lane index. For an avx-512 interleave of 16 vectors of 32
-      elements each (i.e. uint16s), a location could thus be written as:
+      Consider the location of one of the elements of one of the vectors. It has
+      a vector index, which says which vector it's in, and a vector lane index,
+      which gives the lane. x86 shuffles work in terms of 128-bit subvectors,
+      which we will call slices. So we'll decompose that lane index into a slice
+      index, to identify the 128-bit slice within a vector, and the lane index
+      within that slice. For avx the slice index is either zero or one, and for
+      avx-512 it can be zero through three. Because we have limited everything
+      to be a power of two, we can write out these indices in binary. We'll use
+      v for the vector index, s for the slice index, and l for the lane
+      index. For an avx-512 interleave of 16 vectors of 32 elements each
+      (i.e. uint16s), a location could thus be written as:
 
       [l0 l1 l2] [s0 s1] [v0 v1 v2 v3]
 
@@ -982,7 +982,7 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
 
       Now let's consider the instructions we have available. These generally
       permute these bits. E.g. an instruction that interleaves two entire
-      vectors, applied to pairs of vectors, would take the some vector bit and
+      vectors, applied to pairs of vectors, would take the same vector bit and
       make it the lowest lane bit instead, shuffling the other bits upwards,
       with the highest-order within-vector bit taking the place of the vector
       bit (because we produce separate vectors for the low and high half of the

From 0bc1b9f28c6f0c1d3a0b48b43b578014ee116f83 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 28 Jan 2026 13:36:52 -0800
Subject: [PATCH 09/55] Don't do half-width unpcks

---
 src/CodeGen_X86.cpp                  | 150 +++++++++++++--------------
 test/performance/block_transpose.cpp |   3 +-
 2 files changed, 75 insertions(+), 78 deletions(-)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index b68987bebd71..4bf504967ab3 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -936,11 +936,13 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
     const size_t elems_per_slice = 128 / element_bits;
 
     // Only apply special x86 logic for power-of-two interleaves for avx and
-    // above (TODO: Could slice into native vectors and concat results even if
-    // not power of two)
+    // above where we're going to end up with multiple native vectors (TODO:
+    // Could slice into native vectors and concat results even if not power of
+    // two)
 
     if (!is_power_of_two(vec_elements) ||
-        !is_power_of_two(vecs.size())) {
+        !is_power_of_two(vecs.size()) ||
+        (vecs.size() * vec_elements * element_bits) <= (size_t)native_vector_bits()) {
         return CodeGen_Posix::interleave_vectors(vecs);
     }
 
@@ -1091,35 +1093,27 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
 
       [2 3 4] [5] [0 1]
 
-      There's no good concatenation we can do to make whole vectors. That 0 and 1
-      both need to end up as lanes bits, and we have no instructions that swap
-      slice bits with lanes bits. So we'll just have to run unpck instructions at
-      half-vector width to push that 4 into the vector bit range:
+      Let's concatenate adjacent pairs as before.
 
-      [1 2 3] [5] [0 4]
+      [2 3 4] [5 0] [1]
 
-      and now we can concatenate according to bit 4 to make whole vectors
+      Now we do one unpck
 
-      [1 2 3] [5 4] [0]
-
-      We then do one more unpck to pull the 0 down:
-
-      [0 1 2] [5 4] [3]
-
-      Next, we need to make 3 a slice bit. We can use shufi to swap it with 4:
+      [1 2 3] [5 0] [4]
 
-      [0 1 2] [5 3] [4]
+      And we encounter a problem when it comes to the second one. The next bit
+      we want pull in is hiding in the slice bits, which unpck instructions
+      can't access. So at this point we use a shufi to push it back into the
+      vector bits, swapping 0 and 4.
 
-      and then another shufi to rotate those three
+      [1 2 3] [5 4] [0]
 
-      [0 1 2] [3 4] [5]
+      Now we can do the last unpck.
 
-      and we're done.
+      [0 1 2] [5 4] [3]
 
-      Depending on how many of each bit we start with, we can also end up in
-      situations where everything is correct except the two slice bits are in
-      the wrong order, in which case we can use a shufi instruction with a
-      vector and itself to swap those two bits.
+      From here we can use two shufi instructions to fix up the vector and slice
+      bits.
 
       So there are many possible paths depending on the number of elements per
       vector, the number of elements per 128-bit slice of each vector, and the
@@ -1134,7 +1128,8 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
 
     // The number of 128-bit slices per vector is 2 for avx and 4 for avx512
     const int final_num_s_bits = ctz64(native_vector_bits() / 128);
-    internal_assert(final_num_s_bits == 1 || final_num_s_bits == 2) << native_vector_bits() << " " << final_num_s_bits << "\n";
+    internal_assert(final_num_s_bits == 1 || final_num_s_bits == 2)
+        << native_vector_bits() << " " << final_num_s_bits;
 
     const int num_v_bits = ctz64(v.size());
     const int num_s_bits = ((size_t)vec_elements <= elems_per_slice) ? 0 : ctz64(vec_elements / elems_per_slice);
@@ -1216,7 +1211,7 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
         }
         Value *lo = shuffle_vectors(a, b, lo_indices);
         Value *hi = shuffle_vectors(a, b, hi_indices);
-        return {optimization_fence(lo), optimization_fence(hi)};
+        return {lo, hi};
     };
 
     // A 2x2 transpose of slices within a single vector
@@ -1230,7 +1225,7 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
                 indices.push_back(i + j * (int)elems_per_slice);
             }
         }
-        return optimization_fence(shuffle_vectors(a, a, indices));
+        return shuffle_vectors(a, a, indices);
     };
 
     // A helper to iterate over all pairs of entries in v, separated by some
@@ -1326,64 +1321,64 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
         }
 
         auto v_it = std::find(v_bits.begin(), v_bits.end(), bit);
-        if (v_it != v_bits.end()) {
-            int j = v_it - v_bits.begin();
-            v_bits.erase(v_it);
-            s_bits.push_back(bit);
 
-            std::vector<Value *> new_v;
-            new_v.reserve(v.size() / 2);
-            for_all_pairs(j, [&](auto *a, auto *b) {
-                new_v.push_back(concat_vectors({*a, *b}));
-            });
-            v.swap(new_v);
-            vec_elements *= 2;
-        } else {
-            // Oh no, the bit we wanted to use isn't in v_bits, it's in l_bits.
-            // We'll do sub-width unpck instead with an appropriate v bit to try
-            // to push it out. This is in a while loop, so it will keep doing
-            // this until it pops out the top of the l bits and we identify it
-            // as a v bit.
-            if (std::find(l_bits.begin(), l_bits.end(), bit) != l_bits.end()) {
-                int b = l_bits[0] - 1;
-                auto vb_it = std::find(v_bits.begin(), v_bits.end(), b);
-                if (vb_it == v_bits.end()) {
-                    vb_it = v_bits.end() - 1;
-                    b = *vb_it;
-                }
+        if (v_it == v_bits.end()) {
+            // Just concatenate according to the lowest vector bit.
+            v_it = v_bits.begin();
+            bit = *v_it;
+        }
 
-                int j = vb_it - v_bits.begin();
-                *vb_it = l_bits.back();
-                l_bits.pop_back();
-                l_bits.insert(l_bits.begin(), b);
+        int j = v_it - v_bits.begin();
+        v_bits.erase(v_it);
+        s_bits.push_back(bit);
 
-                for_all_pairs(j, [&](auto *a, auto *b) {
-                    auto [lo, hi] = unpck(*a, *b);
-                    *a = lo;
-                    *b = hi;
-                });
-            }
-        }
+        std::vector<Value *> new_v;
+        new_v.reserve(v.size() / 2);
+        for_all_pairs(j, [&](auto *a, auto *b) {
+            new_v.push_back(concat_vectors({*a, *b}));
+        });
+        v.swap(new_v);
+        vec_elements *= 2;
     }
 
-    // If only one vector is left, we just need to check if the slice bits are
-    // in the right order:
-    if (v_bits.empty()) {
-        internal_assert(v.size() == 1);
-        if (s_bits.size() == 2 && s_bits[0] > s_bits[1]) {
-            v[0] = self_shufi(v[0]);
-            std::swap(s_bits[0], s_bits[1]);
-        }
-        return v[0];
-    }
+    // There should be more than one vector left
+    internal_assert(v.size() > 1);
 
-    // Now we have at least two whole vectors. Next we finalize lane bits using
+    // Now we have at least two whole vectors. Next we try to finalize lane bits using
     // unpck instructions.
     while (l_bits[0] != 0) {
         int bit = std::min(l_bits[0], (int)ctz64(elems_per_slice)) - 1;
 
         auto vb_it = std::find(v_bits.begin(), v_bits.end(), bit);
-        internal_assert(vb_it != v_bits.end());
+
+        // internal_assert(vb_it != v_bits.end());
+        if (vb_it == v_bits.end()) {
+            // The next bit is not in vector bits. It must be hiding in the
+            // slice bits due to earlier concatenation. Move it into the v_bits
+            // with a shufi
+            if (s_bits.back() == bit) {
+                // It's the last (or sole) slice bit. Swap it with the first v bit
+                std::swap(s_bits.back(), v_bits[0]);
+                for_all_pairs(0, [&](auto *a, auto *b) {
+                    auto [lo, hi] = shufi(*a, *b, false);
+                    *a = lo;
+                    *b = hi;
+                });
+            } else {
+                internal_assert(s_bits.size() == 2 && s_bits[0] == bit);
+                // It's the low slice bit. We need shufi with crossover.
+                int v_bit = v_bits[0];
+                v_bits[0] = s_bits[0];
+                s_bits[0] = s_bits[1];
+                s_bits[1] = v_bit;
+                for_all_pairs(0, [&](auto *a, auto *b) {
+                    auto [lo, hi] = shufi(*a, *b, true);
+                    *a = lo;
+                    *b = hi;
+                });
+            }
+            vb_it = v_bits.begin();
+        }
 
         int j = vb_it - v_bits.begin();
         *vb_it = l_bits.back();
@@ -1397,14 +1392,15 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
         });
     }
 
-    // They should be 0, 1, 2, 3...
+    // Lane bits should now be 0, 1, 2, 3...
     for (int i = 0; i < (int)l_bits.size(); i++) {
         internal_assert(l_bits[i] == i);
     }
 
-    // Then we fix the slice bits with shufi instructions
+    // Time to fix the slice bits
 
-    // First the low slice bit
+    // First the low slice bit. If it's one of the v bits, move it to be the
+    // high slice bit with a shufi.
     int low_slice_bit = l_bits.size();
     auto ls_in_v = std::find(v_bits.begin(), v_bits.end(), low_slice_bit);
     if (ls_in_v != v_bits.end()) {
diff --git a/test/performance/block_transpose.cpp b/test/performance/block_transpose.cpp
index 9915cf8e5f51..8760d8ac5495 100644
--- a/test/performance/block_transpose.cpp
+++ b/test/performance/block_transpose.cpp
@@ -56,7 +56,8 @@ Result test_transpose(int block_width, int block_height, const Target &t) {
         for (int x = 0; x < N; x++) {
             T actual = out(x, y), correct = in(y, x);
             if (actual != correct) {
-                std::cerr << "out(" << x << ", " << y << ") = "
+                std::cerr << "For block size (" << block_width << ", " << block_height << "): "
+                          << "out(" << x << ", " << y << ") = "
                           << actual << " instead of " << correct << "\n";
                 exit(1);
             }

From cdc1de283010d19d58bc5944ae96f1e5ca4171f6 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Fri, 30 Jan 2026 15:50:35 +1100
Subject: [PATCH 10/55] Use optimization fences in the base class too

Before:

Computing best tile sizes for each type
.................................................
bytes, tile width, tile height, bandwidth (GB/s):
1 8 8 20.9997
1 16 8 20.8329
1 8 16 18.5702
1 8 32 17.2463
1 8 64 14.312

2 8 16 19.2047
2 8 8 18.8368
2 16 8 17.0593
2 8 32 17.0591
2 4 8 15.7681

4 8 8 24.9364
4 4 16 22.9699
4 8 16 22.5743
4 4 32 22.255
4 4 8 20.4468

8 8 8 38.4094
8 16 4 28.4167
8 16 8 27.6184
8 8 4 27.6062
8 8 16 26.8693

After:

Computing best tile sizes for each type
.................................................
bytes, tile width, tile height, bandwidth (GB/s):
1 16 32 34.1921
1 16 16 31.8399
1 8 16 25.575
1 16 64 25.1665
1 32 16 25.0061

2 8 32 28.2635
2 8 16 27.7648
2 16 16 27.2126
2 16 32 23.9034
2 8 8 23.6345

4 8 16 34.5303
4 8 8 28.3653
4 16 8 26.8521
4 8 32 26.084
4 16 16 24.4519

8 8 8 33.7163
8 8 4 29.1339
8 4 16 26.418
8 16 4 25.4663
8 2 8 24.3949
---
 src/CodeGen_LLVM.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 7715fce28c34..72b648feff1b 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -2195,7 +2195,10 @@ Value *CodeGen_LLVM::optimization_fence(Value *v) {
     internal_assert(!t->isScalableTy())
         << "optimization_fence does not support scalable vectors yet";
     const int bits = t->getPrimitiveSizeInBits();
-    llvm::Type *float_type = llvm_type_of(Float(64, bits / 64));
+    if (bits % 16) {
+        return v;
+    }
+    llvm::Type *float_type = llvm_type_of(Float(16, bits / 16));
     v = builder->CreateBitCast(v, float_type);
     v = builder->CreateArithmeticFence(v, float_type);
     return builder->CreateBitCast(v, t);
@@ -2217,7 +2220,7 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
         for (int i = 0; i < vec_elements * 2; i++) {
             indices[i] = i % 2 == 0 ? i / 2 : i / 2 + vec_elements;
         }
-        return shuffle_vectors(a, b, indices);
+        return optimization_fence(shuffle_vectors(a, b, indices));
     } else {
         // Grab the even and odd elements of vecs.
         vector<Value *> even_vecs;

From 3eef5dbac3c531283fd752e3d50f7db1d194ce55 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Thu, 12 Feb 2026 07:49:33 -0800
Subject: [PATCH 11/55] Use Catanzaro's algorithm for non-power-of-two
 interleaves

---
 src/CodeGen_LLVM.cpp            | 120 ++++++++++++++++--------
 test/performance/CMakeLists.txt |   1 +
 test/performance/interleave.cpp | 159 ++++++++++++++++++++++++++++++++
 3 files changed, 241 insertions(+), 39 deletions(-)
 create mode 100644 test/performance/interleave.cpp

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index b5aa069e673d..57aa43299eea 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -2211,6 +2211,8 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
     }
     int vec_elements = get_vector_num_elements(vecs[0]->getType());
 
+    int factor = gcd(vec_elements, (int)vecs.size());
+
     if (vecs.size() == 1) {
         return vecs[0];
     } else if (vecs.size() == 2) {
@@ -2221,57 +2223,97 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
             indices[i] = i % 2 == 0 ? i / 2 : i / 2 + vec_elements;
         }
         return optimization_fence(shuffle_vectors(a, b, indices));
-    } else {
-        // Grab the even and odd elements of vecs.
-        vector<Value *> even_vecs;
-        vector<Value *> odd_vecs;
-        for (size_t i = 0; i < vecs.size(); i++) {
-            if (i % 2 == 0) {
-                even_vecs.push_back(vecs[i]);
-            } else {
-                odd_vecs.push_back(vecs[i]);
+    } else if (factor == 1) {
+        // The number of vectors and the vector length is
+        // coprime. (E.g. interleaving an odd number of vectors of some
+        // power-of-two length). Use the algorithm from "A Decomposition for
+        // In-place Matrix Transposition" by Catanzaro et al.
+        std::vector<Value *> v = vecs;
+
+        // Using unary shuffles, get each element into the right ultimate
+        // lane. This works out without collisions because the number of vectors
+        // and the length of each vector is coprime.
+        const int num_vecs = (int)v.size();
+        std::vector<int> shuffle(vec_elements);
+        for (int i = 0; i < num_vecs; i++) {
+            for (int j = 0; j < vec_elements; j++) {
+                int k = j * num_vecs + i;
+                shuffle[k % vec_elements] = j;
             }
+            v[i] = shuffle_vectors(v[i], v[i], shuffle);
         }
 
-        // If the number of vecs is odd, save the last one for later.
-        Value *last = nullptr;
-        if (even_vecs.size() > odd_vecs.size()) {
-            last = even_vecs.back();
-            even_vecs.pop_back();
+        // We intentionally don't put an optimization fence after the unary
+        // shuffles, because some architectures have a two-way shuffle, so it
+        // helps to fuse the unary shuffle into the first layer of two-way
+        // blends below.
+
+        // Now we need to transfer the elements across the vectors. If we
+        // reorder the vectors, this becomes a rotation across the vectors of a
+        // different amount per lane.
+        std::vector<Value *> new_v(v.size());
+        for (int i = 0; i < num_vecs; i++) {
+            int j = (i * vec_elements) % num_vecs;
+            new_v[i] = v[j];
         }
-        internal_assert(even_vecs.size() == odd_vecs.size());
+        v.swap(new_v);
 
-        // Interleave the even and odd parts.
-        Value *even = interleave_vectors(even_vecs);
-        Value *odd = interleave_vectors(odd_vecs);
+        std::vector<int> rotation(vec_elements, 0);
+        for (int i = 0; i < vec_elements; i++) {
+            int k = (i * num_vecs) % vec_elements;
+            rotation[k] = (i * num_vecs) / vec_elements;
+        }
+        internal_assert(rotation[0] == 0);
 
-        if (last) {
-            int result_elements = vec_elements * vecs.size();
+        // We'll handle each bit of the rotation one at a time with a two-way
+        // shuffle.
+        int d = 1;
+        while (d < num_vecs) {
 
-            // Interleave even and odd, leaving a space for the last element.
-            vector<int> indices(result_elements, -1);
-            for (int i = 0, idx = 0; i < result_elements; i++) {
-                if (i % vecs.size() < vecs.size() - 1) {
-                    indices[i] = idx % 2 == 0 ? idx / 2 : idx / 2 + vec_elements * even_vecs.size();
-                    idx++;
-                }
+            for (int i = 0; i < vec_elements; i++) {
+                shuffle[i] = ((rotation[i] & d) == 0) ? i : (i + vec_elements);
             }
-            Value *even_odd = shuffle_vectors(even, odd, indices);
 
-            // Interleave the last vector into the result.
-            last = slice_vector(last, 0, result_elements);
-            for (int i = 0; i < result_elements; i++) {
-                if (i % vecs.size() < vecs.size() - 1) {
-                    indices[i] = i;
-                } else {
-                    indices[i] = i / vecs.size() + result_elements;
-                }
+            for (int i = 0; i < num_vecs; i++) {
+                int j = (i + num_vecs - d) % num_vecs;
+                new_v[i] = shuffle_vectors(v[i], v[j], shuffle);
             }
 
-            return shuffle_vectors(even_odd, last, indices);
-        } else {
-            return interleave_vectors({even, odd});
+            v.swap(new_v);
+
+            d *= 2;
         }
+
+        return concat_vectors(v);
+
+    } else {
+        // The number of vectors shares a factor with the length of the
+        // vectors. Pick some large factor of the number of vectors, interleave
+        // in separate groups, and then interleave the results.
+        const int n = (int)vecs.size();
+        int f = 1;
+        for (int i = 2; i < n; i++) {
+            if (n % i == 0) {
+                f = i;
+                break;
+            }
+        }
+
+        internal_assert(f > 1 && f < n);
+
+        vector<vector<Value *>> groups(f);
+        for (size_t i = 0; i < vecs.size(); i++) {
+            groups[i % f].push_back(vecs[i]);
+        }
+
+        // Interleave each group
+        vector<Value *> interleaved(f);
+        for (int i = 0; i < f; i++) {
+            interleaved[i] = optimization_fence(interleave_vectors(groups[i]));
+        }
+
+        // Interleave the result
+        return interleave_vectors(interleaved);
     }
 }
 
diff --git a/test/performance/CMakeLists.txt b/test/performance/CMakeLists.txt
index 851e7e3ae506..5978ac2961a3 100644
--- a/test/performance/CMakeLists.txt
+++ b/test/performance/CMakeLists.txt
@@ -16,6 +16,7 @@ tests(GROUPS performance
       fast_pow.cpp
       fast_sine_cosine.cpp
       gpu_half_throughput.cpp
+      interleave.cpp
       jit_stress.cpp
       lots_of_inputs.cpp
       memcpy.cpp
diff --git a/test/performance/interleave.cpp b/test/performance/interleave.cpp
new file mode 100644
index 000000000000..f73d7b687ac4
--- /dev/null
+++ b/test/performance/interleave.cpp
@@ -0,0 +1,159 @@
+#include "Halide.h"
+#include "halide_benchmark.h"
+#include "halide_test_dirs.h"
+
+#include <cstdio>
+
+using namespace Halide;
+using namespace Halide::Tools;
+
+struct Result {
+    int type_size, factor;
+    double bandwidth;
+};
+
+template<typename T>
+Result test_interleave(int factor, const Target &t) {
+    const int N = 8192;
+    Buffer<T> in(N, factor), out(N * factor);
+
+    for (int y = 0; y < factor; y++) {
+        for (int x = 0; x < N; x++) {
+            in(x, y) = (T)(x * factor + y);
+        }
+    }
+
+    Func output;
+    Var x, y;
+
+    output(x) = in(x / factor, x % factor);
+
+    Var xi, yi;
+    output.unroll(x, factor, TailStrategy::RoundUp).vectorize(x, t.natural_vector_size<T>(), TailStrategy::RoundUp);
+    output.output_buffer().dim(0).set_min(0);
+
+    output.compile_jit();
+
+    output.realize(out);
+
+    double time = benchmark(20, 20, [&]() {
+        output.realize(out);
+    });
+
+    for (int y = 0; y < factor; y++) {
+        for (int x = 0; x < N; x++) {
+            uint64_t actual = out(x * factor + y), correct = in(x, y);
+            if (actual != correct) {
+                std::cerr << "For factor " << factor
+                          << "out(" << x << " * " << factor << " + " << y << ") = "
+                          << actual << " instead of " << correct << "\n";
+                exit(1);
+            }
+        }
+    }
+
+    // Uncomment to dump asm for inspection
+    // output.compile_to_assembly("/dev/stdout",
+    // std::vector<Argument>{in}, "interleave", t);
+
+    return Result{(int)sizeof(T), factor, out.size_in_bytes() / (1.0e9 * time)};
+}
+
+template<typename T>
+Result test_deinterleave(int factor, const Target &t) {
+    const int N = 8192;
+    Buffer<T> in(N * factor), out(N, factor);
+
+    for (int x = 0; x < N; x++) {
+        for (int y = 0; y < factor; y++) {
+            in(x * factor + y) = (T)(x + y * N);
+        }
+    }
+
+    Func output;
+    Var x, y;
+
+    output(x, y) = in(x * factor + y);
+
+    Var xi, yi;
+    output.reorder(y, x).bound(y, 0, factor).unroll(y).vectorize(x, t.natural_vector_size<T>(), TailStrategy::RoundUp);
+    // output.output_buffer().dim(0).set_min(0);
+
+    output.compile_jit();
+
+    output.realize(out);
+
+    double time = benchmark(20, 20, [&]() {
+        output.realize(out);
+    });
+
+    for (int y = 0; y < factor; y++) {
+        for (int x = 0; x < N; x++) {
+            uint64_t actual = out(x, y), correct = in(x * factor + y);
+            if (actual != correct) {
+                std::cerr << "For factor " << factor
+                          << "out(" << x << ", " << y << ") = "
+                          << actual << " instead of " << correct << "\n";
+                exit(1);
+            }
+        }
+    }
+
+    // Uncomment to dump asm for inspection
+    output.compile_to_assembly("/dev/stdout",
+    std::vector<Argument>{in}, "interleave", t);
+
+    return Result{(int)sizeof(T), factor, out.size_in_bytes() / (1.0e9 * time)};
+}
+
+int main(int argc, char **argv) {
+    Target target = get_jit_target_from_environment();
+    if (target.arch == Target::WebAssembly) {
+        printf("[SKIP] Performance tests are meaningless and/or misleading under WebAssembly interpreter.\n");
+        return 0;
+    }
+
+    // Set the target features to use for dumping to assembly
+    target.set_features({Target::NoRuntime, Target::NoAsserts, Target::NoBoundsQuery});
+
+    std::cout << "\nbytes, interleave factor, interleave bandwidth (GB/s), deinterleave bandwidth (GB/s):\n";
+#if 0
+    for (int t : {1, 2, 4, 8}) {
+        for (int f = 2; f < 16; f++) {
+#else
+     {
+         {
+            int t = 1, f = 4;
+#endif
+            Result r1, r2;
+            switch (t) {
+            case 1:
+                r1 = test_interleave<uint8_t>(f, target);
+                r2 = test_deinterleave<uint8_t>(f, target);
+                break;
+            case 2:
+                r1 = test_interleave<uint16_t>(f, target);
+                r2 = test_deinterleave<uint16_t>(f, target);
+                break;
+            case 4:
+                r1 = test_interleave<uint32_t>(f, target);
+                r2 = test_deinterleave<uint32_t>(f, target);
+                break;
+            case 8:
+                r1 = test_interleave<uint64_t>(f, target);
+                r2 = test_deinterleave<uint64_t>(f, target);
+                break;
+            default:
+                break;
+            }
+            std::cout << r1.type_size << " "
+                      << r1.factor << " "
+                      << r1.bandwidth << " "
+                      << r2.bandwidth << "\n";
+
+        }
+    }
+
+    printf("Success!\n");
+    return 0;
+}

From 678a353650869e42cafd5e5e9168a21a99f67b08 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 18 Feb 2026 14:28:04 -0800
Subject: [PATCH 12/55] Support more interleave and deinterleave patterns

---
 src/CodeGen_LLVM.cpp                 | 137 +++++++++++++++++++++-
 src/CodeGen_LLVM.h                   |   3 +
 src/CodeGen_X86.cpp                  | 164 +++++++++++++++++++++++++--
 src/IR.cpp                           |  40 +++++++
 src/IR.h                             |   8 ++
 src/IRPrinter.cpp                    |   5 +
 src/Simplify_Exprs.cpp               |  16 ++-
 src/Simplify_Stmts.cpp               |  20 ++++
 src/StageStridedLoads.cpp            | 109 +++++++++++++++++-
 test/performance/block_transpose.cpp |  13 ++-
 test/performance/interleave.cpp      |  17 +--
 11 files changed, 496 insertions(+), 36 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 57aa43299eea..0350ed0e5035 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -2288,8 +2288,9 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
 
     } else {
         // The number of vectors shares a factor with the length of the
-        // vectors. Pick some large factor of the number of vectors, interleave
-        // in separate groups, and then interleave the results.
+        // vectors. Pick some factor of the number of vectors, interleave in
+        // separate groups, and then interleave the results. Doing the smallest
+        // factor first seems to be fastest.
         const int n = (int)vecs.size();
         int f = 1;
         for (int i = 2; i < n; i++) {
@@ -2317,6 +2318,120 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
     }
 }
 
+std::vector<Value *> CodeGen_LLVM::deinterleave_vector(Value *vec, int num_vecs) {
+    int vec_elements = get_vector_num_elements(vec->getType());
+    internal_assert(vec_elements % num_vecs == 0);
+    vec_elements /= num_vecs;
+
+    int factor = gcd(vec_elements, num_vecs);
+
+    if (num_vecs == 1) {
+        return {vec};
+    } else if (num_vecs == 2) {
+        std::vector<Value *> result(2);
+        std::vector<int> indices(vec_elements);
+        for (int i = 0; i < vec_elements; i++) {
+            indices[i] = i * 2;
+        }
+        result[0] = shuffle_vectors(vec, vec, indices);
+        for (int i = 0; i < vec_elements; i++) {
+            indices[i]++;
+        }
+        result[1] = shuffle_vectors(vec, vec, indices);
+        return result;
+    } else if (factor == 1) {
+        // Use the inverse of Catanzaro's algorithm from above. We slice into
+        // distinct vectors, then rotate each element into the correct final
+        // vector, then do a unary permutation of each vector.
+        std::vector<int> shuffle(vec_elements);
+
+        // Instead of concatenating, we slice.
+        std::vector<Value *> v(num_vecs);
+        for (int i = 0; i < num_vecs; i++) {
+            v[i] = slice_vector(vec, i * vec_elements, vec_elements);
+        }
+
+        // Compute the same rotation as above
+        std::vector<int> rotation(vec_elements, 0);
+        for (int i = 0; i < vec_elements; i++) {
+            int k = (i * num_vecs) % vec_elements;
+            rotation[k] = (i * num_vecs) / vec_elements;
+        }
+        internal_assert(rotation[0] == 0);
+
+        // We'll handle each bit of the rotation one at a time with a two-way
+        // shuffle.
+        std::vector<Value *> new_v(v.size());
+        int d = 1;
+        while (d < num_vecs) {
+
+            for (int i = 0; i < vec_elements; i++) {
+                shuffle[i] = ((rotation[i] & d) == 0) ? i : (i + vec_elements);
+            }
+
+            for (int i = 0; i < num_vecs; i++) {
+                // The rotation is in the opposite direction to the interleaving
+                // version, so num_vecs - d becomes just d.
+                int j = (i + d) % num_vecs;
+                // An optimization fence here keeps it as a blend and stops it
+                // from getting fused with the unary shuffle below.
+                new_v[i] = optimization_fence(shuffle_vectors(v[i], v[j], shuffle));
+            }
+
+            v.swap(new_v);
+            d *= 2;
+        }
+
+        // Now reorder the vectors in the inverse order to the above.
+        for (int i = 0; i < num_vecs; i++) {
+            int j = (i * vec_elements) % num_vecs;
+            // j and i are swapped below, because we're doing the inverse of the algorithm above
+            new_v[j] = v[i];
+        }
+        v.swap(new_v);
+
+        // The elements are now in the correct vector. Finish up with a unary
+        // shuffle of each.
+        for (int i = 0; i < num_vecs; i++) {
+            for (int j = 0; j < vec_elements; j++) {
+                int k = j * num_vecs + i;
+                // This is the inverse shuffle of the interleaving version, so
+                // the index and the arg of the assignment below are swapped
+                // compared to the above.
+                shuffle[j] = k % vec_elements;
+            }
+
+            v[i] = shuffle_vectors(v[i], v[i], shuffle);
+        }
+
+        return v;
+
+    } else {
+        // Do a lower-factor deinterleave, then deinterleave each result
+        // again. We know there's a non-trivial factor because if it were prime
+        // the gcd above would have been 1. Unlike interleave, doing the largest
+        // factor first seems to be fastest.
+        int f = 1;
+        for (int i = 2; i < num_vecs; i++) {
+            if (num_vecs % i == 0) {
+                f = i;
+            }
+        }
+
+        auto partial = deinterleave_vector(vec, f);
+        std::vector<Value *> result(num_vecs);
+        for (size_t i = 0; i < partial.size(); i++) {
+            Value *v = partial[i];
+            auto vecs = deinterleave_vector(v, num_vecs / f);
+            for (size_t j = 0; j < vecs.size(); j++) {
+                result[j * f + i] = vecs[j];
+            }
+        }
+
+        return result;
+    }
+}
+
 void CodeGen_LLVM::scalarize(const Expr &e) {
     llvm::Type *result_type = llvm_type_of(e.type());
 
@@ -4178,6 +4293,24 @@ void CodeGen_LLVM::visit(const Shuffle *op) {
 
     if (op->is_interleave()) {
         value = interleave_vectors(vecs);
+    } else if (op->is_transpose()) {
+        int cols = op->transpose_factor();
+        int rows = op->vectors[0].type().lanes() / cols;
+        if (is_power_of_two(cols) &&
+            !is_power_of_two(rows)) {
+            // We're doing something like vectorizing over c and x when storing
+            // packed rgb. Best handled as an interleave.
+            std::vector<Value *> slices(rows);
+            for (int i = 0; i < rows; i++) {
+                slices[i] = slice_vector(vecs[0], i * cols, cols);
+            }
+            value = interleave_vectors(slices);
+        } else {
+            // Deinterleave out the cols of the input matrix and concat
+            // them. Occurs when, for example, loading packed RGB and
+            // vectorizing across x.
+            value = concat_vectors(deinterleave_vector(vecs[0], cols));
+        }
     } else if (op->is_concat()) {
         value = concat_vectors(vecs);
     } else {
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index e006a885fc57..46ec05638e3f 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -460,6 +460,9 @@ class CodeGen_LLVM : public IRVisitor {
      * an arbitrary number of vectors.*/
     virtual llvm::Value *interleave_vectors(const std::vector<llvm::Value *> &);
 
+    /** The inverse of interleave_vectors. */
+    virtual std::vector<llvm::Value *> deinterleave_vector(llvm::Value *vec, int num_vecs);
+
     /** A fence to prevent fusion of ops by llvm. Designed for floats, but we
      * abuse it to prevent shufflevector fusion too. */
     llvm::Value *optimization_fence(llvm::Value *);
diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 21e399a16965..0e09443859b8 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -113,6 +113,7 @@ class CodeGen_X86 : public CodeGen_Posix {
     void codegen_vector_reduce(const VectorReduce *, const Expr &init) override;
     // @}
 
+    std::vector<llvm::Value *> deinterleave_vector(llvm::Value *, int) override;
     llvm::Value *interleave_vectors(const std::vector<llvm::Value *> &) override;
 
 private:
@@ -910,6 +911,30 @@ void CodeGen_X86::codegen_vector_reduce(const VectorReduce *op, const Expr &init
     CodeGen_Posix::codegen_vector_reduce(op, init);
 }
 
+std::vector<Value *> CodeGen_X86::deinterleave_vector(Value *vec, int num_vecs) {
+    int vec_elements = get_vector_num_elements(vec->getType()) / num_vecs;
+    const size_t element_bits = vec->getType()->getScalarSizeInBits();
+    if (target.has_feature(Target::AVX) &&
+        is_power_of_two(num_vecs) &&
+        is_power_of_two(vec_elements) &&
+        (int)(vec_elements * num_vecs * element_bits) > native_vector_bits()) {
+
+        // Our interleaving logic below supports this case
+        std::vector<Value *> slices(vec_elements);
+        for (int i = 0; i < vec_elements; i++) {
+            slices[i] = slice_vector(vec, i * num_vecs, num_vecs);
+        }
+        vec = interleave_vectors(slices);
+        std::vector<Value *> result(num_vecs);
+        for (int i = 0; i < num_vecs; i++) {
+            result[i] = slice_vector(vec, i * vec_elements, vec_elements);
+        }
+        return result;
+    } else {
+        return CodeGen_Posix::deinterleave_vector(vec, num_vecs);
+    }
+}
+
 Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
     // Only use x86-specific interleaving for AVX and above
     if (vecs.empty() || !target.has_feature(Target::AVX)) {
@@ -1146,6 +1171,24 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
 
     // Now we define helpers for each instruction we are going to use
 
+    // Useful for debugging or enhancing this algorithm
+    /*
+    auto dump_bits = [&]() {
+        for (int b : l_bits) {
+            debug(0) << b << " ";
+        }
+        debug(0) << "| ";
+        for (int b : s_bits) {
+            debug(0) << b << " ";
+        }
+        debug(0) << "| ";
+        for (int b : v_bits) {
+            debug(0) << b << " ";
+        }
+        debug(0) << "\n";
+    };
+    */
+
     // unpckl/h instruction
     auto unpck = [&](Value *a, Value *b) -> std::pair<Value *, Value *> {
         int n = get_vector_num_elements(a->getType());
@@ -1258,6 +1301,99 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
         s_bits.pop_back();
     }
 
+    // If adjacent vectors are shuffles of the same underlying vector(s),
+    // concatenate pairs, because this is probably free.
+    while ((size_t)vec_elements < elems_per_native_vec && !v_bits.empty()) {
+        std::vector<Value *> new_v;
+        new_v.reserve(v.size() / 2);
+        bool fail = false;
+        std::vector<int> indices;
+        indices.reserve(vec_elements * 2);
+        for (size_t i = 0; i < v.size(); i += 2) {
+            ShuffleVectorInst *a = llvm::dyn_cast<ShuffleVectorInst>(v[i]);
+            ShuffleVectorInst *b = llvm::dyn_cast<ShuffleVectorInst>(v[i + 1]);
+            if (a &&
+                b &&
+                a->getOperand(0) == b->getOperand(0) &&
+                a->getOperand(1) == b->getOperand(1)) {
+
+                // Concatenate the two shuffles
+                indices.clear();
+                for (int j : a->getShuffleMask()) {
+                    indices.push_back(j);
+                }
+                for (int j : b->getShuffleMask()) {
+                    indices.push_back(j);
+                }
+                new_v.push_back(shuffle_vectors(a->getOperand(0), a->getOperand(1), indices));
+            } else {
+                fail = true;
+            }
+        }
+        if (fail) {
+            break;
+        }
+
+        v.swap(new_v);
+        // The lowest vector bit becomes the highest lane or slice bit
+        if ((size_t)vec_elements < elems_per_slice) {
+            l_bits.push_back(v_bits[0]);
+        } else {
+            s_bits.push_back(v_bits[0]);
+        }
+        v_bits.erase(v_bits.begin());
+        vec_elements *= 2;
+    }
+
+    if (final_num_s_bits > 1 &&
+        (size_t)vec_elements == elems_per_native_vec &&
+        (size_t)v_bits[0] >= l_bits.size() - 1) {
+        // A big binary shuffle of adjacent pairs will fix the l bits
+        // entirely. AVX-512 has these. Yes, this will use registers for the
+        // shuffle indices, but the alternative requires very many unpck
+        // operations to completely cycle out the v_bits that are hiding in the
+        // bottom of the l_bits.
+
+        std::vector<int> lo_indices(vec_elements);
+        std::vector<int> hi_indices(vec_elements);
+        std::vector<int> sorted_bits = l_bits;
+        sorted_bits.insert(sorted_bits.end(), s_bits.begin(), s_bits.end());
+        sorted_bits.push_back(v_bits[0]);
+        std::sort(sorted_bits.begin(), sorted_bits.end());
+        std::vector<int> idx_of_bit(l_bits.size() + s_bits.size() + v_bits.size(), 0);
+        for (size_t b = 0; b < sorted_bits.size(); b++) {
+            idx_of_bit[sorted_bits[b]] = b;
+        }
+
+        for (size_t dst_idx = 0; dst_idx < (size_t)vec_elements * 2; dst_idx++) {
+            size_t src_idx = 0;
+            for (size_t b = 0; b < l_bits.size(); b++) {
+                src_idx |= ((dst_idx >> idx_of_bit[l_bits[b]]) & 1) << b;
+            }
+            for (size_t b = 0; b < s_bits.size(); b++) {
+                src_idx |= ((dst_idx >> idx_of_bit[s_bits[b]]) & 1) << (b + l_bits.size());
+            }
+            src_idx |= ((dst_idx >> idx_of_bit[v_bits[0]]) & 1) << (l_bits.size() + s_bits.size());
+            if (dst_idx < (size_t)vec_elements) {
+                lo_indices[dst_idx] = (int)src_idx;
+            } else {
+                hi_indices[dst_idx - vec_elements] = (int)src_idx;
+            }
+        }
+
+        for_all_pairs(0, [&](auto *a, auto *b) {
+            Value *lo = shuffle_vectors(*a, *b, lo_indices);
+            Value *hi = shuffle_vectors(*a, *b, hi_indices);
+            *a = lo;
+            *b = hi;
+        });
+
+        auto first_s_bit = sorted_bits.begin() + l_bits.size();
+        std::copy(sorted_bits.begin(), first_s_bit, l_bits.begin());
+        std::copy(first_s_bit, first_s_bit + s_bits.size(), s_bits.begin());
+        v_bits[0] = sorted_bits.back();
+    }
+
     // Interleave pairs if we have vectors smaller than a single slice. Choosing
     // which pairs to interleave is important because we want to pull down v
     // bits that are destined to end up as l bits, and we want to pull them down
@@ -1300,9 +1436,8 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
 
     // Concatenate/repack to get at least the desired number of slice bits.
     while ((int)s_bits.size() < final_num_s_bits && !v_bits.empty()) {
-        int desired_low_slice_bit = ctz64(elems_per_slice);
-        int desired_high_slice_bit = desired_low_slice_bit + 1;
-
+        const int desired_low_slice_bit = ctz64(elems_per_slice);
+        const int desired_high_slice_bit = desired_low_slice_bit + 1;
         int bit;
         if (!s_bits.empty() &&
             s_bits[0] == desired_low_slice_bit) {
@@ -1340,7 +1475,9 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
     // Now we have at least two whole vectors. Next we try to finalize lane bits using
     // unpck instructions.
     while (l_bits[0] != 0) {
-        int bit = std::min(l_bits[0], (int)ctz64(elems_per_slice)) - 1;
+
+        int first_s_bit = (int)ctz64(elems_per_slice);
+        int bit = std::min(l_bits[0], first_s_bit) - 1;
 
         auto vb_it = std::find(v_bits.begin(), v_bits.end(), bit);
 
@@ -1348,11 +1485,17 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
         if (vb_it == v_bits.end()) {
             // The next bit is not in vector bits. It must be hiding in the
             // slice bits due to earlier concatenation. Move it into the v_bits
-            // with a shufi
+            // with a shufi. We'll need to pick a v bit to take its place,
+            // ideally one destined to end up in the s bits.
+            vb_it = std::find_if(v_bits.begin(), v_bits.end(), [&](int b) { return b >= first_s_bit; });
+            if (vb_it == v_bits.end()) {
+                vb_it = v_bits.begin();
+            }
+
             if (s_bits.back() == bit) {
                 // It's the last (or sole) slice bit. Swap it with the first v bit
-                std::swap(s_bits.back(), v_bits[0]);
-                for_all_pairs(0, [&](auto *a, auto *b) {
+                std::swap(s_bits.back(), *vb_it);
+                for_all_pairs(vb_it - v_bits.begin(), [&](auto *a, auto *b) {
                     auto [lo, hi] = shufi(*a, *b, false);
                     *a = lo;
                     *b = hi;
@@ -1360,17 +1503,16 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
             } else {
                 internal_assert(s_bits.size() == 2 && s_bits[0] == bit);
                 // It's the low slice bit. We need shufi with crossover.
-                int v_bit = v_bits[0];
-                v_bits[0] = s_bits[0];
+                int v_bit = *vb_it;
+                *vb_it = s_bits[0];
                 s_bits[0] = s_bits[1];
                 s_bits[1] = v_bit;
-                for_all_pairs(0, [&](auto *a, auto *b) {
+                for_all_pairs(vb_it - v_bits.begin(), [&](auto *a, auto *b) {
                     auto [lo, hi] = shufi(*a, *b, true);
                     *a = lo;
                     *b = hi;
                 });
             }
-            vb_it = v_bits.begin();
         }
 
         int j = vb_it - v_bits.begin();
diff --git a/src/IR.cpp b/src/IR.cpp
index c82ae4ebd252..049ad8848aaa 100644
--- a/src/IR.cpp
+++ b/src/IR.cpp
@@ -815,6 +815,21 @@ Expr Shuffle::make_interleave(const std::vector<Expr> &vectors) {
     return make(vectors, indices);
 }
 
+Expr Shuffle::make_transpose(Expr e, int cols) {
+    internal_assert(e.type().lanes() % cols == 0)
+        << "Transpose cols must divide the number of lanes.\n";
+    int rows = e.type().lanes() / cols;
+
+    std::vector<int> indices(e.type().lanes());
+    for (int j = 0; j < cols; j++) {
+        for (int i = 0; i < rows; i++) {
+            indices[j * rows + i] = i * cols + j;
+        }
+    }
+
+    return make({std::move(e)}, indices);
+}
+
 Expr Shuffle::make_concat(const std::vector<Expr> &vectors) {
     internal_assert(!vectors.empty()) << "Concat of zero vectors.\n";
 
@@ -1012,6 +1027,31 @@ bool Shuffle::is_concat() const {
     return indices.size() == input_lanes && is_ramp(indices);
 }
 
+bool Shuffle::is_transpose() const {
+    if (vectors.size() > 1 ||
+        (int)indices.size() != vectors[0].type().lanes() ||
+        indices.size() < 2) {
+        return false;
+    }
+    int cols = indices[1] - indices[0];
+    int rows = vectors[0].type().lanes() / cols;
+    if ((int)indices.size() != rows * cols) {
+        return false;
+    }
+    for (int row = 0; row < rows; row++) {
+        for (int col = 0; col < cols; col++) {
+            if (indices[col * rows + row] != row * cols + col) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+int Shuffle::transpose_factor() const {
+    return indices[1] - indices[0];
+}
+
 bool Shuffle::is_slice() const {
     size_t input_lanes = 0;
     for (const Expr &i : vectors) {
diff --git a/src/IR.h b/src/IR.h
index da27019a93c7..78d61be2349c 100644
--- a/src/IR.h
+++ b/src/IR.h
@@ -910,6 +910,10 @@ struct Shuffle : public ExprNode<Shuffle> {
      * interleaving of vectors of the same length. */
     static Expr make_interleave(const std::vector<Expr> &vectors);
 
+    /** Convenience constructor for making a shuffle representing an
+     * in-place transpose of a matrix with the given number of columns. */
+    static Expr make_transpose(Expr e, int cols);
+
     /** Convenience constructor for making a shuffle representing a
      * concatenation of the vectors. */
     static Expr make_concat(const std::vector<Expr> &vectors);
@@ -930,6 +934,10 @@ struct Shuffle : public ExprNode<Shuffle> {
      * arguments. */
     bool is_interleave() const;
 
+    /** Check if this shuffle is an in-place transpose of a single vector */
+    bool is_transpose() const;
+    int transpose_factor() const;
+
     /** Check if this shuffle can be represented as a repeating pattern that
      * repeats the same shuffle of the single input vector some number of times.
      * For example: 0, 3, 1, 1,  0, 3, 1, 1, .....,  0, 3, 1, 1
diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp
index e95286af03ee..9cd5527b09a6 100644
--- a/src/IRPrinter.cpp
+++ b/src/IRPrinter.cpp
@@ -1461,6 +1461,11 @@ void IRPrinter::visit(const Shuffle *op) {
         stream << paren(", ") << imm_int(op->slice_begin())
                << paren(", ") << imm_int(op->slice_stride())
                << paren(", ") << imm_int(op->indices.size());
+    } else if (op->is_transpose()) {
+        openf("transpose_vector");
+        print_list(op->vectors);
+        stream << paren(", ") << imm_int(op->transpose_factor());
+
     } else {
         openf("shuffle");
         print_list(op->vectors);
diff --git a/src/Simplify_Exprs.cpp b/src/Simplify_Exprs.cpp
index 0eb3bbaf3c15..52665c0c2894 100644
--- a/src/Simplify_Exprs.cpp
+++ b/src/Simplify_Exprs.cpp
@@ -327,8 +327,9 @@ Expr Simplify::visit(const Load *op, ExprInfo *info) {
     }
 
     ExprInfo base_info;
-    if (const Ramp *r = index.as<Ramp>()) {
-        mutate(r->base, &base_info);
+    const Ramp *r_index = index.as<Ramp>();
+    if (r_index) {
+        mutate(r_index->base, &base_info);
     }
 
     base_info.alignment = ModulusRemainder::intersect(base_info.alignment, index_info.alignment);
@@ -360,6 +361,17 @@ Expr Simplify::visit(const Load *op, ExprInfo *info) {
             loaded_vecs.emplace_back(std::move(load));
         }
         return Shuffle::make(loaded_vecs, s_index->indices);
+    } else if (const Ramp *inner_ramp = r_index ? r_index->base.as<Ramp>() : nullptr;
+               inner_ramp && is_const_one(r_index->stride)) {
+        // If it's a nested ramp and the outer ramp has stride 1, swap the
+        // nesting order of the ramps to make dense loads and transpose the
+        // resulting vector instead.
+        Expr transposed_index =
+            Ramp::make(Ramp::make(inner_ramp->base, make_one(inner_ramp->base.type()), r_index->lanes),
+                       Broadcast::make(inner_ramp->stride, r_index->lanes), inner_ramp->lanes);
+        Expr transposed_load =
+            Load::make(op->type, op->name, transposed_index, op->image, op->param, predicate, align);
+        return mutate(Shuffle::make_transpose(transposed_load, r_index->lanes), info);
     } else if (predicate.same_as(op->predicate) && index.same_as(op->index) && align == op->alignment) {
         return op;
     } else {
diff --git a/src/Simplify_Stmts.cpp b/src/Simplify_Stmts.cpp
index bbacbe69b55d..308254ff1b9a 100644
--- a/src/Simplify_Stmts.cpp
+++ b/src/Simplify_Stmts.cpp
@@ -348,6 +348,7 @@ Stmt Simplify::visit(const Store *op) {
     base_info.alignment = ModulusRemainder::intersect(base_info.alignment, index_info.alignment);
 
     const Load *load = value.as<Load>();
+    const Shuffle *shuf = index.as<Shuffle>();
     const Broadcast *scalar_pred = predicate.as<Broadcast>();
     if (scalar_pred && !scalar_pred->value.type().is_scalar()) {
         // Nested vectorization
@@ -365,6 +366,25 @@ Stmt Simplify::visit(const Store *op) {
     } else if (is_undef(value) || (load && load->name == op->name && equal(load->index, index))) {
         // foo[x] = foo[x] or foo[x] = undef is a no-op
         return Evaluate::make(0);
+    } else if (shuf && shuf->is_concat()) {
+        // Break a store of a concat of vector indices into separate stores
+        std::string var_name = unique_name('t');
+        Expr var = Variable::make(value.type(), var_name);
+        std::vector<Stmt> stores;
+        int lanes = 0;
+        for (size_t i = 0; i < shuf->vectors.size(); i++) {
+            Expr idx = shuf->vectors[i];
+            stores.push_back(Store::make(op->name,
+                                         Shuffle::make_slice(var, lanes, 1, idx.type().lanes()),
+                                         shuf->vectors[i],
+                                         op->param,
+                                         Shuffle::make_slice(predicate, lanes, 1, idx.type().lanes()),
+                                         ModulusRemainder{}));
+            lanes += idx.type().lanes();
+        }
+        Stmt s = Block::make(stores);
+        s = LetStmt::make(var_name, value, s);
+        return mutate(s);
     } else if (predicate.same_as(op->predicate) && value.same_as(op->value) && index.same_as(op->index) && align == op->alignment) {
         return op;
     } else {
diff --git a/src/StageStridedLoads.cpp b/src/StageStridedLoads.cpp
index 85691921bc8d..16e4680323f4 100644
--- a/src/StageStridedLoads.cpp
+++ b/src/StageStridedLoads.cpp
@@ -1,5 +1,6 @@
 #include "StageStridedLoads.h"
 #include "CSE.h"
+#include "ExprUsesVar.h"
 #include "IREquality.h"
 #include "IRMutator.h"
 #include "IROperator.h"
@@ -95,12 +96,15 @@ class FindStridedLoads : public IRVisitor {
                         base = base_add->a;
                         offset = *off;
                     }
+                } else if (auto off = as_const_int(base)) {
+                    base = 0;
+                    offset = *off;
                 }
 
                 // TODO: We do not yet handle nested vectorization here for
                 // ramps which have not already collapsed. We could potentially
                 // handle more interesting types of shuffle than simple flat slices.
-                if (stride >= 2 && stride < r->lanes && r->stride.type().is_scalar()) {
+                if (stride >= 2 && stride <= r->lanes && r->stride.type().is_scalar()) {
                     const IRNode *s = scope;
                     const Allocate *a = nullptr;
                     if (const Allocate *const *a_ptr = allocation_scope.find(op->name)) {
@@ -157,6 +161,19 @@ class ReplaceStridedLoads : public IRMutator {
     std::map<std::pair<const Allocate *, const Load *>, Expr> replacements;
     std::map<const Allocate *, int> padding;
     Scope<const Allocate *> allocation_scope;
+    std::map<Stmt, std::pair<std::string, Expr>> let_injections;
+
+    using IRMutator::mutate;
+
+    Stmt mutate(const Stmt &s) override {
+        auto it = let_injections.find(s);
+        if (it != let_injections.end()) {
+            const auto &[name, value] = it->second;
+            return LetStmt::make(name, value, IRMutator::mutate(s));
+        } else {
+            return IRMutator::mutate(s);
+        }
+    }
 
 protected:
     Expr visit(const Load *op) override {
@@ -191,6 +208,61 @@ class ReplaceStridedLoads : public IRMutator {
     using IRMutator::visit;
 };
 
+Stmt innermost_containing_stmt(const Stmt &root, const std::set<const Load *> &exprs) {
+    std::vector<Stmt> path, result;
+    mutate_with(root,  //
+                [&](auto *self, const Stmt &s) {
+                    path.push_back(s);
+                    self->mutate_base(s);
+                    path.pop_back();
+                    return s;  //
+                },
+                [&](auto *self, const Expr &e) {
+                    const Load *l = e.as<Load>();
+                    if (l && exprs.count(l)) {
+                        if (result.empty()) {
+                            result = path;
+                        } else {
+                            // Find the common prefix of path and result
+                            size_t i = 0;
+                            while (i < path.size() &&
+                                   i < result.size() &&
+                                   path[i].get() == result[i].get()) {
+                                i++;
+                            }
+                            result.resize(i);
+                        }
+                    };
+                    return self->mutate_base(e);  //
+                });
+    internal_assert(!result.empty()) << "None of the exprs were found\n";
+    return result.back();
+}
+
+bool can_hoist_shared_load(const Stmt &s, const std::string &buf, const Expr &idx) {
+    // Check none of the variables the idx depends on are defined somewhere
+    // within this stmt, and there are no stores to the given buffer in the
+    // stmt.
+    bool result = true;
+    visit_with(s,                                 //
+               [&](auto *self, const Let *let) {  //
+                   result &= !expr_uses_var(idx, let->name);
+               },
+               [&](auto *self, const LetStmt *let) {  //
+                   result &= !expr_uses_var(idx, let->name);
+               },
+               [&](auto *self, const For *loop) {  //
+                   result &= !expr_uses_var(idx, loop->name);
+               },
+               [&](auto *self, const Allocate *alloc) {  //
+                   result &= alloc->name != buf;
+               },
+               [&](auto *self, const Store *store) {  //
+                   result &= store->name != buf;
+               });
+    return result;
+}
+
 }  // namespace
 
 Stmt stage_strided_loads(const Stmt &s) {
@@ -218,6 +290,7 @@ Stmt stage_strided_loads(const Stmt &s) {
             const bool can_lift = l.second.lower_bound(load->first + k.stride - 1) != l.second.end();
 
             if (!can_lift) {
+                debug(0) << "Can't lift: " << Expr(load->second[0]->index) << "\n";
                 load++;
                 continue;
             }
@@ -228,13 +301,39 @@ Stmt stage_strided_loads(const Stmt &s) {
             Expr idx = Ramp::make(k.base + (int)first_offset, make_one(k.base.type()), lanes);
             Type t = k.type.with_lanes(lanes);
             const Load *op = load->second[0];
+
+            std::set<const Load *> all_loads;
+            for (auto l = load; l != v.end() && l->first < first_offset + k.stride; l++) {
+                all_loads.insert(l->second.begin(), l->second.end());
+            }
+
             Expr shared_load = Load::make(t, k.buf, idx, op->image, op->param,
                                           const_true(lanes), op->alignment);
             shared_load = common_subexpression_elimination(shared_load);
-            for (; load != v.end() && load->first < first_offset + k.stride; load++) {
-                Expr shuf = Shuffle::make_slice(shared_load, load->first - first_offset, k.stride, k.lanes);
-                for (const Load *l : load->second) {
-                    replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
+
+            // If possible, we do the shuffle as an in-place transpose followed
+            // by a dense slice. This is more efficient when extracting multiple
+            // slices.
+            Stmt let_site = innermost_containing_stmt(alloc ? Stmt(alloc) : s, all_loads);
+            if (can_hoist_shared_load(let_site, k.buf, idx)) {
+                shared_load = Shuffle::make_transpose(shared_load, k.stride);
+                std::string name = unique_name('t');
+                Expr var = Variable::make(shared_load.type(), name);
+                for (; load != v.end() && load->first < first_offset + k.stride; load++) {
+                    int row = load->first - first_offset;
+                    Expr shuf = Shuffle::make_slice(var, row * k.lanes, 1, k.lanes);
+                    for (const Load *l : load->second) {
+                        replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
+                    }
+                }
+                replacer.let_injections.emplace(let_site, std::make_pair(name, shared_load));
+            } else {
+                for (; load != v.end() && load->first < first_offset + k.stride; load++) {
+                    int row = load->first - first_offset;
+                    Expr shuf = Shuffle::make_slice(shared_load, row, k.stride, k.lanes);
+                    for (const Load *l : load->second) {
+                        replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
+                    }
                 }
             }
         }
diff --git a/test/performance/block_transpose.cpp b/test/performance/block_transpose.cpp
index 8760d8ac5495..921d7f9a913b 100644
--- a/test/performance/block_transpose.cpp
+++ b/test/performance/block_transpose.cpp
@@ -33,13 +33,16 @@ Result test_transpose(int block_width, int block_height, const Target &t) {
     Var xi, yi;
     output.tile(x, y, xi, yi, block_width, block_height, TailStrategy::RoundUp)
         .vectorize(xi)
-        .unroll(yi);
+        .vectorize(yi);
 
-    // Do vectorized loads from the input.
-    input.in().compute_at(output, x).vectorize(x).unroll(y);
+    // Explicitly vectorized loads from the input. Was necessary before we
+    // automatically swizzled the 2D load into dense order.
+    // input.in().compute_at(output, x).vectorize(x).unroll(y);
 
-    // Transpose in registers
-    input.in().in().reorder_storage(y, x).compute_at(output, x).vectorize(x).unroll(y);
+    // Explicit transpose in registers. This used to be the idiom, but is no
+    // longer necessary because stage_strided_loads should detect the strided
+    // loads from input.in() and turn it into a transpose.
+    // input.in().in().reorder_storage(y, x).compute_at(output, x).vectorize(x).unroll(y);
 
     // TODO: Should not be necessary, but prevents licm from doing something dumb.
     output.output_buffer().dim(0).set_bounds(0, 256);
diff --git a/test/performance/interleave.cpp b/test/performance/interleave.cpp
index f73d7b687ac4..3df42ed0237f 100644
--- a/test/performance/interleave.cpp
+++ b/test/performance/interleave.cpp
@@ -76,8 +76,10 @@ Result test_deinterleave(int factor, const Target &t) {
     output(x, y) = in(x * factor + y);
 
     Var xi, yi;
-    output.reorder(y, x).bound(y, 0, factor).unroll(y).vectorize(x, t.natural_vector_size<T>(), TailStrategy::RoundUp);
-    // output.output_buffer().dim(0).set_min(0);
+    output.bound(y, 0, factor)
+        .reorder(y, x)
+        .unroll(y)  // Also works if we vectorize y
+        .vectorize(x, t.natural_vector_size<T>(), TailStrategy::RoundUp);
 
     output.compile_jit();
 
@@ -100,8 +102,8 @@ Result test_deinterleave(int factor, const Target &t) {
     }
 
     // Uncomment to dump asm for inspection
-    output.compile_to_assembly("/dev/stdout",
-    std::vector<Argument>{in}, "interleave", t);
+    // output.compile_to_assembly("/dev/stdout",
+    // std::vector<Argument>{in}, "deinterleave", t);
 
     return Result{(int)sizeof(T), factor, out.size_in_bytes() / (1.0e9 * time)};
 }
@@ -117,14 +119,8 @@ int main(int argc, char **argv) {
     target.set_features({Target::NoRuntime, Target::NoAsserts, Target::NoBoundsQuery});
 
     std::cout << "\nbytes, interleave factor, interleave bandwidth (GB/s), deinterleave bandwidth (GB/s):\n";
-#if 0
     for (int t : {1, 2, 4, 8}) {
         for (int f = 2; f < 16; f++) {
-#else
-     {
-         {
-            int t = 1, f = 4;
-#endif
             Result r1, r2;
             switch (t) {
             case 1:
@@ -150,7 +146,6 @@ int main(int argc, char **argv) {
                       << r1.factor << " "
                       << r1.bandwidth << " "
                       << r2.bandwidth << "\n";
-
         }
     }
 

From 4c1adf779b2644103ed63458dcc4cbce449a480c Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Thu, 19 Feb 2026 09:48:06 -0800
Subject: [PATCH 13/55] clang-tidy fix

---
 src/Simplify_Stmts.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/Simplify_Stmts.cpp b/src/Simplify_Stmts.cpp
index 308254ff1b9a..1b4588342096 100644
--- a/src/Simplify_Stmts.cpp
+++ b/src/Simplify_Stmts.cpp
@@ -372,11 +372,10 @@ Stmt Simplify::visit(const Store *op) {
         Expr var = Variable::make(value.type(), var_name);
         std::vector<Stmt> stores;
         int lanes = 0;
-        for (size_t i = 0; i < shuf->vectors.size(); i++) {
-            Expr idx = shuf->vectors[i];
+        for (const Expr &idx : shuf->vectors) {
             stores.push_back(Store::make(op->name,
                                          Shuffle::make_slice(var, lanes, 1, idx.type().lanes()),
-                                         shuf->vectors[i],
+                                         idx,
                                          op->param,
                                          Shuffle::make_slice(predicate, lanes, 1, idx.type().lanes()),
                                          ModulusRemainder{}));

From 1c940e8fb34459738f7d0e1ace636685bba47ea4 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Thu, 19 Feb 2026 09:48:26 -0800
Subject: [PATCH 14/55] Handle multiple let injections at same site

Also better algorithm for innermost containing stmt
---
 src/StageStridedLoads.cpp | 43 +++++++++++++++++----------------------
 1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/src/StageStridedLoads.cpp b/src/StageStridedLoads.cpp
index 16e4680323f4..c159d9d62b0a 100644
--- a/src/StageStridedLoads.cpp
+++ b/src/StageStridedLoads.cpp
@@ -161,18 +161,19 @@ class ReplaceStridedLoads : public IRMutator {
     std::map<std::pair<const Allocate *, const Load *>, Expr> replacements;
     std::map<const Allocate *, int> padding;
     Scope<const Allocate *> allocation_scope;
-    std::map<Stmt, std::pair<std::string, Expr>> let_injections;
+    std::map<Stmt, std::vector<std::pair<std::string, Expr>>> let_injections;
 
     using IRMutator::mutate;
 
     Stmt mutate(const Stmt &s) override {
+        Stmt stmt = IRMutator::mutate(s);
         auto it = let_injections.find(s);
         if (it != let_injections.end()) {
-            const auto &[name, value] = it->second;
-            return LetStmt::make(name, value, IRMutator::mutate(s));
-        } else {
-            return IRMutator::mutate(s);
+            for (const auto &[name, value] : it->second) {
+                stmt = LetStmt::make(name, value, stmt);
+            }
         }
+        return stmt;
     }
 
 protected:
@@ -209,34 +210,29 @@ class ReplaceStridedLoads : public IRMutator {
 };
 
 Stmt innermost_containing_stmt(const Stmt &root, const std::set<const Load *> &exprs) {
-    std::vector<Stmt> path, result;
+    Stmt result;
+    // The innermost containing stmt is whichever stmt node contains the
+    // largest number of our exprs, with ties breaking inwards.
+    int seen = 0, best = 0;
     mutate_with(root,  //
                 [&](auto *self, const Stmt &s) {
-                    path.push_back(s);
+                    int old = seen;
                     self->mutate_base(s);
-                    path.pop_back();
+                    if (old == 0 && seen > best) {
+                        result = s;
+                        best = seen;
+                    }
                     return s;  //
                 },
                 [&](auto *self, const Expr &e) {
                     const Load *l = e.as<Load>();
                     if (l && exprs.count(l)) {
-                        if (result.empty()) {
-                            result = path;
-                        } else {
-                            // Find the common prefix of path and result
-                            size_t i = 0;
-                            while (i < path.size() &&
-                                   i < result.size() &&
-                                   path[i].get() == result[i].get()) {
-                                i++;
-                            }
-                            result.resize(i);
-                        }
+                        seen++;
                     };
                     return self->mutate_base(e);  //
                 });
-    internal_assert(!result.empty()) << "None of the exprs were found\n";
-    return result.back();
+    internal_assert(seen) << "None of the exprs were found\n";
+    return result;
 }
 
 bool can_hoist_shared_load(const Stmt &s, const std::string &buf, const Expr &idx) {
@@ -290,7 +286,6 @@ Stmt stage_strided_loads(const Stmt &s) {
             const bool can_lift = l.second.lower_bound(load->first + k.stride - 1) != l.second.end();
 
             if (!can_lift) {
-                debug(0) << "Can't lift: " << Expr(load->second[0]->index) << "\n";
                 load++;
                 continue;
             }
@@ -326,7 +321,7 @@ Stmt stage_strided_loads(const Stmt &s) {
                         replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
                     }
                 }
-                replacer.let_injections.emplace(let_site, std::make_pair(name, shared_load));
+                replacer.let_injections[let_site].emplace_back(name, shared_load);
             } else {
                 for (; load != v.end() && load->first < first_offset + k.stride; load++) {
                     int row = load->first - first_offset;

From c39b1a0505396bf95c5b1829872981b4bb709b4a Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Thu, 19 Feb 2026 16:30:41 -0800
Subject: [PATCH 15/55] better simplification and better handling of composite
 factors

---
 apps/iir_blur/iir_blur_generator.cpp |  3 +-
 src/CSE.cpp                          | 35 ++++++++++++++++--
 src/CodeGen_LLVM.cpp                 | 35 ++++++++++--------
 src/CodeGen_X86.cpp                  | 20 +++++++++--
 src/IRMatch.h                        | 54 ++++++++++++++++++++++++++++
 src/Simplify_Add.cpp                 |  1 +
 src/Simplify_EQ.cpp                  |  1 +
 src/Simplify_Max.cpp                 |  1 +
 src/Simplify_Min.cpp                 |  1 +
 src/Simplify_Mul.cpp                 |  1 +
 src/Simplify_Sub.cpp                 |  1 +
 src/StageStridedLoads.cpp            | 43 ++++++++++++++--------
 12 files changed, 160 insertions(+), 36 deletions(-)

diff --git a/apps/iir_blur/iir_blur_generator.cpp b/apps/iir_blur/iir_blur_generator.cpp
index 3c4dee4304af..7f411d7e8fef 100644
--- a/apps/iir_blur/iir_blur_generator.cpp
+++ b/apps/iir_blur/iir_blur_generator.cpp
@@ -43,13 +43,12 @@ Func blur_cols_transpose(Func input, Expr height, Expr alpha, bool skip_schedule
             transpose.compute_root()
                 .tile(x, y, xo, yo, x, y, vec, vec * 4)
                 .split(y, y, yi, vec)
-                .unroll(yi)
+                .vectorize(yi)
                 .vectorize(x)
                 .fuse(yo, c, t)
                 .parallel(t);
 
             blur.in(transpose)
-                .reorder_storage(y, x)
                 .compute_at(transpose, y)
                 .vectorize(x)
                 .unroll(y);
diff --git a/src/CSE.cpp b/src/CSE.cpp
index c2a46d93bc4d..e7e56bb4df09 100644
--- a/src/CSE.cpp
+++ b/src/CSE.cpp
@@ -237,10 +237,39 @@ class CSEEveryExprInStmt : public IRMutator {
         }
         const Call *bundle = Call::as_intrinsic(dummy, {Call::bundle});
         internal_assert(bundle && bundle->args.size() == 2);
-        Stmt s = Store::make(op->name, bundle->args[0], bundle->args[1],
+
+        Expr value = bundle->args[0], index = bundle->args[1];
+
+        // Figure out which ones are actually needed by the index
+
+        auto add_all_vars_to_set = [&](const Expr &e, std::set<std::string> &s) {
+            visit_with(e, [&](auto *, const Variable *var) {
+                s.insert(var->name);
+            });
+        };
+
+        std::set<string> index_lets;
+        add_all_vars_to_set(index, index_lets);
+        for (const auto &[var, val] : reverse_view(lets)) {
+            if (index_lets.count(var)) {
+                add_all_vars_to_set(val, index_lets);
+            }
+        }
+
+        vector<pair<string, Expr>> deferred;
+        for (const auto &[var, val] : reverse_view(lets)) {
+            if (index_lets.count(var)) {
+                deferred.emplace_back(var, val);
+            } else {
+                value = Let::make(var, val, value);
+            }
+        }
+
+        Stmt s = Store::make(op->name, value, index,
                              op->param, mutate(op->predicate), op->alignment);
-        for (const auto &[var, value] : reverse_view(lets)) {
-            s = LetStmt::make(var, value, s);
+
+        for (const auto &[var, val] : deferred) {
+            s = LetStmt::make(var, val, s);
         }
         return s;
     }
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 9fcc3a6cd046..a5937f123cfe 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -2289,18 +2289,20 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
     } else {
         // The number of vectors shares a factor with the length of the
         // vectors. Pick some factor of the number of vectors, interleave in
-        // separate groups, and then interleave the results. Doing the smallest
-        // factor first seems to be fastest.
+        // separate groups, and then interleave the results. Do the largest
+        // power of two factor first.
         const int n = (int)vecs.size();
-        int f = 1;
-        for (int i = 2; i < n; i++) {
-            if (n % i == 0) {
-                f = i;
-                break;
+        int f = n & -n;
+        if (f == 1 || f == n) {
+            for (int i = 2; i < n; i++) {
+                if (n % i == 0) {
+                    f = i;
+                    break;
+                }
             }
         }
 
-        internal_assert(f > 1 && f < n);
+        internal_assert(f > 1 && f < n && n % f == 0) << f << " " << n;
 
         vector<vector<Value *>> groups(f);
         for (size_t i = 0; i < vecs.size(); i++) {
@@ -2409,15 +2411,20 @@ std::vector<Value *> CodeGen_LLVM::deinterleave_vector(Value *vec, int num_vecs)
     } else {
         // Do a lower-factor deinterleave, then deinterleave each result
         // again. We know there's a non-trivial factor because if it were prime
-        // the gcd above would have been 1. Unlike interleave, doing the largest
-        // factor first seems to be fastest.
-        int f = 1;
-        for (int i = 2; i < num_vecs; i++) {
-            if (num_vecs % i == 0) {
-                f = i;
+        // the gcd above would have been 1. Do the largest power-of-two factor
+        // first.
+        int f = num_vecs & -num_vecs;
+        if (f == 1 || f == num_vecs) {
+            for (int i = 2; i < num_vecs; i++) {
+                if (num_vecs % i == 0) {
+                    f = i;
+                    break;
+                }
             }
         }
 
+        internal_assert(f > 1 && f < num_vecs && num_vecs % f == 0) << f << " " << num_vecs;
+
         auto partial = deinterleave_vector(vec, f);
         std::vector<Value *> result(num_vecs);
         for (size_t i = 0; i < partial.size(); i++) {
diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 0e09443859b8..edf79a4db15c 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -954,9 +954,23 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
     const size_t elems_per_slice = 128 / element_bits;
 
     // Only apply special x86 logic for power-of-two interleaves for avx and
-    // above where we're going to end up with multiple native vectors (TODO:
-    // Could slice into native vectors and concat results even if not power of
-    // two)
+    // above where we're going to end up with multiple native vectors.
+
+    if (!is_power_of_two(vec_elements) &&
+        vec_elements % elems_per_native_vec == 0) {
+        // It's not a power of two, but it's a multiple of the native vector
+        // length, so slice it and recurse.
+        std::vector<Value *> results;
+        for (int i = 0; i < vec_elements; i += elems_per_native_vec) {
+            std::vector<Value *> slices;
+            slices.reserve(vecs.size());
+            for (auto *v : vecs) {
+                slices.push_back(slice_vector(v, i, (int)elems_per_native_vec));
+            }
+            results.push_back(interleave_vectors(slices));
+        }
+        return concat_vectors(results);
+    }
 
     if (!is_power_of_two(vec_elements) ||
         !is_power_of_two(vecs.size()) ||
diff --git a/src/IRMatch.h b/src/IRMatch.h
index 671a6e086e1f..7e9abc80789b 100644
--- a/src/IRMatch.h
+++ b/src/IRMatch.h
@@ -2249,6 +2249,60 @@ HALIDE_ALWAYS_INLINE auto slice(Vec vec, Base base, Stride stride, Lanes lanes)
     return {pattern_arg(vec), pattern_arg(base), pattern_arg(stride), pattern_arg(lanes)};
 }
 
+template<typename Vec, typename Factor>
+struct TransposeOp {
+    struct pattern_tag {};
+    Vec vec;
+    Factor factor;
+
+    static constexpr uint32_t binds = Vec::binds | Factor::binds;
+
+    constexpr static IRNodeType min_node_type = IRNodeType::Shuffle;
+    constexpr static IRNodeType max_node_type = IRNodeType::Shuffle;
+    constexpr static bool canonical = Vec::canonical && Factor::canonical;
+
+    template<uint32_t bound>
+    HALIDE_ALWAYS_INLINE bool match(const BaseExprNode &e, MatcherState &state) const noexcept {
+        if (e.node_type != IRNodeType::Shuffle) {
+            return false;
+        }
+        const Shuffle &v = (const Shuffle &)e;
+        return v.vectors.size() == 1 &&
+               v.is_transpose() &&
+               vec.template match<bound>(*v.vectors[0].get(), state) &&
+               factor.template match<(bound | bindings<Vec>::mask)>(v.transpose_factor(), state);
+    }
+
+    HALIDE_ALWAYS_INLINE
+    Expr make(MatcherState &state, halide_type_t type_hint) const {
+        halide_scalar_value_t factor_val;
+        halide_type_t ty;
+        factor.make_folded_const(factor_val, ty, state);
+        int f = (int)factor_val.u.i64;
+        return Shuffle::make_transpose(vec.make(state, type_hint), f);
+    }
+
+    constexpr static bool foldable = false;
+
+    HALIDE_ALWAYS_INLINE
+    TransposeOp(Vec v, Factor f)
+        : vec(v), factor(f) {
+        static_assert(Factor::foldable, "Factor of transpose should consist only of operations that constant-fold");
+    }
+};
+
+template<typename Vec, typename Factor>
+std::ostream &operator<<(std::ostream &s, const TransposeOp<Vec, Factor> &op) {
+    s << "transpose(" << op.vec << ", " << op.factor << ")";
+    return s;
+}
+
+template<typename Vec, typename Factor>
+HALIDE_ALWAYS_INLINE auto transpose(Vec vec, Factor factor) noexcept
+    -> TransposeOp<decltype(pattern_arg(vec)), decltype(pattern_arg(factor))> {
+    return {pattern_arg(vec), pattern_arg(factor)};
+}
+
 template<typename A>
 struct Fold {
     struct pattern_tag {};
diff --git a/src/Simplify_Add.cpp b/src/Simplify_Add.cpp
index 6158cc9cd48c..06967a8d32d3 100644
--- a/src/Simplify_Add.cpp
+++ b/src/Simplify_Add.cpp
@@ -120,6 +120,7 @@ Expr Simplify::visit(const Add *op, ExprInfo *info) {
          rewrite(slice(x, c0, c1, c2) + (slice(y, c0, c1, c2) + z), slice(x + y, c0, c1, c2) + z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
          rewrite(slice(x, c0, c1, c2) + (z - slice(y, c0, c1, c2)), slice(x - y, c0, c1, c2) + z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
          rewrite(slice(x, c0, c1, c2) + (slice(y, c0, c1, c2) - z), slice(x + y, c0, c1, c2) - z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
+         rewrite(transpose(x, c0) + transpose(y, c0), transpose(x + y, c0)) ||
 
          (no_overflow(op->type) &&
           (rewrite(x + x * y, x * (y + 1)) ||
diff --git a/src/Simplify_EQ.cpp b/src/Simplify_EQ.cpp
index 994d14cd4cee..5d8c09901b49 100644
--- a/src/Simplify_EQ.cpp
+++ b/src/Simplify_EQ.cpp
@@ -195,6 +195,7 @@ Expr Simplify::visit(const EQ *op, ExprInfo *info) {
                  slice(x - y, c0, c1, c2) == z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
          rewrite(slice(x, c0, c1, c2) == slice(y, c0, c1, c2) + z,
                  slice(x - y, c0, c1, c2) == z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
+         rewrite(transpose(x, c0) == transpose(y, c0), transpose(x == y, c0)) ||
          false) ||
         (no_overflow(a.type()) && EVAL_IN_LAMBDA  //
          (rewrite(x * y == 0, (x == 0) || (y == 0)) ||
diff --git a/src/Simplify_Max.cpp b/src/Simplify_Max.cpp
index 1926bc9a069e..cc4253ca718f 100644
--- a/src/Simplify_Max.cpp
+++ b/src/Simplify_Max.cpp
@@ -212,6 +212,7 @@ Expr Simplify::visit(const Max *op, ExprInfo *info) {
          rewrite(max(slice(x, c0, c1, c2), slice(y, c0, c1, c2)), slice(max(x, y), c0, c1, c2), c2 > 1 && lanes_of(x) == lanes_of(y)) ||
          rewrite(max(slice(x, c0, c1, c2), max(slice(y, c0, c1, c2), z)), max(slice(max(x, y), c0, c1, c2), z), c2 > 1 && lanes_of(x) == lanes_of(y)) ||
          rewrite(max(slice(x, c0, c1, c2), max(z, slice(y, c0, c1, c2))), max(slice(max(x, y), c0, c1, c2), z), c2 > 1 && lanes_of(x) == lanes_of(y)) ||
+         rewrite(max(transpose(x, c0), transpose(y, c0)), transpose(max(x, y), c0)) ||
 
          (no_overflow(op->type) &&
           (rewrite(max(max(x, y) + c0, x), max(x, y + c0), c0 < 0) ||
diff --git a/src/Simplify_Min.cpp b/src/Simplify_Min.cpp
index 3f6084c6c4f1..e6515ab280e9 100644
--- a/src/Simplify_Min.cpp
+++ b/src/Simplify_Min.cpp
@@ -214,6 +214,7 @@ Expr Simplify::visit(const Min *op, ExprInfo *info) {
          rewrite(min(slice(x, c0, c1, c2), slice(y, c0, c1, c2)), slice(min(x, y), c0, c1, c2), c2 > 1 && lanes_of(x) == lanes_of(y)) ||
          rewrite(min(slice(x, c0, c1, c2), min(slice(y, c0, c1, c2), z)), min(slice(min(x, y), c0, c1, c2), z), c2 > 1 && lanes_of(x) == lanes_of(y)) ||
          rewrite(min(slice(x, c0, c1, c2), min(z, slice(y, c0, c1, c2))), min(slice(min(x, y), c0, c1, c2), z), c2 > 1 && lanes_of(x) == lanes_of(y)) ||
+         rewrite(min(transpose(x, c0), transpose(y, c0)), transpose(min(x, y), c0)) ||
          (no_overflow(op->type) &&
           (rewrite(min(min(x, y) + c0, x), min(x, y + c0), c0 > 0) ||
            rewrite(min(min(x, y) + c0, x), min(x, y) + c0, c0 < 0) ||
diff --git a/src/Simplify_Mul.cpp b/src/Simplify_Mul.cpp
index dfa38d39111c..e1bcb68fe7bc 100644
--- a/src/Simplify_Mul.cpp
+++ b/src/Simplify_Mul.cpp
@@ -81,6 +81,7 @@ Expr Simplify::visit(const Mul *op, ExprInfo *info) {
         rewrite(slice(x, c0, c1, c2) * slice(y, c0, c1, c2), slice(x * y, c0, c1, c2), c2 > 1 && lanes_of(x) == lanes_of(y)) ||
         rewrite(slice(x, c0, c1, c2) * (slice(y, c0, c1, c2) * z), slice(x * y, c0, c1, c2) * z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
         rewrite(slice(x, c0, c1, c2) * (z * slice(y, c0, c1, c2)), slice(x * y, c0, c1, c2) * z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
+        rewrite(transpose(x, c0) * transpose(y, c0), transpose(x * y, c0)) ||
 
         false) {
         return mutate(rewrite.result, info);
diff --git a/src/Simplify_Sub.cpp b/src/Simplify_Sub.cpp
index 29bd02c78ed6..2444cb6fd1d9 100644
--- a/src/Simplify_Sub.cpp
+++ b/src/Simplify_Sub.cpp
@@ -177,6 +177,7 @@ Expr Simplify::visit(const Sub *op, ExprInfo *info) {
          rewrite(slice(x, c0, c1, c2) - (slice(y, c0, c1, c2) + z), slice(x - y, c0, c1, c2) - z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
          rewrite((slice(x, c0, c1, c2) - z) - slice(y, c0, c1, c2), slice(x - y, c0, c1, c2) - z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
          rewrite((z - slice(x, c0, c1, c2)) - slice(y, c0, c1, c2), z - slice(x + y, c0, c1, c2), c2 > 1 && lanes_of(x) == lanes_of(y)) ||
+         rewrite(transpose(x, c0) - transpose(y, c0), transpose(x - y, c0)) ||
 
          (no_overflow(op->type) && EVAL_IN_LAMBDA  //
           (rewrite(max(x, y) - x, max(y - x, 0)) ||
diff --git a/src/StageStridedLoads.cpp b/src/StageStridedLoads.cpp
index c159d9d62b0a..5711e36e92ec 100644
--- a/src/StageStridedLoads.cpp
+++ b/src/StageStridedLoads.cpp
@@ -161,13 +161,11 @@ class ReplaceStridedLoads : public IRMutator {
     std::map<std::pair<const Allocate *, const Load *>, Expr> replacements;
     std::map<const Allocate *, int> padding;
     Scope<const Allocate *> allocation_scope;
-    std::map<Stmt, std::vector<std::pair<std::string, Expr>>> let_injections;
-
-    using IRMutator::mutate;
+    std::map<const IRNode *, std::vector<std::pair<std::string, Expr>>> let_injections;
 
     Stmt mutate(const Stmt &s) override {
         Stmt stmt = IRMutator::mutate(s);
-        auto it = let_injections.find(s);
+        auto it = let_injections.find(s.get());
         if (it != let_injections.end()) {
             for (const auto &[name, value] : it->second) {
                 stmt = LetStmt::make(name, value, stmt);
@@ -176,6 +174,17 @@ class ReplaceStridedLoads : public IRMutator {
         return stmt;
     }
 
+    Expr mutate(const Expr &e) override {
+        Expr expr = IRMutator::mutate(e);
+        auto it = let_injections.find(e.get());
+        if (it != let_injections.end()) {
+            for (const auto &[name, value] : it->second) {
+                expr = Let::make(name, value, expr);
+            }
+        }
+        return expr;
+    }
+
 protected:
     Expr visit(const Load *op) override {
         const Allocate *alloc = nullptr;
@@ -209,8 +218,8 @@ class ReplaceStridedLoads : public IRMutator {
     using IRMutator::visit;
 };
 
-Stmt innermost_containing_stmt(const Stmt &root, const std::set<const Load *> &exprs) {
-    Stmt result;
+const IRNode *innermost_containing_node(const Stmt &root, const std::set<const Load *> &exprs) {
+    const IRNode *result = nullptr;
     // The innermost containing stmt is whichever stmt node contains the
     // largest number of our exprs, with ties breaking inwards.
     int seen = 0, best = 0;
@@ -219,28 +228,34 @@ Stmt innermost_containing_stmt(const Stmt &root, const std::set<const Load *> &e
                     int old = seen;
                     self->mutate_base(s);
                     if (old == 0 && seen > best) {
-                        result = s;
+                        result = s.get();
                         best = seen;
                     }
                     return s;  //
                 },
                 [&](auto *self, const Expr &e) {
+                    int old = seen;
                     const Load *l = e.as<Load>();
                     if (l && exprs.count(l)) {
                         seen++;
                     };
-                    return self->mutate_base(e);  //
+                    self->mutate_base(e);
+                    if (old == 0 && seen > best) {
+                        result = e.get();
+                        best = seen;
+                    }
+                    return e;  //
                 });
     internal_assert(seen) << "None of the exprs were found\n";
     return result;
 }
 
-bool can_hoist_shared_load(const Stmt &s, const std::string &buf, const Expr &idx) {
+bool can_hoist_shared_load(const IRNode *n, const std::string &buf, const Expr &idx) {
     // Check none of the variables the idx depends on are defined somewhere
     // within this stmt, and there are no stores to the given buffer in the
     // stmt.
     bool result = true;
-    visit_with(s,                                 //
+    visit_with(n,                                 //
                [&](auto *self, const Let *let) {  //
                    result &= !expr_uses_var(idx, let->name);
                },
@@ -293,7 +308,8 @@ Stmt stage_strided_loads(const Stmt &s) {
             // We have a complete cluster of loads. Make a single dense load
             int lanes = k.lanes * k.stride;
             int64_t first_offset = load->first;
-            Expr idx = Ramp::make(k.base + (int)first_offset, make_one(k.base.type()), lanes);
+            Expr base = common_subexpression_elimination(k.base);
+            Expr idx = Ramp::make(base + (int)first_offset, make_one(k.base.type()), lanes);
             Type t = k.type.with_lanes(lanes);
             const Load *op = load->second[0];
 
@@ -304,14 +320,12 @@ Stmt stage_strided_loads(const Stmt &s) {
 
             Expr shared_load = Load::make(t, k.buf, idx, op->image, op->param,
                                           const_true(lanes), op->alignment);
-            shared_load = common_subexpression_elimination(shared_load);
 
             // If possible, we do the shuffle as an in-place transpose followed
             // by a dense slice. This is more efficient when extracting multiple
             // slices.
-            Stmt let_site = innermost_containing_stmt(alloc ? Stmt(alloc) : s, all_loads);
+            const IRNode *let_site = innermost_containing_node(alloc ? Stmt(alloc) : s, all_loads);
             if (can_hoist_shared_load(let_site, k.buf, idx)) {
-                shared_load = Shuffle::make_transpose(shared_load, k.stride);
                 std::string name = unique_name('t');
                 Expr var = Variable::make(shared_load.type(), name);
                 for (; load != v.end() && load->first < first_offset + k.stride; load++) {
@@ -321,6 +335,7 @@ Stmt stage_strided_loads(const Stmt &s) {
                         replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
                     }
                 }
+                shared_load = Shuffle::make_transpose(shared_load, k.stride);
                 replacer.let_injections[let_site].emplace_back(name, shared_load);
             } else {
                 for (; load != v.end() && load->first < first_offset + k.stride; load++) {

From 794df0bf30950dbb32cb0b4f4c07a7023e655cdd Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Fri, 20 Feb 2026 09:54:19 -0800
Subject: [PATCH 16/55] Fix innermost_containing_node

---
 src/IRMutator.h           | 9 +++++++++
 src/StageStridedLoads.cpp | 9 +++++++--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/IRMutator.h b/src/IRMutator.h
index c170b37eb42b..4c06e12eda97 100644
--- a/src/IRMutator.h
+++ b/src/IRMutator.h
@@ -343,6 +343,15 @@ auto mutate_with(const T &ir, Lambdas &&...lambdas) {
     }
 }
 
+template<typename... Lambdas>
+auto mutate_with(const IRNode *ir, Lambdas &&...lambdas) -> IRHandle {
+    if (ir->node_type <= StrongestExprNodeType) {
+        return mutate_with(Expr((const BaseExprNode *)ir), std::forward<Lambdas>(lambdas)...);
+    } else {
+        return mutate_with(Stmt((const BaseStmtNode *)ir), std::forward<Lambdas>(lambdas)...);
+    }
+}
+
 /** A helper function for mutator-like things to mutate regions */
 template<typename Mutator, typename... Args>
 std::pair<Region, bool> mutate_region(Mutator *mutator, const Region &bounds, Args &&...args) {
diff --git a/src/StageStridedLoads.cpp b/src/StageStridedLoads.cpp
index 5711e36e92ec..05799c433062 100644
--- a/src/StageStridedLoads.cpp
+++ b/src/StageStridedLoads.cpp
@@ -218,7 +218,7 @@ class ReplaceStridedLoads : public IRMutator {
     using IRMutator::visit;
 };
 
-const IRNode *innermost_containing_node(const Stmt &root, const std::set<const Load *> &exprs) {
+const IRNode *innermost_containing_node(const IRNode *root, const std::set<const Load *> &exprs) {
     const IRNode *result = nullptr;
     // The innermost containing stmt is whichever stmt node contains the
     // largest number of our exprs, with ties breaking inwards.
@@ -258,18 +258,23 @@ bool can_hoist_shared_load(const IRNode *n, const std::string &buf, const Expr &
     visit_with(n,                                 //
                [&](auto *self, const Let *let) {  //
                    result &= !expr_uses_var(idx, let->name);
+                   self->visit_base(let);
                },
                [&](auto *self, const LetStmt *let) {  //
                    result &= !expr_uses_var(idx, let->name);
+                   self->visit_base(let);
                },
                [&](auto *self, const For *loop) {  //
                    result &= !expr_uses_var(idx, loop->name);
+                   self->visit_base(loop);
                },
                [&](auto *self, const Allocate *alloc) {  //
                    result &= alloc->name != buf;
+                   self->visit_base(alloc);
                },
                [&](auto *self, const Store *store) {  //
                    result &= store->name != buf;
+                   self->visit_base(store);
                });
     return result;
 }
@@ -324,7 +329,7 @@ Stmt stage_strided_loads(const Stmt &s) {
             // If possible, we do the shuffle as an in-place transpose followed
             // by a dense slice. This is more efficient when extracting multiple
             // slices.
-            const IRNode *let_site = innermost_containing_node(alloc ? Stmt(alloc) : s, all_loads);
+            const IRNode *let_site = innermost_containing_node(k.scope, all_loads);
             if (can_hoist_shared_load(let_site, k.buf, idx)) {
                 std::string name = unique_name('t');
                 Expr var = Variable::make(shared_load.type(), name);

From 486addd0c8a6c934a891e1f5b9ec2c54c0b0027c Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Sat, 21 Feb 2026 11:44:53 -0800
Subject: [PATCH 17/55] Fix some simd op check failures

---
 src/CodeGen_ARM.cpp       |  1 +
 src/CodeGen_LLVM.cpp      |  9 +++++++--
 src/Lower.cpp             |  2 +-
 src/StageStridedLoads.cpp | 17 +++++++++++++----
 src/StageStridedLoads.h   |  2 +-
 5 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index 43372183aeb4..d43426857a9a 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -1886,6 +1886,7 @@ void CodeGen_ARM::visit(const Shuffle *op) {
     if (target.os != Target::IOS && target.os != Target::OSX &&
         load &&
         op->vectors.size() == 1 &&
+        op->is_slice() &&
         2 <= stride && stride <= 4 &&
         op->slice_begin() < stride &&
         load->type.lanes() == stride * op->type.lanes()) {
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index a5937f123cfe..85d3e2f4ce85 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -2195,10 +2195,15 @@ Value *CodeGen_LLVM::optimization_fence(Value *v) {
     internal_assert(!t->isScalableTy())
         << "optimization_fence does not support scalable vectors yet";
     const int bits = t->getPrimitiveSizeInBits();
-    if (bits % 16) {
+    if (bits % 32) {
+        const int lanes = get_vector_num_elements(t);
+        const int padded_lanes = (lanes + 3) / 4 * 4;
+        v = slice_vector(v, 0, padded_lanes);
+        v = optimization_fence(v);
+        v = slice_vector(v, 0, lanes);
         return v;
     }
-    llvm::Type *float_type = llvm_type_of(Float(16, bits / 16));
+    llvm::Type *float_type = llvm_type_of(Float(32, bits / 32));
     v = builder->CreateBitCast(v, float_type);
     v = builder->CreateArithmeticFence(v, float_type);
     return builder->CreateBitCast(v, t);
diff --git a/src/Lower.cpp b/src/Lower.cpp
index 32b64e83a2bd..e633cc99c1d2 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -381,7 +381,7 @@ void lower_impl(const vector<Function> &output_funcs,
     log("Lowering after partitioning loops:", s);
 
     debug(1) << "Staging strided loads...\n";
-    s = stage_strided_loads(s);
+    s = stage_strided_loads(s, t);
     log("Lowering after staging strided loads:", s);
 
     debug(1) << "Trimming loops to the region over which they do something...\n";
diff --git a/src/StageStridedLoads.cpp b/src/StageStridedLoads.cpp
index 05799c433062..3df12a6d1592 100644
--- a/src/StageStridedLoads.cpp
+++ b/src/StageStridedLoads.cpp
@@ -281,7 +281,7 @@ bool can_hoist_shared_load(const IRNode *n, const std::string &buf, const Expr &
 
 }  // namespace
 
-Stmt stage_strided_loads(const Stmt &s) {
+Stmt stage_strided_loads(const Stmt &s, const Target &target) {
     FindStridedLoads finder;
     ReplaceStridedLoads replacer;
 
@@ -329,18 +329,27 @@ Stmt stage_strided_loads(const Stmt &s) {
             // If possible, we do the shuffle as an in-place transpose followed
             // by a dense slice. This is more efficient when extracting multiple
             // slices.
-            const IRNode *let_site = innermost_containing_node(k.scope, all_loads);
+            const IRNode *let_site = innermost_containing_node(k.scope ? k.scope : s.get(), all_loads);
             if (can_hoist_shared_load(let_site, k.buf, idx)) {
+                // For larger strides we can do a better job at shuffling if we
+                // do it as one big task. For stride 2 it interferes with
+                // horizontal add pattern matching. On ARM it also interferes
+                // with LLVM's pattern matching for vld3 and vld4.
+                bool transpose_shared_load = k.stride > 2 && (target.arch != Target::ARM || k.stride > 4);
                 std::string name = unique_name('t');
                 Expr var = Variable::make(shared_load.type(), name);
                 for (; load != v.end() && load->first < first_offset + k.stride; load++) {
                     int row = load->first - first_offset;
-                    Expr shuf = Shuffle::make_slice(var, row * k.lanes, 1, k.lanes);
+                    Expr shuf = transpose_shared_load ?
+                        Shuffle::make_slice(var, row * k.lanes, 1, k.lanes) :
+                        Shuffle::make_slice(var, row, k.stride, k.lanes);
                     for (const Load *l : load->second) {
                         replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
                     }
                 }
-                shared_load = Shuffle::make_transpose(shared_load, k.stride);
+                if (transpose_shared_load) {
+                    shared_load = Shuffle::make_transpose(shared_load, k.stride);
+                }
                 replacer.let_injections[let_site].emplace_back(name, shared_load);
             } else {
                 for (; load != v.end() && load->first < first_offset + k.stride; load++) {
diff --git a/src/StageStridedLoads.h b/src/StageStridedLoads.h
index a29cef2438f1..b6afd3770981 100644
--- a/src/StageStridedLoads.h
+++ b/src/StageStridedLoads.h
@@ -37,7 +37,7 @@ namespace Internal {
  * internal allocations it adds padding to the allocation explicitly, by setting
  * the padding field on Allocate nodes.
  */
-Stmt stage_strided_loads(const Stmt &s);
+Stmt stage_strided_loads(const Stmt &s, const Target &target);
 
 }  // namespace Internal
 }  // namespace Halide

From a1ecca90b95d67a04b16578de6d80aee9a56e6a0 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Mon, 23 Feb 2026 09:48:18 -0800
Subject: [PATCH 18/55] Fix infinite recursion issue and missed case in
 interleave codegen

---
 src/CodeGen_LLVM.cpp      | 61 ++++++++++++++++++++++++++++++---------
 src/StageStridedLoads.cpp | 11 +++++--
 2 files changed, 55 insertions(+), 17 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 85d3e2f4ce85..3a759091a9bf 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -2197,7 +2197,9 @@ Value *CodeGen_LLVM::optimization_fence(Value *v) {
     const int bits = t->getPrimitiveSizeInBits();
     if (bits % 32) {
         const int lanes = get_vector_num_elements(t);
-        const int padded_lanes = (lanes + 3) / 4 * 4;
+        const int element_bits = t->getScalarSizeInBits();
+        const int lanes_per_32_bits = 32 / element_bits;
+        const int padded_lanes = align_up(lanes, lanes_per_32_bits);
         v = slice_vector(v, 0, padded_lanes);
         v = optimization_fence(v);
         v = slice_vector(v, 0, lanes);
@@ -2215,19 +2217,20 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
         internal_assert(vecs[0]->getType() == vecs[i]->getType());
     }
     int vec_elements = get_vector_num_elements(vecs[0]->getType());
+    const int num_vecs = (int)vecs.size();
 
-    int factor = gcd(vec_elements, (int)vecs.size());
+    int factor = gcd(vec_elements, num_vecs);
 
-    if (vecs.size() == 1) {
+    if (num_vecs == 1) {
         return vecs[0];
-    } else if (vecs.size() == 2) {
+    } else if (num_vecs == 2) {
         Value *a = vecs[0];
         Value *b = vecs[1];
         vector<int> indices(vec_elements * 2);
         for (int i = 0; i < vec_elements * 2; i++) {
             indices[i] = i % 2 == 0 ? i / 2 : i / 2 + vec_elements;
         }
-        return optimization_fence(shuffle_vectors(a, b, indices));
+        return shuffle_vectors(a, b, indices);
     } else if (factor == 1) {
         // The number of vectors and the vector length is
         // coprime. (E.g. interleaving an odd number of vectors of some
@@ -2290,27 +2293,41 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
         }
 
         return concat_vectors(v);
-
     } else {
         // The number of vectors shares a factor with the length of the
         // vectors. Pick some factor of the number of vectors, interleave in
         // separate groups, and then interleave the results. Do the largest
         // power of two factor first.
-        const int n = (int)vecs.size();
-        int f = n & -n;
-        if (f == 1 || f == n) {
-            for (int i = 2; i < n; i++) {
-                if (n % i == 0) {
+        int f = num_vecs & -num_vecs;
+        if (f == 1 || f == num_vecs) {
+            for (int i = 2; i < num_vecs; i++) {
+                if (num_vecs % i == 0) {
                     f = i;
                     break;
                 }
             }
         }
 
-        internal_assert(f > 1 && f < n && n % f == 0) << f << " " << n;
+        // if f == 1 then the vector length is a multiple of the
+        // interleaving factor and the number of vectors is prime but not two
+        // (e.g. vec_elements = 24 and num_vecs = 3). Pad each vector out to a
+        // power of two size, interleave, and discard the tail of the
+        // result. This buys us some extra room to run Catanzaro's algorithm in.
+        if (f == 1) {
+            int padded_size = next_power_of_two(vec_elements);
+            std::vector<Value *> padded(num_vecs);
+            for (int i = 0; i < num_vecs; i++) {
+                padded[i] = slice_vector(vecs[i], 0, padded_size);
+            }
+            Value *v = interleave_vectors(padded);
+            return slice_vector(v, 0, num_vecs * vec_elements);
+        }
+
+        internal_assert(f > 1 && f < num_vecs && num_vecs % f == 0)
+            << f << " " << num_vecs << " " << factor;
 
         vector<vector<Value *>> groups(f);
-        for (size_t i = 0; i < vecs.size(); i++) {
+        for (int i = 0; i < num_vecs; i++) {
             groups[i % f].push_back(vecs[i]);
         }
 
@@ -2428,7 +2445,23 @@ std::vector<Value *> CodeGen_LLVM::deinterleave_vector(Value *vec, int num_vecs)
             }
         }
 
-        internal_assert(f > 1 && f < num_vecs && num_vecs % f == 0) << f << " " << num_vecs;
+        // if f == 1 then the final vector length is a multiple of the
+        // deinterleaving factor and the number of vectors is prime but not two
+        // (e.g. vec_elements = 24 and num_vecs = 3). Pad the vector out to a
+        // power of two size, deinterleave, and discard the tail of each vector
+        // result. This buys us some extra room to run Catanzaro's algorithm in.
+        if (f == 1) {
+            int padded_size = next_power_of_two(vec_elements);
+            Value *padded = slice_vector(vec, 0, padded_size * num_vecs);
+            std::vector<Value *> result = deinterleave_vector(padded, num_vecs);
+            for (int i = 0; i < num_vecs; i++) {
+                result[i] = slice_vector(result[i], 0, vec_elements);
+            }
+            return result;
+        }
+
+        internal_assert(f > 1 && f < num_vecs && num_vecs % f == 0)
+            << f << " " << num_vecs << " " << factor;
 
         auto partial = deinterleave_vector(vec, f);
         std::vector<Value *> result(num_vecs);
diff --git a/src/StageStridedLoads.cpp b/src/StageStridedLoads.cpp
index 3df12a6d1592..541e27be6a2a 100644
--- a/src/StageStridedLoads.cpp
+++ b/src/StageStridedLoads.cpp
@@ -329,7 +329,12 @@ Stmt stage_strided_loads(const Stmt &s, const Target &target) {
             // If possible, we do the shuffle as an in-place transpose followed
             // by a dense slice. This is more efficient when extracting multiple
             // slices.
-            const IRNode *let_site = innermost_containing_node(k.scope ? k.scope : s.get(), all_loads);
+
+            // We can't lift the shared load further out than the scope over
+            // which the loads definition occur. If k.scope is null, the loads
+            // are valid everywhere (it must be an input buffer)
+            const IRNode *outermost = k.scope ? k.scope : s.get();
+            const IRNode *let_site = innermost_containing_node(outermost, all_loads);
             if (can_hoist_shared_load(let_site, k.buf, idx)) {
                 // For larger strides we can do a better job at shuffling if we
                 // do it as one big task. For stride 2 it interferes with
@@ -341,8 +346,8 @@ Stmt stage_strided_loads(const Stmt &s, const Target &target) {
                 for (; load != v.end() && load->first < first_offset + k.stride; load++) {
                     int row = load->first - first_offset;
                     Expr shuf = transpose_shared_load ?
-                        Shuffle::make_slice(var, row * k.lanes, 1, k.lanes) :
-                        Shuffle::make_slice(var, row, k.stride, k.lanes);
+                                    Shuffle::make_slice(var, row * k.lanes, 1, k.lanes) :
+                                    Shuffle::make_slice(var, row, k.stride, k.lanes);
                     for (const Load *l : load->second) {
                         replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
                     }

From f66d5eaa68c815cd098ad2f6bf7a017d2fdac30a Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Mon, 23 Feb 2026 09:53:29 -0800
Subject: [PATCH 19/55] Adjust expectations in stage_strided_loads test

---
 test/correctness/stage_strided_loads.cpp | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/test/correctness/stage_strided_loads.cpp b/test/correctness/stage_strided_loads.cpp
index f791385f7c25..038108844eb4 100644
--- a/test/correctness/stage_strided_loads.cpp
+++ b/test/correctness/stage_strided_loads.cpp
@@ -86,10 +86,7 @@ int main(int argc, char **argv) {
         f(x) += {buf(2 * x), buf(2 * x + 1)};
         f.update().vectorize(x, 8, TailStrategy::RoundUp);
 
-        // In this case, the dense load appears twice across the two store
-        // statements for the two tuple components, but it will get deduped by
-        // llvm.
-        checker.check(f, 2);
+        checker.check(f, 1);
     }
 
     {
@@ -113,7 +110,7 @@ int main(int argc, char **argv) {
         g.vectorize(x, 8, TailStrategy::RoundUp);
         f.compute_at(g, x).vectorize(x);
 
-        checker.check(g, 2);
+        checker.check(g, 1);
     }
 
     {
@@ -125,7 +122,7 @@ int main(int argc, char **argv) {
         g(x) = f(x);
         g.vectorize(x, 8, TailStrategy::RoundUp);
 
-        checker.check(g, 2);
+        checker.check(g, 1);
     }
 
     {
@@ -135,7 +132,7 @@ int main(int argc, char **argv) {
         f(x, c) = buf(4 * x + c) + 4 * x;
         f.vectorize(x, 8, TailStrategy::RoundUp).bound(c, 0, 4).unroll(c).reorder(c, x);
 
-        checker.check(f, 4);
+        checker.check(f, 1);
     }
 
     {
@@ -152,7 +149,7 @@ int main(int argc, char **argv) {
         f.tile(x, y, xi, yi, 8, 8, TailStrategy::RoundUp).vectorize(xi).reorder(c, x, y);
         g.compute_at(f, x).vectorize(x);
         h.compute_at(f, x).vectorize(x);
-        checker.check(f, 2);
+        checker.check(f, 1);
     }
 
     // We can always densify strided loads to internal allocations, because we
@@ -181,7 +178,7 @@ int main(int argc, char **argv) {
     {
         Func f;
         Var x;
-        f(x) = buf(16 * x) + buf(16 * x + 15);
+        f(x) = buf(17 * x) + buf(17 * x + 15);
         f.vectorize(x, 16, TailStrategy::RoundUp);
 
         checker.check_not(f, 0);

From c25142f7a27b83fb80d6485b7612b5335cd92b23 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Mon, 23 Feb 2026 10:25:44 -0800
Subject: [PATCH 20/55] Allow reversed suffix or not in sve test

---
 test/correctness/simd_op_check_sve2.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/test/correctness/simd_op_check_sve2.cpp b/test/correctness/simd_op_check_sve2.cpp
index fca748dd60d9..7e1e1e00ddfb 100644
--- a/test/correctness/simd_op_check_sve2.cpp
+++ b/test/correctness/simd_op_check_sve2.cpp
@@ -447,13 +447,14 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
                 Expr shift = (i_2 % bits) - (bits / 2);
                 Expr round_s = (cast_i(1) >> min(shift, 0)) / 2;
                 Expr round_u = (cast_u(1) >> min(shift, 0)) / 2;
-                add_8_16_32(sel_op("vrshl.s", "srshl", "srshlr"), cast_i((widen_i(i_1) + round_s) << shift));
-                add_8_16_32(sel_op("vrshl.u", "urshl", "urshlr"), cast_u((widen_u(u_1) + round_u) << shift));
+                // The r suffix is optional - it just changes which of the two args gets clobbered
+                add_8_16_32(sel_op("vrshl.s", "srshlr?"), cast_i((widen_i(i_1) + round_s) << shift));
+                add_8_16_32(sel_op("vrshl.u", "urshlr?"), cast_u((widen_u(u_1) + round_u) << shift));
 
                 round_s = (cast_i(1) << max(shift, 0)) / 2;
                 round_u = (cast_u(1) << max(shift, 0)) / 2;
-                add_8_16_32(sel_op("vrshl.s", "srshl", "srshlr"), cast_i((widen_i(i_1) + round_s) >> shift));
-                add_8_16_32(sel_op("vrshl.u", "urshl", "urshlr"), cast_u((widen_u(u_1) + round_u) >> shift));
+                add_8_16_32(sel_op("vrshl.s", "srshlr?"), cast_i((widen_i(i_1) + round_s) >> shift));
+                add_8_16_32(sel_op("vrshl.u", "urshlr?"), cast_u((widen_u(u_1) + round_u) >> shift));
 
                 // VRSHR    I       -       Rounding Shift Right
                 add_8_16_32(sel_op("vrshr.s", "srshr", "srshl"), cast_i((widen_i(i_1) + 1) >> 1));

From bae3e02d2bcb7499fa52c5fa950189acfc6ed99b Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Mon, 23 Feb 2026 10:26:00 -0800
Subject: [PATCH 21/55] Don't use optimization fences on hexagon

---
 src/CodeGen_Hexagon.cpp | 7 +++++++
 src/CodeGen_LLVM.h      | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp
index 4ac47b4d1f3b..563ac00a4972 100644
--- a/src/CodeGen_Hexagon.cpp
+++ b/src/CodeGen_Hexagon.cpp
@@ -95,6 +95,7 @@ class CodeGen_Hexagon : public CodeGen_Posix {
     llvm::Value *interleave_vectors(const std::vector<llvm::Value *> &v) override;
     llvm::Value *shuffle_vectors(llvm::Value *a, llvm::Value *b,
                                  const std::vector<int> &indices) override;
+    llvm::Value *optimization_fence(llvm::Value *v) override;
     using CodeGen_Posix::shuffle_vectors;
     ///@}
 
@@ -1296,6 +1297,12 @@ Value *CodeGen_Hexagon::shuffle_vectors(Value *a, Value *b,
     return vdelta(concat_vectors({a, b}), indices);
 }
 
+Value *CodeGen_Hexagon::optimization_fence(Value *v) {
+    // As of llvm 21, the base class version seems to trip up LLVM's hexagon
+    // backend, possibly because it relies on a floating point type.
+    return v;
+}
+
 Value *CodeGen_Hexagon::vlut256(Value *lut, Value *idx, int min_index,
                                 int max_index) {
     llvm::Type *lut_ty = lut->getType();
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index 46ec05638e3f..abbf935122c3 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -465,7 +465,7 @@ class CodeGen_LLVM : public IRVisitor {
 
     /** A fence to prevent fusion of ops by llvm. Designed for floats, but we
      * abuse it to prevent shufflevector fusion too. */
-    llvm::Value *optimization_fence(llvm::Value *);
+    virtual llvm::Value *optimization_fence(llvm::Value *);
 
     /** Description of an intrinsic function overload. Overloads are resolved
      * using both argument and return types. The scalar types of the arguments

From b7defbd2934330d5eecfd4fa8cc9567f90ad03ec Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Mon, 23 Feb 2026 11:18:18 -0800
Subject: [PATCH 22/55] Fix infinite simplifier loop

---
 src/Simplify_Exprs.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/Simplify_Exprs.cpp b/src/Simplify_Exprs.cpp
index 52665c0c2894..ad8a9827847a 100644
--- a/src/Simplify_Exprs.cpp
+++ b/src/Simplify_Exprs.cpp
@@ -362,7 +362,9 @@ Expr Simplify::visit(const Load *op, ExprInfo *info) {
         }
         return Shuffle::make(loaded_vecs, s_index->indices);
     } else if (const Ramp *inner_ramp = r_index ? r_index->base.as<Ramp>() : nullptr;
-               inner_ramp && is_const_one(r_index->stride)) {
+               inner_ramp &&
+               !is_const_one(inner_ramp->stride) &&
+               is_const_one(r_index->stride)) {
         // If it's a nested ramp and the outer ramp has stride 1, swap the
         // nesting order of the ramps to make dense loads and transpose the
         // resulting vector instead.

From 23944a0093f5e9d0d03f96b12ea2706d8fba1c6e Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Mon, 23 Feb 2026 11:18:27 -0800
Subject: [PATCH 23/55] Don't hoist transposes on hexagon

---
 src/StageStridedLoads.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/StageStridedLoads.cpp b/src/StageStridedLoads.cpp
index 541e27be6a2a..d8315ecd19cd 100644
--- a/src/StageStridedLoads.cpp
+++ b/src/StageStridedLoads.cpp
@@ -340,7 +340,10 @@ Stmt stage_strided_loads(const Stmt &s, const Target &target) {
                 // do it as one big task. For stride 2 it interferes with
                 // horizontal add pattern matching. On ARM it also interferes
                 // with LLVM's pattern matching for vld3 and vld4.
-                bool transpose_shared_load = k.stride > 2 && (target.arch != Target::ARM || k.stride > 4);
+                bool transpose_shared_load = k.stride > 2;
+                if (target.arch == Target::ARM || target.arch == Target::Hexagon) {
+                    transpose_shared_load = k.stride > 4;
+                }
                 std::string name = unique_name('t');
                 Expr var = Variable::make(shared_load.type(), name);
                 for (; load != v.end() && load->first < first_offset + k.stride; load++) {

From 0d110d206dbfcb133fda12d8a7d44c72d770eca3 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Mon, 23 Feb 2026 14:07:10 -0800
Subject: [PATCH 24/55] Make distinct strided load nodes in the IR distinct in
 memory too

---
 src/StageStridedLoads.cpp | 46 +++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 19 deletions(-)

diff --git a/src/StageStridedLoads.cpp b/src/StageStridedLoads.cpp
index d8315ecd19cd..3723f997a871 100644
--- a/src/StageStridedLoads.cpp
+++ b/src/StageStridedLoads.cpp
@@ -158,9 +158,8 @@ class FindStridedLoads : public IRVisitor {
 // Replace a bunch of load expressions in a stmt
 class ReplaceStridedLoads : public IRMutator {
 public:
-    std::map<std::pair<const Allocate *, const Load *>, Expr> replacements;
+    std::map<const Load *, Expr> replacements;
     std::map<const Allocate *, int> padding;
-    Scope<const Allocate *> allocation_scope;
     std::map<const IRNode *, std::vector<std::pair<std::string, Expr>>> let_injections;
 
     Stmt mutate(const Stmt &s) override {
@@ -187,11 +186,7 @@ class ReplaceStridedLoads : public IRMutator {
 
 protected:
     Expr visit(const Load *op) override {
-        const Allocate *alloc = nullptr;
-        if (const Allocate *const *a_ptr = allocation_scope.find(op->name)) {
-            alloc = *a_ptr;
-        }
-        auto it = replacements.find({alloc, op});
+        auto it = replacements.find(op);
         if (it != replacements.end()) {
             return mutate(it->second);
         } else {
@@ -200,7 +195,6 @@ class ReplaceStridedLoads : public IRMutator {
     }
 
     Stmt visit(const Allocate *op) override {
-        ScopedBinding bind(allocation_scope, op->name, op);
         auto it = padding.find(op);
         Stmt s = IRMutator::visit(op);
         if (it == padding.end()) {
@@ -281,10 +275,25 @@ bool can_hoist_shared_load(const IRNode *n, const std::string &buf, const Expr &
 
 }  // namespace
 
-Stmt stage_strided_loads(const Stmt &s, const Target &target) {
+Stmt stage_strided_loads(const Stmt &stmt, const Target &target) {
     FindStridedLoads finder;
     ReplaceStridedLoads replacer;
 
+    // Make all strided loads distinct IR nodes so that we can uniquely identify
+    // them by address. We may want to mutate the same load node in different
+    // ways depending on the surrounding context.
+    Stmt s = mutate_with(stmt, [&](auto *self, const Load *l) {
+        const Ramp *r = l->index.as<Ramp>();
+        if (l->type.is_scalar() || (r && is_const_one(r->stride))) {
+            // Definitely not a strided load
+            return self->visit_base(l);
+        } else {
+            // Might be a strided load after simplification
+            return Load::make(l->type, l->name, self->mutate(l->index), l->image, l->param,
+                              self->mutate(l->predicate), l->alignment);
+        }
+    });
+
     // Find related clusters of strided loads anywhere in the stmt. While this
     // appears to look globally, it requires expressions to match exactly, so
     // really it's only going to find things inside the same loops and let
@@ -293,7 +302,6 @@ Stmt stage_strided_loads(const Stmt &s, const Target &target) {
 
     for (const auto &l : finder.found_loads) {
         const FindStridedLoads::Key &k = l.first;
-        const Allocate *alloc = k.allocation;
         const std::map<int64_t, std::vector<const Load *>> &v = l.second;
 
         // Find clusters of strided loads that can share the same dense load.
@@ -352,7 +360,7 @@ Stmt stage_strided_loads(const Stmt &s, const Target &target) {
                                     Shuffle::make_slice(var, row * k.lanes, 1, k.lanes) :
                                     Shuffle::make_slice(var, row, k.stride, k.lanes);
                     for (const Load *l : load->second) {
-                        replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
+                        replacer.replacements.emplace(l, shuf);
                     }
                 }
                 if (transpose_shared_load) {
@@ -364,7 +372,7 @@ Stmt stage_strided_loads(const Stmt &s, const Target &target) {
                     int row = load->first - first_offset;
                     Expr shuf = Shuffle::make_slice(shared_load, row, k.stride, k.lanes);
                     for (const Load *l : load->second) {
-                        replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
+                        replacer.replacements.emplace(l, shuf);
                     }
                 }
             }
@@ -374,7 +382,7 @@ Stmt stage_strided_loads(const Stmt &s, const Target &target) {
         // picked up in a cluster, but for whom we know it's safe to do a
         // dense load before their start.
         for (const auto &[offset, loads] : reverse_view(v)) {
-            if (replacer.replacements.count({alloc, loads[0]})) {
+            if (replacer.replacements.count(loads[0])) {
                 continue;
             }
             int64_t delta = k.stride - 1;
@@ -392,14 +400,14 @@ Stmt stage_strided_loads(const Stmt &s, const Target &target) {
             dense_load = common_subexpression_elimination(dense_load);
             Expr shuf = Shuffle::make_slice(dense_load, delta, k.stride, k.lanes);
             for (const Load *l : loads) {
-                replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
+                replacer.replacements.emplace(l, shuf);
             }
         }
 
         // Look for any loads we can densify because an overlapping load occurs
         // in any parent scope.
         for (const auto &[offset, loads] : reverse_view(v)) {
-            if (replacer.replacements.count({alloc, loads[0]})) {
+            if (replacer.replacements.count(loads[0])) {
                 continue;
             }
             int64_t min_offset = offset;
@@ -430,7 +438,7 @@ Stmt stage_strided_loads(const Stmt &s, const Target &target) {
             dense_load = common_subexpression_elimination(dense_load);
             Expr shuf = Shuffle::make_slice(dense_load, offset - final_offset, k.stride, k.lanes);
             for (const Load *l : loads) {
-                replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
+                replacer.replacements.emplace(l, shuf);
             }
         }
 
@@ -439,7 +447,7 @@ Stmt stage_strided_loads(const Stmt &s, const Target &target) {
         // external allocations by doing a dense load at a trimmed size. We rely
         // on codegen to do a good job at loading vectors of a funny size.
         for (const auto &[offset, loads] : v) {
-            if (replacer.replacements.count({alloc, loads[0]})) {
+            if (replacer.replacements.count(loads[0])) {
                 continue;
             }
 
@@ -463,7 +471,7 @@ Stmt stage_strided_loads(const Stmt &s, const Target &target) {
                 dense_load = common_subexpression_elimination(dense_load);
                 Expr shuf = Shuffle::make_slice(dense_load, offset - first_offset, k.stride, k.lanes);
                 for (const Load *l : loads) {
-                    replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
+                    replacer.replacements.emplace(l, shuf);
                 }
 
             } else if (k.lanes % 2 == 0) {
@@ -486,7 +494,7 @@ Stmt stage_strided_loads(const Stmt &s, const Target &target) {
                 Expr shuf2 = Shuffle::make_slice(dense_load2, delta, k.stride, k.lanes / 2);
                 Expr shuf = Shuffle::make_concat({shuf1, shuf2});
                 for (const Load *l : loads) {
-                    replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
+                    replacer.replacements.emplace(l, shuf);
                 }
             }
         }

From 84f10b1ce4e26490d77fb60d8626e4b4a34734bb Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 24 Feb 2026 12:35:42 -0800
Subject: [PATCH 25/55] arm-32 has no vst2 for 64-bit elements

---
 src/CodeGen_ARM.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index d43426857a9a..31f64272b552 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -1471,10 +1471,11 @@ void CodeGen_ARM::visit(const Store *op) {
         intrin_type = t;
         Type elt = t.element_of();
         int vec_bits = t.bits() * t.lanes();
-        if (elt == Float(32) || elt == Float(64) ||
-            is_float16_and_has_feature(elt) ||
-            elt == Int(8) || elt == Int(16) || elt == Int(32) || elt == Int(64) ||
-            elt == UInt(8) || elt == UInt(16) || elt == UInt(32) || elt == UInt(64)) {
+        if (t.bits() <= target.bits &&
+            (elt == Float(32) || elt == Float(64) ||
+             is_float16_and_has_feature(elt) ||
+             elt == Int(8) || elt == Int(16) || elt == Int(32) || elt == Int(64) ||
+             elt == UInt(8) || elt == UInt(16) || elt == UInt(32) || elt == UInt(64))) {
             const int target_vector_bits = native_vector_bits();
             if (vec_bits % 128 == 0) {
                 type_ok_for_vst = true;

From 8d93c3c7c2fc76386af57df8ddf1136c25b9cbc8 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 24 Feb 2026 13:43:51 -0800
Subject: [PATCH 26/55] Windows bad filename fix in simd op check

---
 test/correctness/simd_op_check_sve2.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/correctness/simd_op_check_sve2.cpp b/test/correctness/simd_op_check_sve2.cpp
index 7e1e1e00ddfb..3ed0a70ef380 100644
--- a/test/correctness/simd_op_check_sve2.cpp
+++ b/test/correctness/simd_op_check_sve2.cpp
@@ -1221,6 +1221,12 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
             std::stringstream type_name_stream;
             type_name_stream << e.type();
             std::string decorated_op_name = op_name + "_" + type_name_stream.str() + "_x" + std::to_string(vec_factor);
+
+            // Some regex symbols are illegal in filenames on windows
+            std::string illegal = "<>:\"/\\|?*";
+            std::replace_if(decorated_op_name.begin(), decorated_op_name.end(),  //
+                            [&](char c) { return illegal.find(c) != std::string::npos; }, '_');
+
             auto unique_name = "op_" + decorated_op_name + "_" + std::to_string(parent.tasks.size());
 
             // Bail out after generating the unique_name, so that names are

From 36565ce56464486ad2cbdd6fab7a84131bb430d4 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 24 Feb 2026 14:04:19 -0800
Subject: [PATCH 27/55] Temporary dumping of cpu info to debug github actions
 issue

---
 Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Makefile b/Makefile
index 54c61a622ae8..0f136b40114d 100644
--- a/Makefile
+++ b/Makefile
@@ -16,6 +16,8 @@ MAKEFLAGS += --no-builtin-rules
 
 UNAME = $(shell uname)
 
+$(info $(shell cat /proc/cpuinfo))
+
 ifeq ($(OS), Windows_NT)
     $(error Halide no longer supports the MinGW environment. Please use MSVC through CMake instead.)
 else

From 3f45c47773f0c3cf4ec0dc2582a0b9fe8b0c4f41 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 24 Feb 2026 14:24:17 -0800
Subject: [PATCH 28/55] dump cpuinfo in makefile testing workflow

To help diagnose occasional illegal instruction errors
---
 .github/workflows/testing-make.yml | 1 +
 Makefile                           | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/testing-make.yml b/.github/workflows/testing-make.yml
index ccd6c600f851..0beeb9a86563 100644
--- a/.github/workflows/testing-make.yml
+++ b/.github/workflows/testing-make.yml
@@ -47,6 +47,7 @@ jobs:
         run: |
           if [ "$RUNNER_OS" = "Linux" ]; then
             echo "LLVM_CONFIG=llvm-config-$LLVM_VERSION" | tee -a "$GITHUB_ENV"
+            cat /proc/cpuinfo
           elif [ "$RUNNER_OS" = "macOS" ]; then
             echo "LLVM_CONFIG=$(brew --prefix llvm@$LLVM_VERSION)/bin/llvm-config" | tee -a "$GITHUB_ENV"
           fi
diff --git a/Makefile b/Makefile
index 0f136b40114d..54c61a622ae8 100644
--- a/Makefile
+++ b/Makefile
@@ -16,8 +16,6 @@ MAKEFLAGS += --no-builtin-rules
 
 UNAME = $(shell uname)
 
-$(info $(shell cat /proc/cpuinfo))
-
 ifeq ($(OS), Windows_NT)
     $(error Halide no longer supports the MinGW environment. Please use MSVC through CMake instead.)
 else

From 2695151683836d4597d50a5a9f0aa4decb4fbb7a Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Fri, 6 Mar 2026 14:25:58 -0800
Subject: [PATCH 29/55] Address review comments

---
 src/CodeGen_LLVM.cpp                     |  12 +-
 src/IR.cpp                               |   6 +-
 src/IR.h                                 |  12 +-
 src/Simplify_Exprs.cpp                   |  16 +-
 src/Simplify_Shuffle.cpp                 |   5 +-
 src/Simplify_Stmts.cpp                   |  29 +++-
 src/StageStridedLoads.cpp                |  22 ++-
 src/Util.h                               |   5 +
 test/correctness/CMakeLists.txt          |   1 +
 test/correctness/stage_strided_loads.cpp |   2 +-
 test/correctness/transpose_idioms.cpp    | 211 +++++++++++++++++++++++
 test/performance/interleave.cpp          |   6 +-
 12 files changed, 303 insertions(+), 24 deletions(-)
 create mode 100644 test/correctness/transpose_idioms.cpp

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 259f287354b1..4be2a8ab0577 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -2241,7 +2241,6 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
         // Using unary shuffles, get each element into the right ultimate
         // lane. This works out without collisions because the number of vectors
         // and the length of each vector is coprime.
-        const int num_vecs = (int)v.size();
         std::vector<int> shuffle(vec_elements);
         for (int i = 0; i < num_vecs; i++) {
             for (int j = 0; j < vec_elements; j++) {
@@ -2298,7 +2297,7 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
         // vectors. Pick some factor of the number of vectors, interleave in
         // separate groups, and then interleave the results. Do the largest
         // power of two factor first.
-        int f = num_vecs & -num_vecs;
+        int f = largest_power_of_two_factor(num_vecs);
         if (f == 1 || f == num_vecs) {
             for (int i = 2; i < num_vecs; i++) {
                 if (num_vecs % i == 0) {
@@ -2317,6 +2316,7 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
             int padded_size = next_power_of_two(vec_elements);
             std::vector<Value *> padded(num_vecs);
             for (int i = 0; i < num_vecs; i++) {
+                // slice_vector can also be used to pad with don't cares
                 padded[i] = slice_vector(vecs[i], 0, padded_size);
             }
             Value *v = interleave_vectors(padded);
@@ -2367,7 +2367,6 @@ std::vector<Value *> CodeGen_LLVM::deinterleave_vector(Value *vec, int num_vecs)
         // Use the inverse of Catanzaro's algorithm from above. We slice into
         // distinct vectors, then rotate each element into the correct final
         // vector, then do a unary permutation of each vector.
-        std::vector<int> shuffle(vec_elements);
 
         // Instead of concatenating, we slice.
         std::vector<Value *> v(num_vecs);
@@ -2385,6 +2384,7 @@ std::vector<Value *> CodeGen_LLVM::deinterleave_vector(Value *vec, int num_vecs)
 
         // We'll handle each bit of the rotation one at a time with a two-way
         // shuffle.
+        std::vector<int> shuffle(vec_elements);
         std::vector<Value *> new_v(v.size());
         int d = 1;
         while (d < num_vecs) {
@@ -2409,7 +2409,9 @@ std::vector<Value *> CodeGen_LLVM::deinterleave_vector(Value *vec, int num_vecs)
         // Now reorder the vectors in the inverse order to the above.
         for (int i = 0; i < num_vecs; i++) {
             int j = (i * vec_elements) % num_vecs;
-            // j and i are swapped below, because we're doing the inverse of the algorithm above
+            // j and i are swapped below, because we're doing the inverse of the
+            // algorithm above. This map is 1:1 because vec_elements and
+            // num_vecs are coprime, so every slot of new_v is stored to.
             new_v[j] = v[i];
         }
         v.swap(new_v);
@@ -2435,7 +2437,7 @@ std::vector<Value *> CodeGen_LLVM::deinterleave_vector(Value *vec, int num_vecs)
         // again. We know there's a non-trivial factor because if it were prime
         // the gcd above would have been 1. Do the largest power-of-two factor
         // first.
-        int f = num_vecs & -num_vecs;
+        int f = largest_power_of_two_factor(num_vecs);
         if (f == 1 || f == num_vecs) {
             for (int i = 2; i < num_vecs; i++) {
                 if (num_vecs % i == 0) {
diff --git a/src/IR.cpp b/src/IR.cpp
index 5f5320c68c87..b53d99960d4b 100644
--- a/src/IR.cpp
+++ b/src/IR.cpp
@@ -1029,10 +1029,12 @@ bool Shuffle::is_concat() const {
 bool Shuffle::is_transpose() const {
     if (vectors.size() > 1 ||
         (int)indices.size() != vectors[0].type().lanes() ||
-        indices.size() < 2) {
+        indices.size() < 2 ||
+        indices[0] != 0 ||
+        indices[1] <= 0) {
         return false;
     }
-    int cols = indices[1] - indices[0];
+    int cols = indices[1];
     int rows = vectors[0].type().lanes() / cols;
     if ((int)indices.size() != rows * cols) {
         return false;
diff --git a/src/IR.h b/src/IR.h
index e70312363627..3b1320330df6 100644
--- a/src/IR.h
+++ b/src/IR.h
@@ -988,8 +988,11 @@ struct Shuffle : public ExprNode<Shuffle> {
      * interleaving of vectors of the same length. */
     static Expr make_interleave(const std::vector<Expr> &vectors);
 
-    /** Convenience constructor for making a shuffle representing an
-     * in-place transpose of a matrix with the given number of columns. */
+    /** Convenience constructor for making a shuffle representing an in-place
+     * transpose of a row-major matrix with the given number of columns. The
+     * output, interpreted as a row-major matrix, therefore has than number of
+     * rows. For example, to turn the vector RGBRGBRGBRGB into RRRRGGGGBBBB cols
+     * would be 3, and to do the reverse cols would be 4. */
     static Expr make_transpose(Expr e, int cols);
 
     /** Convenience constructor for making a shuffle representing a
@@ -1012,7 +1015,10 @@ struct Shuffle : public ExprNode<Shuffle> {
      * arguments. */
     bool is_interleave() const;
 
-    /** Check if this shuffle is an in-place transpose of a single vector */
+    /** Check if this shuffle is an in-place transpose of a single vector. The
+     * factor is the number of columns of the source matrix, or equivalently,
+     * the number of rows of the destination matrix, interpreting a vector as a
+     * matrix stored row-major. */
     bool is_transpose() const;
     int transpose_factor() const;
 
diff --git a/src/Simplify_Exprs.cpp b/src/Simplify_Exprs.cpp
index ad8a9827847a..c7eb6c7f802b 100644
--- a/src/Simplify_Exprs.cpp
+++ b/src/Simplify_Exprs.cpp
@@ -349,13 +349,20 @@ Expr Simplify::visit(const Load *op, ExprInfo *info) {
                                op->image, op->param, const_true(new_lanes, nullptr), align);
         return Broadcast::make(load, b_index->lanes);
     } else if (s_index &&
-               is_const_one(predicate) &&
                (s_index->is_concat() ||
                 s_index->is_interleave())) {
-        // Loads of concats/interleaves should be concats/interleaves of loads
+        // Loads of concats/interleaves should be concats/interleaves of
+        // loads. We'll need to slice up the predicate though.
         std::vector<Expr> loaded_vecs;
         for (const Expr &new_index : s_index->vectors) {
             int new_lanes = new_index.type().lanes();
+            Expr predicate_slice =
+                is_const_one(predicate) ? const_true(new_lanes, nullptr) :
+                s_index->is_concat() ?
+                                          Shuffle::make_slice(predicate, (int)loaded_vecs.size() * new_lanes, 1, new_lanes) :
+                                          Shuffle::make_slice(predicate, (int)loaded_vecs.size(), op->type.lanes() / new_lanes, new_lanes);
+            predicate_slice = mutate(predicate_slice, nullptr);
+
             Expr load = Load::make(op->type.with_lanes(new_lanes), op->name, new_index,
                                    op->image, op->param, const_true(new_lanes, nullptr), ModulusRemainder{});
             loaded_vecs.emplace_back(std::move(load));
@@ -371,8 +378,11 @@ Expr Simplify::visit(const Load *op, ExprInfo *info) {
         Expr transposed_index =
             Ramp::make(Ramp::make(inner_ramp->base, make_one(inner_ramp->base.type()), r_index->lanes),
                        Broadcast::make(inner_ramp->stride, r_index->lanes), inner_ramp->lanes);
+        Expr transposed_predicate = (predicate.as<Broadcast>() ?
+                                         predicate :  // common case optimization
+                                         Shuffle::make_transpose(predicate, inner_ramp->lanes));
         Expr transposed_load =
-            Load::make(op->type, op->name, transposed_index, op->image, op->param, predicate, align);
+            Load::make(op->type, op->name, transposed_index, op->image, op->param, transposed_predicate, align);
         return mutate(Shuffle::make_transpose(transposed_load, r_index->lanes), info);
     } else if (predicate.same_as(op->predicate) && index.same_as(op->index) && align == op->alignment) {
         return op;
diff --git a/src/Simplify_Shuffle.cpp b/src/Simplify_Shuffle.cpp
index aecb4c6fc99a..2a614ac81744 100644
--- a/src/Simplify_Shuffle.cpp
+++ b/src/Simplify_Shuffle.cpp
@@ -95,10 +95,11 @@ Expr Simplify::visit(const Shuffle *op, ExprInfo *info) {
     // broadcast. Note that it doesn't matter what the indices
     // are.
     const Broadcast *b1 = new_vectors[0].as<Broadcast>();
-    if (b1) {
+    if (b1 && b1->value.type().is_scalar()) {
         bool can_collapse = true;
         for (size_t i = 1; i < new_vectors.size() && can_collapse; i++) {
-            if (const Broadcast *b2 = new_vectors[i].as<Broadcast>()) {
+            if (const Broadcast *b2 = new_vectors[i].as<Broadcast>();
+                b2 && b2->value.type().is_scalar()) {
                 Expr check = mutate(b1->value - b2->value, nullptr);
                 can_collapse &= is_const_zero(check);
             } else {
diff --git a/src/Simplify_Stmts.cpp b/src/Simplify_Stmts.cpp
index 1b4588342096..60e80e86c1b5 100644
--- a/src/Simplify_Stmts.cpp
+++ b/src/Simplify_Stmts.cpp
@@ -326,6 +326,7 @@ Stmt Simplify::visit(const Store *op) {
     ExprInfo index_info;
     Expr index = mutate(op->index, &index_info);
 
+
     // If the store is fully unconditional and out of bounds, drop it.
     // This should only occur inside branches that make the store unreachable,
     // but perhaps the branch was hard to prove constant true or false. This
@@ -342,8 +343,9 @@ Stmt Simplify::visit(const Store *op) {
     }
 
     ExprInfo base_info;
-    if (const Ramp *r = index.as<Ramp>()) {
-        mutate(r->base, &base_info);
+    const Ramp *r_index = index.as<Ramp>();
+    if (r_index) {
+        mutate(r_index->base, &base_info);
     }
     base_info.alignment = ModulusRemainder::intersect(base_info.alignment, index_info.alignment);
 
@@ -367,7 +369,10 @@ Stmt Simplify::visit(const Store *op) {
         // foo[x] = foo[x] or foo[x] = undef is a no-op
         return Evaluate::make(0);
     } else if (shuf && shuf->is_concat()) {
-        // Break a store of a concat of vector indices into separate stores
+        // Break a store of a concat of vector indices into separate stores. A
+        // concat index will result in a general scatter at codegen time. We
+        // should just break it up here, where there is a hope that the
+        // individual elements might be simplifiable to dense ramps.
         std::string var_name = unique_name('t');
         Expr var = Variable::make(value.type(), var_name);
         std::vector<Stmt> stores;
@@ -384,6 +389,24 @@ Stmt Simplify::visit(const Store *op) {
         Stmt s = Block::make(stores);
         s = LetStmt::make(var_name, value, s);
         return mutate(s);
+    } else if (const Ramp *inner_ramp = r_index ? r_index->base.as<Ramp>() : nullptr;
+               inner_ramp &&
+               !is_const_one(inner_ramp->stride) &&
+               is_const_one(r_index->stride)) {
+        // If it's a nested ramp and the outer ramp has stride 1, swap the
+        // nesting order of the ramps to make dense stores and transpose the
+        // index and value instead. Later in lowering after flattening the
+        // nested ramps it will turn into a concat of dense ramps and hit the
+        // case above.
+        Expr transposed_index =
+            Ramp::make(Ramp::make(inner_ramp->base, make_one(inner_ramp->base.type()), r_index->lanes),
+                       Broadcast::make(inner_ramp->stride, r_index->lanes), inner_ramp->lanes);
+        Expr transposed_value = Shuffle::make_transpose(value, inner_ramp->lanes);
+        Expr transposed_predicate = (predicate.as<Broadcast>() ?
+                                         predicate :  // common case optimization
+                                         Shuffle::make_transpose(predicate, inner_ramp->lanes));
+        return mutate(Store::make(op->name, transposed_value, transposed_index,
+                                  op->param, transposed_predicate, align));
     } else if (predicate.same_as(op->predicate) && value.same_as(op->value) && index.same_as(op->index) && align == op->alignment) {
         return op;
     } else {
diff --git a/src/StageStridedLoads.cpp b/src/StageStridedLoads.cpp
index a1319d722112..896a33b5193e 100644
--- a/src/StageStridedLoads.cpp
+++ b/src/StageStridedLoads.cpp
@@ -104,7 +104,7 @@ class FindStridedLoads : public IRVisitor {
                 // TODO: We do not yet handle nested vectorization here for
                 // ramps which have not already collapsed. We could potentially
                 // handle more interesting types of shuffle than simple flat slices.
-                if (stride >= 2 && stride <= r->lanes && r->stride.type().is_scalar()) {
+                if (stride >= 2 && r->stride.type().is_scalar()) {
                     const IRNode *s = scope;
                     const Allocate *a = nullptr;
                     if (const Allocate *const *a_ptr = allocation_scope.find(op->name)) {
@@ -334,9 +334,23 @@ Stmt stage_strided_loads(const Stmt &stmt, const Target &target) {
             Type t = k.type.with_lanes(lanes);
             const Load *op = load->second[0];
 
+            int last_offset = first_offset;
+            int64_t biggest_gap = 0;
             std::set<const Load *> all_loads;
             for (auto l = load; l != v.end() && l->first < first_offset + k.stride; l++) {
                 all_loads.insert(l->second.begin(), l->second.end());
+                biggest_gap = std::max(biggest_gap, l->first - last_offset);
+                last_offset = l->first;
+            }
+            biggest_gap = std::max(biggest_gap, (first_offset + k.stride) - last_offset);
+
+            // If our contiguous shared load has contiguous vectors in it of
+            // size at least k.lanes that are going to be entirely unused, this
+            // is a bad idea (e.g. a cluster of {ramp(0, 1024, 8) and ramp(37,
+            // 1024, 8)} should not be staged).
+            if (biggest_gap >= k.lanes) {
+                load++;
+                continue;
             }
 
             Expr shared_load = Load::make(t, k.buf, idx, op->image, op->param,
@@ -391,7 +405,7 @@ Stmt stage_strided_loads(const Stmt &stmt, const Target &target) {
         // picked up in a cluster, but for whom we know it's safe to do a
         // dense load before their start.
         for (const auto &[offset, loads] : reverse_view(v)) {
-            if (replacer.replacements.count(loads[0])) {
+            if (replacer.replacements.count(loads[0]) || k.lanes < k.stride) {
                 continue;
             }
             int64_t delta = k.stride - 1;
@@ -416,7 +430,7 @@ Stmt stage_strided_loads(const Stmt &stmt, const Target &target) {
         // Look for any loads we can densify because an overlapping load occurs
         // in any parent scope.
         for (const auto &[offset, loads] : reverse_view(v)) {
-            if (replacer.replacements.count(loads[0])) {
+            if (replacer.replacements.count(loads[0]) || k.lanes < k.stride) {
                 continue;
             }
             int64_t min_offset = offset;
@@ -456,7 +470,7 @@ Stmt stage_strided_loads(const Stmt &stmt, const Target &target) {
         // external allocations by doing a dense load at a trimmed size. We rely
         // on codegen to do a good job at loading vectors of a funny size.
         for (const auto &[offset, loads] : v) {
-            if (replacer.replacements.count(loads[0])) {
+            if (replacer.replacements.count(loads[0]) || k.lanes < k.stride) {
                 continue;
             }
 
diff --git a/src/Util.h b/src/Util.h
index a437d18a9ce4..2a9b9a676cc0 100644
--- a/src/Util.h
+++ b/src/Util.h
@@ -575,6 +575,11 @@ inline int64_t next_power_of_two(int64_t x) {
     return static_cast<int64_t>(1) << static_cast<int64_t>(std::ceil(std::log2(x)));
 }
 
+/** Returns the largest power of two which is a factor of the argument. */
+inline int64_t largest_power_of_two_factor(int64_t x) {
+    return x & -x;
+}
+
 /** Return whether or not an integer is a power of two. */
 inline bool is_power_of_two(int64_t x) {
     return (x & (x - 1)) == 0;
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index 77066a8392bd..5805fe599827 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -327,6 +327,7 @@ tests(GROUPS correctness
       tracing_broadcast.cpp
       tracing_stack.cpp
       transitive_bounds.cpp
+      transpose_idioms.cpp
       trim_no_ops.cpp
       tuple_partial_update.cpp
       tuple_reduction.cpp
diff --git a/test/correctness/stage_strided_loads.cpp b/test/correctness/stage_strided_loads.cpp
index 757f71acd487..dc09be89d09c 100644
--- a/test/correctness/stage_strided_loads.cpp
+++ b/test/correctness/stage_strided_loads.cpp
@@ -190,7 +190,7 @@ int main(int argc, char **argv) {
     {
         Func f;
         Var x;
-        f(x) = buf(17 * x) + buf(17 * x + 15);
+        f(x) = buf(50 * x) + buf(50 * x + 15);
         f.vectorize(x, 16, TailStrategy::RoundUp);
 
         checker.check_not(f, 0);
diff --git a/test/correctness/transpose_idioms.cpp b/test/correctness/transpose_idioms.cpp
new file mode 100644
index 000000000000..afe02039fbde
--- /dev/null
+++ b/test/correctness/transpose_idioms.cpp
@@ -0,0 +1,211 @@
+#include "Halide.h"
+
+using namespace Halide;
+using namespace Halide::Internal;
+
+// This test enumerates all the scheduling idioms in Halide that *should*
+// produce good code for a transpose/interleave/deinterleave operation.
+
+class Checker : public IRMutator {
+
+    using IRMutator::visit;
+
+    Expr visit(const Load *op) override {
+        if (const Ramp *r = op->index.as<Ramp>();
+            r && is_const_one(r->stride)) {
+            dense_loads++;
+        } else if (op->type.is_vector()) {
+            gathers++;
+        }
+        return IRMutator::visit(op);
+    }
+
+    Stmt visit(const Store *op) override {
+        if (const Ramp *r = op->index.as<Ramp>();
+            r && is_const_one(r->stride)) {
+            dense_stores++;
+        } else if (op->index.type().is_vector()) {
+            scatters++;
+        }
+        return IRMutator::visit(op);
+    }
+
+    Expr visit(const Shuffle *op) override {
+        transposes += op->is_transpose();
+        interleaves += op->is_interleave();
+        if (op->is_slice()) {
+            if (op->slice_stride() == 1) {
+                dense_slices++;
+            } else {
+                strided_slices++;
+            }
+        }
+        return IRMutator::visit(op);
+    }
+
+public:
+    int dense_loads = 0;
+    int gathers = 0;
+    int dense_stores = 0;
+    int scatters = 0;
+    int dense_slices = 0;
+    int strided_slices = 0;
+    int interleaves = 0;
+    int transposes = 0;
+
+    void check() {
+        internal_assert(gathers == 0) << "Vector gathers found";
+        internal_assert(scatters == 0) << "Vector scatters found";
+        internal_assert(strided_slices == 0) << "strided slices found";
+        internal_assert(dense_loads) << "No dense loads found";
+        internal_assert(dense_stores) << "No dense stores found";
+        internal_assert(interleaves + transposes) << "No interleaves or transposes found";
+    }
+};
+
+void check(Func g) {
+    Checker checker;
+    g.add_custom_lowering_pass(&checker, nullptr);
+
+    // Choose a shape with lots of factors so that our RoundUp schedules work
+    int n = 16 * 9 * 7;
+    Buffer<int> out = g.realize({n, n});
+    for (int y = 0; y < out.height(); y++) {
+        for (int x = 0; x < out.width(); x++) {
+            int correct = 100 * x + y;
+            internal_assert(out(x, y) == correct)
+                << "out(" << x << ", " << y << ") = " << out(x, y)
+                << " instead of " << correct << "\n";
+        }
+    }
+
+    checker.check();
+}
+
+int main(int argc, char **argv) {
+    Var x{"x"}, y{"y"}, xi{"xi"}, yi{"yi"};
+
+    // In each case we'll say g(x, y) = f(y, x) and tile it. We will try power
+    // of two sizes, and sizes that are coprime, and sizes that are neither
+    // coprime no powers of two.
+
+    for (auto tile : {std::pair{8, 16}, {7, 3}, {6, 9}}) {
+        {
+            // Idiom 1: Strided stores into a staged transposed copy of the
+            // input. The strided stores that get mashed together into one big
+            // interleave + store by the pass that interleaves strided
+            // stores. This has to be done on a staged copy of the input rather
+            // than g so that the strided stores have a constant stride.
+            Func f{"f"}, g{"g"};
+            f(x, y) = x + 100 * y;
+            g(x, y) = f(y, x);
+            f.compute_root();
+
+            g.tile(x, y, xi, yi, tile.first, tile.second, TailStrategy::RoundUp)
+                .vectorize(xi)
+                .unroll(yi);
+
+            f.in().compute_at(g, x).reorder_storage(y, x).vectorize(x).unroll(y);
+
+            check(g);
+        }
+
+        {
+            // Idiom 2: Vectorize x, unroll y. Stage a copy of the input but
+            // don't transpose it. This will create strided loads from the
+            // staged input that get hoisted out into one big dense load +
+            // transpose by the stage_strided_stores pass. The staging is
+            // required so that the strides are constant.
+            Func f{"f"}, g{"g"};
+            f(x, y) = x + 100 * y;
+            g(x, y) = f(y, x);
+            f.compute_root();
+
+            g.tile(x, y, xi, yi, tile.first, tile.second, TailStrategy::RoundUp)
+                .vectorize(xi)
+                .unroll(yi);
+
+            f.in().compute_at(g, x).vectorize(x).unroll(y);
+
+            check(g);
+        }
+
+        {
+            // Idiom 3: Vectorize both, x innermost. This should be handled by
+            // shuffle optimization logic in the simplifier: a store of a concat
+            // of ramps turns into a sequence of stores of slices of the RHS,
+            // and a load of a ramp of a ramp where the *outer* ramp has stride
+            // 1 but the inner doesn't turns into a transpose of a concat of
+            // dense loads.
+            Func f{"f"}, g{"g"};
+            f(x, y) = x + 100 * y;
+            g(x, y) = f(y, x);
+            f.compute_root();
+
+            g.tile(x, y, xi, yi, tile.first, tile.second, TailStrategy::RoundUp)
+                .vectorize(xi)
+                .vectorize(yi);
+
+            check(g);
+        }
+
+        {
+            // Idiom 4: Vectorize both, y innermost. In this case the store of a
+            // ramp of a ramp gets rewritten by the simplifier to move the ramp
+            // with stride one innermost, transposing the RHS.
+
+            Func f{"f"}, g{"g"};
+            f(x, y) = x + 100 * y;
+            g(x, y) = f(y, x);
+            f.compute_root();
+
+            g.tile(x, y, xi, yi, tile.first, tile.second, TailStrategy::RoundUp)
+                .reorder(yi, xi)
+                .vectorize(xi)
+                .vectorize(yi);
+
+            check(g);
+        }
+    }
+
+    {
+        // Check the double-vectorization approaches also work when there is a
+        // vector predicate on one of the two vectors, to be sure the simplifier
+        // is transforming the predicate correctly. We can't predicate both,
+        // because the vectorizer can't handle it and generates a scalar tail.
+        {
+            Func f{"f"}, g{"g"};
+            f(x, y) = x + 100 * y;
+            g(x, y) = f(y, x);
+            f.compute_root();
+
+            g
+                .never_partition(x, y)
+                .split(x, x, xi, 13, TailStrategy::Predicate)
+                .split(y, y, yi, 11, TailStrategy::ShiftInwards)
+                .reorder(xi, yi, x, y)
+                .vectorize(xi)
+                .vectorize(yi);
+
+            check(g);
+        }
+        {
+            Func f{"f"}, g{"g"};
+            f(x, y) = x + 100 * y;
+            g(x, y) = f(y, x);
+            f.compute_root();
+
+            g
+                .never_partition(x, y)
+                .split(x, x, xi, 13, TailStrategy::ShiftInwards)
+                .split(y, y, yi, 11, TailStrategy::Predicate)
+                .reorder(yi, xi, x, y)
+                .vectorize(xi)
+                .vectorize(yi);
+
+            check(g);
+        }
+    }
+
+    printf("Success!\n");
+}
diff --git a/test/performance/interleave.cpp b/test/performance/interleave.cpp
index 3df42ed0237f..ee1598e40d41 100644
--- a/test/performance/interleave.cpp
+++ b/test/performance/interleave.cpp
@@ -29,7 +29,11 @@ Result test_interleave(int factor, const Target &t) {
     output(x) = in(x / factor, x % factor);
 
     Var xi, yi;
-    output.unroll(x, factor, TailStrategy::RoundUp).vectorize(x, t.natural_vector_size<T>(), TailStrategy::RoundUp);
+    // We'll use the interleaving-stores scheduling idiom, where unrolling
+    // strided stores gets mashed together into a single dense store of a
+    // interleave_vectors call.
+    output.unroll(x, factor, TailStrategy::RoundUp)
+        .vectorize(x, t.natural_vector_size<T>(), TailStrategy::RoundUp);
     output.output_buffer().dim(0).set_min(0);
 
     output.compile_jit();

From 2962ea191044b2a06a2040de52b002c62cbb966f Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 10 Mar 2026 13:55:02 -0700
Subject: [PATCH 30/55] Remove duplicate function body

---
 src/CodeGen_LLVM.cpp | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 7edc6a3e003e..a23e0f52ab0d 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -5211,27 +5211,6 @@ Value *CodeGen_LLVM::slice_vector(Value *vec, int start, int size) {
     }
 }
 
-Value *CodeGen_LLVM::optimization_fence(Value *v) {
-    llvm::Type *t = v->getType();
-    internal_assert(!t->isScalableTy())
-        << "optimization_fence does not support scalable vectors yet";
-    const int bits = t->getPrimitiveSizeInBits();
-    if (bits % 32) {
-        const int lanes = get_vector_num_elements(t);
-        const int element_bits = t->getScalarSizeInBits();
-        const int lanes_per_32_bits = 32 / element_bits;
-        const int padded_lanes = align_up(lanes, lanes_per_32_bits);
-        v = slice_vector(v, 0, padded_lanes);
-        v = optimization_fence(v);
-        v = slice_vector(v, 0, lanes);
-        return v;
-    }
-    llvm::Type *float_type = llvm_type_of(Float(32, bits / 32));
-    v = builder->CreateBitCast(v, float_type);
-    v = builder->CreateArithmeticFence(v, float_type);
-    return builder->CreateBitCast(v, t);
-}
-
 Value *CodeGen_LLVM::concat_vectors(const vector<Value *> &v) {
     if (v.size() == 1) {
         return v[0];

From fa2fcb7aad2b465f64003d725588456db465befd Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 11 Mar 2026 11:14:04 -0700
Subject: [PATCH 31/55] Use slice of predicate

---
 src/Simplify_Exprs.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Simplify_Exprs.cpp b/src/Simplify_Exprs.cpp
index c7eb6c7f802b..29f827553789 100644
--- a/src/Simplify_Exprs.cpp
+++ b/src/Simplify_Exprs.cpp
@@ -364,7 +364,7 @@ Expr Simplify::visit(const Load *op, ExprInfo *info) {
             predicate_slice = mutate(predicate_slice, nullptr);
 
             Expr load = Load::make(op->type.with_lanes(new_lanes), op->name, new_index,
-                                   op->image, op->param, const_true(new_lanes, nullptr), ModulusRemainder{});
+                                   op->image, op->param, predicate_slice, ModulusRemainder{});
             loaded_vecs.emplace_back(std::move(load));
         }
         return Shuffle::make(loaded_vecs, s_index->indices);

From dcdfb903637eaa4dbbb0bfa63ff75b1679629722 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 11 Mar 2026 11:18:11 -0700
Subject: [PATCH 32/55] clang-format

---
 src/Simplify_Stmts.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/Simplify_Stmts.cpp b/src/Simplify_Stmts.cpp
index 60e80e86c1b5..b1940482802a 100644
--- a/src/Simplify_Stmts.cpp
+++ b/src/Simplify_Stmts.cpp
@@ -326,7 +326,6 @@ Stmt Simplify::visit(const Store *op) {
     ExprInfo index_info;
     Expr index = mutate(op->index, &index_info);
 
-
     // If the store is fully unconditional and out of bounds, drop it.
     // This should only occur inside branches that make the store unreachable,
     // but perhaps the branch was hard to prove constant true or false. This

From 70afc5881deaabba73919503531f41fed0183c0b Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 11 Mar 2026 13:09:11 -0700
Subject: [PATCH 33/55] SVE fixes

Co-authored-by: Claude Code <noreply@anthropic.com>
---
 src/CodeGen_LLVM.cpp                  | 10 +++++++---
 test/correctness/transpose_idioms.cpp |  5 +++--
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index a23e0f52ab0d..ca66274d5ea4 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -2199,8 +2199,12 @@ void CodeGen_LLVM::visit(const Broadcast *op) {
 
 Value *CodeGen_LLVM::optimization_fence(Value *v) {
     llvm::Type *t = v->getType();
-    internal_assert(!t->isScalableTy())
-        << "optimization_fence does not support scalable vectors yet";
+    if (t->isScalableTy()) {
+        // Convert to fixed, fence, convert back.
+        Value *fixed = scalable_to_fixed_vector_type(v);
+        fixed = optimization_fence(fixed);
+        return fixed_to_scalable_vector_type(fixed);
+    }
     const int bits = t->getPrimitiveSizeInBits();
     if (bits % 32) {
         const int lanes = get_vector_num_elements(t);
@@ -2212,7 +2216,7 @@ Value *CodeGen_LLVM::optimization_fence(Value *v) {
         v = slice_vector(v, 0, lanes);
         return v;
     }
-    llvm::Type *float_type = llvm_type_of(Float(32, bits / 32));
+    llvm::Type *float_type = get_vector_type(f32_t, bits / 32, VectorTypeConstraint::Fixed);
     v = builder->CreateBitCast(v, float_type);
     v = builder->CreateArithmeticFence(v, float_type);
     return builder->CreateBitCast(v, t);
diff --git a/test/correctness/transpose_idioms.cpp b/test/correctness/transpose_idioms.cpp
index afe02039fbde..9fb29c2883e0 100644
--- a/test/correctness/transpose_idioms.cpp
+++ b/test/correctness/transpose_idioms.cpp
@@ -87,9 +87,10 @@ int main(int argc, char **argv) {
 
     // In each case we'll say g(x, y) = f(y, x) and tile it. We will try power
     // of two sizes, and sizes that are coprime, and sizes that are neither
-    // coprime no powers of two.
+    // coprime no powers of two. We'll use sizes larger than 4, because some
+    // backends like to do different things for small strides.
 
-    for (auto tile : {std::pair{8, 16}, {7, 3}, {6, 9}}) {
+    for (auto tile : {std::pair{8, 16}, {7, 5}, {6, 9}}) {
         {
             // Idiom 1: Strided stores into a staged transposed copy of the
             // input. The strided stores that get mashed together into one big

From 5d2b5241fa620d9d84d70c8df960315a0dcf7577 Mon Sep 17 00:00:00 2001
From: Alex Reinking <areinking@adobe.com>
Date: Mon, 16 Mar 2026 01:01:39 -0400
Subject: [PATCH 34/55] Move optimization_fence back

---
 src/CodeGen_LLVM.cpp | 50 ++++++++++++++++++++++----------------------
 src/CodeGen_LLVM.h   |  9 ++++----
 2 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index bc4dd4f8eb3e..2d74f12b4c67 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -2197,31 +2197,6 @@ void CodeGen_LLVM::visit(const Broadcast *op) {
     value = create_broadcast(v, op->lanes);
 }
 
-Value *CodeGen_LLVM::optimization_fence(Value *v) {
-    llvm::Type *t = v->getType();
-    if (t->isScalableTy()) {
-        // Convert to fixed, fence, convert back.
-        Value *fixed = scalable_to_fixed_vector_type(v);
-        fixed = optimization_fence(fixed);
-        return fixed_to_scalable_vector_type(fixed);
-    }
-    const int bits = t->getPrimitiveSizeInBits();
-    if (bits % 32) {
-        const int lanes = get_vector_num_elements(t);
-        const int element_bits = t->getScalarSizeInBits();
-        const int lanes_per_32_bits = 32 / element_bits;
-        const int padded_lanes = align_up(lanes, lanes_per_32_bits);
-        v = slice_vector(v, 0, padded_lanes);
-        v = optimization_fence(v);
-        v = slice_vector(v, 0, lanes);
-        return v;
-    }
-    llvm::Type *float_type = get_vector_type(f32_t, bits / 32, VectorTypeConstraint::Fixed);
-    v = builder->CreateBitCast(v, float_type);
-    v = builder->CreateArithmeticFence(v, float_type);
-    return builder->CreateBitCast(v, t);
-}
-
 Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
     internal_assert(!vecs.empty());
     for (size_t i = 1; i < vecs.size(); i++) {
@@ -5215,6 +5190,31 @@ Value *CodeGen_LLVM::slice_vector(Value *vec, int start, int size) {
     }
 }
 
+Value *CodeGen_LLVM::optimization_fence(Value *v) {
+    llvm::Type *t = v->getType();
+    if (t->isScalableTy()) {
+        // Convert to fixed, fence, convert back.
+        Value *fixed = scalable_to_fixed_vector_type(v);
+        fixed = optimization_fence(fixed);
+        return fixed_to_scalable_vector_type(fixed);
+    }
+    const int bits = t->getPrimitiveSizeInBits();
+    if (bits % 32) {
+        const int lanes = get_vector_num_elements(t);
+        const int element_bits = t->getScalarSizeInBits();
+        const int lanes_per_32_bits = 32 / element_bits;
+        const int padded_lanes = align_up(lanes, lanes_per_32_bits);
+        v = slice_vector(v, 0, padded_lanes);
+        v = optimization_fence(v);
+        v = slice_vector(v, 0, lanes);
+        return v;
+    }
+    llvm::Type *float_type = get_vector_type(f32_t, bits / 32, VectorTypeConstraint::Fixed);
+    v = builder->CreateBitCast(v, float_type);
+    v = builder->CreateArithmeticFence(v, float_type);
+    return builder->CreateBitCast(v, t);
+}
+
 Value *CodeGen_LLVM::concat_vectors(const vector<Value *> &v) {
     if (v.size() == 1) {
         return v[0];
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index 415de2463b47..57d78172c4fa 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -465,10 +465,6 @@ class CodeGen_LLVM : public IRVisitor {
     /** The inverse of interleave_vectors. */
     virtual std::vector<llvm::Value *> deinterleave_vector(llvm::Value *vec, int num_vecs);
 
-    /** A fence to prevent fusion of ops by llvm. Designed for floats, but we
-     * abuse it to prevent shufflevector fusion too. */
-    virtual llvm::Value *optimization_fence(llvm::Value *);
-
     /** Description of an intrinsic function overload. Overloads are resolved
      * using both argument and return types. The scalar types of the arguments
      * and return type must match exactly for an overload resolution to succeed. */
@@ -518,6 +514,11 @@ class CodeGen_LLVM : public IRVisitor {
      * if you ask for more lanes than the vector has. */
     virtual llvm::Value *slice_vector(llvm::Value *vec, int start, int extent);
 
+    /** Use an arithmetic fence to prevent LLVM from fusing operations
+     * across this barrier. Works by bitcasting to float, applying
+     * llvm.arithmetic.fence, and bitcasting back. */
+    virtual llvm::Value *optimization_fence(llvm::Value *);
+
     /** Concatenate a bunch of llvm vectors. Must be of the same type. */
     virtual llvm::Value *concat_vectors(const std::vector<llvm::Value *> &);
 

From 60cd341da8021bcf5131eafad41262dae3a1b31e Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Fri, 3 Apr 2026 15:36:19 -0700
Subject: [PATCH 35/55] Try to thread the needle with webassembly nonsense

---
 src/CodeGen_WebAssembly.cpp | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/CodeGen_WebAssembly.cpp b/src/CodeGen_WebAssembly.cpp
index 66c043ad32ef..dc590da42f8b 100644
--- a/src/CodeGen_WebAssembly.cpp
+++ b/src/CodeGen_WebAssembly.cpp
@@ -42,6 +42,8 @@ class CodeGen_WebAssembly : public CodeGen_Posix {
     void visit(const Cast *) override;
     void visit(const Call *) override;
     void codegen_vector_reduce(const VectorReduce *, const Expr &) override;
+
+    llvm::Value *optimization_fence(llvm::Value *v) override;
 };
 
 CodeGen_WebAssembly::CodeGen_WebAssembly(const Target &t)
@@ -198,7 +200,11 @@ void CodeGen_WebAssembly::visit(const Cast *op) {
             }
             if (is_load) {
                 llvm::Value *v = codegen(op->value);
-                v = optimization_fence(v);
+                // In general we don't emit optimization fences in the
+                // webassembly backend, because they cause LLVM internal
+                // errors. However in this specific case it's necessary as a
+                // workaround, so we call the base class version explicitly.
+                v = CodeGen_LLVM::optimization_fence(v);
                 value = builder->CreateIntCast(v, llvm_type_of(op->type),
                                                op->value.type().is_int());
                 return;
@@ -372,6 +378,12 @@ void CodeGen_WebAssembly::codegen_vector_reduce(const VectorReduce *op, const Ex
     CodeGen_Posix::codegen_vector_reduce(op, init);
 }
 
+llvm::Value *CodeGen_WebAssembly::optimization_fence(llvm::Value *v) {
+    // As of llvm 23, using an arithmetic fence intrinsic causes all kinds of
+    // errors in LLVM's webassembly backend.
+    return v;
+}
+
 string CodeGen_WebAssembly::mcpu_target() const {
     return "";
 }

From 9d7b9041631564f1f83957a90332c839eab73c2a Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Mon, 6 Apr 2026 16:11:52 -0700
Subject: [PATCH 36/55] Fix msvc warning

---
 src/CodeGen_X86.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 3e52ac7fa1a5..be32ad486817 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -1304,7 +1304,7 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
     // A helper to iterate over all pairs of entries in v, separated by some
     // power-of-two spacing.
     auto for_all_pairs = [&](size_t log_step, auto fn) {
-        size_t step = 1 << log_step;
+        size_t step = (size_t)1 << log_step;
         for (size_t i = 0; i < v.size(); i++) {
             // Pair each vector with the one separated by the step.
             size_t j = i ^ step;

From 9dd04eb679366f73563f221a2219d3144f3bdb93 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Mon, 6 Apr 2026 16:12:04 -0700
Subject: [PATCH 37/55] Skip simd_op_check_sve2 on old llvms

---
 test/correctness/simd_op_check_sve2.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/correctness/simd_op_check_sve2.cpp b/test/correctness/simd_op_check_sve2.cpp
index 73d46751b92e..d0a790797394 100644
--- a/test/correctness/simd_op_check_sve2.cpp
+++ b/test/correctness/simd_op_check_sve2.cpp
@@ -1419,6 +1419,12 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
 }  // namespace
 
 int main(int argc, char **argv) {
+    if (Internal::get_llvm_version() < 220) {
+        printf("[SKIP] LLVM %d has known SVE backend bugs for this test.\n",
+               Internal::get_llvm_version());
+        return 0;
+    }
+
     return SimdOpCheckTest::main<SimdOpCheckArmSve>(
         argc, argv,
         {

From e5e6b662fadfec01bf258f0d71bb999113457de9 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 8 Apr 2026 12:12:35 -0700
Subject: [PATCH 38/55] Skip test on sve2 with llvm 21

---
 test/correctness/transpose_idioms.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/test/correctness/transpose_idioms.cpp b/test/correctness/transpose_idioms.cpp
index 9fb29c2883e0..a41b3677513d 100644
--- a/test/correctness/transpose_idioms.cpp
+++ b/test/correctness/transpose_idioms.cpp
@@ -83,6 +83,13 @@ void check(Func g) {
 }
 
 int main(int argc, char **argv) {
+    if (Internal::get_llvm_version() < 220 &&
+        get_jit_target_from_environment().has_feature(Target::SVE2)) {
+        printf("[SKIP] LLVM %d has known SVE backend bugs for this test.\n",
+               Internal::get_llvm_version());
+        return 0;
+    }
+
     Var x{"x"}, y{"y"}, xi{"xi"}, yi{"yi"};
 
     // In each case we'll say g(x, y) = f(y, x) and tile it. We will try power

From 2470267ee5dccd971c111e0f2fb62013adec2b2f Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Thu, 9 Apr 2026 10:30:44 -0700
Subject: [PATCH 39/55] Skip block transpose performance test for sve2 on llvm
 21

---
 test/performance/block_transpose.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/test/performance/block_transpose.cpp b/test/performance/block_transpose.cpp
index 921d7f9a913b..21e7c70fe434 100644
--- a/test/performance/block_transpose.cpp
+++ b/test/performance/block_transpose.cpp
@@ -87,6 +87,13 @@ int main(int argc, char **argv) {
         return 0;
     }
 
+    if (Internal::get_llvm_version() < 220 &&
+        target.has_feature(Target::SVE2)) {
+        printf("[SKIP] LLVM %d has known SVE backend bugs for this test.\n",
+               Internal::get_llvm_version());
+        return 0;
+    }
+
     // Set the target features to use for dumping to assembly
     target.set_features({Target::NoRuntime, Target::NoAsserts, Target::NoBoundsQuery});
 

From 479afa85b864f291d0c3204455aa69ef474c3645 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Fri, 10 Apr 2026 10:26:02 -0700
Subject: [PATCH 40/55] Skip sub-test that triggers llvm bug

---
 test/correctness/transpose_idioms.cpp | 38 +++++++++++++++++----------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/test/correctness/transpose_idioms.cpp b/test/correctness/transpose_idioms.cpp
index a41b3677513d..306620736f05 100644
--- a/test/correctness/transpose_idioms.cpp
+++ b/test/correctness/transpose_idioms.cpp
@@ -181,21 +181,31 @@ int main(int argc, char **argv) {
         // vector predicate on one of the two vectors, to be sure the simplifier
         // is transforming the predicate correctly. We can't predicate both,
         // because the vectorizer can't handle it and generates a scalar tail.
-        {
-            Func f{"f"}, g{"g"};
-            f(x, y) = x + 100 * y;
-            g(x, y) = f(y, x);
-            f.compute_root();
-
-            g
-                .never_partition(x, y)
-                .split(x, x, xi, 13, TailStrategy::Predicate)
-                .split(y, y, yi, 11, TailStrategy::ShiftInwards)
-                .reorder(xi, yi, x, y)
-                .vectorize(xi)
-                .vectorize(yi);
 
-            check(g);
+        {
+            // LLVM 22/23 have a codegen bug for some x86 versions here, so skip with AVX512
+            // See: https://github.com/llvm/llvm-project/issues/191304
+            if (Internal::get_llvm_version() >= 220 &&
+                Internal::get_llvm_version() < 240 &&
+                get_jit_target_from_environment().has_feature(Target::AVX512)) {
+                printf("Skipping one subtest for LLVM %d with AVX-512 due to known backend bugs.\n",
+                       Internal::get_llvm_version());
+            } else {
+                Func f{"f"}, g{"g"};
+                f(x, y) = x + 100 * y;
+                g(x, y) = f(y, x);
+                f.compute_root();
+
+                g
+                    .never_partition(x, y)
+                    .split(x, x, xi, 13, TailStrategy::Predicate)
+                    .split(y, y, yi, 11, TailStrategy::ShiftInwards)
+                    .reorder(xi, yi, x, y)
+                    .vectorize(xi)
+                    .vectorize(yi);
+
+                check(g);
+            }
         }
         {
             Func f{"f"}, g{"g"};

From b46cb04ac848da7641b4ced83f0c9dc698c03d45 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 15 Apr 2026 09:44:06 -0700
Subject: [PATCH 41/55] Test should hopefully now work with llvm main

---
 test/correctness/transpose_idioms.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/correctness/transpose_idioms.cpp b/test/correctness/transpose_idioms.cpp
index 306620736f05..0b61376e176f 100644
--- a/test/correctness/transpose_idioms.cpp
+++ b/test/correctness/transpose_idioms.cpp
@@ -183,10 +183,10 @@ int main(int argc, char **argv) {
         // because the vectorizer can't handle it and generates a scalar tail.
 
         {
-            // LLVM 22/23 have a codegen bug for some x86 versions here, so skip with AVX512
+            // LLVM 22 has a codegen bug for some x86 versions here, so skip with AVX512
             // See: https://github.com/llvm/llvm-project/issues/191304
             if (Internal::get_llvm_version() >= 220 &&
-                Internal::get_llvm_version() < 240 &&
+                Internal::get_llvm_version() < 230 &&
                 get_jit_target_from_environment().has_feature(Target::AVX512)) {
                 printf("Skipping one subtest for LLVM %d with AVX-512 due to known backend bugs.\n",
                        Internal::get_llvm_version());

From a75362084d734fa99b13cb82a7072f4a9c7893bf Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 22 Apr 2026 11:26:55 -0700
Subject: [PATCH 42/55] Introduce MultiRamp, a multi-dimensional ramp
 abstraction

Adds a MultiRamp IR helper that generalizes the old InterleavedRamp: a
nested ramp with a scalar base, a vector of strides (innermost first),
and a vector of per-dim lane counts. Supports in-place mul/add/div/mod
with symbolic strides where possible, reorder, slice, flatten into 1D
ramps, shuffle index construction for permutations and slices, and an
alias-free predicate.

Replaces InterleavedRamp recognition and handling in VectorizeLoops with
MultiRamp. The reduction-store path peels stride-zero and non-alias-free
dims (turning the latter into unrolled containing loops), computes the
per-iteration shuffle mask from the pre-peel shape via
shuffle_from_slice, and gracefully falls back when alias-freedom can't
be proven.

Wires MultiRamp into the simplifier: the Load and Store rules that
recognize a ramp-of-ramp index now use MultiRamp to rotate the stride-1
dim outermost via a single Shuffle::make_transpose, fixing a latent
correctness bug in the old rule for triply-nested ramps.

In FlattenNestedRamps, teaches the Load and Store visitors to recognize
multiramp indices and emit a concat of per-outer-multi-index 1D ramp
loads/stores, rather than a single flat-indexed load that downstream
passes struggle to combine. The existing bounded-span-to-dense-load
path runs first so strided-gather patterns (e.g. HVX vdelta) are
preserved.

Adds correctness tests for the MultiRamp API (test/correctness/multiramp.cpp)
and a nested-vectorize reduction test (transposed_vector_reduce.cpp).

Co-authored-by: Claude <noreply@anthropic.com>
---
 Makefile                                      |   2 +
 src/CMakeLists.txt                            |   2 +
 src/FlattenNestedRamps.cpp                    | 135 +++-
 src/MultiRamp.cpp                             | 606 ++++++++++++++++++
 src/MultiRamp.h                               | 117 ++++
 src/Simplify_Exprs.cpp                        |  79 ++-
 src/Simplify_Stmts.cpp                        |  71 +-
 src/VectorizeLoops.cpp                        | 396 ++++++------
 test/correctness/CMakeLists.txt               |   2 +
 test/correctness/interleave.cpp               |   2 +-
 test/correctness/multiramp.cpp                | 493 ++++++++++++++
 test/correctness/transposed_vector_reduce.cpp | 137 ++++
 12 files changed, 1810 insertions(+), 232 deletions(-)
 create mode 100644 src/MultiRamp.cpp
 create mode 100644 src/MultiRamp.h
 create mode 100644 test/correctness/multiramp.cpp
 create mode 100644 test/correctness/transposed_vector_reduce.cpp

diff --git a/Makefile b/Makefile
index 7edddd719f81..fca7ad491141 100644
--- a/Makefile
+++ b/Makefile
@@ -548,6 +548,7 @@ SOURCE_FILES = \
   Module.cpp \
   ModulusRemainder.cpp \
   Monotonic.cpp \
+  MultiRamp.cpp \
   ObjectInstanceRegistry.cpp \
   OffloadGPULoops.cpp \
   OptimizeShuffles.cpp \
@@ -753,6 +754,7 @@ HEADER_FILES = \
   Module.h \
   ModulusRemainder.h \
   Monotonic.h \
+  MultiRamp.h \
   ObjectInstanceRegistry.h \
   OffloadGPULoops.h \
   OptimizeShuffles.h \
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a373136025a9..d478ea14d56a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -160,6 +160,7 @@ target_sources(
     Module.h
     ModulusRemainder.h
     Monotonic.h
+    MultiRamp.h
     ObjectInstanceRegistry.h
     OffloadGPULoops.h
     OptimizeShuffles.h
@@ -337,6 +338,7 @@ target_sources(
     Module.cpp
     ModulusRemainder.cpp
     Monotonic.cpp
+    MultiRamp.cpp
     ObjectInstanceRegistry.cpp
     OffloadGPULoops.cpp
     OptimizeShuffles.cpp
diff --git a/src/FlattenNestedRamps.cpp b/src/FlattenNestedRamps.cpp
index efa373f6970a..a428e21c395c 100644
--- a/src/FlattenNestedRamps.cpp
+++ b/src/FlattenNestedRamps.cpp
@@ -4,6 +4,7 @@
 #include "Deinterleave.h"
 #include "IRMutator.h"
 #include "IROperator.h"
+#include "MultiRamp.h"
 #include "Simplify.h"
 
 using std::vector;
@@ -15,17 +16,37 @@ namespace {
 class FlattenRamps : public IRMutator {
     using IRMutator::visit;
 
+    // Visit the scalar base and strides of a multiramp. They are scalars,
+    // but technically could contain total reductions of nested vectors, so
+    // we need to walk them.
+    void mutate_multiramp_scalars(MultiRamp &mr) {
+        mr.base = mutate(mr.base);
+        for (Expr &s : mr.strides) {
+            s = mutate(s);
+        }
+    }
+
     Expr visit(const Ramp *op) override {
         if (op->base.type().is_vector()) {
-            Expr base = mutate(op->base);
-            Expr stride = mutate(op->stride);
-            std::vector<Expr> ramp_elems;
-            ramp_elems.reserve(op->lanes);
-            for (int ix = 0; ix < op->lanes; ix++) {
-                ramp_elems.push_back(base + ix * stride);
-            }
+            if (MultiRamp mr;
+                is_multiramp(op, Scope<Expr>::empty_scope(), &mr)) {
+                // Flatten multiramps entirely in one go, instead of recursively
+                // with the general case below, so that we get one big concat
+                // instead of a concat-of-concats. The innermost dimension is
+                // left as a Ramp.
+                mutate_multiramp_scalars(mr);
+                return Shuffle::make_concat(mr.flatten());
+            } else {
+                Expr base = mutate(op->base);
+                Expr stride = mutate(op->stride);
+                std::vector<Expr> ramp_elems;
+                ramp_elems.reserve(op->lanes);
+                for (int ix = 0; ix < op->lanes; ix++) {
+                    ramp_elems.push_back(base + ix * stride);
+                }
 
-            return Shuffle::make_concat(ramp_elems);
+                return Shuffle::make_concat(ramp_elems);
+            }
         }
 
         return IRMutator::visit(op);
@@ -40,6 +61,18 @@ class FlattenRamps : public IRMutator {
         return IRMutator::visit(op);
     }
 
+    // Slice `v` down to `inner_lanes` starting at output lane `n*inner_lanes`,
+    // matching the slicing done to the flattened index. Broadcasts of scalars
+    // pass through unchanged (as a fresh broadcast of `inner_lanes`).
+    static Expr slice_per_inner_ramp(const Expr &v, int n, int inner_lanes) {
+        if (const Broadcast *b = v.as<Broadcast>()) {
+            if (b->value.type().is_scalar()) {
+                return Broadcast::make(b->value, inner_lanes);
+            }
+        }
+        return Shuffle::make_slice(v, n * inner_lanes, 1, inner_lanes);
+    }
+
     Expr visit(const Load *op) override {
         // Convert a load of a bounded span of indices into a shuffle
         // of a dense or strided load if possible.
@@ -124,6 +157,92 @@ class FlattenRamps : public IRMutator {
                 }
             }
         }
+
+        // If the index is a multiramp, emit a concat of per-inner-ramp
+        // dense/strided loads. This handles the case where the bounded-span
+        // conversion above didn't fire (e.g. symbolic strides, or the
+        // access range is too large for a single dense load). Doing the
+        // concat directly (rather than letting the Ramp visitor flatten
+        // the nested ramp into a big scalar-index load + a subtracted
+        // broadcast offset) makes the structure visible to downstream
+        // shuffle simplification rules.
+        if (op->type.is_vector()) {
+            if (MultiRamp mr;
+                is_multiramp(op->index, Scope<Expr>::empty_scope(), &mr) &&
+                mr.dimensions() >= 2) {
+
+                Expr predicate = mutate(op->predicate);
+                mutate_multiramp_scalars(mr);
+                std::vector<Expr> sub_indices = mr.flatten();
+                int inner_lanes = mr.lanes[0];
+                Type elem_type = op->type.with_lanes(inner_lanes);
+                std::vector<Expr> loads;
+                loads.reserve(sub_indices.size());
+                for (size_t n = 0; n < sub_indices.size(); n++) {
+                    Expr p = slice_per_inner_ramp(predicate, (int)n, inner_lanes);
+                    ModulusRemainder align = (n == 0) ? op->alignment : ModulusRemainder{};
+                    loads.push_back(Load::make(elem_type, op->name, sub_indices[n],
+                                               op->image, op->param, p, align));
+                }
+                return Shuffle::make_concat(loads);
+            }
+        }
+
+        return IRMutator::visit(op);
+    }
+
+    Stmt visit(const Store *op) override {
+        // If the index is a multiramp, unroll into a sequence of per-inner-ramp
+        // stores, for the same reason as the Load visitor above.
+        if (op->index.type().is_vector()) {
+            if (MultiRamp mr;
+                is_multiramp(op->index, Scope<Expr>::empty_scope(), &mr) &&
+                mr.dimensions() >= 2) {
+
+                Expr predicate = mutate(op->predicate);
+                Expr value = mutate(op->value);
+                mutate_multiramp_scalars(mr);
+                std::vector<Expr> sub_indices = mr.flatten();
+                int inner_lanes = mr.lanes[0];
+
+                // The value and/or predicate may load from the buffer being
+                // stored to, so they must be fully evaluated before any of
+                // the stores run. Hoist non-trivial ones into LetStmts that
+                // wrap the block of stores. Skip the hoisting if the expr
+                // is already a Variable or a constant.
+                auto needs_hoist = [](const Expr &e) {
+                    return !is_const(e) && !e.as<Variable>();
+                };
+                std::string value_name, predicate_name;
+                Expr value_ref = value, predicate_ref = predicate;
+                if (needs_hoist(value)) {
+                    value_name = unique_name('t');
+                    value_ref = Variable::make(value.type(), value_name);
+                }
+                if (needs_hoist(predicate)) {
+                    predicate_name = unique_name('t');
+                    predicate_ref = Variable::make(predicate.type(), predicate_name);
+                }
+
+                std::vector<Stmt> stores;
+                stores.reserve(sub_indices.size());
+                for (size_t n = 0; n < sub_indices.size(); n++) {
+                    Expr p = slice_per_inner_ramp(predicate_ref, (int)n, inner_lanes);
+                    Expr v = slice_per_inner_ramp(value_ref, (int)n, inner_lanes);
+                    ModulusRemainder align = (n == 0) ? op->alignment : ModulusRemainder{};
+                    stores.push_back(Store::make(op->name, v, sub_indices[n],
+                                                 op->param, p, align));
+                }
+                Stmt result = Block::make(stores);
+                if (!predicate_name.empty()) {
+                    result = LetStmt::make(predicate_name, predicate, result);
+                }
+                if (!value_name.empty()) {
+                    result = LetStmt::make(value_name, value, result);
+                }
+                return result;
+            }
+        }
         return IRMutator::visit(op);
     }
 };
diff --git a/src/MultiRamp.cpp b/src/MultiRamp.cpp
new file mode 100644
index 000000000000..0caa0dee6029
--- /dev/null
+++ b/src/MultiRamp.cpp
@@ -0,0 +1,606 @@
+#include "MultiRamp.h"
+
+#include "IR.h"
+#include "IREquality.h"
+#include "IROperator.h"
+#include "ModulusRemainder.h"
+#include "Simplify.h"
+
+#include <numeric>
+#include <optional>
+
+namespace Halide {
+namespace Internal {
+
+namespace {
+
+// Collapse adjacent dims whose strides align: if the outer stride equals
+// inner_stride · inner_lanes, the two dims describe a single flat dim and
+// can be merged. Keeps the output tidy; doesn't affect what values the
+// MultiRamp represents.
+void collapse_adjacent_dims(MultiRamp *m) {
+    for (size_t i = 1; i < m->lanes.size();) {
+        Expr want_outer = simplify(m->strides[i - 1] * m->lanes[i - 1]);
+        if (equal(m->strides[i], want_outer)) {
+            m->lanes[i - 1] *= m->lanes[i];
+            m->strides.erase(m->strides.begin() + i);
+            m->lanes.erase(m->lanes.begin() + i);
+        } else {
+            i++;
+        }
+    }
+}
+
+}  // namespace
+
+// Multiramps with compatible lanes form a vector space. Here is scalar multiplication.
+void MultiRamp::mul(const Expr &e) {
+    internal_assert(e.type().is_scalar());
+    base *= e;
+    for (Expr &s : strides) {
+        s *= e;
+    }
+}
+
+// And here is vector addition. Returns false when the two shapes have no
+// common refinement (the sum is not a multiramp). Adding multiramps with
+// different total lane counts is a caller error and triggers an assertion.
+bool MultiRamp::add(const MultiRamp &other) {
+    // We walk through both ramps' dimensions innermost-to-outermost, consuming
+    // gcd(a_lanes, b_lanes) of lanes at a time. When a dimension is only
+    // partially consumed, the remaining part of that dimension corresponds to
+    // an "outer" sub-dim in the refined shape and its stride must be scaled
+    // by the factor just consumed.
+    internal_assert(!lanes.empty() && !other.lanes.empty());
+    int64_t total_a = 1, total_b = 1;
+    for (int l : lanes) {
+        total_a *= l;
+    }
+    for (int l : other.lanes) {
+        total_b *= l;
+    }
+    internal_assert(total_a == total_b)
+        << "MultiRamp::add: total lane counts must match (" << total_a
+        << " vs " << total_b << ")";
+    MultiRamp result;
+    result.base = simplify(base + other.base);
+    size_t ai = 0, bi = 0;
+    int a_lanes = lanes[0], b_lanes = other.lanes[0];
+    Expr a_stride = strides[0], b_stride = other.strides[0];
+    while (true) {
+        int next_lanes = gcd(a_lanes, b_lanes);
+        if (next_lanes == 1) {
+            // The two next lanes are coprime, e.g:
+            //   [0, 1, 2, 100, 101, 102] + [0, 1, 100, 101, 200, 201]
+            // which has no common refinement.
+            return false;
+        }
+        result.strides.emplace_back(simplify(a_stride + b_stride));
+        result.lanes.push_back(next_lanes);
+        a_lanes /= next_lanes;
+        b_lanes /= next_lanes;
+        bool a_done = false, b_done = false;
+        if (a_lanes == 1) {
+            ai++;
+            if (ai >= lanes.size()) {
+                a_done = true;
+            } else {
+                a_lanes = lanes[ai];
+                a_stride = strides[ai];
+            }
+        } else {
+            // Remaining portion of current A-dim has a scaled stride.
+            a_stride = simplify(a_stride * next_lanes);
+        }
+        if (b_lanes == 1) {
+            bi++;
+            if (bi >= other.lanes.size()) {
+                b_done = true;
+            } else {
+                b_lanes = other.lanes[bi];
+                b_stride = other.strides[bi];
+            }
+        } else {
+            b_stride = simplify(b_stride * next_lanes);
+        }
+        if (a_done && b_done) {
+            collapse_adjacent_dims(&result);
+            *this = std::move(result);
+            return true;
+        }
+        // Since the up-front lane-count check passed, both sides must
+        // always exhaust together.
+        internal_assert(!a_done && !b_done);
+    }
+}
+
+namespace {
+
+// Divide (or mod) a MultiRamp by a positive integer k. Returns a new
+// MultiRamp, or false if the quotient/remainder isn't itself a multiramp.
+// Shared core of div_by and mod_by.
+//
+// Precondition: the base is a known multiple of k. Otherwise we return false.
+//
+// Mental model
+// ------------
+// Picture the integers laid out in buckets of size k: [0, k), [k, 2k), ....
+// Dividing by k asks "which bucket?", modding by k asks "where inside the
+// bucket?". The base sits at the left edge of some bucket. We want every
+// lane of the result to remain an affine function of the multi-index — i.e.
+// a multiramp. Whether that's possible depends on how the input dims move
+// the lanes around relative to those buckets.
+//
+// Two kinds of input dim
+// ----------------------
+// "Pure-carry" dim: stride s is itself a multiple of k. Every step crosses
+// a whole number of buckets, so the output stride for this dim is just s/k.
+// These are boring in a good way.
+//
+// "Flex" dim: stride s isn't a multiple of k. Write s = k·q + r with
+// r = s mod k in [0, k). A step advances the bucket index by q and shifts
+// the position-inside-the-bucket by r. If every lane along this dim still
+// lives in the same bucket, the output stride is q and the intra-bucket
+// wiggle washes out under /k. The danger is that the position eventually
+// exceeds k-1 — at which point the floor jumps and the result isn't a
+// multiramp.
+//
+// Worked example
+// --------------
+// base 0, stride 2, lanes 6, k = 4. Values 0, 2, 4, 6, 8, 10.
+//
+// Treat it as one flat 6-lane dim and it's doomed: the positions inside
+// the bucket would be 0, 2, 4, 6, ... — already past k-1 = 3 at lane 2.
+//
+// But we can reshape the 6 lanes as (inner 2 × outer 3). The inner stride
+// stays 2, and the outer stride becomes 2·2 = 4 — a whole bucket. Now the
+// outer dim is pure-carry, and the inner dim only shows positions 0 and 2,
+// safely inside [0, 4). The result is base 0, strides [0, 1], lanes [2, 3],
+// which expands to 0, 0, 1, 1, 2, 2. That matches the per-lane division.
+//
+// The budget
+// ----------
+// Because the base is a bucket boundary, every lane starts at position 0.
+// At the far corner of the iteration box each flex dim contributes r·(n-1)
+// to the position, and the positions have to stay ≤ k-1 everywhere. So the
+// flex dims share a single budget of k-1; each one spends r·(n-1) of it.
+// If they all fit, we're done.
+//
+// Joint fit: base 0, strides [2, 3], lanes [2, 2], k = 6.
+//   Input values:  0, 2, 3, 5       (all in bucket [0, 6))
+//   / 6:           0, 0, 0, 0       (a multiramp with strides [0, 0])
+//   Inner spends 2·(2-1) = 2 of the budget, outer spends 3·(2-1) = 3. Total
+//   5 = k-1, just fitting.
+//
+// Joint failure: base 0, strides [2, 5], lanes [2, 2], k = 6.
+//   Input values:  0, 2, 5, 7       (7 is in the next bucket)
+//   / 6:           0, 0, 0, 1       (not a multiramp of any shape)
+//   Inner spends 2, outer spends 5. Each alone would fit (≤ 5), but
+//   together they want 7 > 5. Return false.
+//
+// The split trick
+// ---------------
+// When a single dim's r·(n-1) blows the budget by itself, here's the
+// escape. Let p = k / gcd(k, r) — the smallest number of stride-s steps
+// that reach a bucket boundary (since p·s ≡ p·r (mod k), and we want that
+// to be 0). We re-view the dim of lanes n as (inner p × outer n/p) with
+// strides (s, s·p). The outer stride s·p is a whole number of buckets, so
+// the outer dim is pure-carry. Only the inner still spends budget, and
+// only r·(p-1) of it. If even that doesn't fit, we give up.
+//
+// Algorithm
+// ---------
+// Walk input dims innermost-first, with budget = k-1. For each dim we only
+// need to know r = s mod k (not s itself) — so a symbolic stride is fine as
+// long as we can pin down its residue modulo k. If we can't, fail. For the
+// first case that applies, emit its output; if none, fail.
+//
+//   (a) r = 0 (pure carry)             → emit (s/k, n).
+//   (b) r·(n-1) ≤ budget               → emit (s/k, n);
+//                                          budget -= r·(n-1).
+//   (c) p = k/gcd(k,r); 1 < p < n,
+//       p divides n, r·(p-1) ≤ budget  → emit inner (s/k, p) and
+//                                          outer (s·p/k, n/p);
+//                                          budget -= r·(p-1).
+//   (d) otherwise                      → return false.
+//
+// Output base is base/k for div, 0 for mod. For mod, emit r in place of
+// s/k and 0 in place of s·p/k; the shape is the same.
+//
+// Finally, collapse any adjacent output dims where the outer stride is
+// inner_stride · inner_lanes — e.g. pure-carry dims with matching strides
+// from two consecutive inputs, or a split's outer half lining up with the
+// next input's contribution. This just keeps the output tidy; it doesn't
+// affect which inputs we accept.
+//
+// Rejection examples
+// ------------------
+// base 0, stride 1, lanes 5, / 2:
+//   Input values:  0, 1, 2, 3, 4
+//   / 2:           0, 0, 1, 1, 2     (not a multiramp of any shape)
+//   p = 2 doesn't divide 5, and the flat dim would spill immediately
+//   (r·(n-1) = 4 > 1 = budget). Return false.
+//
+// base 3, stride 2, lanes 2, / 4:
+//   Input values:  3, 5
+//   / 4:           0, 1              (does happen to be a multiramp, but
+//                                     our algorithm requires an aligned
+//                                     base and skips this case)
+//   Return false before even looking at the dims.
+bool div_or_mod_impl(MultiRamp *self, const Expr &k_expr, bool is_div) {
+    auto ck = as_const_int(k_expr);
+    if (!ck || *ck <= 0) {
+        return false;
+    }
+    int64_t k = *ck;
+    Type t = self->base.type();
+
+    // Aligned-base assumption: require base to be a known multiple of k.
+    int64_t b_mod = 0;
+    if (!reduce_expr_modulo(self->base, k, &b_mod) || b_mod != 0) {
+        return false;
+    }
+
+    MultiRamp result;
+    result.base = is_div ? simplify(self->base / (int)k) : make_zero(t);
+
+    // Residual budget: how much room is left inside the single k-bucket
+    // starting at the base. Starts at k-1 and shrinks as each non-pure-carry
+    // dim spends r·(lanes-1) of it.
+    int64_t budget = k - 1;
+
+    for (size_t j = 0; j < self->strides.size(); j++) {
+        const Expr &s = self->strides[j];
+        int n = self->lanes[j];
+
+        // Everything below only needs s mod k, never s itself. So it's fine
+        // for s to be symbolic, as long as we can pin down its residue.
+        int64_t r = 0;
+        if (!reduce_expr_modulo(s, k, &r)) {
+            return false;
+        }
+
+        // Case (a): pure carry.
+        if (r == 0) {
+            result.strides.push_back(is_div ? simplify(s / (int)k) : make_zero(t));
+            result.lanes.push_back(n);
+            continue;
+        }
+
+        // Case (b): whole dim fits in the remaining budget. Note that (b)
+        // and (c) below are mutually exclusive — if the whole dim fits, n
+        // is necessarily ≤ p, which means case (c) couldn't apply anyway.
+        // So their order here doesn't matter for which inputs we accept.
+        if (r * (n - 1) <= budget) {
+            result.strides.push_back(is_div ? simplify(s / (int)k) : make_const(t, r));
+            result.lanes.push_back(n);
+            budget -= r * (n - 1);
+            continue;
+        }
+
+        // Case (c): split into (inner = p, outer = n/p). The smallest p with
+        // p·s ≡ 0 (mod k) only depends on r, since p·s ≡ p·r (mod k). So
+        // p = k / gcd(k, r).
+        int64_t p = k / gcd(k, r);
+
+        // For r ∈ (0, k), gcd(k, r) ≤ r < k, so p ≥ 2.
+        internal_assert(p > 1);
+
+        if (p >= (int64_t)n) {
+            // The smallest split that would work is >= than the number of lanes
+            // we have in this dimension.
+            return false;
+        }
+
+        if (n % p) {
+            // p must divide n to split n by p. Any larger
+            // split size would also be a multiple of p, so
+            // if p does not divide n, no valid split size
+            // divides n.
+            return false;
+        }
+
+        if (r * (p - 1) > budget) {
+            // We ran out of budget.
+            return false;
+        }
+
+        // Inner half: residual fits after shrinking to size p.
+        result.strides.push_back(is_div ? simplify(s / (int)k) : make_const(t, r));
+        result.lanes.push_back((int)p);
+        budget -= r * (p - 1);
+
+        // Outer half: s·p is a multiple of k by construction, so this divides
+        // exactly (though Halide's simplifier may or may not fold it).
+        result.strides.push_back(is_div ? simplify((s * (int)p) / (int)k) : make_zero(t));
+        result.lanes.push_back((int)(n / p));
+    }
+
+    collapse_adjacent_dims(&result);
+    *self = std::move(result);
+    return true;
+}
+
+}  // namespace
+
+bool MultiRamp::div(const Expr &k) {
+    return div_or_mod_impl(this, k, /*is_div=*/true);
+}
+
+bool MultiRamp::mod(const Expr &k) {
+    return div_or_mod_impl(this, k, /*is_div=*/false);
+}
+
+namespace {
+std::optional<Expr> unbroadcast(const Expr &e) {
+    if (e.type().is_scalar()) {
+        return e;
+    } else if (const Broadcast *b = e.as<Broadcast>()) {
+        return unbroadcast(b->value);
+    } else {
+        return std::nullopt;
+    }
+}
+}  // namespace
+
+bool is_multiramp(const Expr &e, const Scope<Expr> &scope, MultiRamp *result) {
+    Type elem_t = e.type().element_of();
+    if (e.type().is_scalar()) {
+        result->base = e;
+        return true;
+    } else if (const Variable *v = e.as<Variable>()) {
+        if (const Expr *e = scope.find(v->name)) {
+            return is_multiramp(*e, scope, result);
+        }
+    } else if (const Broadcast *b = e.as<Broadcast>();
+               b && is_multiramp(b->value, scope, result)) {
+        result->strides.push_back(make_zero(elem_t));
+        result->lanes.push_back(b->lanes);
+        return true;
+    } else if (const Ramp *r = e.as<Ramp>()) {
+        if (auto stride = unbroadcast(r->stride)) {
+            if (is_multiramp(r->base, scope, result)) {
+                result->strides.push_back(*stride);
+                result->lanes.push_back(r->lanes);
+                return true;
+            }
+        }
+    } else if (const Add *a = e.as<Add>()) {
+        MultiRamp rb;
+        if (is_multiramp(a->a, scope, result) &&
+            is_multiramp(a->b, scope, &rb)) {
+            return result->add(rb);
+        }
+    } else if (const Sub *s = e.as<Sub>()) {
+        // Convert to Add to reuse logic above.
+        MultiRamp rb;
+        if (is_multiramp(s->a, scope, result) &&
+            is_multiramp(s->b, scope, &rb)) {
+            rb.mul(make_const(elem_t, -1));
+            return result->add(rb);
+        }
+    } else if (const Mul *m = e.as<Mul>()) {
+        if (auto b = unbroadcast(m->b);
+            b && is_multiramp(m->a, scope, result)) {
+            result->mul(*b);
+            return true;
+        } else if (auto a = unbroadcast(m->a);
+                   a && is_multiramp(m->b, scope, result)) {
+            result->mul(*a);
+            return true;
+        }
+    } else if (const Div *d = e.as<Div>()) {
+        if (auto denom = unbroadcast(d->b)) {
+            if (is_multiramp(d->a, scope, result)) {
+                return result->div(*denom);
+            }
+        }
+    } else if (const Mod *m = e.as<Mod>()) {
+        if (auto denom = unbroadcast(m->b)) {
+            if (is_multiramp(m->a, scope, result)) {
+                return result->mod(*denom);
+            }
+        }
+    }
+    return false;
+}
+
+Expr MultiRamp::operator==(const MultiRamp &other) const {
+    // Construct the difference, and check if all strides are zero.
+    MultiRamp diff = other;
+    diff.mul(-1);
+    if (!diff.add(*this)) {
+        return const_false();
+    }
+    Expr c = diff.base == 0;
+    for (const Expr &s : diff.strides) {
+        c = c && s == 0;
+    }
+    return simplify(c);
+}
+
+void MultiRamp::slice(int d, Expr v) {
+    internal_assert(d >= 0 && d < (int)strides.size());
+    internal_assert(v.type() == base.type());
+    base += v * strides[d];
+    strides.erase(strides.begin() + d);
+    lanes.erase(lanes.begin() + d);
+    collapse_adjacent_dims(this);
+}
+
+Expr MultiRamp::alias_free() const {
+    // A multiramp is alias free if (but not only if) there is an ordering of
+    // dimensions such that next stride is greater than the max value seen so
+    // far. In principle we only need to test the ordering with increasing
+    // strides, but in the presence of symbolic strides, we don't know which one
+    // that is. So we'll test all permutations (there shouldn't be many, because
+    // there's only one dimension per nested loop) and or together the
+    // conditions.
+
+    if (lanes.empty()) {
+        return const_true();
+    }
+    int d = (int)lanes.size();
+    std::vector<int> perm(d);
+    std::iota(perm.begin(), perm.end(), 0);
+    Expr result = const_false();
+    do {
+        Expr cond = (strides[perm[0]] != 0);
+        Expr accum = make_zero(base.type());  // running sum of |s_k|*(n_k − 1)
+        for (int j = 0; j < d; j++) {
+            Expr s = strides[perm[j]];
+            Expr abs_s = abs(s);
+            if (j > 0) {
+                cond = cond && (abs_s > accum);
+            }
+            accum = accum + abs_s * (lanes[perm[j]] - 1);
+        }
+        result = result || cond;
+    } while (std::next_permutation(perm.begin(), perm.end()));
+    return simplify(result);
+}
+
+int MultiRamp::dimensions() const {
+    return (int)strides.size();
+}
+
+int MultiRamp::total_lanes() const {
+    int prod = 1;
+    for (int l : lanes) {
+        prod *= l;
+    }
+    return prod;
+}
+
+Expr MultiRamp::to_expr() const {
+    Expr e = base;
+    for (int i = 0; i < dimensions(); i++) {
+        if (is_const_zero(strides[i])) {
+            e = Broadcast::make(e, lanes[i]);
+        } else if (e.type().is_scalar()) {
+            e = Ramp::make(e, strides[i], lanes[i]);
+        } else {
+            e = Ramp::make(e, Broadcast::make(strides[i], e.type().lanes()), lanes[i]);
+        }
+    }
+    return e;
+}
+
+void MultiRamp::reorder(const std::vector<int> &perm) {
+    int d = dimensions();
+    internal_assert((int)perm.size() == d) << "perm size mismatch\n";
+    std::vector<Expr> new_strides;
+    std::vector<int> new_lanes;
+    new_strides.reserve(d);
+    new_lanes.reserve(d);
+    for (int k = 0; k < d; k++) {
+        internal_assert(perm[k] >= 0 && perm[k] < d) << "perm out of range\n";
+        new_strides.push_back(std::move(strides[perm[k]]));
+        new_lanes.push_back(lanes[perm[k]]);
+    }
+    strides = std::move(new_strides);
+    lanes = std::move(new_lanes);
+}
+
+std::vector<int> MultiRamp::shuffle_from_permuted(const std::vector<int> &perm) const {
+    // For each output lane n (in *this's lane order), we want the shuffle to
+    // pull from the input (permuted) vector's lane that represents the same
+    // multi-index. Decompose n into multi-index (i_0, ..., i_{d-1}) using
+    // this->lanes (innermost first); the matching multi-index in the permuted
+    // MultiRamp is (j_k) with j_k = i_{perm[k]}, flattened with
+    // this->lanes[perm[k]] as its innermost lane counts.
+    int d = dimensions();
+    internal_assert((int)perm.size() == d);
+    int total = total_lanes();
+    std::vector<int> indices(total);
+    std::vector<int> i(d);
+    for (int n = 0; n < total; n++) {
+        int rem = n;
+        for (int k = 0; k < d; k++) {
+            i[k] = rem % lanes[k];
+            rem /= lanes[k];
+        }
+        int permuted_flat = 0;
+        int M = 1;
+        for (int k = 0; k < d; k++) {
+            permuted_flat += i[perm[k]] * M;
+            M *= lanes[perm[k]];
+        }
+        indices[n] = permuted_flat;
+    }
+    return indices;
+}
+
+std::vector<Expr> MultiRamp::flatten() const {
+    int d = dimensions();
+    if (d == 0) {
+        return {base};
+    }
+    int inner_lanes = lanes[0];
+    int outer_total = total_lanes() / inner_lanes;
+    std::vector<Expr> result;
+    result.reserve(outer_total);
+    for (int n = 0; n < outer_total; n++) {
+        int rem = n;
+        Expr offset_base = base;
+        for (int k = 1; k < d; k++) {
+            int ik = rem % lanes[k];
+            rem /= lanes[k];
+            if (ik != 0) {
+                offset_base = offset_base + ik * strides[k];
+            }
+        }
+        result.push_back(Ramp::make(offset_base, strides[0], inner_lanes));
+    }
+    return result;
+}
+
+std::vector<int> MultiRamp::shuffle_from_slice(int d, int pos) const {
+    return shuffle_from_slice(std::vector<int>{d}, std::vector<int>{pos});
+}
+
+std::vector<int> MultiRamp::shuffle_from_slice(const std::vector<int> &dims,
+                                               const std::vector<int> &pos) const {
+    // For each output lane n (in the sliced MultiRamp's lane order), we want
+    // the shuffle to pull from the lane of *this whose multi-index matches
+    // n in the free (non-sliced) dims, and has the specified values in the
+    // sliced dims.
+    internal_assert(dims.size() == pos.size());
+    int d = dimensions();
+    std::vector<int> fixed(d, -1);
+    for (size_t j = 0; j < dims.size(); j++) {
+        int dd = dims[j];
+        internal_assert(dd >= 0 && dd < d);
+        internal_assert(pos[j] >= 0 && pos[j] < lanes[dd]);
+        internal_assert(fixed[dd] == -1) << "duplicate dim in shuffle_from_slice\n";
+        fixed[dd] = pos[j];
+    }
+    int total_out = 1;
+    for (int k = 0; k < d; k++) {
+        if (fixed[k] == -1) {
+            total_out *= lanes[k];
+        }
+    }
+    std::vector<int> indices(total_out);
+    for (int n = 0; n < total_out; n++) {
+        int rem = n;
+        int flat = 0;
+        int M = 1;
+        for (int k = 0; k < d; k++) {
+            int ik;
+            if (fixed[k] != -1) {
+                ik = fixed[k];
+            } else {
+                ik = rem % lanes[k];
+                rem /= lanes[k];
+            }
+            flat += ik * M;
+            M *= lanes[k];
+        }
+        indices[n] = flat;
+    }
+    return indices;
+}
+
+}  // namespace Internal
+}  // namespace Halide
diff --git a/src/MultiRamp.h b/src/MultiRamp.h
new file mode 100644
index 000000000000..1a011ba08406
--- /dev/null
+++ b/src/MultiRamp.h
@@ -0,0 +1,117 @@
+#ifndef HALIDE_MULTI_RAMP_H
+#define HALIDE_MULTI_RAMP_H
+
+/** \file
+ * Defines the MultiRamp IR helper — a multi-dimensional ramp recognised and
+ * manipulated by the vectorization pass and its callers.
+ */
+
+#include "Expr.h"
+#include "Scope.h"
+
+namespace Halide {
+namespace Internal {
+
+/** A multi-dimensional ramp. I.e. a ramp of ramps of ramps of ramps...
+ *
+ * The scalar-producing operations (mul, add, div, mod) all mutate the
+ * MultiRamp in place. mul always succeeds; add/div/mod return false when
+ * the result isn't expressible as a multiramp (leaving *this undefined). */
+struct MultiRamp {
+    Expr base;
+    // The first stride is the innermost one. So for example, if the base is
+    // zero, strides are [1, 100] and the extents are [2, 3], the IR node is a
+    // vector with lanes: [0, 1, 100, 101, 200, 201]
+    std::vector<Expr> strides;
+    std::vector<int> lanes;
+
+    // Multiply by a scalar. Always a multiramp.
+    void mul(const Expr &e);
+
+    // Add another MultiRamp elementwise. Returns false if the result isn't a
+    // multiramp (which happens when the two input shapes have no common
+    // refinement).
+    bool add(const MultiRamp &other);
+
+    // Floor-divide by a scalar. Returns false if the denominator isn't a
+    // positive integer constant, or if the quotient isn't a multiramp. The
+    // result may have one more dim than the input (a single split may be
+    // introduced per input dim). O(d).
+    bool div(const Expr &k);
+
+    // Euclidean mod by a scalar. Returns false if the denominator isn't a
+    // positive integer constant, or if the remainder isn't a multiramp.
+    // Same shape as div. Rare cases where the remainder is a multiramp but
+    // the quotient isn't are not recognized here. O(d).
+    bool mod(const Expr &k);
+
+    // Construct an Expr which gives whether one multiramp is equal to another
+    // in every lane. Assumes the total lane count matches.
+    Expr operator==(const MultiRamp &other) const;
+
+    // Remove a dimension, replacing it with the given scalar expression
+    // (e.g. pass v = 0 to get the first slice along that dimension, pass v =
+    // some var to get a parameterized slice along that dimension).
+    void slice(int d, Expr v);
+
+    // Construct an Expr telling us whether the lanes are all unique. This
+    // expression being false is conservative: it doesn't imply aliasing, only
+    // that we couldn't construct the tightest condition for it in closed form.
+    Expr alias_free() const;
+
+    // The dimensionality. May be lower than you expected, because this gets
+    // flattened when possible by the operations above.
+    int dimensions() const;
+
+    // The product of all the lane counts
+    int total_lanes() const;
+
+    // The multiramp as a nested series of ramps
+    Expr to_expr() const;
+
+    // Flatten the multiramp into a vector of 1D Ramps — one per outer
+    // multi-index, each with inner_lanes = lanes[0] and stride = strides[0].
+    // Ramps are returned in this MultiRamp's lane order: concat'ing the
+    // returned Ramps reproduces the full lane sequence. The caller is
+    // responsible for any prior mutation/simplification of `base` and
+    // `strides` (the Ramps reference them directly).
+    std::vector<Expr> flatten() const;
+
+    // Reorder this MultiRamp's dimensions in place. perm[k] is the index
+    // into this's current dims that becomes the k-th dim after reordering
+    // (innermost first, as always). perm must be a permutation of
+    // {0, ..., dimensions()-1}.
+    void reorder(const std::vector<int> &perm);
+
+    // Given a permutation `perm`, return shuffle indices `idx` such that if
+    // `p` is a copy of `*this` with `reorder(perm)` applied, then
+    //     Shuffle::make({p.to_expr()}, idx)
+    // produces the same vector of lane values as `this->to_expr()`. In other
+    // words: given a vector in the permuted lane order, the returned indices
+    // put it back into this MultiRamp's original lane order.
+    std::vector<int> shuffle_from_permuted(const std::vector<int> &perm) const;
+
+    // Given a dimension `d` and a position `pos` within it, return shuffle
+    // indices `idx` such that
+    //     Shuffle::make({this->to_expr()}, idx)
+    // produces the same vector of lane values as a copy of *this with
+    // slice(d, pos) applied. Since slicing reduces the lane count, the
+    // shuffle selects the subset of *this's lanes whose d-th coordinate
+    // equals `pos`.
+    std::vector<int> shuffle_from_slice(int d, int pos) const;
+
+    // Variant that slices multiple dims simultaneously. Returns shuffle
+    // indices selecting the lanes of *this where dim `dims[j]` equals
+    // `pos[j]` for all j. `dims` and `pos` must have the same length and
+    // `dims` must list distinct dim indices.
+    std::vector<int> shuffle_from_slice(const std::vector<int> &dims,
+                                        const std::vector<int> &pos) const;
+};
+
+/** Check if a vector Expr is a multiramp, and assign to result if so. */
+bool is_multiramp(const Expr &e, const Scope<Expr> &scope, MultiRamp *result);
+
+}  // namespace Internal
+}  // namespace Halide
+
+#endif
diff --git a/src/Simplify_Exprs.cpp b/src/Simplify_Exprs.cpp
index 7c3181b426f3..df26ff3cce66 100644
--- a/src/Simplify_Exprs.cpp
+++ b/src/Simplify_Exprs.cpp
@@ -1,5 +1,10 @@
 #include "Simplify_Internal.h"
 
+#include <algorithm>
+#include <numeric>
+
+#include "MultiRamp.h"
+
 using std::string;
 
 namespace Halide {
@@ -375,22 +380,64 @@ Expr Simplify::visit(const Load *op, ExprInfo *info) {
             loaded_vecs.emplace_back(std::move(load));
         }
         return Shuffle::make(loaded_vecs, s_index->indices);
-    } else if (const Ramp *inner_ramp = r_index ? r_index->base.as<Ramp>() : nullptr;
-               inner_ramp &&
-               !is_const_one(inner_ramp->stride) &&
-               is_const_one(r_index->stride)) {
-        // If it's a nested ramp and the outer ramp has stride 1, swap the
-        // nesting order of the ramps to make dense loads and transpose the
-        // resulting vector instead.
-        Expr transposed_index =
-            Ramp::make(Ramp::make(inner_ramp->base, make_one(inner_ramp->base.type()), r_index->lanes),
-                       Broadcast::make(inner_ramp->stride, r_index->lanes), inner_ramp->lanes);
-        Expr transposed_predicate = (predicate.as<Broadcast>() ?
-                                         predicate :  // common case optimization
-                                         Shuffle::make_transpose(predicate, inner_ramp->lanes));
-        Expr transposed_load =
-            Load::make(op->type, op->name, transposed_index, op->image, op->param, transposed_predicate, align);
-        return mutate(Shuffle::make_transpose(transposed_load, r_index->lanes), info);
+    } else if (MultiRamp mr;
+               index.type().is_vector() &&
+               // Don't do expensive analysis in the common case of a load of a ramp of scalars.
+               !(r_index && r_index->base.type().is_scalar()) &&
+               is_multiramp(index, Scope<Expr>::empty_scope(), &mr) &&
+               mr.dimensions() > 1) {
+        // If the index is a multi-dimensional ramp with a stride-1 dim that
+        // isn't already innermost, rotate it (together with all subsequent
+        // dims) to the innermost position so the resulting load is dense,
+        // and restore the original lane order with a transpose. Splitting
+        // the dims into a contiguous "outer half + inner half" pair and
+        // swapping them lets the shuffle be expressed as a single
+        // make_transpose, which downstream code can recognise and (in
+        // future) represent more compactly than a general shuffle.
+        int k = -1;
+        for (int i = 0; i < mr.dimensions(); i++) {
+            if (is_const_one(mr.strides[i])) {
+                k = i;
+                break;
+            }
+        }
+        if (k > 0) {
+            // Permutation: [k, k+1, ..., d-1, 0, 1, ..., k-1]. This is a pure
+            // rotation of the halves, which Shuffle::make_transpose can
+            // express.
+            int d = mr.dimensions();
+            std::vector<int> perm(d);
+            std::iota(perm.begin(), perm.end(), 0);
+            std::rotate(perm.begin(), perm.begin() + k, perm.end());
+            MultiRamp permuted = mr;
+            permuted.reorder(perm);
+            int A = 1;  // product of lanes[0..k-1]
+            for (int i = 0; i < k; i++) {
+                A *= mr.lanes[i];
+            }
+            int B = op->type.lanes() / A;  // product of lanes[k..d-1]
+
+            // The predicate applied to the permuted load must be in the
+            // permuted lane order. For the halves-swap rotation, that's just
+            // make_transpose(predicate, A) (except for scalar broadcasts,
+            // which are invariant).
+            Expr permuted_predicate;
+            const Broadcast *b_pred = predicate.as<Broadcast>();
+            if (b_pred && b_pred->value.type().is_scalar()) {
+                permuted_predicate = predicate;
+            } else {
+                permuted_predicate = Shuffle::make_transpose(predicate, A);
+            }
+
+            Expr permuted_load =
+                Load::make(op->type, op->name, permuted.to_expr(), op->image,
+                           op->param, permuted_predicate, align);
+            return mutate(Shuffle::make_transpose(permuted_load, B), info);
+        }
+        if (predicate.same_as(op->predicate) && index.same_as(op->index) && align == op->alignment) {
+            return op;
+        }
+        return Load::make(op->type, op->name, index, op->image, op->param, predicate, align);
     } else if (predicate.same_as(op->predicate) && index.same_as(op->index) && align == op->alignment) {
         return op;
     } else {
diff --git a/src/Simplify_Stmts.cpp b/src/Simplify_Stmts.cpp
index 3a6b459b5c88..4d29a9509124 100644
--- a/src/Simplify_Stmts.cpp
+++ b/src/Simplify_Stmts.cpp
@@ -1,7 +1,11 @@
 #include "Simplify_Internal.h"
 
+#include <algorithm>
+#include <numeric>
+
 #include "ExprUsesVar.h"
 #include "IRMutator.h"
+#include "MultiRamp.h"
 #include "Substitute.h"
 
 namespace Halide {
@@ -388,24 +392,57 @@ Stmt Simplify::visit(const Store *op) {
         Stmt s = Block::make(stores);
         s = LetStmt::make(var_name, value, s);
         return mutate(s);
-    } else if (const Ramp *inner_ramp = r_index ? r_index->base.as<Ramp>() : nullptr;
-               inner_ramp &&
-               !is_const_one(inner_ramp->stride) &&
-               is_const_one(r_index->stride)) {
-        // If it's a nested ramp and the outer ramp has stride 1, swap the
-        // nesting order of the ramps to make dense stores and transpose the
-        // index and value instead. Later in lowering after flattening the
-        // nested ramps it will turn into a concat of dense ramps and hit the
+    } else if (MultiRamp mr;
+               index.type().is_vector() &&
+               // Don't do expensive analysis in the common case of a load of a ramp of scalars.
+               !(r_index && r_index->base.type().is_scalar()) &&
+               is_multiramp(index, Scope<Expr>::empty_scope(), &mr) &&
+               mr.dimensions() > 1) {
+        // If the index is a multi-dimensional ramp with a stride-1 dim that
+        // isn't already innermost, rotate it (together with all subsequent
+        // dims) to the innermost position so the resulting store is dense.
+        // Permute the value and predicate to match the new lane order using
+        // a single make_transpose, which downstream code can recognise and
+        // represent compactly. Later in lowering, after flattening the
+        // nested ramps, this turns into a concat of dense ramps and hits the
         // case above.
-        Expr transposed_index =
-            Ramp::make(Ramp::make(inner_ramp->base, make_one(inner_ramp->base.type()), r_index->lanes),
-                       Broadcast::make(inner_ramp->stride, r_index->lanes), inner_ramp->lanes);
-        Expr transposed_value = Shuffle::make_transpose(value, inner_ramp->lanes);
-        Expr transposed_predicate = (predicate.as<Broadcast>() ?
-                                         predicate :  // common case optimization
-                                         Shuffle::make_transpose(predicate, inner_ramp->lanes));
-        return mutate(Store::make(op->name, transposed_value, transposed_index,
-                                  op->param, transposed_predicate, align));
+        int k = -1;
+        for (int i = 0; i < mr.dimensions(); i++) {
+            if (is_const_one(mr.strides[i])) {
+                k = i;
+                break;
+            }
+        }
+        if (k > 0) {
+            int d = mr.dimensions();
+            std::vector<int> perm(d);
+            std::iota(perm.begin(), perm.end(), 0);
+            std::rotate(perm.begin(), perm.begin() + k, perm.end());
+            MultiRamp permuted = mr;
+            permuted.reorder(perm);
+            int A = 1;
+            for (int i = 0; i < k; i++) {
+                A *= mr.lanes[i];
+            }
+
+            // Transpose the value and predicate so their lane ordering
+            // matches the permuted index.
+            Expr permuted_value = Shuffle::make_transpose(value, A);
+            Expr permuted_predicate;
+            const Broadcast *b_pred = predicate.as<Broadcast>();
+            if (b_pred && b_pred->value.type().is_scalar()) {
+                permuted_predicate = predicate;
+            } else {
+                permuted_predicate = Shuffle::make_transpose(predicate, A);
+            }
+            return mutate(Store::make(op->name, permuted_value, permuted.to_expr(),
+                                      op->param, permuted_predicate, align));
+        }
+        if (predicate.same_as(op->predicate) && value.same_as(op->value) &&
+            index.same_as(op->index) && align == op->alignment) {
+            return op;
+        }
+        return Store::make(op->name, value, index, op->param, predicate, align);
     } else if (predicate.same_as(op->predicate) && value.same_as(op->value) && index.same_as(op->index) && align == op->alignment) {
         return op;
     } else {
diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp
index ebfd63e860bd..3d767826ae93 100644
--- a/src/VectorizeLoops.cpp
+++ b/src/VectorizeLoops.cpp
@@ -9,6 +9,7 @@
 #include "IRMutator.h"
 #include "IROperator.h"
 #include "IRPrinter.h"
+#include "MultiRamp.h"
 #include "Scope.h"
 #include "Simplify.h"
 #include "Solve.h"
@@ -41,7 +42,7 @@ const Broadcast *as_scalar_broadcast(const Expr &e) {
     } else {
         return nullptr;
     }
-};
+}
 
 /** Find the exact scalar max and min lanes of a vector expression. Not
  * conservative like bounds_of_expr, but uses similar rules for some common node
@@ -189,119 +190,6 @@ Interval bounds_of_lanes(const Expr &e) {
         Expr max_lane = VectorReduce::make(VectorReduce::Max, e, 1);
         return {min_lane, max_lane};
     }
-};
-
-// A ramp with the lanes repeated inner_repetitions times, and then
-// the whole vector repeated outer_repetitions times.
-// E.g: <0 0 2 2 4 4 6 6 0 0 2 2 4 4 6 6>.
-struct InterleavedRamp {
-    Expr base, stride;
-    int lanes, inner_repetitions, outer_repetitions;
-};
-
-bool equal_or_zero(int a, int b) {
-    return a == 0 || b == 0 || a == b;
-}
-
-bool is_interleaved_ramp(const Expr &e, const Scope<Expr> &scope, InterleavedRamp *result) {
-    if (const Ramp *r = e.as<Ramp>()) {
-        const Broadcast *b_base = r->base.as<Broadcast>();
-        const Broadcast *b_stride = r->stride.as<Broadcast>();
-        if (r->base.type().is_scalar()) {
-            result->base = r->base;
-            result->stride = r->stride;
-            result->lanes = r->lanes;
-            result->inner_repetitions = 1;
-            result->outer_repetitions = 1;
-            return true;
-        } else if (b_base && b_stride && b_base->lanes == b_stride->lanes) {
-            // Ramp of broadcast
-            result->base = b_base->value;
-            result->stride = b_stride->value;
-            result->lanes = r->lanes;
-            result->inner_repetitions = b_base->lanes;
-            result->outer_repetitions = 1;
-            return true;
-        }
-    } else if (const Broadcast *b = e.as<Broadcast>()) {
-        if (b->value.type().is_scalar()) {
-            result->base = b->value;
-            result->stride = 0;
-            result->lanes = b->lanes;
-            result->inner_repetitions = 0;
-            result->outer_repetitions = 0;
-            return true;
-        } else if (is_interleaved_ramp(b->value, scope, result)) {
-            // Broadcast of interleaved ramp
-            result->outer_repetitions *= b->lanes;
-            return true;
-        }
-    } else if (const Add *add = e.as<Add>()) {
-        InterleavedRamp ra;
-        if (is_interleaved_ramp(add->a, scope, &ra) &&
-            is_interleaved_ramp(add->b, scope, result) &&
-            equal_or_zero(ra.inner_repetitions, result->inner_repetitions) &&
-            equal_or_zero(ra.outer_repetitions, result->outer_repetitions)) {
-            result->base = simplify(result->base + ra.base);
-            result->stride = simplify(result->stride + ra.stride);
-            result->inner_repetitions = std::max(result->inner_repetitions, ra.inner_repetitions);
-            result->outer_repetitions = std::max(result->outer_repetitions, ra.outer_repetitions);
-            return true;
-        }
-    } else if (const Sub *sub = e.as<Sub>()) {
-        InterleavedRamp ra;
-        if (is_interleaved_ramp(sub->a, scope, &ra) &&
-            is_interleaved_ramp(sub->b, scope, result) &&
-            equal_or_zero(ra.inner_repetitions, result->inner_repetitions) &&
-            equal_or_zero(ra.outer_repetitions, result->outer_repetitions)) {
-            result->base = simplify(ra.base - result->base);
-            result->stride = simplify(ra.stride - result->stride);
-            result->inner_repetitions = std::max(result->inner_repetitions, ra.inner_repetitions);
-            result->outer_repetitions = std::max(result->outer_repetitions, ra.outer_repetitions);
-            return true;
-        }
-    } else if (const Mul *mul = e.as<Mul>()) {
-        std::optional<int64_t> b;
-        if (is_interleaved_ramp(mul->a, scope, result) &&
-            (b = as_const_int(mul->b))) {
-            result->base = simplify(result->base * (int)(*b));
-            result->stride = simplify(result->stride * (int)(*b));
-            return true;
-        }
-    } else if (const Div *div = e.as<Div>()) {
-        std::optional<int64_t> b;
-        if (is_interleaved_ramp(div->a, scope, result) &&
-            (b = as_const_int(div->b)) &&
-            is_const_one(result->stride) &&
-            (result->inner_repetitions == 1 ||
-             result->inner_repetitions == 0) &&
-            can_prove((result->base % (int)(*b)) == 0)) {
-            // TODO: Generalize this. Currently only matches
-            // ramp(base*b, 1, lanes) / b
-            // broadcast(base * b, lanes) / b
-            result->base = simplify(result->base / (int)(*b));
-            result->inner_repetitions *= (int)(*b);
-            return true;
-        }
-    } else if (const Mod *mod = e.as<Mod>()) {
-        std::optional<int64_t> b;
-        if (is_interleaved_ramp(mod->a, scope, result) &&
-            (b = as_const_int(mod->b)) &&
-            (result->outer_repetitions == 1 ||
-             result->outer_repetitions == 0) &&
-            can_prove(((int)(*b) % result->stride) == 0)) {
-            // ramp(base, 2, lanes) % 8
-            result->base = simplify(result->base % (int)(*b));
-            result->stride = simplify(result->stride % (int)(*b));
-            result->outer_repetitions *= (int)(*b);
-            return true;
-        }
-    } else if (const Variable *var = e.as<Variable>()) {
-        if (const Expr *e = scope.find(var->name)) {
-            return is_interleaved_ramp(*e, scope, result);
-        }
-    }
-    return false;
 }
 
 // Allocations inside vectorized loops grow an additional inner
@@ -749,41 +637,8 @@ class VectorSubs : public IRMutator {
                           op->call_type, op->func, op->value_index, op->image, op->param);
     }
 
-    Expr visit(const Let *op) override {
-        // Vectorize the let value and check to see if it was vectorized by
-        // this mutator. The type of the expression might already be vector
-        // width.
-        Expr mutated_value = simplify(mutate(op->value));
-        bool was_vectorized = (!op->value.type().is_vector() &&
-                               mutated_value.type().is_vector());
-
-        // If the value was vectorized by this mutator, add a new name to
-        // the scope for the vectorized value expression.
-        string vectorized_name;
-        if (was_vectorized) {
-            vectorized_name = get_widened_var_name(op->name);
-            scope.push(op->name, op->value);
-            vector_scope.push(vectorized_name, mutated_value);
-        }
-
-        Expr mutated_body = mutate(op->body);
-
-        InterleavedRamp ir;
-        if (is_interleaved_ramp(mutated_value, vector_scope, &ir)) {
-            return substitute(vectorized_name, mutated_value, mutated_body);
-        } else if (mutated_value.same_as(op->value) &&
-                   mutated_body.same_as(op->body)) {
-            return op;
-        } else if (was_vectorized) {
-            scope.pop(op->name);
-            vector_scope.pop(vectorized_name);
-            return Let::make(vectorized_name, mutated_value, mutated_body);
-        } else {
-            return Let::make(op->name, mutated_value, mutated_body);
-        }
-    }
-
-    Stmt visit(const LetStmt *op) override {
+    template<typename LetOrLetStmt>
+    auto visit_let(const LetOrLetStmt *op) -> decltype(op->body) {
         Expr mutated_value = simplify(mutate(op->value));
         string vectorized_name = op->name;
 
@@ -791,33 +646,42 @@ class VectorSubs : public IRMutator {
         bool was_vectorized = (!op->value.type().is_vector() &&
                                mutated_value.type().is_vector());
 
+        decltype(op->body) mutated_body;
         if (was_vectorized) {
             vectorized_name = get_widened_var_name(op->name);
-            scope.push(op->name, op->value);
-            vector_scope.push(vectorized_name, mutated_value);
             // Also keep track of the original let, in case inner code scalarizes.
             containing_lets.emplace_back(op->name, op->value);
-        }
 
-        Stmt mutated_body = mutate(op->body);
+            ScopedBinding<Expr>
+                bind(scope, op->name, op->value),
+                bind_vec(vector_scope, vectorized_name, mutated_value);
 
-        if (was_vectorized) {
+            mutated_body = mutate(op->body);
             containing_lets.pop_back();
-            scope.pop(op->name);
-            vector_scope.pop(vectorized_name);
+        } else {
+            mutated_body = mutate(op->body);
         }
 
-        InterleavedRamp ir;
-        if (is_interleaved_ramp(mutated_value, vector_scope, &ir)) {
+        MultiRamp m;
+        if (mutated_value.type().is_vector() &&
+            is_multiramp(mutated_value, vector_scope, &m)) {
             return substitute(vectorized_name, mutated_value, mutated_body);
         } else if (mutated_value.same_as(op->value) &&
                    mutated_body.same_as(op->body)) {
             return op;
         } else {
-            return LetStmt::make(vectorized_name, mutated_value, mutated_body);
+            return LetOrLetStmt::make(vectorized_name, mutated_value, mutated_body);
         }
     }
 
+    Expr visit(const Let *op) override {
+        return visit_let(op);
+    }
+
+    Stmt visit(const LetStmt *op) override {
+        return visit_let(op);
+    }
+
     Stmt visit(const Provide *op) override {
         internal_error << "Vectorizing a Provide node is unimplemented. "
                        << "Vectorization usually runs after storage flattening.\n";
@@ -1017,8 +881,8 @@ class VectorSubs : public IRMutator {
                 string vectorized_name = get_widened_var_name(var);
                 Expr vectorized_value = vector_scope.get(vectorized_name);
                 vector_scope.pop(vectorized_name);
-                InterleavedRamp ir;
-                if (is_interleaved_ramp(vectorized_value, vector_scope, &ir)) {
+                MultiRamp m;
+                if (is_multiramp(vectorized_value, vector_scope, &m)) {
                     body = substitute(vectorized_name, vectorized_value, body);
                 } else {
                     body = LetStmt::make(vectorized_name, vectorized_value, body);
@@ -1097,6 +961,18 @@ class VectorSubs : public IRMutator {
 
     Stmt visit(const Atomic *op) override {
         // Recognize a few special cases that we can handle as within-vector reduction trees.
+
+        // We may partially succeed, in which case we'll have loops to rewrap
+        struct ContainingLoop {
+            std::string name;
+            int extent = 0;
+            // The index of this loop's dim in the pre-peel store_mr. Only
+            // used in the alias-free peeling path; other uses can leave it
+            // at -1.
+            int dim = -1;
+        };
+        std::vector<ContainingLoop> containing_loops;
+
         do {
             if (!op->mutex_name.empty()) {
                 // We can't vectorize over a mutex
@@ -1203,19 +1079,19 @@ class VectorSubs : public IRMutator {
             Expr store_index = mutate(store->index);
             Expr load_index = mutate(load_a->index);
 
-            // The load and store indices must be the same interleaved
-            // ramp (or the same scalar, in the total reduction case).
-            InterleavedRamp store_ir, load_ir;
+            // The load and store indices must be the same multiramp
+            // (or the same scalar, in the total reduction case).
+            MultiRamp store_mr, load_mr;
             Expr test;
             if (store_index.type().is_scalar()) {
                 test = simplify(load_index == store_index);
-            } else if (is_interleaved_ramp(store_index, vector_scope, &store_ir) &&
-                       is_interleaved_ramp(load_index, vector_scope, &load_ir) &&
-                       store_ir.inner_repetitions == load_ir.inner_repetitions &&
-                       store_ir.outer_repetitions == load_ir.outer_repetitions &&
-                       store_ir.lanes == load_ir.lanes) {
-                test = simplify(store_ir.base == load_ir.base &&
-                                store_ir.stride == load_ir.stride);
+            } else if (is_multiramp(store_index, vector_scope, &store_mr) &&
+                       is_multiramp(load_index, vector_scope, &load_mr)) {
+                debug(0) << "Store multiramp:\n "
+                         << store_mr.to_expr() << "\n";
+                test = store_mr == load_mr;
+                debug(0) << "Store == load test:\n "
+                         << test << "\n";
             }
 
             if (!test.defined()) {
@@ -1250,25 +1126,119 @@ class VectorSubs : public IRMutator {
             };
 
             int output_lanes = 1;
+            MultiRamp pre_peel_mr;
             if (store_index.type().is_scalar()) {
                 // The index doesn't depend on the value being
                 // vectorized, so it's a total reduction.
-
                 b = VectorReduce::make(reduce_op, b, 1);
             } else {
 
-                output_lanes = store_index.type().lanes() / (store_ir.inner_repetitions * store_ir.outer_repetitions);
+                // The output lanes is >1, so there must be at least one
+                // multiramp dimension with non-zero stride. There may be
+                // dimensions with zero stride, however.
+
+                // Here we identify any stride-0 dimensions in the
+                // multiramp. Innermost ones with stride zero will be handled
+                // with a vector reduce. Others will be handled by taking slices
+                // and combining in a tree. We first shuffle the other
+                // stride-zero ones outermost so that the slices are
+                // dense. TODO: is this the best policy? We could also transpose
+                // them inwards and vector reduce.
+
+                // TODO: There may also be dimensions with unknown (symbolic)
+                // stride.  We need to handle these carefully because they might
+                // be zero at runtime. This is require injecting a loop that
+                // handles one slice at a time along that dimension. Finally,
+                // there might be dimensions with known strides such that they
+                // overlap, e.g. if some lunatic vectorizes a reduction like
+                // f(r.x + r.y) += .... We need to slice out at least one of the
+                // two conflicting dimensions and turn it into a loop.
+
+                int inner_repetitions = 1;
+                int outer_repetitions = 1;
+                if (is_const_zero(store_mr.strides[0])) {
+                    inner_repetitions = store_mr.lanes[0];
+                    store_mr.slice(0, 0);
+                }
 
-                store_index = Ramp::make(store_ir.base, store_ir.stride, output_lanes / store_ir.base.type().lanes());
-                if (store_ir.inner_repetitions > 1) {
-                    b = VectorReduce::make(reduce_op, b, output_lanes * store_ir.outer_repetitions);
+                std::vector<int> perm, zero_dims;
+
+                // Look for stride-zero dimensions
+                perm.reserve(store_mr.dimensions());
+                bool needs_shuffle = false;
+                for (int d = 0; d < store_mr.dimensions(); d++) {
+                    if (is_const_zero(store_mr.strides[d])) {
+                        zero_dims.push_back(d);
+                        outer_repetitions *= store_mr.lanes[d];
+                    } else {
+                        // If any non-stride-zero dims come after a stride-zero
+                        // dim, we'll need a shuffle.
+                        needs_shuffle |= !zero_dims.empty();
+                        perm.push_back(d);
+                    }
+                }
+                std::vector<int> shuffle;
+                if (needs_shuffle) {
+                    perm.insert(perm.end(), zero_dims.begin(), zero_dims.end());
+                    shuffle = store_mr.shuffle_from_permuted(perm);
+                    store_mr.reorder(perm);
+                }
+                for (size_t i = 0; i < zero_dims.size(); i++) {
+                    store_mr.strides.pop_back();
+                    store_mr.lanes.pop_back();
+                }
+
+                // Snapshot the pre-peel MultiRamp so we can figure out
+                // which slice of b each unrolled iteration should store.
+                pre_peel_mr = store_mr;
+
+                if (!can_prove(store_mr.alias_free())) {
+                    debug(0) << "Alias-free check failed\n";
+                    // There may be more collisions. We don't know. This means
+                    // we need to genuinely do an interleaved sequence of loads
+                    // and stores to the target buffer. There may be multiple
+                    // alias-free subsets of the dimensions of store_mr. We'll
+                    // do it greedily. Starting from the innermost, we'll add
+                    // each dimension provided that we maintain the alias-free
+                    // property. Even if we find none, we've at least peeled off
+                    // the stride-0 dimensions already, so it's better than
+                    // bailing and scalarizing.
+
+                    MultiRamp alias_free_slice;
+                    alias_free_slice.base = store_mr.base;
+                    for (int i = 0; i < store_mr.dimensions(); i++) {
+                        Expr s = store_mr.strides[i];
+                        int l = store_mr.lanes[i];
+                        alias_free_slice.strides.push_back(s);
+                        alias_free_slice.lanes.push_back(l);
+                        if (!can_prove(alias_free_slice.alias_free())) {
+                            containing_loops.emplace_back(
+                                ContainingLoop{unique_name('t'), l, i});
+                            alias_free_slice.base +=
+                                Variable::make(Int(32), containing_loops.back().name) * s;
+                            alias_free_slice.strides.pop_back();
+                            alias_free_slice.lanes.pop_back();
+                        }
+                    }
+                    store_mr = std::move(alias_free_slice);
+                }
+
+                output_lanes = store_mr.total_lanes();
+                store_index = store_mr.to_expr();
+                int pre_peel_total = pre_peel_mr.total_lanes();
+                if (inner_repetitions > 1) {
+                    b = VectorReduce::make(reduce_op, b, pre_peel_total * outer_repetitions);
+                }
+
+                if (needs_shuffle) {
+                    b = Shuffle::make({b}, shuffle);
                 }
 
-                // Handle outer repetitions by unrolling the reduction
-                // over slices.
-                if (store_ir.outer_repetitions > 1) {
-                    // First remove all powers of two with a binary reduction tree.
-                    int reps = store_ir.outer_repetitions;
+                // Handle outer repetitions with a reduction tree over dense
+                // slices. Reduces b down to pre_peel_total lanes (peeled dims
+                // are handled by the unroll below).
+                if (outer_repetitions > 1) {
+                    int reps = outer_repetitions;
                     while (reps % 2 == 0) {
                         int l = b.type().lanes() / 2;
                         Expr b0 = Shuffle::make_slice(b, 0, 1, l);
@@ -1276,12 +1246,10 @@ class VectorSubs : public IRMutator {
                         b = binop(b0, b1);
                         reps /= 2;
                     }
-
-                    // Then reduce linearly over slices for the rest.
                     if (reps > 1) {
-                        Expr v = Shuffle::make_slice(b, 0, 1, output_lanes);
+                        Expr v = Shuffle::make_slice(b, 0, 1, pre_peel_total);
                         for (int i = 1; i < reps; i++) {
-                            Expr slice = simplify(Shuffle::make_slice(b, i * output_lanes, 1, output_lanes));
+                            Expr slice = simplify(Shuffle::make_slice(b, i * pre_peel_total, 1, pre_peel_total));
                             v = binop(v, slice);
                         }
                         b = v;
@@ -1294,12 +1262,60 @@ class VectorSubs : public IRMutator {
                                        load_a->param, const_true(output_lanes),
                                        ModulusRemainder{});
 
-            Expr lhs = cast(b.type(), new_load);
-            b = binop(lhs, b);
-            b = cast(new_load.type(), b);
+            Expr lhs = cast(b.type().with_lanes(output_lanes), new_load);
+
+            Stmt s;
+            if (containing_loops.empty()) {
+                b = binop(lhs, b);
+                b = cast(new_load.type(), b);
+                s = Store::make(store->name, b, store_index, store->param,
+                                const_true(b.type().lanes()), store->alignment);
+            } else {
+                // Wrap any containing loops we still need (unrolled). We
+                // enumerate the cartesian product of loop iteration values
+                // directly, so that each store's b-slice can be computed
+                // from the full multi-index.
+                std::string b_var_name = unique_name('b');
+                Expr b_var = Variable::make(b.type().with_lanes(output_lanes), b_var_name);
+                Stmt store_template =
+                    Store::make(store->name, cast(new_load.type(), binop(lhs, b_var)),
+                                store_index, store->param,
+                                const_true(output_lanes), ModulusRemainder{});
+                std::string full_b_var_name = unique_name('b');
+                Expr full_b_var = Variable::make(b.type(), full_b_var_name);
+
+                int total_iters = pre_peel_mr.total_lanes() / output_lanes;
+                std::vector<int> peeled_dims;
+                peeled_dims.reserve(containing_loops.size());
+                for (const auto &loop : containing_loops) {
+                    peeled_dims.push_back(loop.dim);
+                }
+                std::vector<Stmt> block;
+                block.reserve(total_iters);
+                for (int n = 0; n < total_iters; n++) {
+                    // Decompose n into per-loop iteration values (innermost
+                    // loop first, matching the order in containing_loops).
+                    std::vector<int> v(containing_loops.size());
+                    int rem = n;
+                    for (size_t j = 0; j < containing_loops.size(); j++) {
+                        int e = containing_loops[j].extent;
+                        v[j] = rem % e;
+                        rem /= e;
+                    }
 
-            Stmt s = Store::make(store->name, b, store_index, store->param,
-                                 const_true(b.type().lanes()), store->alignment);
+                    std::vector<int> indices = pre_peel_mr.shuffle_from_slice(peeled_dims, v);
+                    Expr b_slice = Shuffle::make({full_b_var}, indices);
+
+                    Stmt this_store = store_template;
+                    for (size_t j = 0; j < containing_loops.size(); j++) {
+                        this_store = substitute(containing_loops[j].name, v[j], this_store);
+                    }
+                    this_store = substitute(b_var_name, b_slice, this_store);
+                    block.push_back(this_store);
+                }
+                s = Block::make(block);
+                s = LetStmt::make(full_b_var_name, b, s);
+            }
 
             // We may still need the atomic node, if there was more
             // parallelism than just the vectorization.
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index 77ddcb5d6caf..bbbd06e68d74 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -229,6 +229,7 @@ tests(GROUPS correctness
       multi_way_select.cpp
       multipass_constraints.cpp
       multiple_outputs.cpp
+      multiramp.cpp
       mux.cpp
       narrow_predicates.cpp
       negative_split_factors.cpp
@@ -323,6 +324,7 @@ tests(GROUPS correctness
       tracing_stack.cpp
       transitive_bounds.cpp
       transpose_idioms.cpp
+      transposed_vector_reduce.cpp
       trim_no_ops.cpp
       tuple_partial_update.cpp
       tuple_reduction.cpp
diff --git a/test/correctness/interleave.cpp b/test/correctness/interleave.cpp
index cbee263f5487..6b437c0651d2 100644
--- a/test/correctness/interleave.cpp
+++ b/test/correctness/interleave.cpp
@@ -16,7 +16,7 @@ class CountInterleaves : public IRVisitor {
     using IRVisitor::visit;
 
     void visit(const Shuffle *op) override {
-        if (op->is_interleave()) {
+        if (op->is_interleave() || op->is_transpose()) {
             result++;
         }
         IRVisitor::visit(op);
diff --git a/test/correctness/multiramp.cpp b/test/correctness/multiramp.cpp
new file mode 100644
index 000000000000..e6677eb64e27
--- /dev/null
+++ b/test/correctness/multiramp.cpp
@@ -0,0 +1,493 @@
+#include "Halide.h"
+
+#include <cstdio>
+#include <vector>
+
+using namespace Halide;
+using namespace Halide::Internal;
+
+namespace {
+
+int failures = 0;
+
+// Expand a MultiRamp (with const base and const strides) to a flat vector
+// using the same innermost-fastest enumeration the IR uses.
+std::vector<int> expand(const MultiRamp &m) {
+    auto cb = as_const_int(simplify(m.base));
+    internal_assert(cb) << "expand() only supports const bases, got " << m.base << "\n";
+    int64_t b = *cb;
+    std::vector<int64_t> strides;
+    for (const Expr &s : m.strides) {
+        auto cs = as_const_int(simplify(s));
+        internal_assert(cs) << "expand() only supports const strides, got " << s << "\n";
+        strides.push_back(*cs);
+    }
+    int total = 1;
+    for (int n : m.lanes) total *= n;
+    std::vector<int> result;
+    result.reserve(total);
+    for (int flat = 0; flat < total; flat++) {
+        int rem = flat;
+        int64_t v = b;
+        for (size_t i = 0; i < m.lanes.size(); i++) {
+            int idx = rem % m.lanes[i];
+            rem /= m.lanes[i];
+            v += strides[i] * idx;
+        }
+        result.push_back((int)v);
+    }
+    return result;
+}
+
+void print_vec(const std::vector<int> &v) {
+    printf("[");
+    for (size_t i = 0; i < v.size(); i++) {
+        printf("%s%d", i ? ", " : "", v[i]);
+    }
+    printf("]");
+}
+
+void check_seq(const std::vector<int> &got, const std::vector<int> &want,
+               const char *msg, int line) {
+    if (got != want) {
+        printf("FAIL at %d: %s\n  got ", line, msg);
+        print_vec(got);
+        printf("\n  want ");
+        print_vec(want);
+        printf("\n");
+        failures++;
+    }
+}
+
+#define CHECK(cond, msg) do {                                \
+    if (!(cond)) {                                           \
+        printf("FAIL at %d: %s\n", __LINE__, msg);           \
+        failures++;                                          \
+    }                                                        \
+} while (0)
+
+#define CHECK_SEQ_LIT(got, msg, ...) check_seq((got), std::vector<int>{__VA_ARGS__}, (msg), __LINE__)
+#define CHECK_SEQ(got, want, msg)    check_seq((got), (want), (msg), __LINE__)
+
+// ---- MultiRamp::add ------------------------------------------------------
+
+void check_add_refinable_shapes() {
+    // From the math problem: A = ramp(0,1,6) = [0,1,2,3,4,5],
+    //                        B = ramp(ramp(0,2,2),100,3) = [0,2,100,102,200,202],
+    //                   A + B = [0,3,102,105,204,207].
+    // Shapes (6,) and (2,3) (innermost first) must refine to (2,3).
+    MultiRamp A{0, {1}, {6}};
+    MultiRamp B{0, {2, 100}, {2, 3}};
+    CHECK(A.add(B), "add with refinable shapes");
+    CHECK_SEQ_LIT(expand(A), "refinable-shape add values", 0, 3, 102, 105, 204, 207);
+}
+
+void check_add_same_shape() {
+    MultiRamp A{10, {3, 100}, {4, 2}};
+    MultiRamp B{5, {-1, 50}, {4, 2}};
+    auto a_seq = expand(A), b_seq = expand(B);
+    CHECK(A.add(B), "same-shape add");
+    std::vector<int> want(8);
+    for (size_t i = 0; i < a_seq.size(); i++) want[i] = a_seq[i] + b_seq[i];
+    CHECK_SEQ(expand(A), want, "same-shape add values");
+}
+
+void check_add_incompatible_shapes() {
+    // Shapes with innermost sizes 3 vs 2 and outer sizes 2 vs 3 can't refine.
+    MultiRamp A{0, {1, 100}, {3, 2}};
+    MultiRamp B{0, {1, 100}, {2, 3}};
+    CHECK(!A.add(B), "incompatible shapes rejected");
+}
+
+void check_add_cancels_to_zero() {
+    // 2·A + (-2)·A should simplify to a single zero-stride dim (one flat dim
+    // of the total lane count).
+    MultiRamp A{7, {3, 100}, {4, 2}};
+    MultiRamp B = A;
+    A.mul(2);
+    B.mul(-2);
+    CHECK(A.add(B), "add of cancelling multiramps");
+    CHECK(A.lanes.size() == 1, "cancelled add should collapse to 1 dim");
+    if (A.lanes.size() == 1) {
+        CHECK(A.lanes[0] == 8, "cancelled add lanes = 8");
+        auto s = as_const_int(simplify(A.strides[0]));
+        CHECK(s && *s == 0, "cancelled add stride = 0");
+        auto b = as_const_int(simplify(A.base));
+        CHECK(b && *b == 0, "cancelled add base = 0");
+    }
+}
+
+void check_add_scaled_outer() {
+    // Regression test for the stride-scaling bug: adding a 1D ramp of length 6
+    // to a 2D ramp with shape (2,3) must scale the 1D's stride by 2 when
+    // producing the outer dim of the result.
+    //   A = ramp(0,1,6)          -> [0,1,2,3,4,5]
+    //   B = ramp(ramp(0,0,2),100,3) = broadcast(0,2) then + ramp-of-100s
+    //                             -> [0,0,100,100,200,200]
+    //   A+B = [0,1,102,103,204,205]
+    MultiRamp A{0, {1}, {6}};
+    MultiRamp B{0, {0, 100}, {2, 3}};
+    CHECK(A.add(B), "scaled-outer add");
+    CHECK_SEQ_LIT(expand(A), "scaled-outer values", 0, 1, 102, 103, 204, 205);
+}
+
+// ---- MultiRamp::div -----------------------------------------------------
+
+void check_div_pure_carry_const() {
+    MultiRamp A{8, {4, 12}, {2, 3}};
+    auto a_seq = expand(A);
+    CHECK(A.div(4), "pure-carry div (const k)");
+    std::vector<int> want(a_seq.size());
+    for (size_t i = 0; i < a_seq.size(); i++) want[i] = a_seq[i] / 4;
+    CHECK_SEQ(expand(A), want, "pure-carry div values");
+}
+
+void check_div_symbolic_strides() {
+    // Symbolic base and strides, all provably multiples of the denominator —
+    // every dim is pure carry.
+    Var v("v");
+    MultiRamp A{2 * v, {2 * v, 8 * v}, {4, 5}};
+    CHECK(A.div(2), "pure-carry div with symbolic strides");
+    if (A.strides.size() == 2) {
+        // Strides become (2*v/2, 8*v/2) = (v, 4*v).
+        Expr want0 = simplify(A.strides[0] - v);
+        Expr want1 = simplify(A.strides[1] - 4 * v);
+        CHECK(is_const_zero(want0), "sym-stride div inner");
+        CHECK(is_const_zero(want1), "sym-stride div outer");
+    }
+}
+
+void check_div_merges_adjacent_pure_carry() {
+    // Two pure-carry input dims whose output strides line up should collapse
+    // into a single output dim.
+    // Input values: 0, 4, 8, 12, 16, 20 (strides [4, 12], lanes [3, 2]).
+    // Divided by 4: 0, 1, 2, 3, 4, 5 — a flat 1D ramp of length 6.
+    MultiRamp A{0, {4, 12}, {3, 2}};
+    CHECK(A.div(4), "div of two pure-carry dims");
+    CHECK(A.lanes.size() == 1, "adjacent dims should merge into one");
+    if (A.lanes.size() == 1) {
+        CHECK(A.lanes[0] == 6, "merged lane count");
+    }
+    CHECK_SEQ_LIT(expand(A), "merged values", 0, 1, 2, 3, 4, 5);
+}
+
+void check_div_with_split() {
+    // ramp(0,2,6) / 4 = [0,0,1,1,2,2], needs a split of dim 6 -> (2,3).
+    MultiRamp A{0, {2}, {6}};
+    CHECK(A.div(4), "div with split");
+    CHECK_SEQ_LIT(expand(A), "split div values", 0, 0, 1, 1, 2, 2);
+}
+
+void check_div_split_with_symbolic_stride() {
+    // Non-constant stride whose residue mod k is still pinned down: stride
+    // is 4*v + 2, which is always ≡ 2 (mod 4). The split needs p=2, which
+    // divides 6. The budget check uses r = 2 only.
+    Var v("v");
+    MultiRamp A{0, {4 * v + 2}, {6}};
+    CHECK(A.div(4), "div split with symbolic stride");
+    // Expected shape after split: lanes (2, 3); inner stride = (4v+2)/4
+    // (symbolic), outer stride = (4v+2)*2/4 = 2v+1.
+    CHECK(A.lanes.size() == 2, "split produced two output dims");
+    if (A.lanes.size() == 2) {
+        CHECK(A.lanes[0] == 2 && A.lanes[1] == 3, "split lanes (2, 3)");
+        // Outer stride should simplify to 2v + 1.
+        Expr outer = simplify(A.strides[1]);
+        Expr want = simplify(2 * v + 1);
+        CHECK(equal(outer, want), "outer stride is 2v+1");
+    }
+}
+
+void check_div_rejects_non_multiramp() {
+    // ramp(0,1,5)/2 = [0,0,1,1,2], not a multiramp (5 has no usable factor).
+    MultiRamp A{0, {1}, {5}};
+    CHECK(!A.div(2), "should reject ramp(0,1,5)/2");
+}
+
+void check_div_rejects_unaligned_base() {
+    // ramp(2,2,6)/4 = [0,1,1,2,2,3] would be a multiramp, but our algorithm
+    // requires the base to be a known multiple of the denominator, and 2 is
+    // not a multiple of 4.
+    MultiRamp A{2, {2}, {6}};
+    CHECK(!A.div(4), "should reject div when base isn't aligned");
+}
+
+void check_div_rejects_symbolic_denominator() {
+    // A symbolic (non-constant) denominator should fail cleanly. The code
+    // needs k as a known positive integer to reason about bucket sizes.
+    Var k("k");
+    MultiRamp A{0, {1}, {4}};
+    CHECK(!A.div(k), "should reject div with symbolic denominator");
+    CHECK(!A.mod(k), "should reject mod with symbolic denominator");
+}
+
+// ---- MultiRamp::mod -----------------------------------------------------
+
+void check_mod_basic() {
+    MultiRamp A{0, {1}, {6}};
+    CHECK(A.mod(2), "mod basic");
+    CHECK_SEQ_LIT(expand(A), "mod basic values", 0, 1, 0, 1, 0, 1);
+}
+
+void check_mod_with_split() {
+    MultiRamp A{0, {2}, {6}};
+    CHECK(A.mod(4), "mod with split");
+    CHECK_SEQ_LIT(expand(A), "mod split values", 0, 2, 0, 2, 0, 2);
+}
+
+void check_mod_symbolic_strides() {
+    // Symbolic base and strides, all provably multiples of the denominator:
+    // mod result is entirely zero.
+    Var v("v");
+    MultiRamp A{2 * v, {6 * v, 10 * v}, {3, 2}};
+    CHECK(A.mod(2), "mod pure-carry symbolic strides");
+    for (const Expr &s : A.strides) {
+        auto c = as_const_int(simplify(s));
+        CHECK(c && *c == 0, "sym-stride mod stride = 0");
+    }
+    auto b = as_const_int(simplify(A.base));
+    CHECK(b && *b == 0, "sym-stride mod base = 0");
+}
+
+void check_mod_rejects_non_multiramp() {
+    // ramp(0,1,5)%2 = [0,1,0,1,0], not a multiramp.
+    MultiRamp A{0, {1}, {5}};
+    CHECK(!A.mod(2), "should reject ramp(0,1,5)%2");
+}
+
+// ---- End-to-end is_multiramp tests --------------------------------------
+
+void check_recognize_1d_ramp() {
+    Expr e = Ramp::make(Expr(0), Expr(2), 4);
+    Scope<Expr> scope;
+    MultiRamp m;
+    CHECK(is_multiramp(e, scope, &m), "recognize 1D ramp");
+    if (m.lanes.size() == 1) {
+        CHECK(m.lanes[0] == 4, "1D lanes");
+    }
+}
+
+void check_recognize_nested_ramp() {
+    // ramp(ramp(0,1,2), broadcast(100,2), 3) -> strides [1,100], lanes [2,3].
+    Expr inner = Ramp::make(Expr(0), Expr(1), 2);
+    Expr e = Ramp::make(inner, Broadcast::make(Expr(100), 2), 3);
+    Scope<Expr> scope;
+    MultiRamp m;
+    CHECK(is_multiramp(e, scope, &m), "recognize nested ramp");
+    if (m.lanes.size() == 2) {
+        CHECK(m.lanes[0] == 2 && m.lanes[1] == 3, "nested ramp lanes");
+    }
+}
+
+void check_recognize_add() {
+    Expr a = Ramp::make(Expr(0), Expr(1), 6);
+    Expr inner = Ramp::make(Expr(0), Expr(2), 2);
+    Expr b = Ramp::make(inner, Broadcast::make(Expr(100), 2), 3);
+    Expr sum = Add::make(a, b);
+    Scope<Expr> scope;
+    MultiRamp m;
+    CHECK(is_multiramp(sum, scope, &m), "recognize add of two multiramps");
+}
+
+void check_recognize_div_const() {
+    Expr e = Div::make(Ramp::make(Expr(0), Expr(2), 6),
+                       Broadcast::make(Expr(4), 6));
+    Scope<Expr> scope;
+    MultiRamp m;
+    CHECK(is_multiramp(e, scope, &m), "recognize ramp/const");
+    CHECK_SEQ_LIT(expand(m), "recognized div values", 0, 0, 1, 1, 2, 2);
+}
+
+void check_recognize_mod_const() {
+    Expr e = Mod::make(Ramp::make(Expr(0), Expr(1), 6),
+                       Broadcast::make(Expr(2), 6));
+    Scope<Expr> scope;
+    MultiRamp m;
+    CHECK(is_multiramp(e, scope, &m), "recognize ramp%const");
+    CHECK_SEQ_LIT(expand(m), "recognized mod values", 0, 1, 0, 1, 0, 1);
+}
+
+void check_recognize_div_symbolic_strides() {
+    // (2*x) + ramp(0, 4, 4), divided by 2. Numerator has symbolic base, const
+    // strides that are multiples of 2.
+    Var x("x");
+    Expr num = Broadcast::make(2 * x, 4) + Ramp::make(Expr(0), Expr(4), 4);
+    Expr e = Div::make(num, Broadcast::make(Expr(2), 4));
+    Scope<Expr> scope;
+    MultiRamp m;
+    CHECK(is_multiramp(e, scope, &m), "recognize symbolic-strides div");
+    if (m.strides.size() == 1) {
+        auto s = as_const_int(simplify(m.strides[0]));
+        CHECK(s && *s == 2, "symbolic-strides div stride = 2");
+    }
+}
+
+// ---- Reordering and shuffle_from_permuted -------------------------------
+
+void check_reorder() {
+    // Swap the two dims of a 2D multiramp.
+    // base 0, strides [1, 10], lanes [2, 3]:   0,  1, 10, 11, 20, 21
+    // reordered [1, 0] -> strides [10, 1], lanes [3, 2]:  0, 10, 20,  1, 11, 21
+    MultiRamp A{0, {1, 10}, {2, 3}};
+    MultiRamp R = A;
+    R.reorder({1, 0});
+    CHECK(R.lanes.size() == 2, "reordered dims");
+    if (R.lanes.size() == 2) {
+        CHECK(R.lanes[0] == 3 && R.lanes[1] == 2, "reordered lane counts");
+        auto s0 = as_const_int(simplify(R.strides[0]));
+        auto s1 = as_const_int(simplify(R.strides[1]));
+        CHECK(s0 && *s0 == 10, "reordered stride 0");
+        CHECK(s1 && *s1 == 1, "reordered stride 1");
+    }
+    CHECK_SEQ_LIT(expand(R), "reordered values", 0, 10, 20, 1, 11, 21);
+}
+
+void check_shuffle_from_permuted_2d() {
+    // A has 2 dims; perm = [1, 0] swaps them. The shuffle takes the
+    // permuted lane order back to the original lane order.
+    MultiRamp A{0, {1, 10}, {2, 3}};
+    MultiRamp P = A;
+    P.reorder({1, 0});
+    std::vector<int> idx = A.shuffle_from_permuted({1, 0});
+    // For each output lane n (A's order), idx[n] is the input lane in P's
+    // order that carries the same value.
+    auto a_seq = expand(A);  // 0, 1, 10, 11, 20, 21
+    auto p_seq = expand(P);  // 0, 10, 20, 1, 11, 21
+    CHECK(idx.size() == a_seq.size(), "shuffle indices size");
+    for (size_t n = 0; n < a_seq.size(); n++) {
+        CHECK(p_seq[idx[n]] == a_seq[n], "shuffle restores original lane");
+    }
+    // And as a vector: [0, 3, 1, 4, 2, 5].
+    std::vector<int> want = {0, 3, 1, 4, 2, 5};
+    CHECK(idx == want, "shuffle indices match expected");
+}
+
+void check_shuffle_from_permuted_identity() {
+    // perm = identity => indices = [0, 1, 2, ..., total_lanes-1].
+    MultiRamp A{0, {1, 10, 100}, {2, 3, 4}};
+    std::vector<int> idx = A.shuffle_from_permuted({0, 1, 2});
+    for (size_t n = 0; n < idx.size(); n++) {
+        CHECK((int)n == idx[n], "identity permutation indices");
+    }
+}
+
+void check_shuffle_from_permuted_3d() {
+    // 3D with cyclic permutation. Check by comparing expanded sequences.
+    // base 0, strides [1, 4, 20], lanes [2, 3, 2]. Values:
+    //   i_0 + 4*i_1 + 20*i_2 for (i_0, i_1, i_2) in [2)x[3)x[2).
+    MultiRamp A{0, {1, 4, 20}, {2, 3, 2}};
+    std::vector<int> perm = {2, 0, 1};  // outermost becomes innermost
+    MultiRamp P = A;
+    P.reorder(perm);
+    std::vector<int> idx = A.shuffle_from_permuted(perm);
+    auto a_seq = expand(A);
+    auto p_seq = expand(P);
+    CHECK(idx.size() == a_seq.size(), "3D shuffle size");
+    for (size_t n = 0; n < a_seq.size(); n++) {
+        CHECK(p_seq[idx[n]] == a_seq[n], "3D shuffle restores original");
+    }
+}
+
+void check_shuffle_from_slice_2d() {
+    // A has 2 dims, lanes [2, 3]. Slice dim 1 at pos 2 should yield lanes
+    // [2]; the shuffle indices pick those lanes of A.
+    MultiRamp A{0, {1, 10}, {2, 3}};
+    MultiRamp S = A;
+    S.slice(1, Expr(2));
+    std::vector<int> idx = A.shuffle_from_slice(1, 2);
+    auto a_seq = expand(A);  // 0, 1, 10, 11, 20, 21
+    auto s_seq = expand(S);  // 20, 21
+    CHECK(idx.size() == s_seq.size(), "slice shuffle size");
+    for (size_t n = 0; n < s_seq.size(); n++) {
+        CHECK(a_seq[idx[n]] == s_seq[n], "slice shuffle picks right lanes");
+    }
+    std::vector<int> want = {4, 5};
+    CHECK(idx == want, "slice shuffle indices match expected");
+}
+
+void check_shuffle_from_slice_inner() {
+    // Slice the innermost dim.
+    MultiRamp A{0, {1, 10}, {2, 3}};
+    MultiRamp S = A;
+    S.slice(0, Expr(1));
+    std::vector<int> idx = A.shuffle_from_slice(0, 1);
+    auto a_seq = expand(A);  // 0, 1, 10, 11, 20, 21
+    auto s_seq = expand(S);  // 1, 11, 21
+    CHECK(idx.size() == s_seq.size(), "inner slice shuffle size");
+    for (size_t n = 0; n < s_seq.size(); n++) {
+        CHECK(a_seq[idx[n]] == s_seq[n], "inner slice picks right lanes");
+    }
+    std::vector<int> want = {1, 3, 5};
+    CHECK(idx == want, "inner slice indices match expected");
+}
+
+void check_shuffle_from_slice_3d() {
+    // 3D: strides [1, 4, 20], lanes [2, 3, 2]. Slice middle dim at pos 1.
+    MultiRamp A{0, {1, 4, 20}, {2, 3, 2}};
+    MultiRamp S = A;
+    S.slice(1, Expr(1));
+    std::vector<int> idx = A.shuffle_from_slice(1, 1);
+    auto a_seq = expand(A);
+    auto s_seq = expand(S);
+    CHECK(idx.size() == s_seq.size(), "3D slice shuffle size");
+    for (size_t n = 0; n < s_seq.size(); n++) {
+        CHECK(a_seq[idx[n]] == s_seq[n], "3D slice picks right lanes");
+    }
+}
+
+void check_reject_non_multiramp_sum() {
+    // [0,1,2,100,101,102] + [0,2,100,102,200,202] = sum with shape conflict.
+    Expr a_inner = Ramp::make(Expr(0), Expr(1), 3);
+    Expr a = Ramp::make(a_inner, Broadcast::make(Expr(100), 3), 2);
+    Expr b_inner = Ramp::make(Expr(0), Expr(2), 2);
+    Expr b = Ramp::make(b_inner, Broadcast::make(Expr(100), 2), 3);
+    Expr sum = Add::make(a, b);
+    Scope<Expr> scope;
+    MultiRamp m;
+    CHECK(!is_multiramp(sum, scope, &m), "reject coprime-shape add");
+}
+
+}  // namespace
+
+int main(int argc, char **argv) {
+    check_add_refinable_shapes();
+    check_add_same_shape();
+    check_add_incompatible_shapes();
+    check_add_cancels_to_zero();
+    check_add_scaled_outer();
+
+    check_div_pure_carry_const();
+    check_div_symbolic_strides();
+    check_div_merges_adjacent_pure_carry();
+    check_div_with_split();
+    check_div_split_with_symbolic_stride();
+    check_div_rejects_non_multiramp();
+    check_div_rejects_unaligned_base();
+    check_div_rejects_symbolic_denominator();
+
+    check_mod_basic();
+    check_mod_with_split();
+    check_mod_symbolic_strides();
+    check_mod_rejects_non_multiramp();
+
+    check_recognize_1d_ramp();
+    check_recognize_nested_ramp();
+    check_recognize_add();
+    check_recognize_div_const();
+    check_recognize_mod_const();
+    check_recognize_div_symbolic_strides();
+    check_reorder();
+    check_shuffle_from_permuted_2d();
+    check_shuffle_from_permuted_identity();
+    check_shuffle_from_permuted_3d();
+    check_shuffle_from_slice_2d();
+    check_shuffle_from_slice_inner();
+    check_shuffle_from_slice_3d();
+    check_reject_non_multiramp_sum();
+
+    if (failures) {
+        printf("%d failures\n", failures);
+        return 1;
+    }
+    printf("Success!\n");
+    return 0;
+}
diff --git a/test/correctness/transposed_vector_reduce.cpp b/test/correctness/transposed_vector_reduce.cpp
new file mode 100644
index 000000000000..aa38fc4426f8
--- /dev/null
+++ b/test/correctness/transposed_vector_reduce.cpp
@@ -0,0 +1,137 @@
+#include "Halide.h"
+
+using namespace Halide;
+using namespace Halide::Internal;
+
+constexpr int all = -1, success = 0, bad_output = 1, failed_vectorization = 2;
+
+int test(int which_case = all) {
+
+    constexpr int vec = 8;
+
+    Func g{"g"};
+    Var x{"x"}, y{"y"}, z{"z"};
+    RDom r(0, vec);
+
+    ImageParam input(Int(32), 3);
+    Buffer<int> input_buf(vec, vec, vec);
+    input_buf.for_each_element([&](int x, int y, int z) {
+        input_buf(x, y, z) = x + y * 10 + z * 100;
+    });
+    input.set(input_buf);
+
+    for (int i = 0; i < 6; i++) {
+        for (int j = 0; j < 3; j++) {
+            int idx = i * 3 + j;
+            if (which_case == all || which_case == idx) {
+                switch (j) {
+                case 0:
+                    g(x, y) += input(x, y, r);
+                    break;
+                case 1:
+                    g(x, y) += input(x, r, y);
+                    break;
+                case 2:
+                    g(x, y) += input(r, x, y);
+                    break;
+                }
+            }
+        }
+    }
+
+    std::vector<VarOrRVar> orders[6] =
+        {{x, y, r},
+         {x, r, y},
+         {r, x, y},
+         {y, x, r},
+         {y, r, x},
+         {r, y, x}};
+
+    Buffer<int> correct = g.realize({vec, vec});
+
+    g.bound(x, 0, vec)
+        .bound(y, 0, vec)
+        .vectorize(x)
+        .vectorize(y);
+
+    int u = 0;
+    for (int i = 0; i < 6; i++) {
+        for (int j = 0; j < 3; j++) {
+            int idx = i * 3 + j;
+            if (which_case == all || idx == which_case) {
+                g.update(u++)
+                    .vectorize(x)
+                    .vectorize(y)
+                    .atomic()
+                    .vectorize(r)
+                    .reorder(orders[i]);
+            }
+        }
+    }
+
+    // We need to know the stride on the output buffer is such that rows don't
+    // alias each other. That would be UB, but not UB that the vectorizer knows
+    // how to exploit. It's more interesting if the stride is not vec - it's a genuine 2D store.
+    // g.output_buffer().dim(1).set_stride(vec + 7);
+
+    int for_loops = 0, gathers = 0;
+    auto checker = LambdaMutator{
+        [&](auto *self, const For *op) {
+            for_loops++;
+            return self->visit_base(op);
+        },
+        [&](auto *self, const Load *op) {
+            const Ramp *r = op->index.as<Ramp>();
+            gathers += !r || !is_const_one(r->stride);
+            return self->visit_base(op);
+        }};
+
+    g.add_custom_lowering_pass(&checker, nullptr);
+
+    Buffer<int> out = g.realize({vec, vec});
+
+    for (int y = 0; y < vec; y++) {
+        for (int x = 0; x < vec; x++) {
+            if (out(x, y) != correct(x, y)) {
+                printf("out(%d, %d) = %d instead of %d\n", x, y, out(x, y), correct(x, y));
+                return bad_output;
+            }
+        }
+    }
+
+    if (which_case == all && for_loops) {
+        printf("Atomic vectorization failed. Lowered code contained %d for loops\n", for_loops);
+        return failed_vectorization;
+    }
+
+    if (which_case == all && gathers) {
+        printf("Atomic vectorization produced %d vector gathers\n", gathers);
+        return failed_vectorization;
+    }
+
+    if (which_case != all) {
+        g.compile_to_lowered_stmt(std::string("test_") + std::to_string(which_case) + ".stmt", {input}, StmtOutputFormat::Text, Target{"host-no_asserts-no_runtime-no_bounds_query"});
+        g.compile_to_assembly(std::string("test_") + std::to_string(which_case) + ".s", {input}, Target{"host-no_asserts-no_runtime-no_bounds_query"});
+    }
+
+    return success;
+}
+
+int main(int argc, char **argv) {
+
+    int result = test(all);
+
+    if (result == bad_output) {
+        for (int i = 0; i < 18; i++) {
+            if (test(i) != success) {
+                printf("Test case %d failed\n", i);
+                return result;
+            }
+        }
+    } else if (result != success) {
+        return result;
+    }
+
+    printf("Success!\n");
+    return 0;
+}

From 3f81633353a8cf8cbb27a458dee971d51d651826 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 22 Apr 2026 13:15:50 -0700
Subject: [PATCH 43/55] Clarify MultiRamp API and simplify the atomic-store
 reduction path

API and doc cleanup on MultiRamp:
 - Add a real invariants block to the class header; clarify that 0-dim
   (scalar) multiramps are legal and all methods handle them.
 - Switch all class member comments to doxygen style.
 - Reword alias_free, alias_free_slice, rotate_stride_one_innermost, and
   the shuffle_from_* overloads so each leads with what it does.
 - Fix is_multiramp's Mul branch to use fresh local MultiRamps rather
   than letting a failed first attempt leak partial state into the
   second.
 - Drop the single-dim shuffle_from_slice overload in favour of the
   multi-dim version.
 - Relax add() so it handles 0-dim inputs trivially (base+base); this
   also makes operator== work for 0-dim via its existing add path.
 - Add accept/mutate methods (Function-idiom) so callers don't reach
   into base/strides to walk scalar subexpressions.
 - Add alias_free_slice (replaces ad-hoc in-caller peeling) and
   rotate_stride_one_innermost (replaces near-duplicate dance in the
   simplifier rules).

Use those APIs from VectorizeLoops, FlattenNestedRamps, Simplify_Exprs,
and Simplify_Stmts. In particular the atomic-store reduction block in
VectorizeLoops is restructured: one alias_free_slice call discovers
both stride-zero peels (handled via VectorReduce or a tree reduction
over a reordered b) and symbolic/overlapping aliasing peels (handled
via an unrolled cartesian-product loop block). The b's current lane
layout is tracked as a MultiRamp (b_shape_mr) so the per-iteration
slice of the reduced vector can be computed via shuffle_from_slice.

Extend the downsampling/atomic-vectorize test (downsampling_reduce.cpp)
to exercise MultiRamp::div through the vectorize path, and expand the
multiramp API tests.

Co-authored-by: Claude <noreply@anthropic.com>
---
 src/FlattenNestedRamps.cpp               |  22 +--
 src/MultiRamp.cpp                        | 140 +++++++++----
 src/MultiRamp.h                          | 190 ++++++++++++------
 src/Simplify_Exprs.cpp                   |  36 +---
 src/Simplify_Stmts.cpp                   |  27 +--
 src/VectorizeLoops.cpp                   | 237 ++++++++++++-----------
 test/correctness/CMakeLists.txt          |   1 +
 test/correctness/downsampling_reduce.cpp |  80 ++++++++
 test/correctness/multiramp.cpp           |   6 +-
 9 files changed, 459 insertions(+), 280 deletions(-)
 create mode 100644 test/correctness/downsampling_reduce.cpp

diff --git a/src/FlattenNestedRamps.cpp b/src/FlattenNestedRamps.cpp
index a428e21c395c..7ee062ac5850 100644
--- a/src/FlattenNestedRamps.cpp
+++ b/src/FlattenNestedRamps.cpp
@@ -16,16 +16,6 @@ namespace {
 class FlattenRamps : public IRMutator {
     using IRMutator::visit;
 
-    // Visit the scalar base and strides of a multiramp. They are scalars,
-    // but technically could contain total reductions of nested vectors, so
-    // we need to walk them.
-    void mutate_multiramp_scalars(MultiRamp &mr) {
-        mr.base = mutate(mr.base);
-        for (Expr &s : mr.strides) {
-            s = mutate(s);
-        }
-    }
-
     Expr visit(const Ramp *op) override {
         if (op->base.type().is_vector()) {
             if (MultiRamp mr;
@@ -34,7 +24,7 @@ class FlattenRamps : public IRMutator {
                 // with the general case below, so that we get one big concat
                 // instead of a concat-of-concats. The innermost dimension is
                 // left as a Ramp.
-                mutate_multiramp_scalars(mr);
+                mr.mutate(this);
                 return Shuffle::make_concat(mr.flatten());
             } else {
                 Expr base = mutate(op->base);
@@ -61,9 +51,9 @@ class FlattenRamps : public IRMutator {
         return IRMutator::visit(op);
     }
 
-    // Slice `v` down to `inner_lanes` starting at output lane `n*inner_lanes`,
-    // matching the slicing done to the flattened index. Broadcasts of scalars
-    // pass through unchanged (as a fresh broadcast of `inner_lanes`).
+    // Return the sub-vector of `v` corresponding to the n-th sub-ramp of a
+    // flattened multiramp of width `inner_lanes`. Scalar broadcasts get
+    // rebroadcast to `inner_lanes`; everything else is a slice.
     static Expr slice_per_inner_ramp(const Expr &v, int n, int inner_lanes) {
         if (const Broadcast *b = v.as<Broadcast>()) {
             if (b->value.type().is_scalar()) {
@@ -172,7 +162,7 @@ class FlattenRamps : public IRMutator {
                 mr.dimensions() >= 2) {
 
                 Expr predicate = mutate(op->predicate);
-                mutate_multiramp_scalars(mr);
+                mr.mutate(this);
                 std::vector<Expr> sub_indices = mr.flatten();
                 int inner_lanes = mr.lanes[0];
                 Type elem_type = op->type.with_lanes(inner_lanes);
@@ -201,7 +191,7 @@ class FlattenRamps : public IRMutator {
 
                 Expr predicate = mutate(op->predicate);
                 Expr value = mutate(op->value);
-                mutate_multiramp_scalars(mr);
+                mr.mutate(this);
                 std::vector<Expr> sub_indices = mr.flatten();
                 int inner_lanes = mr.lanes[0];
 
diff --git a/src/MultiRamp.cpp b/src/MultiRamp.cpp
index 0caa0dee6029..b18b63d2e042 100644
--- a/src/MultiRamp.cpp
+++ b/src/MultiRamp.cpp
@@ -2,7 +2,9 @@
 
 #include "IR.h"
 #include "IREquality.h"
+#include "IRMutator.h"
 #include "IROperator.h"
+#include "IRVisitor.h"
 #include "ModulusRemainder.h"
 #include "Simplify.h"
 
@@ -51,17 +53,14 @@ bool MultiRamp::add(const MultiRamp &other) {
     // partially consumed, the remaining part of that dimension corresponds to
     // an "outer" sub-dim in the refined shape and its stride must be scaled
     // by the factor just consumed.
-    internal_assert(!lanes.empty() && !other.lanes.empty());
-    int64_t total_a = 1, total_b = 1;
-    for (int l : lanes) {
-        total_a *= l;
-    }
-    for (int l : other.lanes) {
-        total_b *= l;
+    internal_assert(total_lanes() == other.total_lanes())
+        << "MultiRamp::add: total lane counts must match (" << total_lanes()
+        << " vs " << other.total_lanes() << ")";
+    if (lanes.empty()) {
+        // Both are 0-dim scalars.
+        base = simplify(base + other.base);
+        return true;
     }
-    internal_assert(total_a == total_b)
-        << "MultiRamp::add: total lane counts must match (" << total_a
-        << " vs " << total_b << ")";
     MultiRamp result;
     result.base = simplify(base + other.base);
     size_t ai = 0, bi = 0;
@@ -108,9 +107,8 @@ bool MultiRamp::add(const MultiRamp &other) {
             *this = std::move(result);
             return true;
         }
-        // Since the up-front lane-count check passed, both sides must
-        // always exhaust together.
-        internal_assert(!a_done && !b_done);
+        // The up-front lane-count check ensures both sides always exhaust
+        // together, so neither side should be done here.
     }
 }
 
@@ -380,14 +378,26 @@ bool is_multiramp(const Expr &e, const Scope<Expr> &scope, MultiRamp *result) {
             return result->add(rb);
         }
     } else if (const Mul *m = e.as<Mul>()) {
-        if (auto b = unbroadcast(m->b);
-            b && is_multiramp(m->a, scope, result)) {
-            result->mul(*b);
-            return true;
-        } else if (auto a = unbroadcast(m->a);
-                   a && is_multiramp(m->b, scope, result)) {
-            result->mul(*a);
-            return true;
+        // Try each side as the scalar factor. Use a fresh local MultiRamp
+        // for each attempt: if the first is_multiramp call partially
+        // mutates its output and returns false, that state shouldn't leak
+        // into the fallback attempt (is_multiramp's contract leaves the
+        // output unspecified on failure).
+        if (auto b = unbroadcast(m->b)) {
+            MultiRamp r;
+            if (is_multiramp(m->a, scope, &r)) {
+                r.mul(*b);
+                *result = std::move(r);
+                return true;
+            }
+        }
+        if (auto a = unbroadcast(m->a)) {
+            MultiRamp r;
+            if (is_multiramp(m->b, scope, &r)) {
+                r.mul(*a);
+                *result = std::move(r);
+                return true;
+            }
         }
     } else if (const Div *d = e.as<Div>()) {
         if (auto denom = unbroadcast(d->b)) {
@@ -429,13 +439,16 @@ void MultiRamp::slice(int d, Expr v) {
 }
 
 Expr MultiRamp::alias_free() const {
-    // A multiramp is alias free if (but not only if) there is an ordering of
-    // dimensions such that next stride is greater than the max value seen so
-    // far. In principle we only need to test the ordering with increasing
-    // strides, but in the presence of symbolic strides, we don't know which one
-    // that is. So we'll test all permutations (there shouldn't be many, because
-    // there's only one dimension per nested loop) and or together the
-    // conditions.
+    // A sufficient condition: there exists an ordering of dims such that
+    // each stride's absolute value is strictly greater than the sum of the
+    // spans of all earlier dims, where span(k) = |strides[k]| * (lanes[k] −
+    // 1). Under such an ordering the lanes enumerate distinct offsets in an
+    // interval-tree fashion. In principle we'd only need to test the
+    // ordering with increasing |strides|, but symbolic strides leave the
+    // ordering unknown, so we try all permutations and OR the conditions.
+    // (The permutation count is small in practice — one dim per nested
+    // loop.) This ignores base, which is fine for uniqueness within the
+    // ramp (base is a uniform offset).
 
     if (lanes.empty()) {
         return const_true();
@@ -460,6 +473,55 @@ Expr MultiRamp::alias_free() const {
     return simplify(result);
 }
 
+std::vector<MultiRamp::PeeledDim> MultiRamp::alias_free_slice() {
+    // Greedy: starting from an empty MultiRamp (same base), try adding dims
+    // one by one from innermost to outermost. Any dim that would break the
+    // alias-free condition is peeled off instead. Stride-zero dims always
+    // break alias-freedom (except as the single dim of a 1-dim ramp, which
+    // is a scalar), so we fast-path them to skip the can_prove call.
+    std::vector<PeeledDim> peeled;
+    MultiRamp remaining;
+    remaining.base = base;
+    for (int i = 0; i < dimensions(); i++) {
+        bool must_peel = is_const_zero(strides[i]) && !remaining.lanes.empty();
+        if (!must_peel) {
+            remaining.strides.push_back(strides[i]);
+            remaining.lanes.push_back(lanes[i]);
+            if (can_prove(remaining.alias_free())) {
+                continue;
+            }
+            remaining.strides.pop_back();
+            remaining.lanes.pop_back();
+        }
+        peeled.push_back(PeeledDim{strides[i], lanes[i], i});
+    }
+    *this = std::move(remaining);
+    return peeled;
+}
+
+int MultiRamp::rotate_stride_one_innermost() {
+    int k = -1;
+    for (int i = 0; i < dimensions(); i++) {
+        if (is_const_one(strides[i])) {
+            k = i;
+            break;
+        }
+    }
+    if (k <= 0) {
+        return 0;
+    }
+    int A = 1;
+    for (int i = 0; i < k; i++) {
+        A *= lanes[i];
+    }
+    int d = dimensions();
+    std::vector<int> perm(d);
+    std::iota(perm.begin(), perm.end(), 0);
+    std::rotate(perm.begin(), perm.begin() + k, perm.end());
+    reorder(perm);
+    return A;
+}
+
 int MultiRamp::dimensions() const {
     return (int)strides.size();
 }
@@ -502,6 +564,20 @@ void MultiRamp::reorder(const std::vector<int> &perm) {
     lanes = std::move(new_lanes);
 }
 
+void MultiRamp::accept(IRVisitor *visitor) const {
+    base.accept(visitor);
+    for (const Expr &s : strides) {
+        s.accept(visitor);
+    }
+}
+
+void MultiRamp::mutate(IRMutator *mutator) {
+    base = (*mutator)(base);
+    for (Expr &s : strides) {
+        s = (*mutator)(s);
+    }
+}
+
 std::vector<int> MultiRamp::shuffle_from_permuted(const std::vector<int> &perm) const {
     // For each output lane n (in *this's lane order), we want the shuffle to
     // pull from the input (permuted) vector's lane that represents the same
@@ -546,19 +622,13 @@ std::vector<Expr> MultiRamp::flatten() const {
         for (int k = 1; k < d; k++) {
             int ik = rem % lanes[k];
             rem /= lanes[k];
-            if (ik != 0) {
-                offset_base = offset_base + ik * strides[k];
-            }
+            offset_base = offset_base + ik * strides[k];
         }
         result.push_back(Ramp::make(offset_base, strides[0], inner_lanes));
     }
     return result;
 }
 
-std::vector<int> MultiRamp::shuffle_from_slice(int d, int pos) const {
-    return shuffle_from_slice(std::vector<int>{d}, std::vector<int>{pos});
-}
-
 std::vector<int> MultiRamp::shuffle_from_slice(const std::vector<int> &dims,
                                                const std::vector<int> &pos) const {
     // For each output lane n (in the sliced MultiRamp's lane order), we want
diff --git a/src/MultiRamp.h b/src/MultiRamp.h
index 1a011ba08406..147d5a44f71d 100644
--- a/src/MultiRamp.h
+++ b/src/MultiRamp.h
@@ -12,103 +12,173 @@
 namespace Halide {
 namespace Internal {
 
+class IRMutator;
+class IRVisitor;
+
 /** A multi-dimensional ramp. I.e. a ramp of ramps of ramps of ramps...
  *
- * The scalar-producing operations (mul, add, div, mod) all mutate the
- * MultiRamp in place. mul always succeeds; add/div/mod return false when
- * the result isn't expressible as a multiramp (leaving *this undefined). */
+ * Represents a vector whose lanes are produced by
+ *
+ *     base + i_0 * strides[0] + i_1 * strides[1] + ...
+ *
+ * where i_k iterates over [0, lanes[k]) and the innermost dim is dim 0.
+ * For example, with base = 0, strides = [1, 100], lanes = [2, 3] the lane
+ * sequence is [0, 1, 100, 101, 200, 201].
+ *
+ * Invariants:
+ *   - base is scalar; every entry of strides is scalar and has the same
+ *     type as base.
+ *   - strides.size() == lanes.size() (this value is dimensions()).
+ *   - Each lanes[k] >= 1. An entry of 1 is legal but methods that flatten
+ *     (reorder, add, etc.) will remove it.
+ *   - dimensions() == 0 represents a scalar (total_lanes() == 1);
+ *     to_expr() yields `base` unchanged, and the other methods handle
+ *     this case trivially.
+ *
+ * mul, add, div, mod mutate in place. mul always succeeds; add/div/mod
+ * return false when the result isn't expressible as a multiramp (leaving
+ * *this undefined). */
 struct MultiRamp {
     Expr base;
-    // The first stride is the innermost one. So for example, if the base is
-    // zero, strides are [1, 100] and the extents are [2, 3], the IR node is a
-    // vector with lanes: [0, 1, 100, 101, 200, 201]
     std::vector<Expr> strides;
     std::vector<int> lanes;
 
-    // Multiply by a scalar. Always a multiramp.
+    /** Multiply by a scalar. Always a multiramp. */
     void mul(const Expr &e);
 
-    // Add another MultiRamp elementwise. Returns false if the result isn't a
-    // multiramp (which happens when the two input shapes have no common
-    // refinement).
+    /** Add another MultiRamp elementwise. Returns false if the result isn't
+     * a multiramp (which happens when the two input shapes have no common
+     * refinement). */
     bool add(const MultiRamp &other);
 
-    // Floor-divide by a scalar. Returns false if the denominator isn't a
-    // positive integer constant, or if the quotient isn't a multiramp. The
-    // result may have one more dim than the input (a single split may be
-    // introduced per input dim). O(d).
+    /** Floor-divide by a scalar. The main use case is recognizing
+     * downsampling reductions like `f(r/4) += g(r)` as multiramps, so that
+     * the vectorize pass can handle them as within-vector reductions.
+     *
+     * Returns false if the denominator isn't a positive integer constant,
+     * or if the quotient isn't a multiramp. The result may have one more
+     * dim than the input (a single split may be introduced per input dim,
+     * e.g. ramp(0,2,6)/4 requires splitting a dim of extent 6 into 2x3
+     * because the quotient changes mid-dim). See div_or_mod_impl in
+     * MultiRamp.cpp for the derivation. O(d). */
     bool div(const Expr &k);
 
-    // Euclidean mod by a scalar. Returns false if the denominator isn't a
-    // positive integer constant, or if the remainder isn't a multiramp.
-    // Same shape as div. Rare cases where the remainder is a multiramp but
-    // the quotient isn't are not recognized here. O(d).
+    /** Euclidean mod by a scalar. Returns false if the denominator isn't a
+     * positive integer constant, or if the remainder isn't a multiramp.
+     * Same shape transformations as div. Rare cases where the remainder is
+     * a multiramp but the quotient isn't are not recognized here. O(d). */
     bool mod(const Expr &k);
 
-    // Construct an Expr which gives whether one multiramp is equal to another
-    // in every lane. Assumes the total lane count matches.
+    /** Construct an Expr which gives whether one multiramp is equal to
+     * another in every lane. Assumes the total lane count matches. Returns
+     * a symbolic Expr (not a bool) matching operator== semantics on
+     * Exprs. */
     Expr operator==(const MultiRamp &other) const;
 
-    // Remove a dimension, replacing it with the given scalar expression
-    // (e.g. pass v = 0 to get the first slice along that dimension, pass v =
-    // some var to get a parameterized slice along that dimension).
+    /** Remove dim `d`, adding `v * strides[d]` to base. Pass v = 0 for the
+     * first slice along that dim, or a Variable to get a parameterized
+     * slice. */
     void slice(int d, Expr v);
 
-    // Construct an Expr telling us whether the lanes are all unique. This
-    // expression being false is conservative: it doesn't imply aliasing, only
-    // that we couldn't construct the tightest condition for it in closed form.
+    /** Construct an Expr telling us whether the lanes are all unique. A
+     * sufficient condition is that there is an ordering of the dims along
+     * which each stride is greater than the sum of the spans of earlier
+     * dims (span of dim k = |strides[k]| * (lanes[k] - 1)). We check all
+     * dim orderings and OR the resulting conditions together (ignoring
+     * base, since base is a uniform offset and doesn't affect uniqueness
+     * within the ramp). If this simplifies to const_false the lanes may
+     * or may not actually alias. */
     Expr alias_free() const;
 
-    // The dimensionality. May be lower than you expected, because this gets
-    // flattened when possible by the operations above.
+    /** Information about one peeled dim, produced by alias_free_slice.
+     * `dim` is the dim's position in the *pre-call* MultiRamp. */
+    struct PeeledDim {
+        Expr stride;
+        int lanes;
+        int dim;
+    };
+
+    /** Build an alias-free slice of *this by greedily adding dims innermost
+     * to outermost, keeping a dim only if the slice remains alias-free
+     * after it's added. Replace *this with the resulting slice, and return
+     * a description of the dims that weren't kept (innermost first).
+     * Always succeeds; *this may be reduced to a 0-dim scalar if no prefix
+     * of dims is alias-free. The omitted dims' contributions are NOT
+     * folded into base — callers usually want to add back
+     * `var * omitted.stride` per omitted dim before using *this. */
+    std::vector<PeeledDim> alias_free_slice();
+
+    /** No-op returning 0 if the stride-1 dim is already innermost (or
+     * there isn't one). Otherwise rotate the dims so the stride-1 dim
+     * moves to position 0, with the previously-inner dims moved to the
+     * outermost end, and return A = the product of those previously-inner
+     * dims' lane counts. After this call,
+     * Shuffle::make_transpose(new_to_expr(), total_lanes / A) reconstructs
+     * a vector in the old lane order from one in the new order. */
+    int rotate_stride_one_innermost();
+
+    /** The dimensionality. May be lower than you expected, because this
+     * gets flattened when possible by the operations above. */
     int dimensions() const;
 
-    // The product of all the lane counts
+    /** The product of all the lane counts. */
     int total_lanes() const;
 
-    // The multiramp as a nested series of ramps
+    /** The multiramp as a nested series of ramps. */
     Expr to_expr() const;
 
-    // Flatten the multiramp into a vector of 1D Ramps — one per outer
-    // multi-index, each with inner_lanes = lanes[0] and stride = strides[0].
-    // Ramps are returned in this MultiRamp's lane order: concat'ing the
-    // returned Ramps reproduces the full lane sequence. The caller is
-    // responsible for any prior mutation/simplification of `base` and
-    // `strides` (the Ramps reference them directly).
+    /** Flatten the multiramp into a vector of 1D Ramps — one per outer
+     * multi-index, each with inner_lanes = lanes[0] and stride =
+     * strides[0]. Ramps are returned in this MultiRamp's lane order:
+     * concat'ing the returned Ramps reproduces the full lane sequence. The
+     * caller is responsible for any prior mutation/simplification of `base`
+     * and `strides` (the Ramps reference them directly). */
     std::vector<Expr> flatten() const;
 
-    // Reorder this MultiRamp's dimensions in place. perm[k] is the index
-    // into this's current dims that becomes the k-th dim after reordering
-    // (innermost first, as always). perm must be a permutation of
-    // {0, ..., dimensions()-1}.
+    /** Reorder this MultiRamp's dimensions in place. perm[k] is the index
+     * into this's current dims that becomes the k-th dim after reordering
+     * (innermost first, as always). perm must be a permutation of
+     * {0, ..., dimensions()-1}. E.g. with dims [s0, s1, s2] and
+     * perm = [2, 0, 1], after reorder the new dims are [s2, s0, s1]. */
     void reorder(const std::vector<int> &perm);
 
-    // Given a permutation `perm`, return shuffle indices `idx` such that if
-    // `p` is a copy of `*this` with `reorder(perm)` applied, then
-    //     Shuffle::make({p.to_expr()}, idx)
-    // produces the same vector of lane values as `this->to_expr()`. In other
-    // words: given a vector in the permuted lane order, the returned indices
-    // put it back into this MultiRamp's original lane order.
+    /** Pass an IRVisitor through all Exprs referenced (base and each
+     * stride). Note that base and strides are scalar but may nonetheless
+     * contain nested vector reductions. */
+    void accept(IRVisitor *visitor) const;
+
+    /** Pass an IRMutator through all Exprs referenced, replacing base and
+     * strides with the mutated results. Note that base and strides are
+     * scalar but may nonetheless contain nested vector reductions. */
+    void mutate(IRMutator *mutator);
+
+    /** Given a permutation `perm`, return shuffle indices `idx` such that
+     * if `p` is a copy of `*this` with `reorder(perm)` applied, then
+     *
+     *     Shuffle::make({p.to_expr()}, idx)
+     *
+     * produces the same vector of lane values as `this->to_expr()`. I.e.
+     * given a vector in the permuted lane order, the returned indices put
+     * it back into this MultiRamp's original lane order. */
     std::vector<int> shuffle_from_permuted(const std::vector<int> &perm) const;
 
-    // Given a dimension `d` and a position `pos` within it, return shuffle
-    // indices `idx` such that
-    //     Shuffle::make({this->to_expr()}, idx)
-    // produces the same vector of lane values as a copy of *this with
-    // slice(d, pos) applied. Since slicing reduces the lane count, the
-    // shuffle selects the subset of *this's lanes whose d-th coordinate
-    // equals `pos`.
-    std::vector<int> shuffle_from_slice(int d, int pos) const;
-
-    // Variant that slices multiple dims simultaneously. Returns shuffle
-    // indices selecting the lanes of *this where dim `dims[j]` equals
-    // `pos[j]` for all j. `dims` and `pos` must have the same length and
-    // `dims` must list distinct dim indices.
+    /** Return shuffle indices `idx` such that
+     *
+     *     Shuffle::make({this->to_expr()}, idx)
+     *
+     * produces the same vector of lane values as a copy of *this with
+     * slice(dims[j], pos[j]) applied for each j. Since slicing reduces
+     * the lane count, the shuffle selects the subset of *this's lanes
+     * whose coordinate along dim `dims[j]` equals `pos[j]` for all j.
+     * `dims` and `pos` must have the same length and `dims` must list
+     * distinct dim indices. */
     std::vector<int> shuffle_from_slice(const std::vector<int> &dims,
                                         const std::vector<int> &pos) const;
 };
 
-/** Check if a vector Expr is a multiramp, and assign to result if so. */
+/** Check if a vector Expr is a multiramp, and assign to result if so.
+ * Contract: on failure, *result is left in an unspecified state; callers
+ * must not read *result unless is_multiramp returned true. */
 bool is_multiramp(const Expr &e, const Scope<Expr> &scope, MultiRamp *result);
 
 }  // namespace Internal
diff --git a/src/Simplify_Exprs.cpp b/src/Simplify_Exprs.cpp
index df26ff3cce66..5fb2245cdda2 100644
--- a/src/Simplify_Exprs.cpp
+++ b/src/Simplify_Exprs.cpp
@@ -388,37 +388,15 @@ Expr Simplify::visit(const Load *op, ExprInfo *info) {
                mr.dimensions() > 1) {
         // If the index is a multi-dimensional ramp with a stride-1 dim that
         // isn't already innermost, rotate it (together with all subsequent
-        // dims) to the innermost position so the resulting load is dense,
-        // and restore the original lane order with a transpose. Splitting
-        // the dims into a contiguous "outer half + inner half" pair and
-        // swapping them lets the shuffle be expressed as a single
-        // make_transpose, which downstream code can recognise and (in
-        // future) represent more compactly than a general shuffle.
-        int k = -1;
-        for (int i = 0; i < mr.dimensions(); i++) {
-            if (is_const_one(mr.strides[i])) {
-                k = i;
-                break;
-            }
-        }
-        if (k > 0) {
-            // Permutation: [k, k+1, ..., d-1, 0, 1, ..., k-1]. This is a pure
-            // rotation of the halves, which Shuffle::make_transpose can
-            // express.
-            int d = mr.dimensions();
-            std::vector<int> perm(d);
-            std::iota(perm.begin(), perm.end(), 0);
-            std::rotate(perm.begin(), perm.begin() + k, perm.end());
-            MultiRamp permuted = mr;
-            permuted.reorder(perm);
-            int A = 1;  // product of lanes[0..k-1]
-            for (int i = 0; i < k; i++) {
-                A *= mr.lanes[i];
-            }
-            int B = op->type.lanes() / A;  // product of lanes[k..d-1]
+        // dims) to the outermost position so the resulting load is dense,
+        // and restore the original lane order with a single make_transpose.
+        MultiRamp permuted = mr;
+        int A = permuted.rotate_stride_one_innermost();
+        if (A > 0) {
+            int B = op->type.lanes() / A;
 
             // The predicate applied to the permuted load must be in the
-            // permuted lane order. For the halves-swap rotation, that's just
+            // permuted lane order. For a halves-swap rotation that's just
             // make_transpose(predicate, A) (except for scalar broadcasts,
             // which are invariant).
             Expr permuted_predicate;
diff --git a/src/Simplify_Stmts.cpp b/src/Simplify_Stmts.cpp
index 4d29a9509124..e949b3f443f2 100644
--- a/src/Simplify_Stmts.cpp
+++ b/src/Simplify_Stmts.cpp
@@ -400,31 +400,14 @@ Stmt Simplify::visit(const Store *op) {
                mr.dimensions() > 1) {
         // If the index is a multi-dimensional ramp with a stride-1 dim that
         // isn't already innermost, rotate it (together with all subsequent
-        // dims) to the innermost position so the resulting store is dense.
+        // dims) to the outermost position so the resulting store is dense.
         // Permute the value and predicate to match the new lane order using
-        // a single make_transpose, which downstream code can recognise and
-        // represent compactly. Later in lowering, after flattening the
+        // a single make_transpose. Later in lowering, after flattening the
         // nested ramps, this turns into a concat of dense ramps and hits the
         // case above.
-        int k = -1;
-        for (int i = 0; i < mr.dimensions(); i++) {
-            if (is_const_one(mr.strides[i])) {
-                k = i;
-                break;
-            }
-        }
-        if (k > 0) {
-            int d = mr.dimensions();
-            std::vector<int> perm(d);
-            std::iota(perm.begin(), perm.end(), 0);
-            std::rotate(perm.begin(), perm.begin() + k, perm.end());
-            MultiRamp permuted = mr;
-            permuted.reorder(perm);
-            int A = 1;
-            for (int i = 0; i < k; i++) {
-                A *= mr.lanes[i];
-            }
-
+        MultiRamp permuted = mr;
+        int A = permuted.rotate_stride_one_innermost();
+        if (A > 0) {
             // Transpose the value and predicate so their lane ordering
             // matches the permuted index.
             Expr permuted_value = Shuffle::make_transpose(value, A);
diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp
index 3d767826ae93..f514d062033e 100644
--- a/src/VectorizeLoops.cpp
+++ b/src/VectorizeLoops.cpp
@@ -1,4 +1,5 @@
 #include <algorithm>
+#include <numeric>
 #include <utility>
 
 #include "CSE.h"
@@ -960,16 +961,18 @@ class VectorSubs : public IRMutator {
     }
 
     Stmt visit(const Atomic *op) override {
-        // Recognize a few special cases that we can handle as within-vector reduction trees.
+        // Recognize a few special cases that we can handle as within-vector
+        // reduction trees.
 
-        // We may partially succeed, in which case we'll have loops to rewrap
+        // We may partially succeed, in which case we'll have (unrolled) loops
+        // to rewrap.
         struct ContainingLoop {
             std::string name;
-            int extent = 0;
-            // The index of this loop's dim in the pre-peel store_mr. Only
-            // used in the alias-free peeling path; other uses can leave it
-            // at -1.
-            int dim = -1;
+            int extent;
+            // Index of this loop's dim in the pre-alias-peel MultiRamp. Used
+            // by the unroll below to construct a shuffle mask selecting the
+            // corresponding slice of the reduced value vector.
+            int dim;
         };
         std::vector<ContainingLoop> containing_loops;
 
@@ -1087,11 +1090,7 @@ class VectorSubs : public IRMutator {
                 test = simplify(load_index == store_index);
             } else if (is_multiramp(store_index, vector_scope, &store_mr) &&
                        is_multiramp(load_index, vector_scope, &load_mr)) {
-                debug(0) << "Store multiramp:\n "
-                         << store_mr.to_expr() << "\n";
                 test = store_mr == load_mr;
-                debug(0) << "Store == load test:\n "
-                         << test << "\n";
             }
 
             if (!test.defined()) {
@@ -1126,7 +1125,7 @@ class VectorSubs : public IRMutator {
             };
 
             int output_lanes = 1;
-            MultiRamp pre_peel_mr;
+            MultiRamp b_shape_mr;
             if (store_index.type().is_scalar()) {
                 // The index doesn't depend on the value being
                 // vectorized, so it's a total reduction.
@@ -1134,111 +1133,89 @@ class VectorSubs : public IRMutator {
             } else {
 
                 // The output lanes is >1, so there must be at least one
-                // multiramp dimension with non-zero stride. There may be
-                // dimensions with zero stride, however.
-
-                // Here we identify any stride-0 dimensions in the
-                // multiramp. Innermost ones with stride zero will be handled
-                // with a vector reduce. Others will be handled by taking slices
-                // and combining in a tree. We first shuffle the other
-                // stride-zero ones outermost so that the slices are
-                // dense. TODO: is this the best policy? We could also transpose
-                // them inwards and vector reduce.
-
-                // TODO: There may also be dimensions with unknown (symbolic)
-                // stride.  We need to handle these carefully because they might
-                // be zero at runtime. This is require injecting a loop that
-                // handles one slice at a time along that dimension. Finally,
-                // there might be dimensions with known strides such that they
-                // overlap, e.g. if some lunatic vectorizes a reduction like
-                // f(r.x + r.y) += .... We need to slice out at least one of the
-                // two conflicting dimensions and turn it into a loop.
-
-                int inner_repetitions = 1;
-                int outer_repetitions = 1;
-                if (is_const_zero(store_mr.strides[0])) {
-                    inner_repetitions = store_mr.lanes[0];
-                    store_mr.slice(0, 0);
-                }
-
-                std::vector<int> perm, zero_dims;
-
-                // Look for stride-zero dimensions
-                perm.reserve(store_mr.dimensions());
-                bool needs_shuffle = false;
-                for (int d = 0; d < store_mr.dimensions(); d++) {
-                    if (is_const_zero(store_mr.strides[d])) {
-                        zero_dims.push_back(d);
-                        outer_repetitions *= store_mr.lanes[d];
+                // multiramp dimension with non-zero stride. Dims that
+                // can't be part of an alias-free store fall into two
+                // kinds, both discovered by one call to alias_free_slice:
+                //
+                //   - Stride-zero dims: lanes duplicate a value across the
+                //     store, so we fold the duplicates with the reduction
+                //     op. The innermost-in-original stride-zero dim (if
+                //     any) becomes a VectorReduce; others need a
+                //     reduction tree over slices of b.
+                //   - Non-zero-stride aliasing dims (symbolic strides, or
+                //     strides that overlap such that we can't prove
+                //     uniqueness): different lanes of the store go to
+                //     different addresses, so we unroll a containing loop
+                //     and do a slice-per-iteration.
+                //
+                // TODO: the innermost-VectorReduce fast-path is keyed on
+                // "dim 0 of the original". We could move other stride-zero
+                // dims inward via a transpose and VectorReduce them too;
+                // might be better on some targets.
+
+                // b's current lane layout. Starts matching the full
+                // store_mr (before any peel); updated as reductions and
+                // shuffles reshape it. We use this for the shuffle masks
+                // that slice b per unrolled iteration below.
+                b_shape_mr = store_mr;
+
+                std::vector<MultiRamp::PeeledDim> peeled =
+                    store_mr.alias_free_slice();
+
+                // Snapshot the original strides so we can identify which
+                // original dims had stride zero after b_shape_mr gets
+                // reordered below.
+                std::vector<Expr> orig_strides = b_shape_mr.strides;
+
+                // Partition peels by handling strategy.
+                int inner_dup = 1;  // >1 if a VectorReduce applies.
+                int outer_dup = 1;  // >1 if a reduction tree applies.
+                std::vector<MultiRamp::PeeledDim> loop_peels;
+                for (const auto &p : peeled) {
+                    if (is_const_zero(p.stride)) {
+                        if (p.dim == 0) {
+                            // Stride-zero peel at the innermost position:
+                            // its duplicates are contiguous in b, so we
+                            // can use VectorReduce directly.
+                            inner_dup = p.lanes;
+                        } else {
+                            outer_dup *= p.lanes;
+                        }
                     } else {
-                        // If any non-stride-zero dims come after a stride-zero
-                        // dim, we'll need a shuffle.
-                        needs_shuffle |= !zero_dims.empty();
-                        perm.push_back(d);
+                        loop_peels.push_back(p);
                     }
                 }
-                std::vector<int> shuffle;
-                if (needs_shuffle) {
-                    perm.insert(perm.end(), zero_dims.begin(), zero_dims.end());
-                    shuffle = store_mr.shuffle_from_permuted(perm);
-                    store_mr.reorder(perm);
-                }
-                for (size_t i = 0; i < zero_dims.size(); i++) {
-                    store_mr.strides.pop_back();
-                    store_mr.lanes.pop_back();
-                }
 
-                // Snapshot the pre-peel MultiRamp so we can figure out
-                // which slice of b each unrolled iteration should store.
-                pre_peel_mr = store_mr;
-
-                if (!can_prove(store_mr.alias_free())) {
-                    debug(0) << "Alias-free check failed\n";
-                    // There may be more collisions. We don't know. This means
-                    // we need to genuinely do an interleaved sequence of loads
-                    // and stores to the target buffer. There may be multiple
-                    // alias-free subsets of the dimensions of store_mr. We'll
-                    // do it greedily. Starting from the innermost, we'll add
-                    // each dimension provided that we maintain the alias-free
-                    // property. Even if we find none, we've at least peeled off
-                    // the stride-0 dimensions already, so it's better than
-                    // bailing and scalarizing.
-
-                    MultiRamp alias_free_slice;
-                    alias_free_slice.base = store_mr.base;
-                    for (int i = 0; i < store_mr.dimensions(); i++) {
-                        Expr s = store_mr.strides[i];
-                        int l = store_mr.lanes[i];
-                        alias_free_slice.strides.push_back(s);
-                        alias_free_slice.lanes.push_back(l);
-                        if (!can_prove(alias_free_slice.alias_free())) {
-                            containing_loops.emplace_back(
-                                ContainingLoop{unique_name('t'), l, i});
-                            alias_free_slice.base +=
-                                Variable::make(Int(32), containing_loops.back().name) * s;
-                            alias_free_slice.strides.pop_back();
-                            alias_free_slice.lanes.pop_back();
-                        }
-                    }
-                    store_mr = std::move(alias_free_slice);
+                if (inner_dup > 1) {
+                    int new_lanes = b_shape_mr.total_lanes() / inner_dup;
+                    b = VectorReduce::make(reduce_op, b, new_lanes);
+                    b_shape_mr.slice(0, make_zero(b_shape_mr.base.type()));
                 }
 
-                output_lanes = store_mr.total_lanes();
-                store_index = store_mr.to_expr();
-                int pre_peel_total = pre_peel_mr.total_lanes();
-                if (inner_repetitions > 1) {
-                    b = VectorReduce::make(reduce_op, b, pre_peel_total * outer_repetitions);
-                }
-
-                if (needs_shuffle) {
-                    b = Shuffle::make({b}, shuffle);
-                }
-
-                // Handle outer repetitions with a reduction tree over dense
-                // slices. Reduces b down to pre_peel_total lanes (peeled dims
-                // are handled by the unroll below).
-                if (outer_repetitions > 1) {
-                    int reps = outer_repetitions;
+                // If any non-innermost stride-zero dims need combining,
+                // shuffle b so their duplicates become contiguous, then
+                // reduce them with a tree over contiguous sub-vectors.
+                if (outer_dup > 1) {
+                    // Reorder the remaining zero-stride dims outermost,
+                    // keeping the rest in their relative order.
+                    int d = b_shape_mr.dimensions();
+                    std::vector<int> perm(d);
+                    std::iota(perm.begin(), perm.end(), 0);
+                    auto mid = std::stable_partition(perm.begin(), perm.end(),
+                        [&](int i) { return !is_const_zero(b_shape_mr.strides[i]); });
+                    int n_kept = mid - perm.begin();
+                    b = Shuffle::make({b}, b_shape_mr.shuffle_from_permuted(perm));
+                    b_shape_mr.reorder(perm);
+
+                    // An inner reduction is a VectorReduce node. An outer
+                    // reduction is cutting the vector into contiguous pieces,
+                    // and adding those pieces together. Now that all the
+                    // remaining stride-0 dims are outermost, we can do that in
+                    // a binary tree. We slice the vector in half and add the
+                    // halves for as long as possible, and then slice up what's
+                    // left into pieces and add the pieces. For big power-of-two
+                    // reductions this produces log(n) IR nodes.
+                    int reps = outer_dup;
                     while (reps % 2 == 0) {
                         int l = b.type().lanes() / 2;
                         Expr b0 = Shuffle::make_slice(b, 0, 1, l);
@@ -1247,14 +1224,44 @@ class VectorSubs : public IRMutator {
                         reps /= 2;
                     }
                     if (reps > 1) {
-                        Expr v = Shuffle::make_slice(b, 0, 1, pre_peel_total);
+                        int chunk = b.type().lanes() / reps;
+                        Expr v = Shuffle::make_slice(b, 0, 1, chunk);
                         for (int i = 1; i < reps; i++) {
-                            Expr slice = simplify(Shuffle::make_slice(b, i * pre_peel_total, 1, pre_peel_total));
+                            Expr slice = simplify(Shuffle::make_slice(b, i * chunk, 1, chunk));
                             v = binop(v, slice);
                         }
                         b = v;
                     }
+
+                    // Drop the outer-zero peeled dims from b_shape_mr (they
+                    // are the trailing dims after the reorder above).
+                    b_shape_mr.strides.resize(n_kept);
+                    b_shape_mr.lanes.resize(n_kept);
+                }
+
+                // We still have peeled dims without zero stride to handle.
+                // Emit the unrolled containing loops for non-zero aliasing
+                // peels. Their shuffle indices below select the right slice of
+                // b per iteration. The loop.dim field is the dim's position in
+                // b_shape_mr's current layout: the count of earlier original
+                // dims that survived both the inner-dim reduction and the
+                // outer-zero drop.
+                // orig dim 0 was removed if we VectorReduce'd it away.
+                const int dim_offset = inner_dup > 1 ? 1 : 0;
+                for (const auto &p : loop_peels) {
+                    int pos = 0;
+                    for (int i = dim_offset; i < p.dim; i++) {
+                        if (!is_const_zero(orig_strides[i])) {
+                            pos++;
+                        }
+                    }
+                    std::string name = unique_name('t');
+                    containing_loops.emplace_back(
+                        ContainingLoop{name, p.lanes, pos});
+                    store_mr.base += Variable::make(Int(32), name) * p.stride;
                 }
+                output_lanes = store_mr.total_lanes();
+                store_index = store_mr.to_expr();
             }
 
             Expr new_load = Load::make(load_a->type.with_lanes(output_lanes),
@@ -1284,7 +1291,7 @@ class VectorSubs : public IRMutator {
                 std::string full_b_var_name = unique_name('b');
                 Expr full_b_var = Variable::make(b.type(), full_b_var_name);
 
-                int total_iters = pre_peel_mr.total_lanes() / output_lanes;
+                int total_iters = b_shape_mr.total_lanes() / output_lanes;
                 std::vector<int> peeled_dims;
                 peeled_dims.reserve(containing_loops.size());
                 for (const auto &loop : containing_loops) {
@@ -1303,7 +1310,7 @@ class VectorSubs : public IRMutator {
                         rem /= e;
                     }
 
-                    std::vector<int> indices = pre_peel_mr.shuffle_from_slice(peeled_dims, v);
+                    std::vector<int> indices = b_shape_mr.shuffle_from_slice(peeled_dims, v);
                     Expr b_slice = Shuffle::make({full_b_var}, indices);
 
                     Stmt this_store = store_template;
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index bbbd06e68d74..a5dd980b8a88 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -89,6 +89,7 @@ tests(GROUPS correctness
       device_crop.cpp
       device_slice.cpp
       dilate3x3.cpp
+      downsampling_reduce.cpp
       div_by_zero.cpp
       div_round_to_zero.cpp
       dynamic_allocation_in_gpu_kernel.cpp
diff --git a/test/correctness/downsampling_reduce.cpp b/test/correctness/downsampling_reduce.cpp
new file mode 100644
index 000000000000..ed84345e3a4e
--- /dev/null
+++ b/test/correctness/downsampling_reduce.cpp
@@ -0,0 +1,80 @@
+#include "Halide.h"
+
+using namespace Halide;
+using namespace Halide::Internal;
+
+// Test that an atomic vectorized reduction with a downsampling write pattern
+// (f(r/4) += g(r)) lowers to within-vector reductions rather than
+// scalarizing. This exercises MultiRamp::div in the vectorize path.
+
+int main(int argc, char **argv) {
+    const int vec = 16;
+    const int factor = 4;
+    const int reduction_extent = vec;  // r has `vec` lanes; f's output has vec/factor
+
+    Func g{"g"};
+    Var x{"x"};
+    RDom r(0, reduction_extent);
+
+    ImageParam input(Int(32), 1);
+    Buffer<int> input_buf(reduction_extent);
+    input_buf.for_each_element([&](int i) { input_buf(i) = i * 3 + 7; });
+    input.set(input_buf);
+
+    // f(r/4) += g(r): four consecutive lanes of the reduction contribute to
+    // one output lane. Within one vector of r, the output multiramp has a
+    // stride-zero innermost dim of extent `factor` and a stride-1 outer dim
+    // of extent vec/factor.
+    g(x) = 0;
+    g(r / factor) += input(r);
+
+    Buffer<int> correct = g.realize({reduction_extent / factor});
+
+    g.bound(x, 0, reduction_extent / factor)
+        .update()
+        .atomic()
+        .vectorize(r);
+
+    // Check that the reduction over r was vectorized away: after vectorize,
+    // there should be no inner for-loop over r, and the lowered IR should
+    // contain a VectorReduce node.
+    int inner_for_loops = 0;
+    int vector_reduces = 0;
+    auto checker = LambdaMutator{
+        [&](auto *self, const For *op) {
+            if (op->name.find("r") != std::string::npos) {
+                inner_for_loops++;
+            }
+            return self->visit_base(op);
+        },
+        [&](auto *self, const VectorReduce *op) {
+            vector_reduces++;
+            return self->visit_base(op);
+        }};
+    g.add_custom_lowering_pass(&checker, nullptr);
+
+    Buffer<int> out = g.realize({reduction_extent / factor});
+
+    for (int i = 0; i < reduction_extent / factor; i++) {
+        if (out(i) != correct(i)) {
+            printf("out(%d) = %d instead of %d\n", i, out(i), correct(i));
+            return 1;
+        }
+    }
+
+    if (inner_for_loops > 0) {
+        printf("Atomic vectorization of downsampling reduction failed: "
+               "lowered code contained %d for loop(s) over r\n",
+               inner_for_loops);
+        return 1;
+    }
+
+    if (vector_reduces == 0) {
+        printf("Expected a VectorReduce node in the lowered IR, but "
+               "didn't find one\n");
+        return 1;
+    }
+
+    printf("Success!\n");
+    return 0;
+}
diff --git a/test/correctness/multiramp.cpp b/test/correctness/multiramp.cpp
index e6677eb64e27..e69c79f6256f 100644
--- a/test/correctness/multiramp.cpp
+++ b/test/correctness/multiramp.cpp
@@ -393,7 +393,7 @@ void check_shuffle_from_slice_2d() {
     MultiRamp A{0, {1, 10}, {2, 3}};
     MultiRamp S = A;
     S.slice(1, Expr(2));
-    std::vector<int> idx = A.shuffle_from_slice(1, 2);
+    std::vector<int> idx = A.shuffle_from_slice(std::vector<int>{1}, std::vector<int>{2});
     auto a_seq = expand(A);  // 0, 1, 10, 11, 20, 21
     auto s_seq = expand(S);  // 20, 21
     CHECK(idx.size() == s_seq.size(), "slice shuffle size");
@@ -409,7 +409,7 @@ void check_shuffle_from_slice_inner() {
     MultiRamp A{0, {1, 10}, {2, 3}};
     MultiRamp S = A;
     S.slice(0, Expr(1));
-    std::vector<int> idx = A.shuffle_from_slice(0, 1);
+    std::vector<int> idx = A.shuffle_from_slice(std::vector<int>{0}, std::vector<int>{1});
     auto a_seq = expand(A);  // 0, 1, 10, 11, 20, 21
     auto s_seq = expand(S);  // 1, 11, 21
     CHECK(idx.size() == s_seq.size(), "inner slice shuffle size");
@@ -425,7 +425,7 @@ void check_shuffle_from_slice_3d() {
     MultiRamp A{0, {1, 4, 20}, {2, 3, 2}};
     MultiRamp S = A;
     S.slice(1, Expr(1));
-    std::vector<int> idx = A.shuffle_from_slice(1, 1);
+    std::vector<int> idx = A.shuffle_from_slice(std::vector<int>{1}, std::vector<int>{1});
     auto a_seq = expand(A);
     auto s_seq = expand(S);
     CHECK(idx.size() == s_seq.size(), "3D slice shuffle size");

From 21bbcace6a2aa815149dbc8eb817d62bff447ac6 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 22 Apr 2026 14:37:14 -0700
Subject: [PATCH 44/55] Add for_each_coordinate helper and use it
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three places in Halide iterate the cartesian product of a box of
integer sizes — MultiRamp::shuffle_from_permuted, MultiRamp::flatten,
MultiRamp::shuffle_from_slice, and the unroll block in VectorizeLoops's
atomic-store reduction path. Replace each manual decompose-via-
rem%/rem/ loop with a call to a new for_each_coordinate helper in
Util.h that invokes a callback on each coordinate in lex order.

Co-authored-by: Claude <noreply@anthropic.com>
---
 src/MultiRamp.cpp      | 64 +++++++++++++++++-------------------------
 src/Util.h             | 20 +++++++++++++
 src/VectorizeLoops.cpp | 29 ++++++++-----------
 3 files changed, 56 insertions(+), 57 deletions(-)

diff --git a/src/MultiRamp.cpp b/src/MultiRamp.cpp
index b18b63d2e042..af1c929fb273 100644
--- a/src/MultiRamp.cpp
+++ b/src/MultiRamp.cpp
@@ -7,6 +7,7 @@
 #include "IRVisitor.h"
 #include "ModulusRemainder.h"
 #include "Simplify.h"
+#include "Util.h"
 
 #include <numeric>
 #include <optional>
@@ -587,23 +588,16 @@ std::vector<int> MultiRamp::shuffle_from_permuted(const std::vector<int> &perm)
     // this->lanes[perm[k]] as its innermost lane counts.
     int d = dimensions();
     internal_assert((int)perm.size() == d);
-    int total = total_lanes();
-    std::vector<int> indices(total);
-    std::vector<int> i(d);
-    for (int n = 0; n < total; n++) {
-        int rem = n;
+    std::vector<int> indices;
+    indices.reserve(total_lanes());
+    for_each_coordinate(lanes, [&](const std::vector<int> &coord) {
+        int permuted_flat = 0, M = 1;
         for (int k = 0; k < d; k++) {
-            i[k] = rem % lanes[k];
-            rem /= lanes[k];
-        }
-        int permuted_flat = 0;
-        int M = 1;
-        for (int k = 0; k < d; k++) {
-            permuted_flat += i[perm[k]] * M;
+            permuted_flat += coord[perm[k]] * M;
             M *= lanes[perm[k]];
         }
-        indices[n] = permuted_flat;
-    }
+        indices.push_back(permuted_flat);
+    });
     return indices;
 }
 
@@ -613,19 +607,16 @@ std::vector<Expr> MultiRamp::flatten() const {
         return {base};
     }
     int inner_lanes = lanes[0];
-    int outer_total = total_lanes() / inner_lanes;
+    std::vector<int> outer_sizes(lanes.begin() + 1, lanes.end());
     std::vector<Expr> result;
-    result.reserve(outer_total);
-    for (int n = 0; n < outer_total; n++) {
-        int rem = n;
+    result.reserve(total_lanes() / inner_lanes);
+    for_each_coordinate(outer_sizes, [&](const std::vector<int> &coord) {
         Expr offset_base = base;
-        for (int k = 1; k < d; k++) {
-            int ik = rem % lanes[k];
-            rem /= lanes[k];
-            offset_base = offset_base + ik * strides[k];
+        for (size_t k = 0; k < coord.size(); k++) {
+            offset_base = offset_base + coord[k] * strides[k + 1];
         }
         result.push_back(Ramp::make(offset_base, strides[0], inner_lanes));
-    }
+    });
     return result;
 }
 
@@ -645,30 +636,25 @@ std::vector<int> MultiRamp::shuffle_from_slice(const std::vector<int> &dims,
         internal_assert(fixed[dd] == -1) << "duplicate dim in shuffle_from_slice\n";
         fixed[dd] = pos[j];
     }
-    int total_out = 1;
+    // Sizes of the free (non-fixed) dims, in the same order as they
+    // appear in the full dim list.
+    std::vector<int> free_sizes;
     for (int k = 0; k < d; k++) {
         if (fixed[k] == -1) {
-            total_out *= lanes[k];
+            free_sizes.push_back(lanes[k]);
         }
     }
-    std::vector<int> indices(total_out);
-    for (int n = 0; n < total_out; n++) {
-        int rem = n;
-        int flat = 0;
-        int M = 1;
+    std::vector<int> indices;
+    for_each_coordinate(free_sizes, [&](const std::vector<int> &free_coord) {
+        int flat = 0, M = 1;
+        size_t fj = 0;
         for (int k = 0; k < d; k++) {
-            int ik;
-            if (fixed[k] != -1) {
-                ik = fixed[k];
-            } else {
-                ik = rem % lanes[k];
-                rem /= lanes[k];
-            }
+            int ik = (fixed[k] != -1) ? fixed[k] : free_coord[fj++];
             flat += ik * M;
             M *= lanes[k];
         }
-        indices[n] = flat;
-    }
+        indices.push_back(flat);
+    });
     return indices;
 }
 
diff --git a/src/Util.h b/src/Util.h
index f29e0ad9b6f0..01d89eb3b6ff 100644
--- a/src/Util.h
+++ b/src/Util.h
@@ -183,6 +183,26 @@ bool ends_with(const std::string &str, const std::string &suffix);
  * this function to return the same string without any copies being made. */
 std::string replace_all(std::string str, const std::string &find, const std::string &replace);
 
+/** Invoke `f(coord)` for each integer coordinate in the box
+ * `[0, sizes[0]) x [0, sizes[1]) x ...`, in lex order with the first
+ * axis varying fastest. `coord` is a `const std::vector<int> &` of the
+ * same length as `sizes`. The empty-sizes case invokes `f` once with an
+ * empty coord (a 0-dim box has one point). */
+template<typename F>
+void for_each_coordinate(const std::vector<int> &sizes, F &&f) {
+    std::vector<int> coord(sizes.size(), 0);
+    while (true) {
+        f(coord);
+        size_t k = 0;
+        while (k < sizes.size() && ++coord[k] == sizes[k]) {
+            coord[k++] = 0;
+        }
+        if (k == sizes.size()) {
+            return;
+        }
+    }
+}
+
 /** Split the source string using 'delim' as the divider. */
 std::vector<std::string> split_string(const std::string &source, const std::string &delim);
 
diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp
index f514d062033e..9cd81e9e4b6e 100644
--- a/src/VectorizeLoops.cpp
+++ b/src/VectorizeLoops.cpp
@@ -15,6 +15,7 @@
 #include "Simplify.h"
 #include "Solve.h"
 #include "Substitute.h"
+#include "Util.h"
 #include "VectorizeLoops.h"
 
 namespace Halide {
@@ -1291,35 +1292,27 @@ class VectorSubs : public IRMutator {
                 std::string full_b_var_name = unique_name('b');
                 Expr full_b_var = Variable::make(b.type(), full_b_var_name);
 
-                int total_iters = b_shape_mr.total_lanes() / output_lanes;
-                std::vector<int> peeled_dims;
+                std::vector<int> peeled_dims, loop_extents;
                 peeled_dims.reserve(containing_loops.size());
+                loop_extents.reserve(containing_loops.size());
                 for (const auto &loop : containing_loops) {
                     peeled_dims.push_back(loop.dim);
+                    loop_extents.push_back(loop.extent);
                 }
                 std::vector<Stmt> block;
-                block.reserve(total_iters);
-                for (int n = 0; n < total_iters; n++) {
-                    // Decompose n into per-loop iteration values (innermost
-                    // loop first, matching the order in containing_loops).
-                    std::vector<int> v(containing_loops.size());
-                    int rem = n;
-                    for (size_t j = 0; j < containing_loops.size(); j++) {
-                        int e = containing_loops[j].extent;
-                        v[j] = rem % e;
-                        rem /= e;
-                    }
-
-                    std::vector<int> indices = b_shape_mr.shuffle_from_slice(peeled_dims, v);
-                    Expr b_slice = Shuffle::make({full_b_var}, indices);
-
+                block.reserve(b_shape_mr.total_lanes() / output_lanes);
+                for_each_coordinate(loop_extents, [&](const std::vector<int> &v) {
+                    // v is the loop iteration multi-index (innermost first,
+                    // matching the order in containing_loops).
+                    Expr b_slice = Shuffle::make({full_b_var},
+                                                 b_shape_mr.shuffle_from_slice(peeled_dims, v));
                     Stmt this_store = store_template;
                     for (size_t j = 0; j < containing_loops.size(); j++) {
                         this_store = substitute(containing_loops[j].name, v[j], this_store);
                     }
                     this_store = substitute(b_var_name, b_slice, this_store);
                     block.push_back(this_store);
-                }
+                });
                 s = Block::make(block);
                 s = LetStmt::make(full_b_var_name, b, s);
             }

From fea1d7c4c0c8dc6863997e3c37c7dd2b731eb6fa Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 22 Apr 2026 14:39:22 -0700
Subject: [PATCH 45/55] Fix three bugs found by the randomized
 vectorized-reduction test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

is_multiramp's recursive impl may leave its output in a partial state
on failure (e.g. after a successful Ramp::base recursion followed by a
failed stride check). The contract previously told callers "don't read
*result on failure," but honoring that required every recursive call
site to use a fresh local MultiRamp. Split is_multiramp into an
internal impl that keeps the old partial-state behavior plus a thin
public wrapper that commits to *result only on success. The public
contract is now the cleaner "untouched on failure." The Mul branch's
fresh-local workaround falls out.

VectorizeLoops's atomic-store reduction path needs to shuffle `b` from
its original lane order into a permuted order (so a subsequent
reduction tree can slice contiguous sub-vectors per stride-zero peel).
It was calling shuffle_from_permuted directly, but that method returns
indices for the opposite direction — "permuted → original," which is
what the Simplify_Exprs caller wants. Invert the result as a
permutation. Without this, any case with multiple stride-zero peels
that needed a real reorder summed lanes that should have gone to
different output addresses.

Simplify_Shuffle's "slice of concat" rule drops concat vectors that
don't overlap with the slice's range. The overlap check tested whether
the concat vector's start OR its last lane was inside the slice, but
missed the case where the slice is entirely contained within one
concat vector (neither endpoint inside). When every vector contained
the slice, new_concat_vectors came out empty and Shuffle::make_concat
tripped its empty-vector assert. Replaced with a standard interval-
overlap check. This is a pre-existing bug that the randomized test
exercised through multiramp-derived slice patterns.

Co-authored-by: Claude <noreply@anthropic.com>
---
 src/MultiRamp.cpp        | 50 +++++++++++++++++++++++-----------------
 src/MultiRamp.h          |  3 +--
 src/Simplify_Shuffle.cpp |  7 ++++--
 src/VectorizeLoops.cpp   | 11 ++++++++-
 4 files changed, 45 insertions(+), 26 deletions(-)

diff --git a/src/MultiRamp.cpp b/src/MultiRamp.cpp
index af1c929fb273..2c277b3c32a3 100644
--- a/src/MultiRamp.cpp
+++ b/src/MultiRamp.cpp
@@ -340,9 +340,13 @@ std::optional<Expr> unbroadcast(const Expr &e) {
         return std::nullopt;
     }
 }
-}  // namespace
 
-bool is_multiramp(const Expr &e, const Scope<Expr> &scope, MultiRamp *result) {
+// Internal is_multiramp. May leave *result in a partial state on failure;
+// the public is_multiramp below protects callers by only committing on
+// success. Recursive calls go through the public wrapper, so each branch
+// here can assume *result is either freshly initialized (on entry) or
+// freshly filled by a successful recursion.
+bool is_multiramp_impl(const Expr &e, const Scope<Expr> &scope, MultiRamp *result) {
     Type elem_t = e.type().element_of();
     if (e.type().is_scalar()) {
         result->base = e;
@@ -379,26 +383,18 @@ bool is_multiramp(const Expr &e, const Scope<Expr> &scope, MultiRamp *result) {
             return result->add(rb);
         }
     } else if (const Mul *m = e.as<Mul>()) {
-        // Try each side as the scalar factor. Use a fresh local MultiRamp
-        // for each attempt: if the first is_multiramp call partially
-        // mutates its output and returns false, that state shouldn't leak
-        // into the fallback attempt (is_multiramp's contract leaves the
-        // output unspecified on failure).
-        if (auto b = unbroadcast(m->b)) {
-            MultiRamp r;
-            if (is_multiramp(m->a, scope, &r)) {
-                r.mul(*b);
-                *result = std::move(r);
-                return true;
-            }
+        // Try each side as the scalar factor. The public wrapper's
+        // untouched-on-failure guarantee means a failed first attempt
+        // leaves *result clean for the second.
+        if (auto b = unbroadcast(m->b);
+            b && is_multiramp(m->a, scope, result)) {
+            result->mul(*b);
+            return true;
         }
-        if (auto a = unbroadcast(m->a)) {
-            MultiRamp r;
-            if (is_multiramp(m->b, scope, &r)) {
-                r.mul(*a);
-                *result = std::move(r);
-                return true;
-            }
+        if (auto a = unbroadcast(m->a);
+            a && is_multiramp(m->b, scope, result)) {
+            result->mul(*a);
+            return true;
         }
     } else if (const Div *d = e.as<Div>()) {
         if (auto denom = unbroadcast(d->b)) {
@@ -415,6 +411,18 @@ bool is_multiramp(const Expr &e, const Scope<Expr> &scope, MultiRamp *result) {
     }
     return false;
 }
+}  // namespace
+
+bool is_multiramp(const Expr &e, const Scope<Expr> &scope, MultiRamp *result) {
+    // Wrap the impl so that callers get a clean "untouched on failure"
+    // contract regardless of how the impl leaves its scratch space.
+    MultiRamp tmp;
+    if (is_multiramp_impl(e, scope, &tmp)) {
+        *result = std::move(tmp);
+        return true;
+    }
+    return false;
+}
 
 Expr MultiRamp::operator==(const MultiRamp &other) const {
     // Construct the difference, and check if all strides are zero.
diff --git a/src/MultiRamp.h b/src/MultiRamp.h
index 147d5a44f71d..c0ae2709dca0 100644
--- a/src/MultiRamp.h
+++ b/src/MultiRamp.h
@@ -177,8 +177,7 @@ struct MultiRamp {
 };
 
 /** Check if a vector Expr is a multiramp, and assign to result if so.
- * Contract: on failure, *result is left in an unspecified state; callers
- * must not read *result unless is_multiramp returned true. */
+ * Returns false and leaves *result untouched if not. */
 bool is_multiramp(const Expr &e, const Scope<Expr> &scope, MultiRamp *result);
 
 }  // namespace Internal
diff --git a/src/Simplify_Shuffle.cpp b/src/Simplify_Shuffle.cpp
index 2a614ac81744..9f339dd26614 100644
--- a/src/Simplify_Shuffle.cpp
+++ b/src/Simplify_Shuffle.cpp
@@ -295,8 +295,11 @@ Expr Simplify::visit(const Shuffle *op, ExprInfo *info) {
                 vector<Expr> new_concat_vectors;
                 for (const auto &v : inner_shuffle->vectors) {
                     // Check if current concat vector overlaps with slice.
-                    if ((concat_index >= slice_min && concat_index <= slice_max) ||
-                        ((concat_index + v.type().lanes() - 1) >= slice_min && (concat_index + v.type().lanes() - 1) <= slice_max)) {
+                    // Standard interval overlap: [a, b] and [c, d] overlap
+                    // iff a <= d && c <= b.
+                    int v_start = concat_index;
+                    int v_end = concat_index + v.type().lanes() - 1;
+                    if (v_start <= slice_max && slice_min <= v_end) {
                         if (new_slice_start < 0) {
                             new_slice_start = concat_index;
                         }
diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp
index 9cd81e9e4b6e..47c839d1c5ad 100644
--- a/src/VectorizeLoops.cpp
+++ b/src/VectorizeLoops.cpp
@@ -1205,7 +1205,16 @@ class VectorSubs : public IRMutator {
                     auto mid = std::stable_partition(perm.begin(), perm.end(),
                         [&](int i) { return !is_const_zero(b_shape_mr.strides[i]); });
                     int n_kept = mid - perm.begin();
-                    b = Shuffle::make({b}, b_shape_mr.shuffle_from_permuted(perm));
+                    // shuffle_from_permuted gives us idx such that
+                    // Shuffle(<permuted>, idx) == <original>. Here we have
+                    // b in original lane order and want it in permuted
+                    // order, so we invert that as a permutation.
+                    std::vector<int> idx = b_shape_mr.shuffle_from_permuted(perm);
+                    std::vector<int> inverted(idx.size());
+                    for (size_t i = 0; i < idx.size(); i++) {
+                        inverted[idx[i]] = (int)i;
+                    }
+                    b = Shuffle::make({b}, inverted);
                     b_shape_mr.reorder(perm);
 
                     // An inner reduction is a VectorReduce node. An outer

From a4604f4f7dcd53a49759faf9d83c9a40b6184fac Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 22 Apr 2026 14:42:29 -0700
Subject: [PATCH 46/55] Expand MultiRamp and vectorized-reduction tests

Add hand-picked tests for MultiRamp API properties that weren't
previously covered: mul, operator==, alias_free_slice (unique lanes /
zero-stride peeling / degenerate scalar), rotate_stride_one_innermost
(rotation + transpose round-trip), and is_multiramp round-trips for a
handful of shapes.

Add test_random to transposed_vector_reduce.cpp: 1000 random
quasi-affine store/load index pairs over a 3-dim RDom, each compiled
scalarly and with .atomic().vectorize() across all three RVars and
compared. This test found all three bugs fixed in the preceding
commit.

Co-authored-by: Claude <noreply@anthropic.com>
---
 test/correctness/multiramp.cpp                | 140 ++++++++++++++++++
 test/correctness/transposed_vector_reduce.cpp | 115 ++++++++++++++
 2 files changed, 255 insertions(+)

diff --git a/test/correctness/multiramp.cpp b/test/correctness/multiramp.cpp
index e69c79f6256f..c4854986be4f 100644
--- a/test/correctness/multiramp.cpp
+++ b/test/correctness/multiramp.cpp
@@ -1,6 +1,7 @@
 #include "Halide.h"
 
 #include <cstdio>
+#include <set>
 #include <vector>
 
 using namespace Halide;
@@ -434,6 +435,135 @@ void check_shuffle_from_slice_3d() {
     }
 }
 
+// ---- MultiRamp::mul ------------------------------------------------------
+
+void check_mul_basic() {
+    MultiRamp A{3, {1, 10}, {2, 3}};  // 3, 4, 13, 14, 23, 24
+    A.mul(5);
+    CHECK_SEQ_LIT(expand(A), "mul values", 15, 20, 65, 70, 115, 120);
+}
+
+// ---- MultiRamp::operator== ----------------------------------------------
+
+void check_equality_same() {
+    MultiRamp A{0, {1, 10}, {2, 3}};
+    Expr e = simplify(A == A);
+    CHECK(is_const_one(e), "multiramp equals itself");
+}
+
+void check_equality_different() {
+    MultiRamp A{0, {1, 10}, {2, 3}};
+    MultiRamp B{0, {1, 10}, {3, 2}};  // same total lanes, different shape
+    // A.to_expr() == [0,1,10,11,20,21], B.to_expr() = [0,1,2,10,11,12];
+    // so they are not equal in every lane.
+    Expr e = simplify(A == B);
+    CHECK(is_const_zero(e), "different multiramps compare false");
+}
+
+void check_equality_scalar() {
+    MultiRamp A{42, {}, {}};
+    MultiRamp B{42, {}, {}};
+    MultiRamp C{7, {}, {}};
+    CHECK(is_const_one(simplify(A == B)), "scalar multiramp equality");
+    CHECK(is_const_zero(simplify(A == C)), "scalar multiramp inequality");
+}
+
+// ---- MultiRamp::alias_free_slice ----------------------------------------
+
+void check_alias_free_slice_all_unique() {
+    // All lanes of the returned slice should be unique.
+    MultiRamp A{5, {1, 16}, {4, 3}};  // clearly alias-free
+    auto peeled = A.alias_free_slice();
+    CHECK(peeled.empty(), "fully alias-free: nothing peeled");
+    auto seq = expand(A);
+    std::set<int> unique(seq.begin(), seq.end());
+    CHECK(unique.size() == seq.size(), "slice lanes are unique");
+}
+
+void check_alias_free_slice_peels_zero_stride() {
+    // Stride-zero inner dim must be peeled.
+    MultiRamp A{0, {0, 1}, {4, 5}};
+    auto peeled = A.alias_free_slice();
+    CHECK(peeled.size() == 1, "peeled the stride-zero dim");
+    if (peeled.size() == 1) {
+        CHECK(peeled[0].dim == 0 && peeled[0].lanes == 4,
+              "peeled the right dim");
+        CHECK(is_const_zero(peeled[0].stride), "peeled dim had stride zero");
+    }
+    // Remaining is {base=0, strides=[1], lanes=[5]} — unique.
+    auto seq = expand(A);
+    std::set<int> unique(seq.begin(), seq.end());
+    CHECK(unique.size() == seq.size(), "remaining slice is unique");
+}
+
+void check_alias_free_slice_degenerate() {
+    // A 1-dim ramp with stride zero: only dim is a duplication. It should
+    // be peeled, leaving *this as a 0-dim scalar.
+    MultiRamp A{7, {0}, {4}};
+    auto peeled = A.alias_free_slice();
+    CHECK(peeled.size() == 1, "peeled the only dim");
+    CHECK(A.dimensions() == 0, "remaining is scalar");
+    auto seq = expand(A);
+    CHECK(seq.size() == 1 && seq[0] == 7, "scalar lane is base");
+}
+
+// ---- MultiRamp::rotate_stride_one_innermost -----------------------------
+
+void check_rotate_stride_one_innermost() {
+    // Stride-1 dim not innermost: rotating should produce a MultiRamp
+    // whose expand, when transposed with cols = total / A, matches the
+    // original expand.
+    MultiRamp A{0, {10, 1}, {3, 4}};  // [0,10,20,1,11,21,2,12,22,3,13,23]
+    auto orig = expand(A);
+    int a = A.rotate_stride_one_innermost();
+    CHECK(a > 0, "rotated (stride-1 was not innermost)");
+    auto rotated = expand(A);
+    // Per the header: make_transpose(rotated, total/a) recovers orig.
+    // make_transpose(v, cols): output[j*rows + i] = v[i*cols + j], with
+    // rows = v.size()/cols.
+    int cols = (int)rotated.size() / a;
+    int rows = a;
+    std::vector<int> roundtrip(rotated.size());
+    for (int j = 0; j < cols; j++) {
+        for (int i = 0; i < rows; i++) {
+            roundtrip[j * rows + i] = rotated[i * cols + j];
+        }
+    }
+    CHECK_SEQ(roundtrip, orig, "rotate + transpose = identity");
+}
+
+void check_rotate_stride_one_innermost_noop() {
+    // Stride-1 already innermost: no-op.
+    MultiRamp A{0, {1, 10}, {3, 4}};
+    auto before = expand(A);
+    int a = A.rotate_stride_one_innermost();
+    CHECK(a == 0, "no-op when stride-1 already innermost");
+    CHECK_SEQ(expand(A), before, "unchanged");
+}
+
+// ---- is_multiramp round-trip --------------------------------------------
+
+void check_roundtrip(const MultiRamp &mr, const char *msg) {
+    Expr e = mr.to_expr();
+    MultiRamp parsed;
+    Scope<Expr> scope;
+    if (e.type().is_vector()) {
+        CHECK(is_multiramp(e, scope, &parsed), msg);
+        if (parsed.dimensions() > 0 || mr.dimensions() > 0) {
+            auto got = expand(parsed);
+            auto want = expand(mr);
+            CHECK_SEQ(got, want, msg);
+        }
+    }
+}
+
+void check_roundtrips() {
+    check_roundtrip(MultiRamp{0, {1}, {4}}, "1D ramp roundtrip");
+    check_roundtrip(MultiRamp{7, {1, 10}, {2, 3}}, "2D ramp roundtrip");
+    check_roundtrip(MultiRamp{0, {1, 10, 100}, {2, 3, 2}}, "3D ramp roundtrip");
+    check_roundtrip(MultiRamp{0, {0, 1}, {4, 3}}, "stride-zero dim roundtrip");
+}
+
 void check_reject_non_multiramp_sum() {
     // [0,1,2,100,101,102] + [0,2,100,102,200,202] = sum with shape conflict.
     Expr a_inner = Ramp::make(Expr(0), Expr(1), 3);
@@ -482,6 +612,16 @@ int main(int argc, char **argv) {
     check_shuffle_from_slice_2d();
     check_shuffle_from_slice_inner();
     check_shuffle_from_slice_3d();
+    check_mul_basic();
+    check_equality_same();
+    check_equality_different();
+    check_equality_scalar();
+    check_alias_free_slice_all_unique();
+    check_alias_free_slice_peels_zero_stride();
+    check_alias_free_slice_degenerate();
+    check_rotate_stride_one_innermost();
+    check_rotate_stride_one_innermost_noop();
+    check_roundtrips();
     check_reject_non_multiramp_sum();
 
     if (failures) {
diff --git a/test/correctness/transposed_vector_reduce.cpp b/test/correctness/transposed_vector_reduce.cpp
index aa38fc4426f8..3b636f4b11ac 100644
--- a/test/correctness/transposed_vector_reduce.cpp
+++ b/test/correctness/transposed_vector_reduce.cpp
@@ -1,5 +1,9 @@
 #include "Halide.h"
 
+#include <iostream>
+#include <map>
+#include <random>
+
 using namespace Halide;
 using namespace Halide::Internal;
 
@@ -117,6 +121,112 @@ int test(int which_case = all) {
     return success;
 }
 
+// Generate a random quasi-affine expression in the given RVars: an affine
+// combination of terms where each term is one of v, v/k, v%k, or recursively
+// one of those of a nested term. All divisors are required to divide the
+// corresponding RVar's extent, so the expression is representable as a
+// multiramp of the RDom. Coefficients and constant terms are small ints.
+struct RVarInfo {
+    RVar var;
+    int extent;
+};
+
+Expr random_term(std::mt19937 &rng, const RDom &rdom) {
+    const RVar &chosen = rdom[rng() % rdom.dimensions()];
+    int extent = *as_const_int(chosen.extent());
+    int op = (int)(rng() % 3);  // 0: leaf, 1: /k, 2: %k
+    if (op == 0 || extent <= 1) {
+        return chosen;
+    }
+    std::vector<int> divisors;
+    for (int d = 2; d <= extent; d++) {
+        if (extent % d == 0) {
+            divisors.push_back(d);
+        }
+    }
+    if (divisors.empty()) {
+        return chosen;
+    }
+    int k = divisors[rng() % divisors.size()];
+    return (op == 1) ? chosen / k : chosen % k;
+}
+
+Expr random_qa(std::mt19937 &rng, const RDom &rdom) {
+    int n_terms = 1 + (int)(rng() % 3);
+    Expr e;
+    for (int i = 0; i < n_terms; i++) {
+        int coeff = (int)(rng() % 5) - 2;  // -2..2
+        if (coeff == 0) continue;
+        Expr term = random_term(rng, rdom);
+        Expr part = (coeff == 1) ? term : coeff * term;
+        e = e.defined() ? e + part : part;
+    }
+    if (!e.defined()) e = 0;
+    int c0 = (int)(rng() % 7) - 3;  // -3..3
+    if (c0 != 0) e = e + c0;
+    return e;
+}
+
+int test_random() {
+    std::mt19937 rng(0);
+    RDom r(0, 8, 0, 9, 0, 6);
+
+    // Generous symmetric range for both the input and the output. Halide's
+    // bounds inference figures out what it actually needs within this.
+    constexpr int half = 256;
+    constexpr int range = 2 * half;
+
+    Buffer<int> input_buf(range);
+    input_buf.set_min(-half);
+
+    constexpr int num_cases = 1000;
+    int tried = 0;
+    while (tried < num_cases) {
+        Expr A = random_qa(rng, r);
+        Expr B = random_qa(rng, r);
+        int t = tried++;
+
+        for (int i = 0; i < range; i++) {
+            input_buf(i - half) = (i * 31 + t * 7) & 0xff;
+        }
+        ImageParam input(Int(32), 1);
+        input.set(input_buf);
+
+        auto build = [&](bool vectorized) {
+            Func f{"f_rand"};
+            Var x{"x"};
+            f(x) = 0;
+            f(A) += input(B) + 0 * r.x;  // Force a dependence on the RDom
+            if (vectorized) {
+                f.update().atomic().vectorize(r.x).vectorize(r.y).vectorize(r.z);
+            }
+            return f;
+        };
+
+        auto realize = [&](Func f) {
+            Buffer<int> buf(range);
+            buf.set_min(-half);
+            f.realize(buf);
+            return buf;
+        };
+
+        Buffer<int> correct = realize(build(false));
+        Buffer<int> out = realize(build(true));
+
+        for (int i = -half; i < half; i++) {
+            if (out(i) != correct(i)) {
+                std::cout << "Random case " << t << " failed:\n"
+                          << "  A = " << A << "\n"
+                          << "  B = " << B << "\n"
+                          << "  out(" << i << ") = " << out(i)
+                          << " instead of " << correct(i) << "\n";
+                return bad_output;
+            }
+        }
+    }
+    return success;
+}
+
 int main(int argc, char **argv) {
 
     int result = test(all);
@@ -132,6 +242,11 @@ int main(int argc, char **argv) {
         return result;
     }
 
+    int rand_result = test_random();
+    if (rand_result != success) {
+        return rand_result;
+    }
+
     printf("Success!\n");
     return 0;
 }

From d46819559b8977a567ac2f7b237a511b753cab21 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 22 Apr 2026 14:52:44 -0700
Subject: [PATCH 47/55] Cut down on number of transposed_vector_reduce test
 cases

---
 test/correctness/transposed_vector_reduce.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/correctness/transposed_vector_reduce.cpp b/test/correctness/transposed_vector_reduce.cpp
index 3b636f4b11ac..191c46102130 100644
--- a/test/correctness/transposed_vector_reduce.cpp
+++ b/test/correctness/transposed_vector_reduce.cpp
@@ -179,7 +179,7 @@ int test_random() {
     Buffer<int> input_buf(range);
     input_buf.set_min(-half);
 
-    constexpr int num_cases = 1000;
+    constexpr int num_cases = 200;
     int tried = 0;
     while (tried < num_cases) {
         Expr A = random_qa(rng, r);

From 38f965fe0d520949953d3dbd1e4835b368dfb0fe Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Thu, 23 Apr 2026 12:25:31 -0700
Subject: [PATCH 48/55] Clarify comments flagged by a weak-model review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ran a weak subagent (Haiku) over the MultiRamp PR as an adversarial
comprehension test — asking it to explain the code in detail, then
fixing whatever it got wrong. The theory: if a weaker model
misreads something, the comment is probably unclear, not the model.

Fixes prompted by the review:

- Simplify_Exprs.cpp / Simplify_Stmts.cpp: stale "outermost" wording
  from before rotate_stride_one_outermost was renamed to
  rotate_stride_one_innermost. The comments contradicted the function
  name and Haiku echoed the contradiction.
- MultiRamp.h alias_free: state explicitly that the returned Expr is
  a sufficient (not necessary) condition for lane uniqueness.
- MultiRamp.h alias_free_slice: clarify that kept dims are a subset
  preserving order, not necessarily a prefix.
- VectorizeLoops.cpp: rename ContainingLoop -> UnrolledLoop and note
  that the peeled dims are fully unrolled into a flat Block, not a
  runtime loop nest (despite the old name).
- MultiRamp.h alias_free_slice: note that stride-zero and purely
  symbolic strides always peel (added by Andrew directly).

A second Haiku pass after these edits answered every question
correctly, including the ones it got wrong the first time.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/MultiRamp.h        | 42 ++++++++++++++++++++++++++----------------
 src/Simplify_Exprs.cpp |  7 ++++---
 src/Simplify_Stmts.cpp | 12 ++++++------
 src/VectorizeLoops.cpp | 30 ++++++++++++++++++------------
 4 files changed, 54 insertions(+), 37 deletions(-)

diff --git a/src/MultiRamp.h b/src/MultiRamp.h
index c0ae2709dca0..c84c90194f02 100644
--- a/src/MultiRamp.h
+++ b/src/MultiRamp.h
@@ -80,14 +80,15 @@ struct MultiRamp {
      * slice. */
     void slice(int d, Expr v);
 
-    /** Construct an Expr telling us whether the lanes are all unique. A
-     * sufficient condition is that there is an ordering of the dims along
-     * which each stride is greater than the sum of the spans of earlier
-     * dims (span of dim k = |strides[k]| * (lanes[k] - 1)). We check all
-     * dim orderings and OR the resulting conditions together (ignoring
-     * base, since base is a uniform offset and doesn't affect uniqueness
-     * within the ramp). If this simplifies to const_false the lanes may
-     * or may not actually alias. */
+    /** Construct an Expr that is a *sufficient* condition for the lanes to
+     * all be unique — i.e. if it evaluates to true the lanes don't alias,
+     * but if it evaluates to false the lanes may or may not alias. The
+     * implication only goes one way. The condition checked is: there
+     * exists an ordering of the dims along which each stride is greater
+     * than the sum of the spans of earlier dims (span of dim k =
+     * |strides[k]| * (lanes[k] - 1)). We OR that condition over all dim
+     * orderings (base is ignored since it's a uniform offset and doesn't
+     * affect within-ramp uniqueness). */
     Expr alias_free() const;
 
     /** Information about one peeled dim, produced by alias_free_slice.
@@ -98,14 +99,23 @@ struct MultiRamp {
         int dim;
     };
 
-    /** Build an alias-free slice of *this by greedily adding dims innermost
-     * to outermost, keeping a dim only if the slice remains alias-free
-     * after it's added. Replace *this with the resulting slice, and return
-     * a description of the dims that weren't kept (innermost first).
-     * Always succeeds; *this may be reduced to a 0-dim scalar if no prefix
-     * of dims is alias-free. The omitted dims' contributions are NOT
-     * folded into base — callers usually want to add back
-     * `var * omitted.stride` per omitted dim before using *this. */
+    /** Build an alias-free slice of *this by walking the dims innermost to
+     * outermost and keeping each one only if the slice is still alias-free
+     * after adding it. The kept dims are a *subset* of the original dims
+     * (preserving their relative order), not necessarily a prefix — e.g. a
+     * middle dim may be dropped while both inner and outer dims are kept.
+     * Replace *this with the resulting slice, and return a description of
+     * the dims that weren't kept (innermost first). Always succeeds; *this
+     * may be reduced to a 0-dim scalar if no dim can be kept. The omitted
+     * dims' contributions are NOT folded into base — callers usually want
+     * to add back `var * omitted.stride` per omitted dim before using
+     * *this.
+     *
+     * All dimensions with stride zero or purely symbolic strides will be
+     * peeled, and some constant stride dimensions may also be peeled if
+     * they produce values that overlap other dimensions E.g. if there are
+     * two nested ramps that both have stride 1 the outer one will be
+     * peeled. */
     std::vector<PeeledDim> alias_free_slice();
 
     /** No-op returning 0 if the stride-1 dim is already innermost (or
diff --git a/src/Simplify_Exprs.cpp b/src/Simplify_Exprs.cpp
index 5fb2245cdda2..7c07aba1a42f 100644
--- a/src/Simplify_Exprs.cpp
+++ b/src/Simplify_Exprs.cpp
@@ -387,9 +387,10 @@ Expr Simplify::visit(const Load *op, ExprInfo *info) {
                is_multiramp(index, Scope<Expr>::empty_scope(), &mr) &&
                mr.dimensions() > 1) {
         // If the index is a multi-dimensional ramp with a stride-1 dim that
-        // isn't already innermost, rotate it (together with all subsequent
-        // dims) to the outermost position so the resulting load is dense,
-        // and restore the original lane order with a single make_transpose.
+        // isn't already innermost, rotate it to the innermost position (with
+        // the previously-inner dims moved to the outermost end) so the
+        // resulting load is dense, and restore the original lane order with a
+        // single make_transpose.
         MultiRamp permuted = mr;
         int A = permuted.rotate_stride_one_innermost();
         if (A > 0) {
diff --git a/src/Simplify_Stmts.cpp b/src/Simplify_Stmts.cpp
index e949b3f443f2..77ae0e5848cf 100644
--- a/src/Simplify_Stmts.cpp
+++ b/src/Simplify_Stmts.cpp
@@ -399,12 +399,12 @@ Stmt Simplify::visit(const Store *op) {
                is_multiramp(index, Scope<Expr>::empty_scope(), &mr) &&
                mr.dimensions() > 1) {
         // If the index is a multi-dimensional ramp with a stride-1 dim that
-        // isn't already innermost, rotate it (together with all subsequent
-        // dims) to the outermost position so the resulting store is dense.
-        // Permute the value and predicate to match the new lane order using
-        // a single make_transpose. Later in lowering, after flattening the
-        // nested ramps, this turns into a concat of dense ramps and hits the
-        // case above.
+        // isn't already innermost, rotate it to the innermost position (with
+        // the previously-inner dims moved to the outermost end) so the
+        // resulting store is dense. Permute the value and predicate to match
+        // the new lane order using a single make_transpose. Later in
+        // lowering, after flattening the nested ramps, this turns into a
+        // concat of dense ramps and hits the case above.
         MultiRamp permuted = mr;
         int A = permuted.rotate_stride_one_innermost();
         if (A > 0) {
diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp
index 47c839d1c5ad..8634f116a733 100644
--- a/src/VectorizeLoops.cpp
+++ b/src/VectorizeLoops.cpp
@@ -967,7 +967,7 @@ class VectorSubs : public IRMutator {
 
         // We may partially succeed, in which case we'll have (unrolled) loops
         // to rewrap.
-        struct ContainingLoop {
+        struct UnrolledLoop {
             std::string name;
             int extent;
             // Index of this loop's dim in the pre-alias-peel MultiRamp. Used
@@ -975,7 +975,7 @@ class VectorSubs : public IRMutator {
             // corresponding slice of the reduced value vector.
             int dim;
         };
-        std::vector<ContainingLoop> containing_loops;
+        std::vector<UnrolledLoop> unrolled_loops;
 
         do {
             if (!op->mutex_name.empty()) {
@@ -1266,8 +1266,8 @@ class VectorSubs : public IRMutator {
                         }
                     }
                     std::string name = unique_name('t');
-                    containing_loops.emplace_back(
-                        ContainingLoop{name, p.lanes, pos});
+                    unrolled_loops.emplace_back(
+                        UnrolledLoop{name, p.lanes, pos});
                     store_mr.base += Variable::make(Int(32), name) * p.stride;
                 }
                 output_lanes = store_mr.total_lanes();
@@ -1282,7 +1282,7 @@ class VectorSubs : public IRMutator {
             Expr lhs = cast(b.type().with_lanes(output_lanes), new_load);
 
             Stmt s;
-            if (containing_loops.empty()) {
+            if (unrolled_loops.empty()) {
                 b = binop(lhs, b);
                 b = cast(new_load.type(), b);
                 s = Store::make(store->name, b, store_index, store->param,
@@ -1302,22 +1302,28 @@ class VectorSubs : public IRMutator {
                 Expr full_b_var = Variable::make(b.type(), full_b_var_name);
 
                 std::vector<int> peeled_dims, loop_extents;
-                peeled_dims.reserve(containing_loops.size());
-                loop_extents.reserve(containing_loops.size());
-                for (const auto &loop : containing_loops) {
+                peeled_dims.reserve(unrolled_loops.size());
+                loop_extents.reserve(unrolled_loops.size());
+                for (const auto &loop : unrolled_loops) {
                     peeled_dims.push_back(loop.dim);
                     loop_extents.push_back(loop.extent);
                 }
+                // Fully unroll the peeled dims into a flat Block of stores:
+                // we enumerate every multi-index v in the cartesian product of
+                // loop_extents and emit one store per iteration, substituting
+                // the loop variable with the corresponding constant. There is
+                // no runtime loop nest — UnrolledLoop describes the dims we
+                // peeled off of b, not loops that survive in the output.
                 std::vector<Stmt> block;
                 block.reserve(b_shape_mr.total_lanes() / output_lanes);
                 for_each_coordinate(loop_extents, [&](const std::vector<int> &v) {
-                    // v is the loop iteration multi-index (innermost first,
-                    // matching the order in containing_loops).
+                    // v is the iteration multi-index (innermost first,
+                    // matching the order in unrolled_loops).
                     Expr b_slice = Shuffle::make({full_b_var},
                                                  b_shape_mr.shuffle_from_slice(peeled_dims, v));
                     Stmt this_store = store_template;
-                    for (size_t j = 0; j < containing_loops.size(); j++) {
-                        this_store = substitute(containing_loops[j].name, v[j], this_store);
+                    for (size_t j = 0; j < unrolled_loops.size(); j++) {
+                        this_store = substitute(unrolled_loops[j].name, v[j], this_store);
                     }
                     this_store = substitute(b_var_name, b_slice, this_store);
                     block.push_back(this_store);

From 98ff7c3ddecfe5af209743038ee1e3b4c5d8015a Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 29 Apr 2026 10:31:56 -0700
Subject: [PATCH 49/55] Remove holdover from transpose branch

---
 .github/workflows/testing-make.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/testing-make.yml b/.github/workflows/testing-make.yml
index 9c4172cee556..d65c8607b5ab 100644
--- a/.github/workflows/testing-make.yml
+++ b/.github/workflows/testing-make.yml
@@ -77,7 +77,6 @@ jobs:
               "lld-${LLVM_VERSION}" \
               "liblld-${LLVM_VERSION}-dev"
             echo "LLVM_CONFIG=llvm-config-${LLVM_VERSION}" | tee -a "$GITHUB_ENV"
-            cat /proc/cpuinfo
           elif [ "$RUNNER_OS" = "macOS" ]; then
             brew install libjpeg-turbo libpng pkgconf protobuf "llvm@${LLVM_VERSION}" "lld@${LLVM_VERSION}"
             echo "LLVM_CONFIG=$(brew --prefix "llvm@${LLVM_VERSION}")/bin/llvm-config" | tee -a "$GITHUB_ENV"

From 7a1fe5ea7cb99ac6f608b6443171c0a5525593e8 Mon Sep 17 00:00:00 2001
From: "halide-ci[bot]" <266445882+halide-ci[bot]@users.noreply.github.com>
Date: Wed, 29 Apr 2026 17:40:47 +0000
Subject: [PATCH 50/55] Apply pre-commit auto-fixes

---
 src/VectorizeLoops.cpp          |  2 +-
 test/correctness/CMakeLists.txt |  2 +-
 test/correctness/multiramp.cpp  | 24 ++++++++++++++----------
 3 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp
index 8634f116a733..e4d4def684f3 100644
--- a/src/VectorizeLoops.cpp
+++ b/src/VectorizeLoops.cpp
@@ -1203,7 +1203,7 @@ class VectorSubs : public IRMutator {
                     std::vector<int> perm(d);
                     std::iota(perm.begin(), perm.end(), 0);
                     auto mid = std::stable_partition(perm.begin(), perm.end(),
-                        [&](int i) { return !is_const_zero(b_shape_mr.strides[i]); });
+                                                     [&](int i) { return !is_const_zero(b_shape_mr.strides[i]); });
                     int n_kept = mid - perm.begin();
                     // shuffle_from_permuted gives us idx such that
                     // Shuffle(<permuted>, idx) == <original>. Here we have
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index 6ec7a2cfacf9..1df954dfb0ce 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -89,9 +89,9 @@ tests(GROUPS correctness
       device_crop.cpp
       device_slice.cpp
       dilate3x3.cpp
-      downsampling_reduce.cpp
       div_by_zero.cpp
       div_round_to_zero.cpp
+      downsampling_reduce.cpp
       dynamic_allocation_in_gpu_kernel.cpp
       dynamic_reduction_bounds.cpp
       early_out.cpp
diff --git a/test/correctness/multiramp.cpp b/test/correctness/multiramp.cpp
index c4854986be4f..4e38fa3bd8f6 100644
--- a/test/correctness/multiramp.cpp
+++ b/test/correctness/multiramp.cpp
@@ -24,7 +24,8 @@ std::vector<int> expand(const MultiRamp &m) {
         strides.push_back(*cs);
     }
     int total = 1;
-    for (int n : m.lanes) total *= n;
+    for (int n : m.lanes)
+        total *= n;
     std::vector<int> result;
     result.reserve(total);
     for (int flat = 0; flat < total; flat++) {
@@ -60,15 +61,16 @@ void check_seq(const std::vector<int> &got, const std::vector<int> &want,
     }
 }
 
-#define CHECK(cond, msg) do {                                \
-    if (!(cond)) {                                           \
-        printf("FAIL at %d: %s\n", __LINE__, msg);           \
-        failures++;                                          \
-    }                                                        \
-} while (0)
+#define CHECK(cond, msg)                               \
+    do {                                               \
+        if (!(cond)) {                                 \
+            printf("FAIL at %d: %s\n", __LINE__, msg); \
+            failures++;                                \
+        }                                              \
+    } while (0)
 
 #define CHECK_SEQ_LIT(got, msg, ...) check_seq((got), std::vector<int>{__VA_ARGS__}, (msg), __LINE__)
-#define CHECK_SEQ(got, want, msg)    check_seq((got), (want), (msg), __LINE__)
+#define CHECK_SEQ(got, want, msg) check_seq((got), (want), (msg), __LINE__)
 
 // ---- MultiRamp::add ------------------------------------------------------
 
@@ -89,7 +91,8 @@ void check_add_same_shape() {
     auto a_seq = expand(A), b_seq = expand(B);
     CHECK(A.add(B), "same-shape add");
     std::vector<int> want(8);
-    for (size_t i = 0; i < a_seq.size(); i++) want[i] = a_seq[i] + b_seq[i];
+    for (size_t i = 0; i < a_seq.size(); i++)
+        want[i] = a_seq[i] + b_seq[i];
     CHECK_SEQ(expand(A), want, "same-shape add values");
 }
 
@@ -139,7 +142,8 @@ void check_div_pure_carry_const() {
     auto a_seq = expand(A);
     CHECK(A.div(4), "pure-carry div (const k)");
     std::vector<int> want(a_seq.size());
-    for (size_t i = 0; i < a_seq.size(); i++) want[i] = a_seq[i] / 4;
+    for (size_t i = 0; i < a_seq.size(); i++)
+        want[i] = a_seq[i] / 4;
     CHECK_SEQ(expand(A), want, "pure-carry div values");
 }
 

From 2f8e976e0910786289400baadca08d26dc3149c2 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 29 Apr 2026 10:44:58 -0700
Subject: [PATCH 51/55] Fixes for the pre-commit auto fixes

---
 src/VectorizeLoops.cpp         | 7 +++++--
 test/correctness/multiramp.cpp | 9 ++++++---
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp
index e4d4def684f3..f97099287003 100644
--- a/src/VectorizeLoops.cpp
+++ b/src/VectorizeLoops.cpp
@@ -1202,8 +1202,11 @@ class VectorSubs : public IRMutator {
                     int d = b_shape_mr.dimensions();
                     std::vector<int> perm(d);
                     std::iota(perm.begin(), perm.end(), 0);
-                    auto mid = std::stable_partition(perm.begin(), perm.end(),
-                                                     [&](int i) { return !is_const_zero(b_shape_mr.strides[i]); });
+                    auto stride_not_zero = [&](int i) {
+                        return !is_const_zero(b_shape_mr.strides[i]);
+                    };
+                    auto mid = std::stable_partition(perm.begin(), perm.end(), stride_not_zero);
+
                     int n_kept = mid - perm.begin();
                     // shuffle_from_permuted gives us idx such that
                     // Shuffle(<permuted>, idx) == <original>. Here we have
diff --git a/test/correctness/multiramp.cpp b/test/correctness/multiramp.cpp
index 4e38fa3bd8f6..c7b8aae195db 100644
--- a/test/correctness/multiramp.cpp
+++ b/test/correctness/multiramp.cpp
@@ -24,8 +24,9 @@ std::vector<int> expand(const MultiRamp &m) {
         strides.push_back(*cs);
     }
     int total = 1;
-    for (int n : m.lanes)
+    for (int n : m.lanes) {
         total *= n;
+    }
     std::vector<int> result;
     result.reserve(total);
     for (int flat = 0; flat < total; flat++) {
@@ -91,8 +92,9 @@ void check_add_same_shape() {
     auto a_seq = expand(A), b_seq = expand(B);
     CHECK(A.add(B), "same-shape add");
     std::vector<int> want(8);
-    for (size_t i = 0; i < a_seq.size(); i++)
+    for (size_t i = 0; i < a_seq.size(); i++) {
         want[i] = a_seq[i] + b_seq[i];
+    }
     CHECK_SEQ(expand(A), want, "same-shape add values");
 }
 
@@ -142,8 +144,9 @@ void check_div_pure_carry_const() {
     auto a_seq = expand(A);
     CHECK(A.div(4), "pure-carry div (const k)");
     std::vector<int> want(a_seq.size());
-    for (size_t i = 0; i < a_seq.size(); i++)
+    for (size_t i = 0; i < a_seq.size(); i++) {
         want[i] = a_seq[i] / 4;
+    }
     CHECK_SEQ(expand(A), want, "pure-carry div values");
 }
 

From fcf96467deeafe929c8958ad26c532cab1e450cb Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 29 Apr 2026 11:54:32 -0700
Subject: [PATCH 52/55] clang-tidy fix

---
 src/MultiRamp.cpp | 2 +-
 src/MultiRamp.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/MultiRamp.cpp b/src/MultiRamp.cpp
index 2c277b3c32a3..6ecbb4d3fcd0 100644
--- a/src/MultiRamp.cpp
+++ b/src/MultiRamp.cpp
@@ -438,7 +438,7 @@ Expr MultiRamp::operator==(const MultiRamp &other) const {
     return simplify(c);
 }
 
-void MultiRamp::slice(int d, Expr v) {
+void MultiRamp::slice(int d, const Expr &v) {
     internal_assert(d >= 0 && d < (int)strides.size());
     internal_assert(v.type() == base.type());
     base += v * strides[d];
diff --git a/src/MultiRamp.h b/src/MultiRamp.h
index c84c90194f02..35daef494b8e 100644
--- a/src/MultiRamp.h
+++ b/src/MultiRamp.h
@@ -78,7 +78,7 @@ struct MultiRamp {
     /** Remove dim `d`, adding `v * strides[d]` to base. Pass v = 0 for the
      * first slice along that dim, or a Variable to get a parameterized
      * slice. */
-    void slice(int d, Expr v);
+    void slice(int d, const Expr &v);
 
     /** Construct an Expr that is a *sufficient* condition for the lanes to
      * all be unique — i.e. if it evaluates to true the lanes don't alias,

From 3a97c3b03d393e14936cd4b4f3428af871d1b844 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Fri, 1 May 2026 09:36:44 -0700
Subject: [PATCH 53/55] Skip test under SVE

---
 test/correctness/transposed_vector_reduce.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/correctness/transposed_vector_reduce.cpp b/test/correctness/transposed_vector_reduce.cpp
index 191c46102130..ae349c70719b 100644
--- a/test/correctness/transposed_vector_reduce.cpp
+++ b/test/correctness/transposed_vector_reduce.cpp
@@ -228,6 +228,10 @@ int test_random() {
 }
 
 int main(int argc, char **argv) {
+    if (get_jit_target_from_environment().has_feature(Target::SVE2)) {
+        printf("[SKIP] LLVM's SVE backend chokes on the vector shuffles in this test.\n");
+        return 0;
+    }
 
     int result = test(all);
 

From 198c3e323469b63fb65b31f3b635724ab600cf5a Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Fri, 1 May 2026 15:13:25 -0700
Subject: [PATCH 54/55] This level of staging now unnecessary

---
 apps/iir_blur/Makefile               | 4 ++--
 apps/iir_blur/iir_blur_generator.cpp | 5 -----
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/apps/iir_blur/Makefile b/apps/iir_blur/Makefile
index 92ed5d2a5b0b..5dd3b1200cc6 100644
--- a/apps/iir_blur/Makefile
+++ b/apps/iir_blur/Makefile
@@ -10,11 +10,11 @@ $(GENERATOR_BIN)/iir_blur.generator: iir_blur_generator.cpp $(GENERATOR_DEPS)
 
 $(BIN)/%/iir_blur.a: $(GENERATOR_BIN)/iir_blur.generator
 	@mkdir -p $(@D)
-	$< -g iir_blur -f iir_blur -o $(BIN)/$* target=$*-no_runtime
+	$< -g iir_blur -f iir_blur -e $(GENERATOR_OUTPUTS) -o $(BIN)/$* target=$*-no_runtime
 
 $(BIN)/%/iir_blur_auto_schedule.a: $(GENERATOR_BIN)/iir_blur.generator
 	@mkdir -p $(@D)
-	$< -g iir_blur -f iir_blur_auto_schedule -o $(BIN)/$* target=$*-no_runtime autoscheduler=Mullapudi2016
+	$< -g iir_blur -f iir_blur_auto_schedule -e $(GENERATOR_OUTPUTS) -o $(BIN)/$* target=$*-no_runtime autoscheduler=Mullapudi2016
 
 $(BIN)/%/runtime.a: $(GENERATOR_BIN)/iir_blur.generator
 	@mkdir -p $(@D)
diff --git a/apps/iir_blur/iir_blur_generator.cpp b/apps/iir_blur/iir_blur_generator.cpp
index 7f411d7e8fef..146e15fb24b5 100644
--- a/apps/iir_blur/iir_blur_generator.cpp
+++ b/apps/iir_blur/iir_blur_generator.cpp
@@ -48,11 +48,6 @@ Func blur_cols_transpose(Func input, Expr height, Expr alpha, bool skip_schedule
                 .fuse(yo, c, t)
                 .parallel(t);
 
-            blur.in(transpose)
-                .compute_at(transpose, y)
-                .vectorize(x)
-                .unroll(y);
-
             // Run the filter on each row of tiles (which corresponds to a strip of
             // columns in the input).
             blur.compute_at(transpose, t);

From 5f5c756e3ac93038a1e7f86b401a8996b0fda70b Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Sun, 3 May 2026 14:55:48 -0700
Subject: [PATCH 55/55] Simplify new simplifier rules

---
 src/Simplify_Exprs.cpp | 54 +++++++++++++++++-----------------------
 src/Simplify_Stmts.cpp | 56 ++++++++++++++++++++----------------------
 2 files changed, 50 insertions(+), 60 deletions(-)

diff --git a/src/Simplify_Exprs.cpp b/src/Simplify_Exprs.cpp
index f045ae37a14e..e82eb82acdf5 100644
--- a/src/Simplify_Exprs.cpp
+++ b/src/Simplify_Exprs.cpp
@@ -380,6 +380,7 @@ Expr Simplify::visit(const Load *op, ExprInfo *info) {
     base_info.alignment = ModulusRemainder::intersect(base_info.alignment, index_info.alignment);
 
     ModulusRemainder align = ModulusRemainder::intersect(op->alignment, base_info.alignment);
+    int A;
 
     const Broadcast *b_index = index.as<Broadcast>();
     const Shuffle *s_index = index.as<Shuffle>();
@@ -417,39 +418,30 @@ Expr Simplify::visit(const Load *op, ExprInfo *info) {
                index.type().is_vector() &&
                // Don't do expensive analysis in the common case of a load of a ramp of scalars.
                !(r_index && r_index->base.type().is_scalar()) &&
+               // It's a multi-dimensional multiramp.
                is_multiramp(index, Scope<Expr>::empty_scope(), &mr) &&
-               mr.dimensions() > 1) {
-        // If the index is a multi-dimensional ramp with a stride-1 dim that
-        // isn't already innermost, rotate it to the innermost position (with
-        // the previously-inner dims moved to the outermost end) so the
-        // resulting load is dense, and restore the original lane order with a
-        // single make_transpose.
-        MultiRamp permuted = mr;
-        int A = permuted.rotate_stride_one_innermost();
-        if (A > 0) {
-            int B = op->type.lanes() / A;
-
-            // The predicate applied to the permuted load must be in the
-            // permuted lane order. For a halves-swap rotation that's just
-            // make_transpose(predicate, A) (except for scalar broadcasts,
-            // which are invariant).
-            Expr permuted_predicate;
-            const Broadcast *b_pred = predicate.as<Broadcast>();
-            if (b_pred && b_pred->value.type().is_scalar()) {
-                permuted_predicate = predicate;
-            } else {
-                permuted_predicate = Shuffle::make_transpose(predicate, A);
-            }
-
-            Expr permuted_load =
-                Load::make(op->type, op->name, permuted.to_expr(), op->image,
-                           op->param, permuted_predicate, align);
-            return mutate(Shuffle::make_transpose(permuted_load, B), info);
-        }
-        if (predicate.same_as(op->predicate) && index.same_as(op->index) && align == op->alignment) {
-            return op;
+               mr.dimensions() > 1 &&
+               // The innermost stride isn't already one.
+               !is_const_one(mr.strides[0]) &&
+               // We can successfully rotate a stride one dimension innermost.
+               (A = mr.rotate_stride_one_innermost()) > 0) {
+        // Rotating the stride one dimension innermost made the load dense, but
+        // we must now transpose the predicate to match the transposed index,
+        // and inverse-transpose the loaded value to restore the original lane
+        // ordering.
+        Expr permuted_predicate;
+        const Broadcast *b_pred = predicate.as<Broadcast>();
+        if (b_pred && b_pred->value.type().is_scalar()) {
+            permuted_predicate = predicate;
+        } else {
+            permuted_predicate = Shuffle::make_transpose(predicate, A);
         }
-        return Load::make(op->type, op->name, index, op->image, op->param, predicate, align);
+
+        Expr permuted_load =
+            Load::make(op->type, op->name, mr.to_expr(), op->image,
+                       op->param, permuted_predicate, align);
+        int B = op->type.lanes() / A;
+        return mutate(Shuffle::make_transpose(permuted_load, B), info);
     } else if (predicate.same_as(op->predicate) && index.same_as(op->index) && align == op->alignment) {
         return op;
     } else {
diff --git a/src/Simplify_Stmts.cpp b/src/Simplify_Stmts.cpp
index 77ae0e5848cf..c8f9d3effc6d 100644
--- a/src/Simplify_Stmts.cpp
+++ b/src/Simplify_Stmts.cpp
@@ -361,6 +361,7 @@ Stmt Simplify::visit(const Store *op) {
     }
 
     ModulusRemainder align = ModulusRemainder::intersect(op->alignment, base_info.alignment);
+    int A;
 
     if (is_const_zero(predicate)) {
         // Predicate is always false
@@ -396,37 +397,34 @@ Stmt Simplify::visit(const Store *op) {
                index.type().is_vector() &&
                // Don't do expensive analysis in the common case of a load of a ramp of scalars.
                !(r_index && r_index->base.type().is_scalar()) &&
+               // It's a multi-dimensional multiramp
                is_multiramp(index, Scope<Expr>::empty_scope(), &mr) &&
-               mr.dimensions() > 1) {
-        // If the index is a multi-dimensional ramp with a stride-1 dim that
-        // isn't already innermost, rotate it to the innermost position (with
-        // the previously-inner dims moved to the outermost end) so the
-        // resulting store is dense. Permute the value and predicate to match
-        // the new lane order using a single make_transpose. Later in
-        // lowering, after flattening the nested ramps, this turns into a
-        // concat of dense ramps and hits the case above.
-        MultiRamp permuted = mr;
-        int A = permuted.rotate_stride_one_innermost();
-        if (A > 0) {
-            // Transpose the value and predicate so their lane ordering
-            // matches the permuted index.
-            Expr permuted_value = Shuffle::make_transpose(value, A);
-            Expr permuted_predicate;
-            const Broadcast *b_pred = predicate.as<Broadcast>();
-            if (b_pred && b_pred->value.type().is_scalar()) {
-                permuted_predicate = predicate;
-            } else {
-                permuted_predicate = Shuffle::make_transpose(predicate, A);
-            }
-            return mutate(Store::make(op->name, permuted_value, permuted.to_expr(),
-                                      op->param, permuted_predicate, align));
-        }
-        if (predicate.same_as(op->predicate) && value.same_as(op->value) &&
-            index.same_as(op->index) && align == op->alignment) {
-            return op;
+               mr.dimensions() > 1 &&
+               // The innermost stride isn't already one
+               !is_const_one(mr.strides[0]) &&
+               // We can successfully rotate a stride one dimension innermost
+               (A = mr.rotate_stride_one_innermost()) > 0) {
+
+        // Rotating the stride one dimension innermost in the index made the
+        // resulting store dense. Now permute the value and predicate to match
+        // the new lane order using a single make_transpose. Later in lowering,
+        // after flattening the nested ramps, this turns into a concat of dense
+        // ramps and hits the case above.
+
+        Expr permuted_value = Shuffle::make_transpose(value, A);
+        Expr permuted_predicate;
+        const Broadcast *b_pred = predicate.as<Broadcast>();
+        if (b_pred && b_pred->value.type().is_scalar()) {
+            permuted_predicate = predicate;
+        } else {
+            permuted_predicate = Shuffle::make_transpose(predicate, A);
         }
-        return Store::make(op->name, value, index, op->param, predicate, align);
-    } else if (predicate.same_as(op->predicate) && value.same_as(op->value) && index.same_as(op->index) && align == op->alignment) {
+        return mutate(Store::make(op->name, permuted_value, mr.to_expr(),
+                                  op->param, permuted_predicate, align));
+    } else if (predicate.same_as(op->predicate) &&
+               value.same_as(op->value) &&
+               index.same_as(op->index) &&
+               align == op->alignment) {
         return op;
     } else {
         return Store::make(op->name, value, index, op->param, predicate, align);