diff --git a/backends/cadence/fused_quant/op_add.cpp b/backends/cadence/fused_quant/op_add.cpp
index 114039410f0..62e58c71c83 100644
--- a/backends/cadence/fused_quant/op_add.cpp
+++ b/backends/cadence/fused_quant/op_add.cpp
@@ -43,19 +43,16 @@ Tensor& add_out(
     ScalarType inp_dtype,
     int64_t inp_quant_min,
     int64_t inp_quant_max,
-    optional<int64_t> inp_axis,
     const optional<Tensor>& other_scale,
     const optional<Tensor>& other_zero_point,
     ScalarType other_dtype,
     int64_t other_quant_min,
     int64_t other_quant_max,
-    optional<int64_t> other_axis,
     const optional<Tensor>& out_scale,
     const optional<Tensor>& out_zero_point,
     ScalarType out_dtype,
     int64_t out_quant_min,
     int64_t out_quant_max,
-    optional<int64_t> out_axis,
     double alpha,
     Tensor& out) {
   int64_t numel = inp.numel();
@@ -72,7 +69,7 @@ Tensor& add_out(
     }
     inp_buf.resize(numel);
     QParams qp = extract_qparams(
-        inp_scale, inp_zero_point, inp_quant_min, inp_quant_max, inp_axis, inp);
+        inp_scale, inp_zero_point, inp_quant_min, inp_quant_max, inp);
     FUSED_QUANT_DTYPE_SWITCH(
         inp.scalar_type(),
         scalar_t,
@@ -88,12 +85,7 @@ Tensor& add_out(
     }
     other_buf.resize(numel);
     QParams qp = extract_qparams(
-        other_scale,
-        other_zero_point,
-        other_quant_min,
-        other_quant_max,
-        other_axis,
-        other);
+        other_scale, other_zero_point, other_quant_min, other_quant_max, other);
     FUSED_QUANT_DTYPE_SWITCH(
         other.scalar_type(),
         scalar_t,
@@ -107,7 +99,7 @@ Tensor& add_out(
     add_kernel(inp_float, other_float, result_float.data(), numel, alpha_f);
 
     QParams qp = extract_qparams(
-        out_scale, out_zero_point, out_quant_min, out_quant_max, out_axis, out);
+        out_scale, out_zero_point, out_quant_min, out_quant_max, out);
     FUSED_QUANT_DTYPE_SWITCH(
         out.scalar_type(),
         scalar_t,
diff --git a/backends/cadence/fused_quant/op_add.h b/backends/cadence/fused_quant/op_add.h
index 2da4ce80798..9db1e907294 100644
--- a/backends/cadence/fused_quant/op_add.h
+++ b/backends/cadence/fused_quant/op_add.h
@@ -24,20 +24,17 @@ executorch::aten::Tensor& add_out(
     executorch::aten::ScalarType inp_dtype,
     int64_t inp_quant_min,
     int64_t inp_quant_max,
-    executorch::aten::optional<int64_t> inp_axis,
     const executorch::aten::optional<executorch::aten::Tensor>& other_scale,
     const executorch::aten::optional<executorch::aten::Tensor>&
         other_zero_point,
     executorch::aten::ScalarType other_dtype,
     int64_t other_quant_min,
     int64_t other_quant_max,
-    executorch::aten::optional<int64_t> other_axis,
     const executorch::aten::optional<executorch::aten::Tensor>& out_scale,
     const executorch::aten::optional<executorch::aten::Tensor>& out_zero_point,
     executorch::aten::ScalarType out_dtype,
     int64_t out_quant_min,
     int64_t out_quant_max,
-    executorch::aten::optional<int64_t> out_axis,
     double alpha,
     executorch::aten::Tensor& out);
 
diff --git a/backends/cadence/fused_quant/op_bmm.cpp b/backends/cadence/fused_quant/op_bmm.cpp
index 2c79bcb6a59..7204ab6c88f 100644
--- a/backends/cadence/fused_quant/op_bmm.cpp
+++ b/backends/cadence/fused_quant/op_bmm.cpp
@@ -53,19 +53,16 @@ Tensor& bmm_out(
     ScalarType inp_dtype,
     int64_t inp_quant_min,
     int64_t inp_quant_max,
-    optional<int64_t> inp_axis,
     const optional<Tensor>& other_scale,
     const optional<Tensor>& other_zero_point,
     ScalarType other_dtype,
     int64_t other_quant_min,
     int64_t other_quant_max,
-    optional<int64_t> other_axis,
     const optional<Tensor>& out_scale,
     const optional<Tensor>& out_zero_point,
     ScalarType out_dtype,
     int64_t out_quant_min,
     int64_t out_quant_max,
-    optional<int64_t> out_axis,
     Tensor& out) {
   int64_t batch = inp.size(0);
   int64_t M = inp.size(1);
@@ -87,7 +84,7 @@ Tensor& bmm_out(
     }
     inp_buf.resize(inp_numel);
     QParams qp = extract_qparams(
-        inp_scale, inp_zero_point, inp_quant_min, inp_quant_max, inp_axis, inp);
+        inp_scale, inp_zero_point, inp_quant_min, inp_quant_max, inp);
     FUSED_QUANT_DTYPE_SWITCH(
         inp.scalar_type(),
         scalar_t,
@@ -104,12 +101,7 @@ Tensor& bmm_out(
     }
     other_buf.resize(other_numel);
     QParams qp = extract_qparams(
-        other_scale,
-        other_zero_point,
-        other_quant_min,
-        other_quant_max,
-        other_axis,
-        other);
+        other_scale, other_zero_point, other_quant_min, other_quant_max, other);
     FUSED_QUANT_DTYPE_SWITCH(other.scalar_type(),
                              scalar_t,
                              dequantize_buffer(
@@ -126,7 +118,7 @@ Tensor& bmm_out(
     bmm_kernel(inp_float, other_float, result_float.data(), batch, M, K, N);
 
     QParams qp = extract_qparams(
-        out_scale, out_zero_point, out_quant_min, out_quant_max, out_axis, out);
+        out_scale, out_zero_point, out_quant_min, out_quant_max, out);
     FUSED_QUANT_DTYPE_SWITCH(out.scalar_type(),
                              scalar_t,
                              quantize_buffer(
diff --git a/backends/cadence/fused_quant/op_bmm.h b/backends/cadence/fused_quant/op_bmm.h
index f814b46b481..ef9598eac98 100644
--- a/backends/cadence/fused_quant/op_bmm.h
+++ b/backends/cadence/fused_quant/op_bmm.h
@@ -24,20 +24,17 @@ executorch::aten::Tensor& bmm_out(
     executorch::aten::ScalarType inp_dtype,
     int64_t inp_quant_min,
     int64_t inp_quant_max,
-    executorch::aten::optional<int64_t> inp_axis,
     const executorch::aten::optional<executorch::aten::Tensor>& other_scale,
     const executorch::aten::optional<executorch::aten::Tensor>&
         other_zero_point,
     executorch::aten::ScalarType other_dtype,
     int64_t other_quant_min,
     int64_t other_quant_max,
-    executorch::aten::optional<int64_t> other_axis,
     const executorch::aten::optional<executorch::aten::Tensor>& out_scale,
     const executorch::aten::optional<executorch::aten::Tensor>& out_zero_point,
     executorch::aten::ScalarType out_dtype,
     int64_t out_quant_min,
     int64_t out_quant_max,
-    executorch::aten::optional<int64_t> out_axis,
     executorch::aten::Tensor& out);
 
 } // namespace native
diff --git a/backends/cadence/fused_quant/op_hardswish.cpp b/backends/cadence/fused_quant/op_hardswish.cpp
index 0d653a1bfae..452ea90a405 100644
--- a/backends/cadence/fused_quant/op_hardswish.cpp
+++ b/backends/cadence/fused_quant/op_hardswish.cpp
@@ -40,13 +40,11 @@ Tensor& hardswish_out(
     ScalarType inp_dtype,
     int64_t inp_quant_min,
     int64_t inp_quant_max,
-    optional<int64_t> inp_axis,
     const optional<Tensor>& out_scale,
     const optional<Tensor>& out_zero_point,
     ScalarType out_dtype,
     int64_t out_quant_min,
     int64_t out_quant_max,
-    optional<int64_t> out_axis,
     Tensor& out) {
   int64_t numel = inp.numel();
 
@@ -60,7 +58,7 @@ Tensor& hardswish_out(
     }
     inp_buf.resize(numel);
     QParams qp = extract_qparams(
-        inp_scale, inp_zero_point, inp_quant_min, inp_quant_max, inp_axis, inp);
+        inp_scale, inp_zero_point, inp_quant_min, inp_quant_max, inp);
     FUSED_QUANT_DTYPE_SWITCH(
         inp.scalar_type(),
         scalar_t,
@@ -74,7 +72,7 @@ Tensor& hardswish_out(
     hardswish_kernel(inp_float, result_float.data(), numel);
 
     QParams qp = extract_qparams(
-        out_scale, out_zero_point, out_quant_min, out_quant_max, out_axis, out);
+        out_scale, out_zero_point, out_quant_min, out_quant_max, out);
     FUSED_QUANT_DTYPE_SWITCH(
         out.scalar_type(),
         scalar_t,
diff --git a/backends/cadence/fused_quant/op_hardswish.h b/backends/cadence/fused_quant/op_hardswish.h
index 7cba5b07788..ba9e09da23c 100644
--- a/backends/cadence/fused_quant/op_hardswish.h
+++ b/backends/cadence/fused_quant/op_hardswish.h
@@ -23,13 +23,11 @@ executorch::aten::Tensor& hardswish_out(
     executorch::aten::ScalarType inp_dtype,
     int64_t inp_quant_min,
     int64_t inp_quant_max,
-    executorch::aten::optional<int64_t> inp_axis,
     const executorch::aten::optional<executorch::aten::Tensor>& out_scale,
     const executorch::aten::optional<executorch::aten::Tensor>& out_zero_point,
     executorch::aten::ScalarType out_dtype,
     int64_t out_quant_min,
     int64_t out_quant_max,
-    executorch::aten::optional<int64_t> out_axis,
     executorch::aten::Tensor& out);
 
 } // namespace native
diff --git a/backends/cadence/fused_quant/op_mul.cpp b/backends/cadence/fused_quant/op_mul.cpp
index 59b0254a0f0..3d071f7c2da 100644
--- a/backends/cadence/fused_quant/op_mul.cpp
+++ b/backends/cadence/fused_quant/op_mul.cpp
@@ -42,19 +42,16 @@ Tensor& mul_out(
     ScalarType inp_dtype,
     int64_t inp_quant_min,
     int64_t inp_quant_max,
-    optional<int64_t> inp_axis,
     const optional<Tensor>& other_scale,
     const optional<Tensor>& other_zero_point,
     ScalarType other_dtype,
     int64_t other_quant_min,
     int64_t other_quant_max,
-    optional<int64_t> other_axis,
     const optional<Tensor>& out_scale,
     const optional<Tensor>& out_zero_point,
     ScalarType out_dtype,
     int64_t out_quant_min,
     int64_t out_quant_max,
-    optional<int64_t> out_axis,
     Tensor& out) {
   (void)ctx;
   (void)inp_dtype;
@@ -74,7 +71,7 @@ Tensor& mul_out(
     }
     inp_buf.resize(numel);
     QParams qp = extract_qparams(
-        inp_scale, inp_zero_point, inp_quant_min, inp_quant_max, inp_axis, inp);
+        inp_scale, inp_zero_point, inp_quant_min, inp_quant_max, inp);
     FUSED_QUANT_DTYPE_SWITCH(
         inp.scalar_type(),
         scalar_t,
@@ -90,12 +87,7 @@ Tensor& mul_out(
     }
     other_buf.resize(numel);
     QParams qp = extract_qparams(
-        other_scale,
-        other_zero_point,
-        other_quant_min,
-        other_quant_max,
-        other_axis,
-        other);
+        other_scale, other_zero_point, other_quant_min, other_quant_max, other);
     FUSED_QUANT_DTYPE_SWITCH(
         other.scalar_type(),
         scalar_t,
@@ -109,7 +101,7 @@ Tensor& mul_out(
     mul_kernel(inp_float, other_float, result_float.data(), numel);
 
     QParams qp = extract_qparams(
-        out_scale, out_zero_point, out_quant_min, out_quant_max, out_axis, out);
+        out_scale, out_zero_point, out_quant_min, out_quant_max, out);
     FUSED_QUANT_DTYPE_SWITCH(
         out.scalar_type(),
         scalar_t,
diff --git a/backends/cadence/fused_quant/op_mul.h b/backends/cadence/fused_quant/op_mul.h
index 402e39bd379..f7afa016b79 100644
--- a/backends/cadence/fused_quant/op_mul.h
+++ b/backends/cadence/fused_quant/op_mul.h
@@ -24,20 +24,17 @@ executorch::aten::Tensor& mul_out(
     executorch::aten::ScalarType inp_dtype,
     int64_t inp_quant_min,
     int64_t inp_quant_max,
-    executorch::aten::optional<int64_t> inp_axis,
     const executorch::aten::optional<executorch::aten::Tensor>& other_scale,
     const executorch::aten::optional<executorch::aten::Tensor>&
         other_zero_point,
     executorch::aten::ScalarType other_dtype,
     int64_t other_quant_min,
     int64_t other_quant_max,
-    executorch::aten::optional<int64_t> other_axis,
     const executorch::aten::optional<executorch::aten::Tensor>& out_scale,
     const executorch::aten::optional<executorch::aten::Tensor>& out_zero_point,
     executorch::aten::ScalarType out_dtype,
     int64_t out_quant_min,
     int64_t out_quant_max,
-    executorch::aten::optional<int64_t> out_axis,
     executorch::aten::Tensor& out);
 
 } // namespace native
diff --git a/backends/cadence/fused_quant/op_relu.cpp b/backends/cadence/fused_quant/op_relu.cpp
index 3fb8d92aff1..ebe7933a7b9 100644
--- a/backends/cadence/fused_quant/op_relu.cpp
+++ b/backends/cadence/fused_quant/op_relu.cpp
@@ -39,13 +39,11 @@ Tensor& relu_out(
     ScalarType inp_dtype,
     int64_t inp_quant_min,
     int64_t inp_quant_max,
-    optional<int64_t> inp_axis,
     const optional<Tensor>& out_scale,
     const optional<Tensor>& out_zero_point,
     ScalarType out_dtype,
     int64_t out_quant_min,
     int64_t out_quant_max,
-    optional<int64_t> out_axis,
     Tensor& out) {
   int64_t numel = inp.numel();
 
@@ -59,7 +57,7 @@ Tensor& relu_out(
     }
     inp_buf.resize(numel);
     QParams qp = extract_qparams(
-        inp_scale, inp_zero_point, inp_quant_min, inp_quant_max, inp_axis, inp);
+        inp_scale, inp_zero_point, inp_quant_min, inp_quant_max, inp);
     FUSED_QUANT_DTYPE_SWITCH(
         inp.scalar_type(),
         scalar_t,
@@ -73,7 +71,7 @@ Tensor& relu_out(
     relu_kernel(inp_float, result_float.data(), numel);
 
     QParams qp = extract_qparams(
-        out_scale, out_zero_point, out_quant_min, out_quant_max, out_axis, out);
+        out_scale, out_zero_point, out_quant_min, out_quant_max, out);
     FUSED_QUANT_DTYPE_SWITCH(
         out.scalar_type(),
         scalar_t,
diff --git a/backends/cadence/fused_quant/op_relu.h b/backends/cadence/fused_quant/op_relu.h
index 1a9d986ccce..e8527c7633f 100644
--- a/backends/cadence/fused_quant/op_relu.h
+++ b/backends/cadence/fused_quant/op_relu.h
@@ -23,13 +23,11 @@ executorch::aten::Tensor& relu_out(
     executorch::aten::ScalarType inp_dtype,
     int64_t inp_quant_min,
     int64_t inp_quant_max,
-    executorch::aten::optional<int64_t> inp_axis,
     const executorch::aten::optional<executorch::aten::Tensor>& out_scale,
     const executorch::aten::optional<executorch::aten::Tensor>& out_zero_point,
     executorch::aten::ScalarType out_dtype,
     int64_t out_quant_min,
     int64_t out_quant_max,
-    executorch::aten::optional<int64_t> out_axis,
     executorch::aten::Tensor& out);
 
 } // namespace native
diff --git a/backends/cadence/fused_quant/quant_utils.h b/backends/cadence/fused_quant/quant_utils.h
index a7f24432ab6..fff669a9e0e 100644
--- a/backends/cadence/fused_quant/quant_utils.h
+++ b/backends/cadence/fused_quant/quant_utils.h
@@ -17,28 +17,49 @@
 namespace cadence {
 namespace fused_quant {
 
+// Upper bound on tensor rank for affine block indexing. Reference quant kernels
+// operate on small ranks (linear rank 2, conv rank 4); 8 leaves headroom.
+static constexpr int kMaxAffineDim = 8;
+
+// Affine quantization params. Scale/zero_point are either a singleton
+// (per-tensor) or a full-rank tensor whose shape encodes the affine block
+// layout: ``block_size[d] = data.size(d) / scale.size(d)``. This single
+// representation covers per-tensor, per-channel, per-group, and blockwise. The
+// scale element for a data element at flat index ``i`` is found by decomposing
+// ``i`` into per-dim coordinates, mapping each to its block (``coord /
+// block_size[d]``), and re-linearizing through the scale strides.
 struct QParams {
   const float* scales;
   const int64_t* zero_points;
   int32_t quant_min;
   int32_t quant_max;
-  int64_t num_channels;
-  int64_t axis_stride;
+  bool per_tensor;
+  int64_t ndim;
+  int64_t data_strides[kMaxAffineDim];
+  int64_t scale_strides[kMaxAffineDim];
+  int64_t block_size[kMaxAffineDim];
 
   float scale_at(int64_t i) const {
-    return scales[channel_idx(i)];
+    return scales[scale_idx(i)];
   }
 
   int32_t zero_point_at(int64_t i) const {
-    return static_cast<int32_t>(zero_points[channel_idx(i)]);
+    return static_cast<int32_t>(zero_points[scale_idx(i)]);
   }
 
  private:
-  int64_t channel_idx(int64_t i) const {
-    if (num_channels == 1) {
+  int64_t scale_idx(int64_t i) const {
+    if (per_tensor) {
       return 0;
     }
-    return (i / axis_stride) % num_channels;
+    int64_t idx = 0;
+    int64_t rem = i;
+    for (int64_t d = 0; d < ndim; ++d) {
+      const int64_t coord = rem / data_strides[d];
+      rem -= coord * data_strides[d];
+      idx += (coord / block_size[d]) * scale_strides[d];
+    }
+    return idx;
   }
 };
 
@@ -47,27 +68,47 @@ inline QParams extract_qparams(
     const executorch::aten::optional<executorch::aten::Tensor>& zp_tensor,
     int64_t quant_min,
     int64_t quant_max,
-    executorch::aten::optional<int64_t> axis,
     const executorch::aten::Tensor& data_tensor) {
   const auto& scale = scale_tensor.value();
   const auto& zp = zp_tensor.value();
 
-  int64_t num_channels = scale.numel();
-  int64_t axis_stride = 1;
-  if (axis.has_value()) {
-    for (int64_t d = axis.value() + 1; d < data_tensor.dim(); ++d) {
-      axis_stride *= data_tensor.size(d);
-    }
+  QParams qp{};
+  qp.scales = scale.const_data_ptr<float>();
+  qp.zero_points = zp.const_data_ptr<int64_t>();
+  qp.quant_min = static_cast<int32_t>(quant_min);
+  qp.quant_max = static_cast<int32_t>(quant_max);
+
+  // A singleton scale broadcasts across the whole tensor (per-tensor); no block
+  // layout to derive, and the scale rank need not match the data rank.
+  if (scale.numel() == 1) {
+    qp.per_tensor = true;
+    return qp;
   }
 
-  return {
-      scale.const_data_ptr<float>(),
-      zp.const_data_ptr<int64_t>(),
-      static_cast<int32_t>(quant_min),
-      static_cast<int32_t>(quant_max),
-      num_channels,
-      axis_stride,
-  };
+  const int64_t ndim = data_tensor.dim();
+  ET_CHECK_MSG(
+      scale.dim() == ndim,
+      "per-channel/group scale must be full-rank (rank %d) to match data rank %d",
+      static_cast<int>(scale.dim()),
+      static_cast<int>(ndim));
+  ET_CHECK_MSG(
+      ndim <= kMaxAffineDim,
+      "tensor rank %d exceeds kMaxAffineDim %d",
+      static_cast<int>(ndim),
+      static_cast<int>(kMaxAffineDim));
+
+  qp.per_tensor = false;
+  qp.ndim = ndim;
+  int64_t data_stride = 1;
+  int64_t scale_stride = 1;
+  for (int64_t d = ndim - 1; d >= 0; --d) {
+    qp.data_strides[d] = data_stride;
+    qp.scale_strides[d] = scale_stride;
+    qp.block_size[d] = data_tensor.size(d) / scale.size(d);
+    data_stride *= data_tensor.size(d);
+    scale_stride *= scale.size(d);
+  }
+  return qp;
 }
 
 template <typename T>
diff --git a/backends/cadence/fused_quant/tests/test_op_add.cpp b/backends/cadence/fused_quant/tests/test_op_add.cpp
index e88932cc6ef..dca110cf0e1 100644
--- a/backends/cadence/fused_quant/tests/test_op_add.cpp
+++ b/backends/cadence/fused_quant/tests/test_op_add.cpp
@@ -25,10 +25,6 @@ optional<Tensor> none_tensor() {
   return optional<Tensor>();
 }
 
-optional<int64_t> none_axis() {
-  return optional<int64_t>();
-}
-
 } // namespace
 
 class FusedQuantAddTest : public OperatorTest {};
@@ -66,19 +62,16 @@ TEST_F(FusedQuantAddTest, AllQuantizedPerTensor) {
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       optional<Tensor>(other_scale),
       optional<Tensor>(other_zp),
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       1.0,
       out);
 
@@ -112,19 +105,16 @@ TEST_F(FusedQuantAddTest, FloatInputsQuantizedOutput) {
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       none_tensor(),
       none_tensor(),
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       1.0,
       out);
 
@@ -161,19 +151,16 @@ TEST_F(FusedQuantAddTest, QuantizedInpFloatOther) {
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       none_tensor(),
       none_tensor(),
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       1.0,
       out);
 
@@ -210,19 +197,16 @@ TEST_F(FusedQuantAddTest, FloatInpQuantizedOther) {
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       optional<Tensor>(other_scale),
       optional<Tensor>(other_zp),
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       1.0,
       out);
 
@@ -259,19 +243,16 @@ TEST_F(FusedQuantAddTest, QuantizedInputsFloatOutput) {
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       optional<Tensor>(other_scale),
       optional<Tensor>(other_zp),
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       none_tensor(),
       none_tensor(),
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       1.0,
       out);
 
@@ -305,19 +286,16 @@ TEST_F(FusedQuantAddTest, QuantizedInpFloatOtherFloatOutput) {
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       none_tensor(),
       none_tensor(),
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       none_tensor(),
       none_tensor(),
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       1.0,
       out);
 
@@ -351,19 +329,16 @@ TEST_F(FusedQuantAddTest, FloatInpQuantizedOtherFloatOutput) {
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       optional<Tensor>(other_scale),
       optional<Tensor>(other_zp),
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       none_tensor(),
       none_tensor(),
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       1.0,
       out);
 
@@ -376,15 +351,16 @@ TEST_F(FusedQuantAddTest, PerChannelInput) {
   TensorFactory<ScalarType::Float> tf_float;
   TensorFactory<ScalarType::Long> tf_long;
 
-  // Shape [2, 2], axis=0 → 2 channels, axis_stride=2
+  // Per-channel along axis 0: full-rank scale shape [2, 1] encodes the block
+  // layout (block_size = [2/2, 2/1] = [1, 2]).
   const std::vector<int> sizes{2, 2};
 
   Tensor inp = tf_int8.make(sizes, {2, 4, 6, 8});
   Tensor other = tf_float.make(sizes, {1.0, 1.0, 1.0, 1.0});
 
   // Per-channel: channel 0 scale=0.5, channel 1 scale=1.0
-  Tensor inp_scale = tf_float.make({2}, {0.5, 1.0});
-  Tensor inp_zp = tf_long.make({2}, {0, 0});
+  Tensor inp_scale = tf_float.make({2, 1}, {0.5, 1.0});
+  Tensor inp_zp = tf_long.make({2, 1}, {0, 0});
   Tensor out_scale = tf_float.make({1}, {0.5});
   Tensor out_zp = tf_long.make({1}, {0});
 
@@ -403,19 +379,16 @@ TEST_F(FusedQuantAddTest, PerChannelInput) {
       ScalarType::Float,
       -128,
       127,
-      optional<int64_t>(0),
       none_tensor(),
       none_tensor(),
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       1.0,
       out);
 
@@ -428,15 +401,15 @@ TEST_F(FusedQuantAddTest, PerChannelOutput) {
   TensorFactory<ScalarType::Float> tf_float;
   TensorFactory<ScalarType::Long> tf_long;
 
-  // Shape [2, 2], axis=0 → 2 channels
+  // Per-channel along axis 0: full-rank scale shape [2, 1] (block_size [1, 2])
   const std::vector<int> sizes{2, 2};
 
   Tensor inp = tf_float.make(sizes, {2.0, 3.0, 7.0, 9.0});
   Tensor other = tf_float.make(sizes, {0.0, 0.0, 0.0, 0.0});
 
   // Per-channel output: channel 0 scale=0.5, channel 1 scale=1.0
-  Tensor out_scale = tf_float.make({2}, {0.5, 1.0});
-  Tensor out_zp = tf_long.make({2}, {0, 0});
+  Tensor out_scale = tf_float.make({2, 1}, {0.5, 1.0});
+  Tensor out_zp = tf_long.make({2, 1}, {0, 0});
 
   Tensor out = tf_int8.zeros(sizes);
 
@@ -452,19 +425,16 @@ TEST_F(FusedQuantAddTest, PerChannelOutput) {
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       none_tensor(),
       none_tensor(),
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      optional<int64_t>(0),
       1.0,
       out);
 
@@ -504,19 +474,16 @@ TEST_F(FusedQuantAddTest, AlphaScaling) {
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       optional<Tensor>(other_scale),
       optional<Tensor>(other_zp),
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       2.0,
       out);
 
@@ -559,19 +526,16 @@ TEST_F(FusedQuantAddTest, NonZeroZeroPoint) {
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       optional<Tensor>(other_scale),
       optional<Tensor>(other_zp),
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       1.0,
       out);
 
diff --git a/backends/cadence/fused_quant/tests/test_op_bmm.cpp b/backends/cadence/fused_quant/tests/test_op_bmm.cpp
index 93c511a10d5..5ede47ea8a9 100644
--- a/backends/cadence/fused_quant/tests/test_op_bmm.cpp
+++ b/backends/cadence/fused_quant/tests/test_op_bmm.cpp
@@ -25,10 +25,6 @@ optional<Tensor> none_tensor() {
   return optional<Tensor>();
 }
 
-optional<int64_t> none_axis() {
-  return optional<int64_t>();
-}
-
 } // namespace
 
 class FusedQuantBmmTest : public OperatorTest {};
@@ -73,19 +69,16 @@ TEST_F(FusedQuantBmmTest, AllQuantizedPerTensor) {
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       optional<Tensor>(other_scale),
       optional<Tensor>(other_zp),
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_int8.make(out_sizes, {2, 4, 6, 8}));
@@ -121,19 +114,16 @@ TEST_F(FusedQuantBmmTest, FloatInputsQuantizedOutput) {
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       none_tensor(),
       none_tensor(),
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_int8.make(out_sizes, {2, 4, 6, 8}));
@@ -171,19 +161,16 @@ TEST_F(FusedQuantBmmTest, QuantizedInputsFloatOutput) {
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       optional<Tensor>(other_scale),
       optional<Tensor>(other_zp),
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       none_tensor(),
       none_tensor(),
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_float.make(out_sizes, {1.0, 2.0, 3.0, 4.0}));
@@ -221,19 +208,16 @@ TEST_F(FusedQuantBmmTest, QuantizedInpFloatOther) {
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       none_tensor(),
       none_tensor(),
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_int8.make(out_sizes, {2, 4, 6, 8}));
@@ -284,19 +268,16 @@ TEST_F(FusedQuantBmmTest, NonZeroZeroPoint) {
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       optional<Tensor>(other_scale),
       optional<Tensor>(other_zp),
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_int8.make(out_sizes, {3, 6, 2, 5}));
@@ -341,19 +322,16 @@ TEST_F(FusedQuantBmmTest, LargerBatch) {
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       optional<Tensor>(other_scale),
       optional<Tensor>(other_zp),
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_int8.make(out_sizes, {2, 4, 6, 8, 10, 12, 14, 16}));
diff --git a/backends/cadence/fused_quant/tests/test_op_hardswish.cpp b/backends/cadence/fused_quant/tests/test_op_hardswish.cpp
index e92989c64d2..502d680d2e3 100644
--- a/backends/cadence/fused_quant/tests/test_op_hardswish.cpp
+++ b/backends/cadence/fused_quant/tests/test_op_hardswish.cpp
@@ -25,10 +25,6 @@ optional<Tensor> none_tensor() {
   return optional<Tensor>();
 }
 
-optional<int64_t> none_axis() {
-  return optional<int64_t>();
-}
-
 } // namespace
 
 class FusedQuantHardswishTest : public OperatorTest {};
@@ -66,13 +62,11 @@ TEST_F(FusedQuantHardswishTest, AllQuantizedPerTensor) {
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_int8.make(sizes, {0, 0, 0, 3, 6, 10}));
@@ -103,13 +97,11 @@ TEST_F(FusedQuantHardswishTest, FloatInputQuantizedOutput) {
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_int8.make(sizes, {0, 0, 0, 3, 6, 10}));
@@ -140,13 +132,11 @@ TEST_F(FusedQuantHardswishTest, QuantizedInputFloatOutput) {
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       none_tensor(),
       none_tensor(),
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_float.make(sizes, {0.0, 0.0, 0.0, 3.0, 6.0, 10.0}));
@@ -158,14 +148,15 @@ TEST_F(FusedQuantHardswishTest, PerChannelInput) {
   TensorFactory<ScalarType::Float> tf_float;
   TensorFactory<ScalarType::Long> tf_long;
 
-  // Shape [2, 3], axis=0 → 2 channels, axis_stride=3
+  // Per-channel along axis 0: full-rank scale shape [2, 1] encodes the block
+  // layout (block_size = [2/2, 3/1] = [1, 3]).
   const std::vector<int> sizes{2, 3};
 
   Tensor inp = tf_int8.make(sizes, {-6, -3, 0, 3, 6, 10});
 
   // Per-channel: channel 0 scale=1.0, channel 1 scale=0.5
-  Tensor inp_scale = tf_float.make({2}, {1.0, 0.5});
-  Tensor inp_zp = tf_long.make({2}, {0, 0});
+  Tensor inp_scale = tf_float.make({2, 1}, {1.0, 0.5});
+  Tensor inp_zp = tf_long.make({2, 1}, {0, 0});
   Tensor out_scale = tf_float.make({1}, {0.5});
   Tensor out_zp = tf_long.make({1}, {0});
 
@@ -187,13 +178,11 @@ TEST_F(FusedQuantHardswishTest, PerChannelInput) {
       ScalarType::Float,
       -128,
       127,
-      optional<int64_t>(0),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_int8.make(sizes, {0, 0, 0, 2, 6, 10}));
@@ -205,14 +194,14 @@ TEST_F(FusedQuantHardswishTest, PerChannelOutput) {
   TensorFactory<ScalarType::Float> tf_float;
   TensorFactory<ScalarType::Long> tf_long;
 
-  // Shape [2, 3], axis=0 → 2 channels
+  // Per-channel along axis 0: full-rank scale shape [2, 1] (block_size [1, 3])
   const std::vector<int> sizes{2, 3};
 
   Tensor inp = tf_float.make(sizes, {-6.0, 0.0, 3.0, 6.0, 10.0, 12.0});
 
   // Per-channel output: channel 0 scale=1.0, channel 1 scale=0.5
-  Tensor out_scale = tf_float.make({2}, {1.0, 0.5});
-  Tensor out_zp = tf_long.make({2}, {0, 0});
+  Tensor out_scale = tf_float.make({2, 1}, {1.0, 0.5});
+  Tensor out_zp = tf_long.make({2, 1}, {0, 0});
 
   Tensor out = tf_int8.zeros(sizes);
 
@@ -229,13 +218,11 @@ TEST_F(FusedQuantHardswishTest, PerChannelOutput) {
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      optional<int64_t>(0),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_int8.make(sizes, {0, 0, 3, 12, 20, 24}));
@@ -272,13 +259,11 @@ TEST_F(FusedQuantHardswishTest, NonZeroZeroPoint) {
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_int8.make(sizes, {1, 1, 1, 4, 7, 11}));
@@ -312,13 +297,11 @@ TEST_F(FusedQuantHardswishTest, NegativeRegion) {
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_int8.make(sizes, {0, 0, 0, 0}));
@@ -346,13 +329,11 @@ TEST_F(FusedQuantHardswishTest, LinearRegion) {
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       none_tensor(),
       none_tensor(),
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_float.make(sizes, {3.0, 4.0, 6.0, 10.0}));
@@ -392,13 +373,11 @@ TEST_F(FusedQuantHardswishTest, TransitionRegion) {
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_int8.make(sizes, {0, -3, 0, 9, 24}));
diff --git a/backends/cadence/fused_quant/tests/test_op_mul.cpp b/backends/cadence/fused_quant/tests/test_op_mul.cpp
index 77983155015..0b9addabc5e 100644
--- a/backends/cadence/fused_quant/tests/test_op_mul.cpp
+++ b/backends/cadence/fused_quant/tests/test_op_mul.cpp
@@ -25,10 +25,6 @@ optional<Tensor> none_tensor() {
   return optional<Tensor>();
 }
 
-optional<int64_t> none_axis() {
-  return optional<int64_t>();
-}
-
 } // namespace
 
 class FusedQuantMulTest : public OperatorTest {};
@@ -66,19 +62,16 @@ TEST_F(FusedQuantMulTest, AllQuantizedPerTensor) {
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       optional<Tensor>(other_scale),
       optional<Tensor>(other_zp),
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_int8.make(sizes, {2, 4, 6, 8}));
@@ -111,19 +104,16 @@ TEST_F(FusedQuantMulTest, FloatInputsQuantizedOutput) {
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       none_tensor(),
       none_tensor(),
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_int8.make(sizes, {4, 8, 12, 16}));
@@ -159,19 +149,16 @@ TEST_F(FusedQuantMulTest, QuantizedInpFloatOther) {
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       none_tensor(),
       none_tensor(),
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_int8.make(sizes, {4, 8, 12, 16}));
@@ -207,19 +194,16 @@ TEST_F(FusedQuantMulTest, FloatInpQuantizedOther) {
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       optional<Tensor>(other_scale),
       optional<Tensor>(other_zp),
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_int8.make(sizes, {2, 4, 6, 8}));
@@ -255,19 +239,16 @@ TEST_F(FusedQuantMulTest, QuantizedInputsFloatOutput) {
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       optional<Tensor>(other_scale),
       optional<Tensor>(other_zp),
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       none_tensor(),
       none_tensor(),
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_float.make(sizes, {1.0, 2.0, 3.0, 4.0}));
@@ -300,19 +281,16 @@ TEST_F(FusedQuantMulTest, QuantizedInpFloatOtherFloatOutput) {
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       none_tensor(),
       none_tensor(),
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       none_tensor(),
       none_tensor(),
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_float.make(sizes, {2.0, 4.0, 6.0, 8.0}));
@@ -345,19 +323,16 @@ TEST_F(FusedQuantMulTest, FloatInpQuantizedOtherFloatOutput) {
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       optional<Tensor>(other_scale),
       optional<Tensor>(other_zp),
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       none_tensor(),
       none_tensor(),
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_float.make(sizes, {1.0, 2.0, 3.0, 4.0}));
@@ -369,15 +344,16 @@ TEST_F(FusedQuantMulTest, PerChannelInput) {
   TensorFactory<ScalarType::Float> tf_float;
   TensorFactory<ScalarType::Long> tf_long;
 
-  // Shape [2, 2], axis=0 -> 2 channels, axis_stride=2
   const std::vector<int> sizes{2, 2};
 
   Tensor inp = tf_int8.make(sizes, {2, 4, 6, 8});
   Tensor other = tf_float.make(sizes, {2.0, 2.0, 2.0, 2.0});
 
-  // Per-channel: channel 0 scale=0.5, channel 1 scale=1.0
-  Tensor inp_scale = tf_float.make({2}, {0.5, 1.0});
-  Tensor inp_zp = tf_long.make({2}, {0, 0});
+  // Per-channel along axis 0: full-rank scale shape [2, 1] encodes the block
+  // layout (block_size = [2/2, 2/1] = [1, 2]). channel 0 scale=0.5, channel 1
+  // scale=1.0.
+  Tensor inp_scale = tf_float.make({2, 1}, {0.5, 1.0});
+  Tensor inp_zp = tf_long.make({2, 1}, {0, 0});
   Tensor out_scale = tf_float.make({1}, {0.5});
   Tensor out_zp = tf_long.make({1}, {0});
 
@@ -396,19 +372,16 @@ TEST_F(FusedQuantMulTest, PerChannelInput) {
       ScalarType::Float,
       -128,
       127,
-      optional<int64_t>(0),
       none_tensor(),
       none_tensor(),
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_int8.make(sizes, {4, 8, 24, 32}));
@@ -420,15 +393,15 @@ TEST_F(FusedQuantMulTest, PerChannelOutput) {
   TensorFactory<ScalarType::Float> tf_float;
   TensorFactory<ScalarType::Long> tf_long;
 
-  // Shape [2, 2], axis=0 -> 2 channels
+  // Per-channel along axis 0: full-rank scale shape [2, 1] (block_size [1, 2])
   const std::vector<int> sizes{2, 2};
 
   Tensor inp = tf_float.make(sizes, {2.0, 3.0, 7.0, 9.0});
   Tensor other = tf_float.make(sizes, {1.0, 1.0, 1.0, 1.0});
 
   // Per-channel output: channel 0 scale=0.5, channel 1 scale=1.0
-  Tensor out_scale = tf_float.make({2}, {0.5, 1.0});
-  Tensor out_zp = tf_long.make({2}, {0, 0});
+  Tensor out_scale = tf_float.make({2, 1}, {0.5, 1.0});
+  Tensor out_zp = tf_long.make({2, 1}, {0, 0});
 
   Tensor out = tf_int8.zeros(sizes);
 
@@ -444,19 +417,16 @@ TEST_F(FusedQuantMulTest, PerChannelOutput) {
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       none_tensor(),
       none_tensor(),
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      optional<int64_t>(0),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_int8.make(sizes, {4, 6, 7, 9}));
@@ -498,19 +468,16 @@ TEST_F(FusedQuantMulTest, NonZeroZeroPoint) {
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       optional<Tensor>(other_scale),
       optional<Tensor>(other_zp),
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_int8.make(sizes, {2, 3, 3, 4}));
diff --git a/backends/cadence/fused_quant/tests/test_op_relu.cpp b/backends/cadence/fused_quant/tests/test_op_relu.cpp
index 6a35e36dfbf..6b83551fd2b 100644
--- a/backends/cadence/fused_quant/tests/test_op_relu.cpp
+++ b/backends/cadence/fused_quant/tests/test_op_relu.cpp
@@ -25,10 +25,6 @@ optional<Tensor> none_tensor() {
   return optional<Tensor>();
 }
 
-optional<int64_t> none_axis() {
-  return optional<int64_t>();
-}
-
 } // namespace
 
 class FusedQuantReluTest : public OperatorTest {};
@@ -61,13 +57,11 @@ TEST_F(FusedQuantReluTest, AllQuantizedPerTensor) {
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_int8.make(sizes, {0, 0, 2, 4}));
@@ -98,13 +92,11 @@ TEST_F(FusedQuantReluTest, FloatInputQuantizedOutput) {
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_int8.make(sizes, {0, 0, 2, 4}));
@@ -135,13 +127,11 @@ TEST_F(FusedQuantReluTest, QuantizedInputFloatOutput) {
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       none_tensor(),
       none_tensor(),
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_float.make(sizes, {0.0, 0.0, 1.0, 2.0}));
@@ -153,14 +143,15 @@ TEST_F(FusedQuantReluTest, PerChannelInput) {
   TensorFactory<ScalarType::Float> tf_float;
   TensorFactory<ScalarType::Long> tf_long;
 
-  // Shape [2, 2], axis=0 → 2 channels, axis_stride=2
+  // Per-channel along axis 0: full-rank scale shape [2, 1] encodes the block
+  // layout (block_size = [2/2, 2/1] = [1, 2]).
   const std::vector<int> sizes{2, 2};
 
   Tensor inp = tf_int8.make(sizes, {-4, 2, -3, 6});
 
   // Per-channel: channel 0 scale=0.5, channel 1 scale=1.0
-  Tensor inp_scale = tf_float.make({2}, {0.5, 1.0});
-  Tensor inp_zp = tf_long.make({2}, {0, 0});
+  Tensor inp_scale = tf_float.make({2, 1}, {0.5, 1.0});
+  Tensor inp_zp = tf_long.make({2, 1}, {0, 0});
   Tensor out_scale = tf_float.make({1}, {0.5});
   Tensor out_zp = tf_long.make({1}, {0});
 
@@ -178,13 +169,11 @@ TEST_F(FusedQuantReluTest, PerChannelInput) {
       ScalarType::Float,
       -128,
       127,
-      optional<int64_t>(0),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_int8.make(sizes, {0, 2, 0, 12}));
@@ -196,14 +185,14 @@ TEST_F(FusedQuantReluTest, PerChannelOutput) {
   TensorFactory<ScalarType::Char> tf_int8;
   TensorFactory<ScalarType::Long> tf_long;
 
-  // Shape [2, 2], axis=0 → 2 channels
+  // Per-channel along axis 0: full-rank scale shape [2, 1] (block_size [1, 2])
   const std::vector<int> sizes{2, 2};
 
   Tensor inp = tf_float.make(sizes, {-1.0, 3.0, -2.0, 9.0});
 
   // Per-channel output: channel 0 scale=0.5, channel 1 scale=1.0
-  Tensor out_scale = tf_float.make({2}, {0.5, 1.0});
-  Tensor out_zp = tf_long.make({2}, {0, 0});
+  Tensor out_scale = tf_float.make({2, 1}, {0.5, 1.0});
+  Tensor out_zp = tf_long.make({2, 1}, {0, 0});
 
   Tensor out = tf_int8.zeros(sizes);
 
@@ -218,13 +207,11 @@ TEST_F(FusedQuantReluTest, PerChannelOutput) {
       ScalarType::Float,
       0,
       0,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      optional<int64_t>(0),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_int8.make(sizes, {0, 6, 0, 9}));
@@ -261,13 +248,11 @@ TEST_F(FusedQuantReluTest, NonZeroZeroPoint) {
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_int8.make(sizes, {1, 1, 2, 3}));
@@ -301,13 +286,11 @@ TEST_F(FusedQuantReluTest, AllNegativeInputs) {
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_int8.make(sizes, {0, 0, 0, 0}));
@@ -341,13 +324,11 @@ TEST_F(FusedQuantReluTest, AllPositiveInputs) {
       ScalarType::Float,
       -128,
       127,
-      none_axis(),
       optional<Tensor>(out_scale),
       optional<Tensor>(out_zp),
       ScalarType::Char,
       -128,
       127,
-      none_axis(),
       out);
 
   EXPECT_TENSOR_EQ(out, tf_int8.make(sizes, {2, 4, 6, 8}));