From d797509b53ef4b1fb6004e4e680a514fd644b069 Mon Sep 17 00:00:00 2001
From: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com>
Date: Fri, 1 May 2026 17:06:14 -0700
Subject: [PATCH 1/7] fixes for fused moe (qwen3.6, GLM5.1 + MSE calibration

Signed-off-by: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com>
---
 examples/llm_ptq/hf_ptq.py                    |   7 +-
 modelopt/torch/quantization/model_quant.py    |   1 +
 .../nn/modules/tensor_quantizer.py            |   4 +-
 .../quantization/qtensor/nvfp4_tensor.py      |   9 +-
 .../general/ptq/nvfp4_experts_only_mse.yaml   | 130 ++++++++++++++++++
 5 files changed, 147 insertions(+), 4 deletions(-)
 create mode 100644 modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml

diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index 875e78ceea6..0c1c914ce69 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -828,6 +828,11 @@ def pre_quantize(
     preview_input_ids = next(iter(calib_dataloader))[
         "input_features" if model_type == "whisper" else "input_ids"
     ][0:1]
+    # Strip leading padding tokens so the preview input shows real content
+    if model_type not in ("whisper",) and tokenizer is not None and tokenizer.pad_token_id is not None:
+        first_non_pad = (preview_input_ids[0] != tokenizer.pad_token_id).nonzero(as_tuple=True)[0]
+        if first_non_pad.numel() > 0:
+            preview_input_ids = preview_input_ids[:, first_non_pad[0]:]
 
     # Generate preview before quantization
     if args.skip_generate:
@@ -928,7 +933,7 @@ def input_decode(input_ids):
         if processor is not None and isinstance(processor, WhisperProcessor):
             return first_text_speech_dataset
         elif tokenizer is not None:
-            return tokenizer.batch_decode(input_ids)
+            return tokenizer.batch_decode(input_ids, skip_special_tokens=True)
         else:
             raise ValueError("The processor or tokenizer must be set")
 
diff --git a/modelopt/torch/quantization/model_quant.py b/modelopt/torch/quantization/model_quant.py
index 5e65f9cc1d4..3582223c4d3 100644
--- a/modelopt/torch/quantization/model_quant.py
+++ b/modelopt/torch/quantization/model_quant.py
@@ -595,6 +595,7 @@ def print_quant_summary(model: nn.Module, output_dir: str | None = None):
     lines.append(f"{len(lines)} TensorQuantizers found in model")
 
     if output_dir:
+        os.makedirs(output_dir, exist_ok=True)
         path = os.path.join(output_dir, ".quant_summary.txt")
         with open(path, "w", encoding="utf-8") as f:
             f.write("\n".join(lines) + "\n")
diff --git a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py
index fa540b8fdf5..5e3cea44c2a 100644
--- a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py
+++ b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py
@@ -1122,7 +1122,7 @@ def forward(self, inputs):
 
         return outputs
 
-    def _short_amax(self, fmt=".4f"):
+    def _short_amax(self, fmt=".2e"):
         """Short description of amax.
 
         Returns:
@@ -1140,7 +1140,7 @@ def _short_amax(self, fmt=".4f"):
             return "meta"
         return self._short_tensor(self._amax, fmt)
 
-    def _short_tensor(self, tensor: torch.Tensor, fmt=".4f"):
+    def _short_tensor(self, tensor: torch.Tensor, fmt=".2e"):
         """Short description of tensor."""
         if tensor.numel() == 1:
             return f"{tensor.item():{fmt}}"
diff --git a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
index bb39c8a81e3..209a8e778be 100644
--- a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
+++ b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
@@ -128,9 +128,10 @@ def get_weights_scaling_factor_from_quantizer(
             # explodes to ``1.0 * 448 / (global_amax/6)``. fp8_e4m3fn has no Inf, so any
             # value >= 480 casts to NaN — clamp first to keep the stored byte finite.
             if not keep_high_precision:
+                _FP8_E4M3FN_MIN = 2**-9  # 0.001953125 — smallest positive subnormal
                 per_block_scale = (
                     (per_block_scale * 448.0 / per_block_scale_max)
-                    .clamp_(max=448.0)
+                    .clamp(min=_FP8_E4M3FN_MIN, max=448.0)
                     .to(torch.float8_e4m3fn)
                 )
             return per_block_scale, weights_scaling_factor_2
@@ -173,6 +174,12 @@ def get_weights_scaling_factor(
         per_block_scale[per_block_scale == 0] = 1.0
         # Convert to torch.float8_e4m3fn
         if not keep_high_precision:
+            # Clamp to the minimum positive FP8 E4M3FN subnormal (~0.00195 = 2^-9) before
+            # casting.  Without this, blocks whose scale falls below the FP8 representable
+            # range silently underflow to 0, causing those blocks to produce zero output at
+            # inference even when the weights are non-trivial.
+            _FP8_E4M3FN_MIN = 2**-9  # 0.001953125 — smallest positive subnormal
+            per_block_scale = per_block_scale.clamp(min=_FP8_E4M3FN_MIN)
             per_block_scale = per_block_scale.to(torch.float8_e4m3fn)
         return per_block_scale, weights_scaling_factor_2
 
diff --git a/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml
new file mode 100644
index 00000000000..0fbdb421075
--- /dev/null
+++ b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml
@@ -0,0 +1,130 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+metadata:
+  recipe_type: ptq
+  description: >
+    NVFP4 W4A4 for MoE routed experts only. Static weight scales via MSE + FP8 scale sweep;
+    dynamic activation scales. Supports sequential experts (nn.Linear-based) and fused experts
+    (_QuantFusedExperts, HF transformers 5.0+ 3D nn.Parameter style).
+quantize:
+  algorithm:
+    method: mse
+    fp8_scale_sweep: true
+    layerwise: false
+  quant_cfg:
+    # ── Disable everything first ─────────────────────────────────────────────
+    - quantizer_name: '*'
+      enable: false
+
+    # ── Sequential experts (nn.Linear per expert) ────────────────────────────
+    - quantizer_name: '*mlp.experts*weight_quantizer'
+      enable: true
+      cfg:
+        block_sizes:
+          -1: 16
+          type: static
+          scale_bits: e4m3
+        num_bits: e2m1
+    - quantizer_name: '*mlp.experts*input_quantizer'
+      enable: true
+      cfg:
+        block_sizes:
+          -1: 16
+          type: dynamic
+          scale_bits: e4m3
+        num_bits: e2m1
+
+    # ── Sequential experts: Mixtral / block_sparse_moe style ────────────────
+    - quantizer_name: '*block_sparse_moe*weight_quantizer'
+      enable: true
+      cfg:
+        block_sizes:
+          -1: 16
+          type: static
+          scale_bits: e4m3
+        num_bits: e2m1
+    - quantizer_name: '*block_sparse_moe*input_quantizer'
+      enable: true
+      cfg:
+        block_sizes:
+          -1: 16
+          type: dynamic
+          scale_bits: e4m3
+        num_bits: e2m1
+
+    # ── Fused experts (_QuantFusedExperts, HF transformers 5.0+ 3D nn.Parameter style) ──
+    - quantizer_name: '*gate_up_proj_weight_quantizers*'
+      enable: true
+      cfg:
+        block_sizes:
+          -1: 16
+          type: static
+          scale_bits: e4m3
+        num_bits: e2m1
+    - quantizer_name: '*gate_up_proj_input_quantizer*'
+      enable: true
+      cfg:
+        block_sizes:
+          -1: 16
+          type: dynamic
+          scale_bits: e4m3
+        num_bits: e2m1
+    - quantizer_name: '*down_proj_weight_quantizers*'
+      enable: true
+      cfg:
+        block_sizes:
+          -1: 16
+          type: static
+          scale_bits: e4m3
+        num_bits: e2m1
+    - quantizer_name: '*down_proj_input_quantizer*'
+      enable: true
+      cfg:
+        block_sizes:
+          -1: 16
+          type: dynamic
+          scale_bits: e4m3
+        num_bits: e2m1
+
+    # ── Exclusions: shared experts, attention, routers, lm_head ─────────────
+    - quantizer_name: '*block_sparse_moe.gate*'
+      enable: false
+    - quantizer_name: '*linear_attn.conv1d*'
+      enable: false
+    - quantizer_name: '*lm_head*'
+      enable: false
+    - quantizer_name: '*mlp.gate.*'
+      enable: false
+    - quantizer_name: '*mlp.shared_expert*'
+      enable: false
+    - quantizer_name: '*mlp.shared_expert_gate.*'
+      enable: false
+    - quantizer_name: '*router*'
+      enable: false
+    - quantizer_name: 'output.*'
+      enable: false
+    - parent_class: 'nn.BatchNorm1d'
+      quantizer_name: '*'
+      enable: false
+    - parent_class: 'nn.BatchNorm2d'
+      quantizer_name: '*'
+      enable: false
+    - parent_class: 'nn.BatchNorm3d'
+      quantizer_name: '*'
+      enable: false
+    - parent_class: 'nn.LeakyReLU'
+      quantizer_name: '*'
+      enable: false

From 9aac0fbac08b3106cf407640dfed250c959b1224 Mon Sep 17 00:00:00 2001
From: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com>
Date: Mon, 4 May 2026 22:49:00 +0000
Subject: [PATCH 2/7] address reviewers' feedback

Signed-off-by: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com>
---
 .../general/ptq/nvfp4_experts_only_mse.yaml   |   2 +-
 .../plugins/test_fused_experts.py             | 102 +++++++++++++++---
 .../torch/quantization/test_nvfp4_tensor.py   |  68 ++++++++++++
 3 files changed, 154 insertions(+), 18 deletions(-)
 create mode 100644 tests/unit/torch/quantization/test_nvfp4_tensor.py

diff --git a/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml
index 0fbdb421075..76d50b760f0 100644
--- a/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml
+++ b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tests/unit/torch/quantization/plugins/test_fused_experts.py b/tests/unit/torch/quantization/plugins/test_fused_experts.py
index 19e1ed49197..5ae6e0c2e5c 100644
--- a/tests/unit/torch/quantization/plugins/test_fused_experts.py
+++ b/tests/unit/torch/quantization/plugins/test_fused_experts.py
@@ -256,27 +256,51 @@ def test_expert_index_recovery(self):
 # Tests for export
 # ---------------------------------------------------------------------------
 class TestExportFusedExperts:
+    @staticmethod
+    def _cleanup_registry(mod_type):
+        if QuantModuleRegistry.get(mod_type) is not None:
+            QuantModuleRegistry.unregister(mod_type)
+
     def test_export_creates_per_expert_submodules(self):
         """_export_fused_experts should create per-expert submodules with standard naming."""
+        import modelopt.torch.quantization as mtq
         from modelopt.torch.export.moe_utils import _export_fused_experts
 
-        experts = _SyntheticFusedExperts()
-        expert_type = type(experts)
+        model = _TinyMoEModel()
+        expert_type = type(model.moe.experts)
+        self._cleanup_registry(expert_type)
 
-        # Manually register and convert
-        if QuantModuleRegistry.get(expert_type) is None:
-            QuantModuleRegistry.register({expert_type: "test.SyntheticFusedExperts"})(
-                _QuantFusedExperts
-            )
-        converted = QuantModuleRegistry.convert(experts)
+        quant_cfg = {
+            "quant_cfg": [
+                {"quantizer_name": "*", "enable": False},
+                {
+                    "quantizer_name": "*gate_up_proj_input_quantizer",
+                    "cfg": {"num_bits": 8, "axis": None},
+                },
+                {
+                    "quantizer_name": "*down_proj_input_quantizer",
+                    "cfg": {"num_bits": 8, "axis": None},
+                },
+                {
+                    "quantizer_name": "*gate_up_proj_weight_quantizer",
+                    "cfg": {"num_bits": 8, "axis": 0},
+                },
+                {
+                    "quantizer_name": "*down_proj_weight_quantizer",
+                    "cfg": {"num_bits": 8, "axis": 0},
+                },
+            ],
+            "algorithm": "max",
+        }
 
-        # Run a forward pass to calibrate (set amaxes)
-        seq_len = 16
-        hidden_states = torch.randn(seq_len, HIDDEN_DIM)
-        top_k_index = torch.randint(0, NUM_EXPERTS, (seq_len, TOP_K))
-        top_k_weights = torch.softmax(torch.randn(seq_len, TOP_K), dim=-1)
-        with torch.no_grad():
-            converted(hidden_states, top_k_index, top_k_weights)
+        def forward_loop(m):
+            torch.manual_seed(0)
+            for _ in range(2):
+                x = torch.randn(1, 4, HIDDEN_DIM)
+                m(x)
+
+        mtq.quantize(model, quant_cfg, forward_loop=forward_loop)
+        converted = model.moe.experts
 
         _export_fused_experts(converted, torch.float16)
 
@@ -297,8 +321,7 @@ def test_export_creates_per_expert_submodules(self):
         assert not hasattr(converted, "down_proj")
         assert not hasattr(converted, "gate_up_proj_weight_quantizers")
 
-        if QuantModuleRegistry.get(expert_type) is not None:
-            QuantModuleRegistry.unregister(expert_type)
+        self._cleanup_registry(expert_type)
 
     def test_uncalibrated_expert_gate_up_share_amax(self, monkeypatch):
         """gate_proj and up_proj must share weight_scale_2 even when an expert
@@ -899,3 +922,48 @@ def test_unrelated_dotted_number_unchanged(self):
             _normalize_fused_experts_quantizer_name("moe.layers.3.gate.weight")
             == "moe.layers.3.gate.weight"
         )
+
+
+# Verifies that MSE calibration discovers and calibrates every per-expert weight quantizer
+# inside a fused-expert ModuleList (both gate_up_proj and down_proj, for all experts).
+class TestFusedExpertsMSECalibration:
+    @staticmethod
+    def _cleanup_registry(mod_type):
+        if QuantModuleRegistry.get(mod_type) is not None:
+            QuantModuleRegistry.unregister(mod_type)
+
+    def test_mse_calibration_populates_all_expert_quantizers(self):
+        import modelopt.torch.quantization as mtq
+
+        model = _TinyMoEModel()
+        expert_type = type(model.moe.experts)
+        self._cleanup_registry(expert_type)
+
+        mtq.quantize(
+            model,
+            {
+                "quant_cfg": [
+                    {"quantizer_name": "*", "enable": False},
+                    {
+                        "quantizer_name": "*gate_up_proj_weight_quantizer",
+                        "cfg": {"num_bits": 8, "axis": None},
+                    },
+                    {
+                        "quantizer_name": "*down_proj_weight_quantizer",
+                        "cfg": {"num_bits": 8, "axis": None},
+                    },
+                ],
+                "algorithm": "mse",
+            },
+            forward_loop=lambda m: [m(torch.randn(1, 4, HIDDEN_DIM)) for _ in range(2)],
+        )
+
+        experts = model.moe.experts
+        for idx in range(NUM_EXPERTS):
+            assert experts.gate_up_proj_weight_quantizers[idx].amax is not None, (
+                f"gate_up_proj_weight_quantizers[{idx}] not calibrated — Bug 1 regression"
+            )
+            assert experts.down_proj_weight_quantizers[idx].amax is not None, (
+                f"down_proj_weight_quantizers[{idx}] not calibrated"
+            )
+        self._cleanup_registry(expert_type)
diff --git a/tests/unit/torch/quantization/test_nvfp4_tensor.py b/tests/unit/torch/quantization/test_nvfp4_tensor.py
new file mode 100644
index 00000000000..e4f0e7a8d7a
--- /dev/null
+++ b/tests/unit/torch/quantization/test_nvfp4_tensor.py
@@ -0,0 +1,68 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for NVFP4QTensor per-block FP8 scale underflow clamping."""
+
+import torch
+
+from modelopt.torch.quantization.qtensor.nvfp4_tensor import NVFP4QTensor
+
+_FP8_E4M3FN_MIN = 2**-9  # 0.001953125 — smallest positive FP8 E4M3FN subnormal
+
+
+class TestNVFP4ScaleClamping:
+    """Per-block weight scales below the FP8 E4M3FN minimum must be clamped, not rounded to zero."""
+
+    def test_no_zero_scales_for_tiny_weights(self):
+        """Tiny per-block amax (<<FP8 min) must not underflow to zero after FP8 cast."""
+        block_size = 16
+        tiny_weight = torch.full((4, block_size), 1e-10)
+        wsf2 = torch.tensor(1e-10 / (6.0 * 448.0))
+
+        per_block_scale, _ = NVFP4QTensor.get_weights_scaling_factor(tiny_weight, block_size, wsf2)
+        per_block_scale_f32 = per_block_scale.float()
+
+        assert (per_block_scale_f32 > 0).all(), (
+            f"Zero per-block scales found after FP8 cast: {per_block_scale_f32.tolist()}. "
+            "FP8 scale underflow clamping likely regressed."
+        )
+        assert (per_block_scale_f32 >= _FP8_E4M3FN_MIN).all(), (
+            "Per-block scales below FP8 minimum subnormal found after cast."
+        )
+
+    def test_normal_weights_unaffected_by_clamp(self):
+        """Weights with typical magnitudes must not be affected by the underflow clamp."""
+        block_size = 16
+        torch.manual_seed(42)
+        normal_weight = torch.randn(8, block_size)
+
+        per_block_scale, _ = NVFP4QTensor.get_weights_scaling_factor(normal_weight, block_size)
+        assert (per_block_scale.float() > 0).all(), "Normal weights produced zero scales."
+
+    def test_mixed_weight_no_zeros(self):
+        """Mixed-magnitude tensor (normal + tiny blocks) must have no zero scales."""
+        block_size = 16
+        weight = torch.cat(
+            [
+                torch.randn(4, block_size),
+                torch.full((4, block_size), 1e-12),
+            ],
+            dim=0,
+        )
+
+        per_block_scale, _ = NVFP4QTensor.get_weights_scaling_factor(weight, block_size)
+        assert (per_block_scale.float() > 0).all(), (
+            "Zero scales in mixed-magnitude tensor after FP8 cast."
+        )

From 60e185109758b77d20270edd18c0d43f9887e593 Mon Sep 17 00:00:00 2001
From: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com>
Date: Mon, 4 May 2026 23:03:54 +0000
Subject: [PATCH 3/7] code quality

Signed-off-by: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com>
---
 examples/llm_ptq/hf_ptq.py                          | 4 ++--
 modelopt/torch/quantization/model_calib.py          | 2 +-
 modelopt/torch/quantization/qtensor/nvfp4_tensor.py | 8 ++++----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index 0c1c914ce69..ac7a583831f 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -829,10 +829,10 @@ def pre_quantize(
         "input_features" if model_type == "whisper" else "input_ids"
     ][0:1]
     # Strip leading padding tokens so the preview input shows real content
-    if model_type not in ("whisper",) and tokenizer is not None and tokenizer.pad_token_id is not None:
+    if model_type != "whisper" and tokenizer is not None and tokenizer.pad_token_id is not None:
         first_non_pad = (preview_input_ids[0] != tokenizer.pad_token_id).nonzero(as_tuple=True)[0]
         if first_non_pad.numel() > 0:
-            preview_input_ids = preview_input_ids[:, first_non_pad[0]:]
+            preview_input_ids = preview_input_ids[:, first_non_pad[0] :]
 
     # Generate preview before quantization
     if args.skip_generate:
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
index bce49786077..78b237847b1 100644
--- a/modelopt/torch/quantization/model_calib.py
+++ b/modelopt/torch/quantization/model_calib.py
@@ -864,7 +864,7 @@ def finish_stats_collection(model: nn.Module, method: str | None = None, **kwarg
 
         cal = getattr(module, "_calibrator", None)
         if cal and not getattr(module, "_dynamic", False):
-            if method in {"entropy"}:
+            if method == "entropy":
                 if cal.compute_amax(method) is not None:
                     module.load_calib_amax("entropy", **kwargs)
             elif cal.compute_amax(**kwargs) is not None:
diff --git a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
index 209a8e778be..e2e14cb4a74 100644
--- a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
+++ b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
@@ -128,10 +128,10 @@ def get_weights_scaling_factor_from_quantizer(
             # explodes to ``1.0 * 448 / (global_amax/6)``. fp8_e4m3fn has no Inf, so any
             # value >= 480 casts to NaN — clamp first to keep the stored byte finite.
             if not keep_high_precision:
-                _FP8_E4M3FN_MIN = 2**-9  # 0.001953125 — smallest positive subnormal
+                fp8_e4m3fn_min = 2**-9  # 0.001953125 — smallest positive subnormal
                 per_block_scale = (
                     (per_block_scale * 448.0 / per_block_scale_max)
-                    .clamp(min=_FP8_E4M3FN_MIN, max=448.0)
+                    .clamp(min=fp8_e4m3fn_min, max=448.0)
                     .to(torch.float8_e4m3fn)
                 )
             return per_block_scale, weights_scaling_factor_2
@@ -178,8 +178,8 @@ def get_weights_scaling_factor(
             # casting.  Without this, blocks whose scale falls below the FP8 representable
             # range silently underflow to 0, causing those blocks to produce zero output at
             # inference even when the weights are non-trivial.
-            _FP8_E4M3FN_MIN = 2**-9  # 0.001953125 — smallest positive subnormal
-            per_block_scale = per_block_scale.clamp(min=_FP8_E4M3FN_MIN)
+            fp8_e4m3fn_min = 2**-9  # 0.001953125 — smallest positive subnormal
+            per_block_scale = per_block_scale.clamp(min=fp8_e4m3fn_min)
             per_block_scale = per_block_scale.to(torch.float8_e4m3fn)
         return per_block_scale, weights_scaling_factor_2
 

From ab8a162c9be9600726063a8939dd9c74a52a5168 Mon Sep 17 00:00:00 2001
From: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com>
Date: Mon, 4 May 2026 23:59:29 +0000
Subject: [PATCH 4/7] more reviwers feedback

Signed-off-by: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com>
---
 tests/unit/torch/quantization/test_nvfp4_tensor.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/unit/torch/quantization/test_nvfp4_tensor.py b/tests/unit/torch/quantization/test_nvfp4_tensor.py
index e4f0e7a8d7a..d66809a3cfc 100644
--- a/tests/unit/torch/quantization/test_nvfp4_tensor.py
+++ b/tests/unit/torch/quantization/test_nvfp4_tensor.py
@@ -29,7 +29,8 @@ def test_no_zero_scales_for_tiny_weights(self):
         """Tiny per-block amax (<<FP8 min) must not underflow to zero after FP8 cast."""
         block_size = 16
         tiny_weight = torch.full((4, block_size), 1e-10)
-        wsf2 = torch.tensor(1e-10 / (6.0 * 448.0))
+        # wsf2=1.0 → per_block_scale = amax/(6*wsf2) ≈ 1.7e-11 << 2^-9, exercises FP8-min clamp
+        wsf2 = torch.tensor(1.0)
 
         per_block_scale, _ = NVFP4QTensor.get_weights_scaling_factor(tiny_weight, block_size, wsf2)
         per_block_scale_f32 = per_block_scale.float()

From b161f3b379a3cecd2a3e70376ac1058d102decce Mon Sep 17 00:00:00 2001
From: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com>
Date: Tue, 12 May 2026 22:52:26 +0000
Subject: [PATCH 5/7] more reviewers feedback

Signed-off-by: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com>
---
 .../quantization/qtensor/nvfp4_tensor.py      |  31 ++---
 .../general/ptq/nvfp4_experts_only_mse.yaml   | 112 ++----------------
 .../torch/quantization/test_nvfp4_tensor.py   |  46 ++++++-
 3 files changed, 70 insertions(+), 119 deletions(-)

diff --git a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
index e2e14cb4a74..ede540a16ef 100644
--- a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
+++ b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
@@ -78,6 +78,16 @@ def get_weights_scaling_factor_2_from_quantizer(cls, weight_quantizer):
             )
             return weight_quantizer._amax.float() / (6.0 * 448.0)
 
+    @classmethod
+    def _cast_per_block_scale_to_fp8(cls, per_block_scale: torch.Tensor) -> torch.Tensor:
+        """Clamp to FP8 E4M3FN representable range, then cast.
+
+        FP8 E4M3FN has no Inf and a smallest positive subnormal of ``2**-9`` (~0.00195).
+        Values below the min silently underflow to 0 (zero outputs at inference); values
+        above 448 cast to NaN.
+        """
+        return per_block_scale.clamp(min=2**-9, max=448.0).to(torch.float8_e4m3fn)
+
     @classmethod
     def get_weights_scaling_factor_from_quantizer(
         cls,
@@ -122,17 +132,9 @@ def get_weights_scaling_factor_from_quantizer(
             expected_shape = (*weight.shape[:-1], num_blocks_per_row)
             per_block_scale = per_block_scale.view(expected_shape)
 
-            # Quantize scales to FP8. Saturate to the fp8_e4m3fn max (448) before the
-            # cast: when the [==0]=1.0 safety net above fires (per_block_amax was zero
-            # for an all-zero weight block) and global_amax is small, the pre-cast value
-            # explodes to ``1.0 * 448 / (global_amax/6)``. fp8_e4m3fn has no Inf, so any
-            # value >= 480 casts to NaN — clamp first to keep the stored byte finite.
             if not keep_high_precision:
-                fp8_e4m3fn_min = 2**-9  # 0.001953125 — smallest positive subnormal
-                per_block_scale = (
-                    (per_block_scale * 448.0 / per_block_scale_max)
-                    .clamp(min=fp8_e4m3fn_min, max=448.0)
-                    .to(torch.float8_e4m3fn)
+                per_block_scale = cls._cast_per_block_scale_to_fp8(
+                    per_block_scale * 448.0 / per_block_scale_max
                 )
             return per_block_scale, weights_scaling_factor_2
         else:
@@ -172,15 +174,8 @@ def get_weights_scaling_factor(
         )
         # Set all zero values in scale to 1.0
         per_block_scale[per_block_scale == 0] = 1.0
-        # Convert to torch.float8_e4m3fn
         if not keep_high_precision:
-            # Clamp to the minimum positive FP8 E4M3FN subnormal (~0.00195 = 2^-9) before
-            # casting.  Without this, blocks whose scale falls below the FP8 representable
-            # range silently underflow to 0, causing those blocks to produce zero output at
-            # inference even when the weights are non-trivial.
-            fp8_e4m3fn_min = 2**-9  # 0.001953125 — smallest positive subnormal
-            per_block_scale = per_block_scale.clamp(min=fp8_e4m3fn_min)
-            per_block_scale = per_block_scale.to(torch.float8_e4m3fn)
+            per_block_scale = cls._cast_per_block_scale_to_fp8(per_block_scale)
         return per_block_scale, weights_scaling_factor_2
 
     @classmethod
diff --git a/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml
index 76d50b760f0..fbd066b80e1 100644
--- a/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml
+++ b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml
@@ -13,118 +13,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+imports:
+  base_disable_all: configs/ptq/units/base_disable_all
+  default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers
+  nvfp4: configs/numerics/nvfp4
+  nvfp4_static: configs/numerics/nvfp4_static
+
 metadata:
   recipe_type: ptq
-  description: >
-    NVFP4 W4A4 for MoE routed experts only. Static weight scales via MSE + FP8 scale sweep;
-    dynamic activation scales. Supports sequential experts (nn.Linear-based) and fused experts
-    (_QuantFusedExperts, HF transformers 5.0+ 3D nn.Parameter style).
+  description: NVFP4 static weight (MSE FP8-scale sweep) and dynamic activation for expert layers only (W4A4), no KV-cache quantization.
 quantize:
   algorithm:
     method: mse
     fp8_scale_sweep: true
     layerwise: false
   quant_cfg:
-    # ── Disable everything first ─────────────────────────────────────────────
-    - quantizer_name: '*'
-      enable: false
-
-    # ── Sequential experts (nn.Linear per expert) ────────────────────────────
+    - $import: base_disable_all
     - quantizer_name: '*mlp.experts*weight_quantizer'
-      enable: true
       cfg:
-        block_sizes:
-          -1: 16
-          type: static
-          scale_bits: e4m3
-        num_bits: e2m1
+        $import: nvfp4_static
     - quantizer_name: '*mlp.experts*input_quantizer'
-      enable: true
       cfg:
-        block_sizes:
-          -1: 16
-          type: dynamic
-          scale_bits: e4m3
-        num_bits: e2m1
-
-    # ── Sequential experts: Mixtral / block_sparse_moe style ────────────────
+        $import: nvfp4
     - quantizer_name: '*block_sparse_moe*weight_quantizer'
-      enable: true
       cfg:
-        block_sizes:
-          -1: 16
-          type: static
-          scale_bits: e4m3
-        num_bits: e2m1
+        $import: nvfp4_static
     - quantizer_name: '*block_sparse_moe*input_quantizer'
-      enable: true
-      cfg:
-        block_sizes:
-          -1: 16
-          type: dynamic
-          scale_bits: e4m3
-        num_bits: e2m1
-
-    # ── Fused experts (_QuantFusedExperts, HF transformers 5.0+ 3D nn.Parameter style) ──
-    - quantizer_name: '*gate_up_proj_weight_quantizers*'
-      enable: true
       cfg:
-        block_sizes:
-          -1: 16
-          type: static
-          scale_bits: e4m3
-        num_bits: e2m1
-    - quantizer_name: '*gate_up_proj_input_quantizer*'
-      enable: true
-      cfg:
-        block_sizes:
-          -1: 16
-          type: dynamic
-          scale_bits: e4m3
-        num_bits: e2m1
-    - quantizer_name: '*down_proj_weight_quantizers*'
-      enable: true
-      cfg:
-        block_sizes:
-          -1: 16
-          type: static
-          scale_bits: e4m3
-        num_bits: e2m1
-    - quantizer_name: '*down_proj_input_quantizer*'
-      enable: true
-      cfg:
-        block_sizes:
-          -1: 16
-          type: dynamic
-          scale_bits: e4m3
-        num_bits: e2m1
-
-    # ── Exclusions: shared experts, attention, routers, lm_head ─────────────
-    - quantizer_name: '*block_sparse_moe.gate*'
-      enable: false
-    - quantizer_name: '*linear_attn.conv1d*'
-      enable: false
-    - quantizer_name: '*lm_head*'
-      enable: false
-    - quantizer_name: '*mlp.gate.*'
-      enable: false
-    - quantizer_name: '*mlp.shared_expert*'
-      enable: false
-    - quantizer_name: '*mlp.shared_expert_gate.*'
-      enable: false
-    - quantizer_name: '*router*'
-      enable: false
-    - quantizer_name: 'output.*'
-      enable: false
-    - parent_class: 'nn.BatchNorm1d'
-      quantizer_name: '*'
-      enable: false
-    - parent_class: 'nn.BatchNorm2d'
-      quantizer_name: '*'
-      enable: false
-    - parent_class: 'nn.BatchNorm3d'
-      quantizer_name: '*'
-      enable: false
-    - parent_class: 'nn.LeakyReLU'
-      quantizer_name: '*'
-      enable: false
+        $import: nvfp4
+    - $import: default_disabled_quantizers
diff --git a/tests/unit/torch/quantization/test_nvfp4_tensor.py b/tests/unit/torch/quantization/test_nvfp4_tensor.py
index d66809a3cfc..c65fe5cdc1c 100644
--- a/tests/unit/torch/quantization/test_nvfp4_tensor.py
+++ b/tests/unit/torch/quantization/test_nvfp4_tensor.py
@@ -13,17 +13,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Tests for NVFP4QTensor per-block FP8 scale underflow clamping."""
+"""Tests for NVFP4QTensor per-block FP8 scale clamping (underflow + overflow)."""
+
+from types import SimpleNamespace
 
 import torch
 
 from modelopt.torch.quantization.qtensor.nvfp4_tensor import NVFP4QTensor
 
 _FP8_E4M3FN_MIN = 2**-9  # 0.001953125 — smallest positive FP8 E4M3FN subnormal
+_FP8_E4M3FN_MAX = 448.0
 
 
 class TestNVFP4ScaleClamping:
-    """Per-block weight scales below the FP8 E4M3FN minimum must be clamped, not rounded to zero."""
+    """Per-block weight scales outside the FP8 E4M3FN range must be clamped, not turned into 0/NaN."""
 
     def test_no_zero_scales_for_tiny_weights(self):
         """Tiny per-block amax (<<FP8 min) must not underflow to zero after FP8 cast."""
@@ -67,3 +70,42 @@ def test_mixed_weight_no_zeros(self):
         assert (per_block_scale.float() > 0).all(), (
             "Zero scales in mixed-magnitude tensor after FP8 cast."
         )
+
+    def test_helper_clamps_overflow_to_max(self):
+        """Values above 448 must saturate to 448, not cast to NaN (fp8_e4m3fn has no Inf)."""
+        oversized = torch.tensor([100.0, 448.0, 1e3, 1e6])
+        out = NVFP4QTensor._cast_per_block_scale_to_fp8(oversized).float()
+        assert torch.isfinite(out).all(), f"FP8 cast produced non-finite values: {out.tolist()}"
+        assert (out <= _FP8_E4M3FN_MAX).all(), f"FP8 cast values exceed 448: {out.tolist()}"
+
+    def test_helper_clamps_underflow_to_min(self):
+        """Values below the FP8 subnormal must clamp up, not collapse to 0."""
+        tiny = torch.tensor([0.0, 1e-12, 1e-6, _FP8_E4M3FN_MIN / 2])
+        out = NVFP4QTensor._cast_per_block_scale_to_fp8(tiny).float()
+        assert (out > 0).all(), f"FP8 cast produced zero scales: {out.tolist()}"
+
+    def test_static_path_no_nan_when_block_amax_zero(self):
+        """Static path: when a block's amax is 0 (all-zero weights), the `[==0]=1.0` safety net
+        and a small global_amax push the pre-cast value above 448. Without the max clamp,
+        fp8_e4m3fn would cast it to NaN — regression for the export-time NaN reported on this PR.
+        """
+        block_size = 16
+        # global_amax small enough that 1.0 * 448 / (global_amax/6) >> 448.
+        global_amax = torch.tensor(0.01)
+        # One block with amax=0 (triggers safety net), three normal blocks.
+        per_block_amax = torch.tensor([[0.0, 0.005, 0.008, 0.01]])
+        weight = torch.randn(1, 4 * block_size)
+        q = SimpleNamespace(
+            global_amax=global_amax,
+            _amax=per_block_amax,
+            block_sizes={-1: block_size},
+        )
+
+        per_block_scale, _ = NVFP4QTensor.get_weights_scaling_factor_from_quantizer(q, weight)
+        per_block_scale_f32 = per_block_scale.float()
+        assert torch.isfinite(per_block_scale_f32).all(), (
+            f"NaN/Inf in exported static per-block scale: {per_block_scale_f32.tolist()}"
+        )
+        assert (per_block_scale_f32 <= _FP8_E4M3FN_MAX).all(), (
+            f"Static per-block scale exceeds FP8 max 448: {per_block_scale_f32.tolist()}"
+        )

From 5dcda40f11ddc4fd6f1f76c7ef79ac08552d2f03 Mon Sep 17 00:00:00 2001
From: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com>
Date: Tue, 12 May 2026 23:08:23 +0000
Subject: [PATCH 6/7] remove duplicated yaml

Signed-off-by: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com>
---
 .../general/ptq/nvfp4_experts_only_mse.yaml   | 44 -------------------
 1 file changed, 44 deletions(-)
 delete mode 100644 modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml

diff --git a/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml
deleted file mode 100644
index fbd066b80e1..00000000000
--- a/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml
+++ /dev/null
@@ -1,44 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-imports:
-  base_disable_all: configs/ptq/units/base_disable_all
-  default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers
-  nvfp4: configs/numerics/nvfp4
-  nvfp4_static: configs/numerics/nvfp4_static
-
-metadata:
-  recipe_type: ptq
-  description: NVFP4 static weight (MSE FP8-scale sweep) and dynamic activation for expert layers only (W4A4), no KV-cache quantization.
-quantize:
-  algorithm:
-    method: mse
-    fp8_scale_sweep: true
-    layerwise: false
-  quant_cfg:
-    - $import: base_disable_all
-    - quantizer_name: '*mlp.experts*weight_quantizer'
-      cfg:
-        $import: nvfp4_static
-    - quantizer_name: '*mlp.experts*input_quantizer'
-      cfg:
-        $import: nvfp4
-    - quantizer_name: '*block_sparse_moe*weight_quantizer'
-      cfg:
-        $import: nvfp4_static
-    - quantizer_name: '*block_sparse_moe*input_quantizer'
-      cfg:
-        $import: nvfp4
-    - $import: default_disabled_quantizers

From 4de8abf1f7f59c199a671633b00fef6f2472e846 Mon Sep 17 00:00:00 2001
From: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com>
Date: Wed, 13 May 2026 22:57:51 +0000
Subject: [PATCH 7/7] minor update for reviewer feedback

Signed-off-by: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com>
---
 examples/llm_ptq/hf_ptq.py                    |  6 ++
 .../quantization/qtensor/nvfp4_tensor.py      | 20 +++---
 .../plugins/test_fused_experts.py             | 63 +++++++++++++------
 .../torch/quantization/test_nvfp4_tensor.py   |  9 ++-
 4 files changed, 65 insertions(+), 33 deletions(-)

diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index ac7a583831f..a6089e4b3d3 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -833,6 +833,12 @@ def pre_quantize(
         first_non_pad = (preview_input_ids[0] != tokenizer.pad_token_id).nonzero(as_tuple=True)[0]
         if first_non_pad.numel() > 0:
             preview_input_ids = preview_input_ids[:, first_non_pad[0] :]
+        else:
+            warnings.warn(
+                "Preview calibration sample is entirely padding; generated preview will be "
+                "degenerate. Check tokenizer padding side / dataset preprocessing.",
+                stacklevel=2,
+            )
 
     # Generate preview before quantization
     if args.skip_generate:
diff --git a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
index ede540a16ef..a7a2a64cfac 100644
--- a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
+++ b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
@@ -28,6 +28,11 @@
 __all__ = ["NVFP4QTensor"]
 
 
+def _cast_per_block_scale_to_fp8(per_block_scale: torch.Tensor) -> torch.Tensor:
+    """Clamp to FP8 E4M3FN range [2**-9, 448] and cast — avoids underflow→0 / overflow→NaN."""
+    return per_block_scale.clamp(min=2**-9, max=448.0).to(torch.float8_e4m3fn)
+
+
 class NVFP4QTensor(BaseQuantizedTensor):
     """Implements the INT4 quantization on tensors for more efficient storage or computation.
 
@@ -78,16 +83,6 @@ def get_weights_scaling_factor_2_from_quantizer(cls, weight_quantizer):
             )
             return weight_quantizer._amax.float() / (6.0 * 448.0)
 
-    @classmethod
-    def _cast_per_block_scale_to_fp8(cls, per_block_scale: torch.Tensor) -> torch.Tensor:
-        """Clamp to FP8 E4M3FN representable range, then cast.
-
-        FP8 E4M3FN has no Inf and a smallest positive subnormal of ``2**-9`` (~0.00195).
-        Values below the min silently underflow to 0 (zero outputs at inference); values
-        above 448 cast to NaN.
-        """
-        return per_block_scale.clamp(min=2**-9, max=448.0).to(torch.float8_e4m3fn)
-
     @classmethod
     def get_weights_scaling_factor_from_quantizer(
         cls,
@@ -133,7 +128,8 @@ def get_weights_scaling_factor_from_quantizer(
             per_block_scale = per_block_scale.view(expected_shape)
 
             if not keep_high_precision:
-                per_block_scale = cls._cast_per_block_scale_to_fp8(
+                # The [==0]=1.0 safety net + small global_amax can drive the pre-cast value above 448 (PR #1397).
+                per_block_scale = _cast_per_block_scale_to_fp8(
                     per_block_scale * 448.0 / per_block_scale_max
                 )
             return per_block_scale, weights_scaling_factor_2
@@ -175,7 +171,7 @@ def get_weights_scaling_factor(
         # Set all zero values in scale to 1.0
         per_block_scale[per_block_scale == 0] = 1.0
         if not keep_high_precision:
-            per_block_scale = cls._cast_per_block_scale_to_fp8(per_block_scale)
+            per_block_scale = _cast_per_block_scale_to_fp8(per_block_scale)
         return per_block_scale, weights_scaling_factor_2
 
     @classmethod
diff --git a/tests/unit/torch/quantization/plugins/test_fused_experts.py b/tests/unit/torch/quantization/plugins/test_fused_experts.py
index 5ae6e0c2e5c..73a4f741768 100644
--- a/tests/unit/torch/quantization/plugins/test_fused_experts.py
+++ b/tests/unit/torch/quantization/plugins/test_fused_experts.py
@@ -15,6 +15,8 @@
 
 """Tests for _QuantFusedExperts: generic fused MoE quantization and export."""
 
+from unittest.mock import patch
+
 import pytest
 import torch
 import torch.nn as nn
@@ -933,32 +935,46 @@ def _cleanup_registry(mod_type):
             QuantModuleRegistry.unregister(mod_type)
 
     def test_mse_calibration_populates_all_expert_quantizers(self):
+        # Strong assertion: every per-expert weight quantizer must be touched by the MSE
+        # search loop (mse_calibrate Step 3), not just have _amax set by max-calibrate or
+        # the dead-expert bootstrap. Spy on MseCalibrator.collect — that method is only
+        # invoked from Step 3, after Step 2 installs MseCalibrator on each quantizer.
         import modelopt.torch.quantization as mtq
+        from modelopt.torch.quantization.calib.mse import MseCalibrator
 
         model = _TinyMoEModel()
         expert_type = type(model.moe.experts)
         self._cleanup_registry(expert_type)
 
-        mtq.quantize(
-            model,
-            {
-                "quant_cfg": [
-                    {"quantizer_name": "*", "enable": False},
-                    {
-                        "quantizer_name": "*gate_up_proj_weight_quantizer",
-                        "cfg": {"num_bits": 8, "axis": None},
-                    },
-                    {
-                        "quantizer_name": "*down_proj_weight_quantizer",
-                        "cfg": {"num_bits": 8, "axis": None},
-                    },
-                ],
-                "algorithm": "mse",
-            },
-            forward_loop=lambda m: [m(torch.randn(1, 4, HIDDEN_DIM)) for _ in range(2)],
-        )
+        collected_calib_ids: set[int] = set()
+        original_collect = MseCalibrator.collect
+
+        def _spy_collect(self, x):
+            collected_calib_ids.add(id(self))
+            return original_collect(self, x)
+
+        with patch.object(MseCalibrator, "collect", _spy_collect):
+            mtq.quantize(
+                model,
+                {
+                    "quant_cfg": [
+                        {"quantizer_name": "*", "enable": False},
+                        {
+                            "quantizer_name": "*gate_up_proj_weight_quantizer",
+                            "cfg": {"num_bits": 8, "axis": None},
+                        },
+                        {
+                            "quantizer_name": "*down_proj_weight_quantizer",
+                            "cfg": {"num_bits": 8, "axis": None},
+                        },
+                    ],
+                    "algorithm": "mse",
+                },
+                forward_loop=lambda m: [m(torch.randn(1, 4, HIDDEN_DIM)) for _ in range(2)],
+            )
 
         experts = model.moe.experts
+        missed = []
         for idx in range(NUM_EXPERTS):
             assert experts.gate_up_proj_weight_quantizers[idx].amax is not None, (
                 f"gate_up_proj_weight_quantizers[{idx}] not calibrated — Bug 1 regression"
@@ -966,4 +982,15 @@ def test_mse_calibration_populates_all_expert_quantizers(self):
             assert experts.down_proj_weight_quantizers[idx].amax is not None, (
                 f"down_proj_weight_quantizers[{idx}] not calibrated"
             )
+            if (
+                id(experts.gate_up_proj_weight_quantizers[idx]._calibrator)
+                not in collected_calib_ids
+            ):
+                missed.append(f"gate_up_proj_weight_quantizers[{idx}]")
+            if id(experts.down_proj_weight_quantizers[idx]._calibrator) not in collected_calib_ids:
+                missed.append(f"down_proj_weight_quantizers[{idx}]")
+        assert not missed, (
+            f"MSE search loop skipped these per-expert quantizers: {missed}. "
+            "mse_calibrate Step 3 did not iterate them via iter_weights_for_calibration."
+        )
         self._cleanup_registry(expert_type)
diff --git a/tests/unit/torch/quantization/test_nvfp4_tensor.py b/tests/unit/torch/quantization/test_nvfp4_tensor.py
index c65fe5cdc1c..9ae0c34e235 100644
--- a/tests/unit/torch/quantization/test_nvfp4_tensor.py
+++ b/tests/unit/torch/quantization/test_nvfp4_tensor.py
@@ -19,7 +19,10 @@
 
 import torch
 
-from modelopt.torch.quantization.qtensor.nvfp4_tensor import NVFP4QTensor
+from modelopt.torch.quantization.qtensor.nvfp4_tensor import (
+    NVFP4QTensor,
+    _cast_per_block_scale_to_fp8,
+)
 
 _FP8_E4M3FN_MIN = 2**-9  # 0.001953125 — smallest positive FP8 E4M3FN subnormal
 _FP8_E4M3FN_MAX = 448.0
@@ -74,14 +77,14 @@ def test_mixed_weight_no_zeros(self):
     def test_helper_clamps_overflow_to_max(self):
         """Values above 448 must saturate to 448, not cast to NaN (fp8_e4m3fn has no Inf)."""
         oversized = torch.tensor([100.0, 448.0, 1e3, 1e6])
-        out = NVFP4QTensor._cast_per_block_scale_to_fp8(oversized).float()
+        out = _cast_per_block_scale_to_fp8(oversized).float()
         assert torch.isfinite(out).all(), f"FP8 cast produced non-finite values: {out.tolist()}"
         assert (out <= _FP8_E4M3FN_MAX).all(), f"FP8 cast values exceed 448: {out.tolist()}"
 
     def test_helper_clamps_underflow_to_min(self):
         """Values below the FP8 subnormal must clamp up, not collapse to 0."""
         tiny = torch.tensor([0.0, 1e-12, 1e-6, _FP8_E4M3FN_MIN / 2])
-        out = NVFP4QTensor._cast_per_block_scale_to_fp8(tiny).float()
+        out = _cast_per_block_scale_to_fp8(tiny).float()
         assert (out > 0).all(), f"FP8 cast produced zero scales: {out.tolist()}"
 
     def test_static_path_no_nan_when_block_amax_zero(self):