From d797509b53ef4b1fb6004e4e680a514fd644b069 Mon Sep 17 00:00:00 2001 From: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com> Date: Fri, 1 May 2026 17:06:14 -0700 Subject: [PATCH 1/7] fixes for fused moe (qwen3.6, GLM5.1 + MSE calibration Signed-off-by: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com> --- examples/llm_ptq/hf_ptq.py | 7 +- modelopt/torch/quantization/model_quant.py | 1 + .../nn/modules/tensor_quantizer.py | 4 +- .../quantization/qtensor/nvfp4_tensor.py | 9 +- .../general/ptq/nvfp4_experts_only_mse.yaml | 130 ++++++++++++++++++ 5 files changed, 147 insertions(+), 4 deletions(-) create mode 100644 modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 875e78ceea6..0c1c914ce69 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -828,6 +828,11 @@ def pre_quantize( preview_input_ids = next(iter(calib_dataloader))[ "input_features" if model_type == "whisper" else "input_ids" ][0:1] + # Strip leading padding tokens so the preview input shows real content + if model_type not in ("whisper",) and tokenizer is not None and tokenizer.pad_token_id is not None: + first_non_pad = (preview_input_ids[0] != tokenizer.pad_token_id).nonzero(as_tuple=True)[0] + if first_non_pad.numel() > 0: + preview_input_ids = preview_input_ids[:, first_non_pad[0]:] # Generate preview before quantization if args.skip_generate: @@ -928,7 +933,7 @@ def input_decode(input_ids): if processor is not None and isinstance(processor, WhisperProcessor): return first_text_speech_dataset elif tokenizer is not None: - return tokenizer.batch_decode(input_ids) + return tokenizer.batch_decode(input_ids, skip_special_tokens=True) else: raise ValueError("The processor or tokenizer must be set") diff --git a/modelopt/torch/quantization/model_quant.py b/modelopt/torch/quantization/model_quant.py index 5e65f9cc1d4..3582223c4d3 100644 --- a/modelopt/torch/quantization/model_quant.py +++ b/modelopt/torch/quantization/model_quant.py @@ -595,6 +595,7 @@ def print_quant_summary(model: nn.Module, output_dir: str | None = None): lines.append(f"{len(lines)} TensorQuantizers found in model") if output_dir: + os.makedirs(output_dir, exist_ok=True) path = os.path.join(output_dir, ".quant_summary.txt") with open(path, "w", encoding="utf-8") as f: f.write("\n".join(lines) + "\n") diff --git a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py index fa540b8fdf5..5e3cea44c2a 100644 --- a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py +++ b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py @@ -1122,7 +1122,7 @@ def forward(self, inputs): return outputs - def _short_amax(self, fmt=".4f"): + def _short_amax(self, fmt=".2e"): """Short description of amax. Returns: @@ -1140,7 +1140,7 @@ def _short_amax(self, fmt=".4f"): return "meta" return self._short_tensor(self._amax, fmt) - def _short_tensor(self, tensor: torch.Tensor, fmt=".4f"): + def _short_tensor(self, tensor: torch.Tensor, fmt=".2e"): """Short description of tensor.""" if tensor.numel() == 1: return f"{tensor.item():{fmt}}" diff --git a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py index bb39c8a81e3..209a8e778be 100644 --- a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py +++ b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py @@ -128,9 +128,10 @@ def get_weights_scaling_factor_from_quantizer( # explodes to ``1.0 * 448 / (global_amax/6)``. fp8_e4m3fn has no Inf, so any # value >= 480 casts to NaN — clamp first to keep the stored byte finite. if not keep_high_precision: + _FP8_E4M3FN_MIN = 2**-9 # 0.001953125 — smallest positive subnormal per_block_scale = ( (per_block_scale * 448.0 / per_block_scale_max) - .clamp_(max=448.0) + .clamp(min=_FP8_E4M3FN_MIN, max=448.0) .to(torch.float8_e4m3fn) ) return per_block_scale, weights_scaling_factor_2 @@ -173,6 +174,12 @@ def get_weights_scaling_factor( per_block_scale[per_block_scale == 0] = 1.0 # Convert to torch.float8_e4m3fn if not keep_high_precision: + # Clamp to the minimum positive FP8 E4M3FN subnormal (~0.00195 = 2^-9) before + # casting. Without this, blocks whose scale falls below the FP8 representable + # range silently underflow to 0, causing those blocks to produce zero output at + # inference even when the weights are non-trivial. + _FP8_E4M3FN_MIN = 2**-9 # 0.001953125 — smallest positive subnormal + per_block_scale = per_block_scale.clamp(min=_FP8_E4M3FN_MIN) per_block_scale = per_block_scale.to(torch.float8_e4m3fn) return per_block_scale, weights_scaling_factor_2 diff --git a/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml new file mode 100644 index 00000000000..0fbdb421075 --- /dev/null +++ b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml @@ -0,0 +1,130 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +metadata: + recipe_type: ptq + description: > + NVFP4 W4A4 for MoE routed experts only. Static weight scales via MSE + FP8 scale sweep; + dynamic activation scales. Supports sequential experts (nn.Linear-based) and fused experts + (_QuantFusedExperts, HF transformers 5.0+ 3D nn.Parameter style). +quantize: + algorithm: + method: mse + fp8_scale_sweep: true + layerwise: false + quant_cfg: + # ── Disable everything first ───────────────────────────────────────────── + - quantizer_name: '*' + enable: false + + # ── Sequential experts (nn.Linear per expert) ──────────────────────────── + - quantizer_name: '*mlp.experts*weight_quantizer' + enable: true + cfg: + block_sizes: + -1: 16 + type: static + scale_bits: e4m3 + num_bits: e2m1 + - quantizer_name: '*mlp.experts*input_quantizer' + enable: true + cfg: + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + + # ── Sequential experts: Mixtral / block_sparse_moe style ──────────────── + - quantizer_name: '*block_sparse_moe*weight_quantizer' + enable: true + cfg: + block_sizes: + -1: 16 + type: static + scale_bits: e4m3 + num_bits: e2m1 + - quantizer_name: '*block_sparse_moe*input_quantizer' + enable: true + cfg: + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + + # ── Fused experts (_QuantFusedExperts, HF transformers 5.0+ 3D nn.Parameter style) ── + - quantizer_name: '*gate_up_proj_weight_quantizers*' + enable: true + cfg: + block_sizes: + -1: 16 + type: static + scale_bits: e4m3 + num_bits: e2m1 + - quantizer_name: '*gate_up_proj_input_quantizer*' + enable: true + cfg: + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + - quantizer_name: '*down_proj_weight_quantizers*' + enable: true + cfg: + block_sizes: + -1: 16 + type: static + scale_bits: e4m3 + num_bits: e2m1 + - quantizer_name: '*down_proj_input_quantizer*' + enable: true + cfg: + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + + # ── Exclusions: shared experts, attention, routers, lm_head ───────────── + - quantizer_name: '*block_sparse_moe.gate*' + enable: false + - quantizer_name: '*linear_attn.conv1d*' + enable: false + - quantizer_name: '*lm_head*' + enable: false + - quantizer_name: '*mlp.gate.*' + enable: false + - quantizer_name: '*mlp.shared_expert*' + enable: false + - quantizer_name: '*mlp.shared_expert_gate.*' + enable: false + - quantizer_name: '*router*' + enable: false + - quantizer_name: 'output.*' + enable: false + - parent_class: 'nn.BatchNorm1d' + quantizer_name: '*' + enable: false + - parent_class: 'nn.BatchNorm2d' + quantizer_name: '*' + enable: false + - parent_class: 'nn.BatchNorm3d' + quantizer_name: '*' + enable: false + - parent_class: 'nn.LeakyReLU' + quantizer_name: '*' + enable: false From 9aac0fbac08b3106cf407640dfed250c959b1224 Mon Sep 17 00:00:00 2001 From: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com> Date: Mon, 4 May 2026 22:49:00 +0000 Subject: [PATCH 2/7] address reviewers' feedback Signed-off-by: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com> --- .../general/ptq/nvfp4_experts_only_mse.yaml | 2 +- .../plugins/test_fused_experts.py | 102 +++++++++++++++--- .../torch/quantization/test_nvfp4_tensor.py | 68 ++++++++++++ 3 files changed, 154 insertions(+), 18 deletions(-) create mode 100644 tests/unit/torch/quantization/test_nvfp4_tensor.py diff --git a/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml index 0fbdb421075..76d50b760f0 100644 --- a/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml +++ b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tests/unit/torch/quantization/plugins/test_fused_experts.py b/tests/unit/torch/quantization/plugins/test_fused_experts.py index 19e1ed49197..5ae6e0c2e5c 100644 --- a/tests/unit/torch/quantization/plugins/test_fused_experts.py +++ b/tests/unit/torch/quantization/plugins/test_fused_experts.py @@ -256,27 +256,51 @@ def test_expert_index_recovery(self): # Tests for export # --------------------------------------------------------------------------- class TestExportFusedExperts: + @staticmethod + def _cleanup_registry(mod_type): + if QuantModuleRegistry.get(mod_type) is not None: + QuantModuleRegistry.unregister(mod_type) + def test_export_creates_per_expert_submodules(self): """_export_fused_experts should create per-expert submodules with standard naming.""" + import modelopt.torch.quantization as mtq from modelopt.torch.export.moe_utils import _export_fused_experts - experts = _SyntheticFusedExperts() - expert_type = type(experts) + model = _TinyMoEModel() + expert_type = type(model.moe.experts) + self._cleanup_registry(expert_type) - # Manually register and convert - if QuantModuleRegistry.get(expert_type) is None: - QuantModuleRegistry.register({expert_type: "test.SyntheticFusedExperts"})( - _QuantFusedExperts - ) - converted = QuantModuleRegistry.convert(experts) + quant_cfg = { + "quant_cfg": [ + {"quantizer_name": "*", "enable": False}, + { + "quantizer_name": "*gate_up_proj_input_quantizer", + "cfg": {"num_bits": 8, "axis": None}, + }, + { + "quantizer_name": "*down_proj_input_quantizer", + "cfg": {"num_bits": 8, "axis": None}, + }, + { + "quantizer_name": "*gate_up_proj_weight_quantizer", + "cfg": {"num_bits": 8, "axis": 0}, + }, + { + "quantizer_name": "*down_proj_weight_quantizer", + "cfg": {"num_bits": 8, "axis": 0}, + }, + ], + "algorithm": "max", + } - # Run a forward pass to calibrate (set amaxes) - seq_len = 16 - hidden_states = torch.randn(seq_len, HIDDEN_DIM) - top_k_index = torch.randint(0, NUM_EXPERTS, (seq_len, TOP_K)) - top_k_weights = torch.softmax(torch.randn(seq_len, TOP_K), dim=-1) - with torch.no_grad(): - converted(hidden_states, top_k_index, top_k_weights) + def forward_loop(m): + torch.manual_seed(0) + for _ in range(2): + x = torch.randn(1, 4, HIDDEN_DIM) + m(x) + + mtq.quantize(model, quant_cfg, forward_loop=forward_loop) + converted = model.moe.experts _export_fused_experts(converted, torch.float16) @@ -297,8 +321,7 @@ def test_export_creates_per_expert_submodules(self): assert not hasattr(converted, "down_proj") assert not hasattr(converted, "gate_up_proj_weight_quantizers") - if QuantModuleRegistry.get(expert_type) is not None: - QuantModuleRegistry.unregister(expert_type) + self._cleanup_registry(expert_type) def test_uncalibrated_expert_gate_up_share_amax(self, monkeypatch): """gate_proj and up_proj must share weight_scale_2 even when an expert @@ -899,3 +922,48 @@ def test_unrelated_dotted_number_unchanged(self): _normalize_fused_experts_quantizer_name("moe.layers.3.gate.weight") == "moe.layers.3.gate.weight" ) + + +# Verifies that MSE calibration discovers and calibrates every per-expert weight quantizer +# inside a fused-expert ModuleList (both gate_up_proj and down_proj, for all experts). +class TestFusedExpertsMSECalibration: + @staticmethod + def _cleanup_registry(mod_type): + if QuantModuleRegistry.get(mod_type) is not None: + QuantModuleRegistry.unregister(mod_type) + + def test_mse_calibration_populates_all_expert_quantizers(self): + import modelopt.torch.quantization as mtq + + model = _TinyMoEModel() + expert_type = type(model.moe.experts) + self._cleanup_registry(expert_type) + + mtq.quantize( + model, + { + "quant_cfg": [ + {"quantizer_name": "*", "enable": False}, + { + "quantizer_name": "*gate_up_proj_weight_quantizer", + "cfg": {"num_bits": 8, "axis": None}, + }, + { + "quantizer_name": "*down_proj_weight_quantizer", + "cfg": {"num_bits": 8, "axis": None}, + }, + ], + "algorithm": "mse", + }, + forward_loop=lambda m: [m(torch.randn(1, 4, HIDDEN_DIM)) for _ in range(2)], + ) + + experts = model.moe.experts + for idx in range(NUM_EXPERTS): + assert experts.gate_up_proj_weight_quantizers[idx].amax is not None, ( + f"gate_up_proj_weight_quantizers[{idx}] not calibrated — Bug 1 regression" + ) + assert experts.down_proj_weight_quantizers[idx].amax is not None, ( + f"down_proj_weight_quantizers[{idx}] not calibrated" + ) + self._cleanup_registry(expert_type) diff --git a/tests/unit/torch/quantization/test_nvfp4_tensor.py b/tests/unit/torch/quantization/test_nvfp4_tensor.py new file mode 100644 index 00000000000..e4f0e7a8d7a --- /dev/null +++ b/tests/unit/torch/quantization/test_nvfp4_tensor.py @@ -0,0 +1,68 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for NVFP4QTensor per-block FP8 scale underflow clamping.""" + +import torch + +from modelopt.torch.quantization.qtensor.nvfp4_tensor import NVFP4QTensor + +_FP8_E4M3FN_MIN = 2**-9 # 0.001953125 — smallest positive FP8 E4M3FN subnormal + + +class TestNVFP4ScaleClamping: + """Per-block weight scales below the FP8 E4M3FN minimum must be clamped, not rounded to zero.""" + + def test_no_zero_scales_for_tiny_weights(self): + """Tiny per-block amax (< 0).all(), ( + f"Zero per-block scales found after FP8 cast: {per_block_scale_f32.tolist()}. " + "FP8 scale underflow clamping likely regressed." + ) + assert (per_block_scale_f32 >= _FP8_E4M3FN_MIN).all(), ( + "Per-block scales below FP8 minimum subnormal found after cast." + ) + + def test_normal_weights_unaffected_by_clamp(self): + """Weights with typical magnitudes must not be affected by the underflow clamp.""" + block_size = 16 + torch.manual_seed(42) + normal_weight = torch.randn(8, block_size) + + per_block_scale, _ = NVFP4QTensor.get_weights_scaling_factor(normal_weight, block_size) + assert (per_block_scale.float() > 0).all(), "Normal weights produced zero scales." + + def test_mixed_weight_no_zeros(self): + """Mixed-magnitude tensor (normal + tiny blocks) must have no zero scales.""" + block_size = 16 + weight = torch.cat( + [ + torch.randn(4, block_size), + torch.full((4, block_size), 1e-12), + ], + dim=0, + ) + + per_block_scale, _ = NVFP4QTensor.get_weights_scaling_factor(weight, block_size) + assert (per_block_scale.float() > 0).all(), ( + "Zero scales in mixed-magnitude tensor after FP8 cast." + ) From 60e185109758b77d20270edd18c0d43f9887e593 Mon Sep 17 00:00:00 2001 From: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com> Date: Mon, 4 May 2026 23:03:54 +0000 Subject: [PATCH 3/7] code quality Signed-off-by: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com> --- examples/llm_ptq/hf_ptq.py | 4 ++-- modelopt/torch/quantization/model_calib.py | 2 +- modelopt/torch/quantization/qtensor/nvfp4_tensor.py | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 0c1c914ce69..ac7a583831f 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -829,10 +829,10 @@ def pre_quantize( "input_features" if model_type == "whisper" else "input_ids" ][0:1] # Strip leading padding tokens so the preview input shows real content - if model_type not in ("whisper",) and tokenizer is not None and tokenizer.pad_token_id is not None: + if model_type != "whisper" and tokenizer is not None and tokenizer.pad_token_id is not None: first_non_pad = (preview_input_ids[0] != tokenizer.pad_token_id).nonzero(as_tuple=True)[0] if first_non_pad.numel() > 0: - preview_input_ids = preview_input_ids[:, first_non_pad[0]:] + preview_input_ids = preview_input_ids[:, first_non_pad[0] :] # Generate preview before quantization if args.skip_generate: diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py index bce49786077..78b237847b1 100644 --- a/modelopt/torch/quantization/model_calib.py +++ b/modelopt/torch/quantization/model_calib.py @@ -864,7 +864,7 @@ def finish_stats_collection(model: nn.Module, method: str | None = None, **kwarg cal = getattr(module, "_calibrator", None) if cal and not getattr(module, "_dynamic", False): - if method in {"entropy"}: + if method == "entropy": if cal.compute_amax(method) is not None: module.load_calib_amax("entropy", **kwargs) elif cal.compute_amax(**kwargs) is not None: diff --git a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py index 209a8e778be..e2e14cb4a74 100644 --- a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py +++ b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py @@ -128,10 +128,10 @@ def get_weights_scaling_factor_from_quantizer( # explodes to ``1.0 * 448 / (global_amax/6)``. fp8_e4m3fn has no Inf, so any # value >= 480 casts to NaN — clamp first to keep the stored byte finite. if not keep_high_precision: - _FP8_E4M3FN_MIN = 2**-9 # 0.001953125 — smallest positive subnormal + fp8_e4m3fn_min = 2**-9 # 0.001953125 — smallest positive subnormal per_block_scale = ( (per_block_scale * 448.0 / per_block_scale_max) - .clamp(min=_FP8_E4M3FN_MIN, max=448.0) + .clamp(min=fp8_e4m3fn_min, max=448.0) .to(torch.float8_e4m3fn) ) return per_block_scale, weights_scaling_factor_2 @@ -178,8 +178,8 @@ def get_weights_scaling_factor( # casting. Without this, blocks whose scale falls below the FP8 representable # range silently underflow to 0, causing those blocks to produce zero output at # inference even when the weights are non-trivial. - _FP8_E4M3FN_MIN = 2**-9 # 0.001953125 — smallest positive subnormal - per_block_scale = per_block_scale.clamp(min=_FP8_E4M3FN_MIN) + fp8_e4m3fn_min = 2**-9 # 0.001953125 — smallest positive subnormal + per_block_scale = per_block_scale.clamp(min=fp8_e4m3fn_min) per_block_scale = per_block_scale.to(torch.float8_e4m3fn) return per_block_scale, weights_scaling_factor_2 From ab8a162c9be9600726063a8939dd9c74a52a5168 Mon Sep 17 00:00:00 2001 From: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com> Date: Mon, 4 May 2026 23:59:29 +0000 Subject: [PATCH 4/7] more reviwers feedback Signed-off-by: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com> --- tests/unit/torch/quantization/test_nvfp4_tensor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/unit/torch/quantization/test_nvfp4_tensor.py b/tests/unit/torch/quantization/test_nvfp4_tensor.py index e4f0e7a8d7a..d66809a3cfc 100644 --- a/tests/unit/torch/quantization/test_nvfp4_tensor.py +++ b/tests/unit/torch/quantization/test_nvfp4_tensor.py @@ -29,7 +29,8 @@ def test_no_zero_scales_for_tiny_weights(self): """Tiny per-block amax (< Date: Tue, 12 May 2026 22:52:26 +0000 Subject: [PATCH 5/7] more reviewers feedback Signed-off-by: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com> --- .../quantization/qtensor/nvfp4_tensor.py | 31 ++--- .../general/ptq/nvfp4_experts_only_mse.yaml | 112 ++---------------- .../torch/quantization/test_nvfp4_tensor.py | 46 ++++++- 3 files changed, 70 insertions(+), 119 deletions(-) diff --git a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py index e2e14cb4a74..ede540a16ef 100644 --- a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py +++ b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py @@ -78,6 +78,16 @@ def get_weights_scaling_factor_2_from_quantizer(cls, weight_quantizer): ) return weight_quantizer._amax.float() / (6.0 * 448.0) + @classmethod + def _cast_per_block_scale_to_fp8(cls, per_block_scale: torch.Tensor) -> torch.Tensor: + """Clamp to FP8 E4M3FN representable range, then cast. + + FP8 E4M3FN has no Inf and a smallest positive subnormal of ``2**-9`` (~0.00195). + Values below the min silently underflow to 0 (zero outputs at inference); values + above 448 cast to NaN. + """ + return per_block_scale.clamp(min=2**-9, max=448.0).to(torch.float8_e4m3fn) + @classmethod def get_weights_scaling_factor_from_quantizer( cls, @@ -122,17 +132,9 @@ def get_weights_scaling_factor_from_quantizer( expected_shape = (*weight.shape[:-1], num_blocks_per_row) per_block_scale = per_block_scale.view(expected_shape) - # Quantize scales to FP8. Saturate to the fp8_e4m3fn max (448) before the - # cast: when the [==0]=1.0 safety net above fires (per_block_amax was zero - # for an all-zero weight block) and global_amax is small, the pre-cast value - # explodes to ``1.0 * 448 / (global_amax/6)``. fp8_e4m3fn has no Inf, so any - # value >= 480 casts to NaN — clamp first to keep the stored byte finite. if not keep_high_precision: - fp8_e4m3fn_min = 2**-9 # 0.001953125 — smallest positive subnormal - per_block_scale = ( - (per_block_scale * 448.0 / per_block_scale_max) - .clamp(min=fp8_e4m3fn_min, max=448.0) - .to(torch.float8_e4m3fn) + per_block_scale = cls._cast_per_block_scale_to_fp8( + per_block_scale * 448.0 / per_block_scale_max ) return per_block_scale, weights_scaling_factor_2 else: @@ -172,15 +174,8 @@ def get_weights_scaling_factor( ) # Set all zero values in scale to 1.0 per_block_scale[per_block_scale == 0] = 1.0 - # Convert to torch.float8_e4m3fn if not keep_high_precision: - # Clamp to the minimum positive FP8 E4M3FN subnormal (~0.00195 = 2^-9) before - # casting. Without this, blocks whose scale falls below the FP8 representable - # range silently underflow to 0, causing those blocks to produce zero output at - # inference even when the weights are non-trivial. - fp8_e4m3fn_min = 2**-9 # 0.001953125 — smallest positive subnormal - per_block_scale = per_block_scale.clamp(min=fp8_e4m3fn_min) - per_block_scale = per_block_scale.to(torch.float8_e4m3fn) + per_block_scale = cls._cast_per_block_scale_to_fp8(per_block_scale) return per_block_scale, weights_scaling_factor_2 @classmethod diff --git a/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml index 76d50b760f0..fbd066b80e1 100644 --- a/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml +++ b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml @@ -13,118 +13,32 @@ # See the License for the specific language governing permissions and # limitations under the License. +imports: + base_disable_all: configs/ptq/units/base_disable_all + default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers + nvfp4: configs/numerics/nvfp4 + nvfp4_static: configs/numerics/nvfp4_static + metadata: recipe_type: ptq - description: > - NVFP4 W4A4 for MoE routed experts only. Static weight scales via MSE + FP8 scale sweep; - dynamic activation scales. Supports sequential experts (nn.Linear-based) and fused experts - (_QuantFusedExperts, HF transformers 5.0+ 3D nn.Parameter style). + description: NVFP4 static weight (MSE FP8-scale sweep) and dynamic activation for expert layers only (W4A4), no KV-cache quantization. quantize: algorithm: method: mse fp8_scale_sweep: true layerwise: false quant_cfg: - # ── Disable everything first ───────────────────────────────────────────── - - quantizer_name: '*' - enable: false - - # ── Sequential experts (nn.Linear per expert) ──────────────────────────── + - $import: base_disable_all - quantizer_name: '*mlp.experts*weight_quantizer' - enable: true cfg: - block_sizes: - -1: 16 - type: static - scale_bits: e4m3 - num_bits: e2m1 + $import: nvfp4_static - quantizer_name: '*mlp.experts*input_quantizer' - enable: true cfg: - block_sizes: - -1: 16 - type: dynamic - scale_bits: e4m3 - num_bits: e2m1 - - # ── Sequential experts: Mixtral / block_sparse_moe style ──────────────── + $import: nvfp4 - quantizer_name: '*block_sparse_moe*weight_quantizer' - enable: true cfg: - block_sizes: - -1: 16 - type: static - scale_bits: e4m3 - num_bits: e2m1 + $import: nvfp4_static - quantizer_name: '*block_sparse_moe*input_quantizer' - enable: true - cfg: - block_sizes: - -1: 16 - type: dynamic - scale_bits: e4m3 - num_bits: e2m1 - - # ── Fused experts (_QuantFusedExperts, HF transformers 5.0+ 3D nn.Parameter style) ── - - quantizer_name: '*gate_up_proj_weight_quantizers*' - enable: true cfg: - block_sizes: - -1: 16 - type: static - scale_bits: e4m3 - num_bits: e2m1 - - quantizer_name: '*gate_up_proj_input_quantizer*' - enable: true - cfg: - block_sizes: - -1: 16 - type: dynamic - scale_bits: e4m3 - num_bits: e2m1 - - quantizer_name: '*down_proj_weight_quantizers*' - enable: true - cfg: - block_sizes: - -1: 16 - type: static - scale_bits: e4m3 - num_bits: e2m1 - - quantizer_name: '*down_proj_input_quantizer*' - enable: true - cfg: - block_sizes: - -1: 16 - type: dynamic - scale_bits: e4m3 - num_bits: e2m1 - - # ── Exclusions: shared experts, attention, routers, lm_head ───────────── - - quantizer_name: '*block_sparse_moe.gate*' - enable: false - - quantizer_name: '*linear_attn.conv1d*' - enable: false - - quantizer_name: '*lm_head*' - enable: false - - quantizer_name: '*mlp.gate.*' - enable: false - - quantizer_name: '*mlp.shared_expert*' - enable: false - - quantizer_name: '*mlp.shared_expert_gate.*' - enable: false - - quantizer_name: '*router*' - enable: false - - quantizer_name: 'output.*' - enable: false - - parent_class: 'nn.BatchNorm1d' - quantizer_name: '*' - enable: false - - parent_class: 'nn.BatchNorm2d' - quantizer_name: '*' - enable: false - - parent_class: 'nn.BatchNorm3d' - quantizer_name: '*' - enable: false - - parent_class: 'nn.LeakyReLU' - quantizer_name: '*' - enable: false + $import: nvfp4 + - $import: default_disabled_quantizers diff --git a/tests/unit/torch/quantization/test_nvfp4_tensor.py b/tests/unit/torch/quantization/test_nvfp4_tensor.py index d66809a3cfc..c65fe5cdc1c 100644 --- a/tests/unit/torch/quantization/test_nvfp4_tensor.py +++ b/tests/unit/torch/quantization/test_nvfp4_tensor.py @@ -13,17 +13,20 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Tests for NVFP4QTensor per-block FP8 scale underflow clamping.""" +"""Tests for NVFP4QTensor per-block FP8 scale clamping (underflow + overflow).""" + +from types import SimpleNamespace import torch from modelopt.torch.quantization.qtensor.nvfp4_tensor import NVFP4QTensor _FP8_E4M3FN_MIN = 2**-9 # 0.001953125 — smallest positive FP8 E4M3FN subnormal +_FP8_E4M3FN_MAX = 448.0 class TestNVFP4ScaleClamping: - """Per-block weight scales below the FP8 E4M3FN minimum must be clamped, not rounded to zero.""" + """Per-block weight scales outside the FP8 E4M3FN range must be clamped, not turned into 0/NaN.""" def test_no_zero_scales_for_tiny_weights(self): """Tiny per-block amax (< 0).all(), ( "Zero scales in mixed-magnitude tensor after FP8 cast." ) + + def test_helper_clamps_overflow_to_max(self): + """Values above 448 must saturate to 448, not cast to NaN (fp8_e4m3fn has no Inf).""" + oversized = torch.tensor([100.0, 448.0, 1e3, 1e6]) + out = NVFP4QTensor._cast_per_block_scale_to_fp8(oversized).float() + assert torch.isfinite(out).all(), f"FP8 cast produced non-finite values: {out.tolist()}" + assert (out <= _FP8_E4M3FN_MAX).all(), f"FP8 cast values exceed 448: {out.tolist()}" + + def test_helper_clamps_underflow_to_min(self): + """Values below the FP8 subnormal must clamp up, not collapse to 0.""" + tiny = torch.tensor([0.0, 1e-12, 1e-6, _FP8_E4M3FN_MIN / 2]) + out = NVFP4QTensor._cast_per_block_scale_to_fp8(tiny).float() + assert (out > 0).all(), f"FP8 cast produced zero scales: {out.tolist()}" + + def test_static_path_no_nan_when_block_amax_zero(self): + """Static path: when a block's amax is 0 (all-zero weights), the `[==0]=1.0` safety net + and a small global_amax push the pre-cast value above 448. Without the max clamp, + fp8_e4m3fn would cast it to NaN — regression for the export-time NaN reported on this PR. + """ + block_size = 16 + # global_amax small enough that 1.0 * 448 / (global_amax/6) >> 448. + global_amax = torch.tensor(0.01) + # One block with amax=0 (triggers safety net), three normal blocks. + per_block_amax = torch.tensor([[0.0, 0.005, 0.008, 0.01]]) + weight = torch.randn(1, 4 * block_size) + q = SimpleNamespace( + global_amax=global_amax, + _amax=per_block_amax, + block_sizes={-1: block_size}, + ) + + per_block_scale, _ = NVFP4QTensor.get_weights_scaling_factor_from_quantizer(q, weight) + per_block_scale_f32 = per_block_scale.float() + assert torch.isfinite(per_block_scale_f32).all(), ( + f"NaN/Inf in exported static per-block scale: {per_block_scale_f32.tolist()}" + ) + assert (per_block_scale_f32 <= _FP8_E4M3FN_MAX).all(), ( + f"Static per-block scale exceeds FP8 max 448: {per_block_scale_f32.tolist()}" + ) From 5dcda40f11ddc4fd6f1f76c7ef79ac08552d2f03 Mon Sep 17 00:00:00 2001 From: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com> Date: Tue, 12 May 2026 23:08:23 +0000 Subject: [PATCH 6/7] remove duplicated yaml Signed-off-by: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com> --- .../general/ptq/nvfp4_experts_only_mse.yaml | 44 ------------------- 1 file changed, 44 deletions(-) delete mode 100644 modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml diff --git a/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml deleted file mode 100644 index fbd066b80e1..00000000000 --- a/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml +++ /dev/null @@ -1,44 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -imports: - base_disable_all: configs/ptq/units/base_disable_all - default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers - nvfp4: configs/numerics/nvfp4 - nvfp4_static: configs/numerics/nvfp4_static - -metadata: - recipe_type: ptq - description: NVFP4 static weight (MSE FP8-scale sweep) and dynamic activation for expert layers only (W4A4), no KV-cache quantization. -quantize: - algorithm: - method: mse - fp8_scale_sweep: true - layerwise: false - quant_cfg: - - $import: base_disable_all - - quantizer_name: '*mlp.experts*weight_quantizer' - cfg: - $import: nvfp4_static - - quantizer_name: '*mlp.experts*input_quantizer' - cfg: - $import: nvfp4 - - quantizer_name: '*block_sparse_moe*weight_quantizer' - cfg: - $import: nvfp4_static - - quantizer_name: '*block_sparse_moe*input_quantizer' - cfg: - $import: nvfp4 - - $import: default_disabled_quantizers From 4de8abf1f7f59c199a671633b00fef6f2472e846 Mon Sep 17 00:00:00 2001 From: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com> Date: Wed, 13 May 2026 22:57:51 +0000 Subject: [PATCH 7/7] minor update for reviewer feedback Signed-off-by: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com> --- examples/llm_ptq/hf_ptq.py | 6 ++ .../quantization/qtensor/nvfp4_tensor.py | 20 +++--- .../plugins/test_fused_experts.py | 63 +++++++++++++------ .../torch/quantization/test_nvfp4_tensor.py | 9 ++- 4 files changed, 65 insertions(+), 33 deletions(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index ac7a583831f..a6089e4b3d3 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -833,6 +833,12 @@ def pre_quantize( first_non_pad = (preview_input_ids[0] != tokenizer.pad_token_id).nonzero(as_tuple=True)[0] if first_non_pad.numel() > 0: preview_input_ids = preview_input_ids[:, first_non_pad[0] :] + else: + warnings.warn( + "Preview calibration sample is entirely padding; generated preview will be " + "degenerate. Check tokenizer padding side / dataset preprocessing.", + stacklevel=2, + ) # Generate preview before quantization if args.skip_generate: diff --git a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py index ede540a16ef..a7a2a64cfac 100644 --- a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py +++ b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py @@ -28,6 +28,11 @@ __all__ = ["NVFP4QTensor"] +def _cast_per_block_scale_to_fp8(per_block_scale: torch.Tensor) -> torch.Tensor: + """Clamp to FP8 E4M3FN range [2**-9, 448] and cast — avoids underflow→0 / overflow→NaN.""" + return per_block_scale.clamp(min=2**-9, max=448.0).to(torch.float8_e4m3fn) + + class NVFP4QTensor(BaseQuantizedTensor): """Implements the INT4 quantization on tensors for more efficient storage or computation. @@ -78,16 +83,6 @@ def get_weights_scaling_factor_2_from_quantizer(cls, weight_quantizer): ) return weight_quantizer._amax.float() / (6.0 * 448.0) - @classmethod - def _cast_per_block_scale_to_fp8(cls, per_block_scale: torch.Tensor) -> torch.Tensor: - """Clamp to FP8 E4M3FN representable range, then cast. - - FP8 E4M3FN has no Inf and a smallest positive subnormal of ``2**-9`` (~0.00195). - Values below the min silently underflow to 0 (zero outputs at inference); values - above 448 cast to NaN. - """ - return per_block_scale.clamp(min=2**-9, max=448.0).to(torch.float8_e4m3fn) - @classmethod def get_weights_scaling_factor_from_quantizer( cls, @@ -133,7 +128,8 @@ def get_weights_scaling_factor_from_quantizer( per_block_scale = per_block_scale.view(expected_shape) if not keep_high_precision: - per_block_scale = cls._cast_per_block_scale_to_fp8( + # The [==0]=1.0 safety net + small global_amax can drive the pre-cast value above 448 (PR #1397). + per_block_scale = _cast_per_block_scale_to_fp8( per_block_scale * 448.0 / per_block_scale_max ) return per_block_scale, weights_scaling_factor_2 @@ -175,7 +171,7 @@ def get_weights_scaling_factor( # Set all zero values in scale to 1.0 per_block_scale[per_block_scale == 0] = 1.0 if not keep_high_precision: - per_block_scale = cls._cast_per_block_scale_to_fp8(per_block_scale) + per_block_scale = _cast_per_block_scale_to_fp8(per_block_scale) return per_block_scale, weights_scaling_factor_2 @classmethod diff --git a/tests/unit/torch/quantization/plugins/test_fused_experts.py b/tests/unit/torch/quantization/plugins/test_fused_experts.py index 5ae6e0c2e5c..73a4f741768 100644 --- a/tests/unit/torch/quantization/plugins/test_fused_experts.py +++ b/tests/unit/torch/quantization/plugins/test_fused_experts.py @@ -15,6 +15,8 @@ """Tests for _QuantFusedExperts: generic fused MoE quantization and export.""" +from unittest.mock import patch + import pytest import torch import torch.nn as nn @@ -933,32 +935,46 @@ def _cleanup_registry(mod_type): QuantModuleRegistry.unregister(mod_type) def test_mse_calibration_populates_all_expert_quantizers(self): + # Strong assertion: every per-expert weight quantizer must be touched by the MSE + # search loop (mse_calibrate Step 3), not just have _amax set by max-calibrate or + # the dead-expert bootstrap. Spy on MseCalibrator.collect — that method is only + # invoked from Step 3, after Step 2 installs MseCalibrator on each quantizer. import modelopt.torch.quantization as mtq + from modelopt.torch.quantization.calib.mse import MseCalibrator model = _TinyMoEModel() expert_type = type(model.moe.experts) self._cleanup_registry(expert_type) - mtq.quantize( - model, - { - "quant_cfg": [ - {"quantizer_name": "*", "enable": False}, - { - "quantizer_name": "*gate_up_proj_weight_quantizer", - "cfg": {"num_bits": 8, "axis": None}, - }, - { - "quantizer_name": "*down_proj_weight_quantizer", - "cfg": {"num_bits": 8, "axis": None}, - }, - ], - "algorithm": "mse", - }, - forward_loop=lambda m: [m(torch.randn(1, 4, HIDDEN_DIM)) for _ in range(2)], - ) + collected_calib_ids: set[int] = set() + original_collect = MseCalibrator.collect + + def _spy_collect(self, x): + collected_calib_ids.add(id(self)) + return original_collect(self, x) + + with patch.object(MseCalibrator, "collect", _spy_collect): + mtq.quantize( + model, + { + "quant_cfg": [ + {"quantizer_name": "*", "enable": False}, + { + "quantizer_name": "*gate_up_proj_weight_quantizer", + "cfg": {"num_bits": 8, "axis": None}, + }, + { + "quantizer_name": "*down_proj_weight_quantizer", + "cfg": {"num_bits": 8, "axis": None}, + }, + ], + "algorithm": "mse", + }, + forward_loop=lambda m: [m(torch.randn(1, 4, HIDDEN_DIM)) for _ in range(2)], + ) experts = model.moe.experts + missed = [] for idx in range(NUM_EXPERTS): assert experts.gate_up_proj_weight_quantizers[idx].amax is not None, ( f"gate_up_proj_weight_quantizers[{idx}] not calibrated — Bug 1 regression" @@ -966,4 +982,15 @@ def test_mse_calibration_populates_all_expert_quantizers(self): assert experts.down_proj_weight_quantizers[idx].amax is not None, ( f"down_proj_weight_quantizers[{idx}] not calibrated" ) + if ( + id(experts.gate_up_proj_weight_quantizers[idx]._calibrator) + not in collected_calib_ids + ): + missed.append(f"gate_up_proj_weight_quantizers[{idx}]") + if id(experts.down_proj_weight_quantizers[idx]._calibrator) not in collected_calib_ids: + missed.append(f"down_proj_weight_quantizers[{idx}]") + assert not missed, ( + f"MSE search loop skipped these per-expert quantizers: {missed}. " + "mse_calibrate Step 3 did not iterate them via iter_weights_for_calibration." + ) self._cleanup_registry(expert_type) diff --git a/tests/unit/torch/quantization/test_nvfp4_tensor.py b/tests/unit/torch/quantization/test_nvfp4_tensor.py index c65fe5cdc1c..9ae0c34e235 100644 --- a/tests/unit/torch/quantization/test_nvfp4_tensor.py +++ b/tests/unit/torch/quantization/test_nvfp4_tensor.py @@ -19,7 +19,10 @@ import torch -from modelopt.torch.quantization.qtensor.nvfp4_tensor import NVFP4QTensor +from modelopt.torch.quantization.qtensor.nvfp4_tensor import ( + NVFP4QTensor, + _cast_per_block_scale_to_fp8, +) _FP8_E4M3FN_MIN = 2**-9 # 0.001953125 — smallest positive FP8 E4M3FN subnormal _FP8_E4M3FN_MAX = 448.0 @@ -74,14 +77,14 @@ def test_mixed_weight_no_zeros(self): def test_helper_clamps_overflow_to_max(self): """Values above 448 must saturate to 448, not cast to NaN (fp8_e4m3fn has no Inf).""" oversized = torch.tensor([100.0, 448.0, 1e3, 1e6]) - out = NVFP4QTensor._cast_per_block_scale_to_fp8(oversized).float() + out = _cast_per_block_scale_to_fp8(oversized).float() assert torch.isfinite(out).all(), f"FP8 cast produced non-finite values: {out.tolist()}" assert (out <= _FP8_E4M3FN_MAX).all(), f"FP8 cast values exceed 448: {out.tolist()}" def test_helper_clamps_underflow_to_min(self): """Values below the FP8 subnormal must clamp up, not collapse to 0.""" tiny = torch.tensor([0.0, 1e-12, 1e-6, _FP8_E4M3FN_MIN / 2]) - out = NVFP4QTensor._cast_per_block_scale_to_fp8(tiny).float() + out = _cast_per_block_scale_to_fp8(tiny).float() assert (out > 0).all(), f"FP8 cast produced zero scales: {out.tolist()}" def test_static_path_no_nan_when_block_amax_zero(self):