InternLM · 43758726 · Apr 28, 2026 · Apr 30, 2026 · May 6, 2026 · Copilot
diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
@@ -407,7 +407,8 @@ def calib_samples(parser):
         return parser.add_argument('--calib-samples',
                                    type=int,
                                    default=128,
-                                   help='The number of samples for calibration')
+                                   help='The number of samples for calibration. '
+                                        'Define 0 to indicate the data free quantization.')
 
     @staticmethod
     def calib_seqlen(parser):

diff --git a/lmdeploy/lite/apis/auto_awq.py b/lmdeploy/lite/apis/auto_awq.py
@@ -8,7 +8,8 @@
 import torch
 from torch import nn
 
-from lmdeploy.lite.apis.calibrate import LAYER_TYPE_MAP, calibrate
+from lmdeploy.lite.apis.calibrate import LAYER_TYPE_MAP, MOE_MODEL_LIST, calibrate, load_model_and_tokenizer
+from lmdeploy.lite.moe_mlp_modules import CONVERT_MOE_MODELS
 from lmdeploy.lite.quantization.awq import FC_FCS_MAP, NORM_FCS_MAP, awq_layers, quant_weights, smooth_layers
 from lmdeploy.lite.utils import collect_target_modules
 from lmdeploy.utils import try_import_deeplink
@@ -59,6 +60,7 @@ def auto_awq(model: str,
         calib_dataset (str): The calibration dataset name.
             Defaults to 'wikitext2'.
         calib_samples (int): The number of samples for calibration.
+            Define 0 to indicate the data free quantization.
         batch_size (int): The batch size for running the calib samples.
             Low GPU mem requires small batch_size. Large batch_size
             reduces the calibration time while costs more VRAM.
@@ -83,41 +85,51 @@ def auto_awq(model: str,
         from lmdeploy.utils import get_model
         model = get_model(model, revision=revision, download_dir=download_dir)
     model_path = model
-    vl_model, model, tokenizer, work_dir = calibrate(model,
-                                                     calib_dataset,
-                                                     calib_samples,
-                                                     calib_seqlen,
-                                                     work_dir,
-                                                     device,
-                                                     w_bits=w_bits,
-                                                     w_group_size=w_group_size,
-                                                     search_scale=search_scale,
-                                                     dtype=dtype,
-                                                     batch_size=batch_size)
+    if calib_samples == 0:
+        arch, vl_model, model, tokenizer, _, _ = load_model_and_tokenizer(model, dtype=dtype, work_dir=work_dir)
+    else:
+        arch, vl_model, model, tokenizer = calibrate(model,
+                                                         calib_dataset,
+                                                         calib_samples,
+                                                         calib_seqlen,
+                                                         work_dir,
+                                                         device,
+                                                         w_bits=w_bits,
+                                                         w_group_size=w_group_size,
+                                                         search_scale=search_scale,
+                                                         dtype=dtype,
+                                                         batch_size=batch_size)
 
     layer_type = LAYER_TYPE_MAP[type(model).__name__]
-    layer_type = LAYER_TYPE_MAP[type(model).__name__]
+    model_type = type(model).__name__
+    if model_type not in LAYER_TYPE_MAP:
+        supported_model_types = ', '.join(sorted(LAYER_TYPE_MAP.keys()))
+        raise RuntimeError(
+            f'Unsupported model type: {model_type}. '
+            f'Supported model types are: {supported_model_types}.')
+    layer_type = LAYER_TYPE_MAP[model_type]
-    layer_type = LAYER_TYPE_MAP[type(model).__name__]
+    model_type = type(model).__name__
+    if model_type not in LAYER_TYPE_MAP:
+        supported_model_types = ', '.join(sorted(LAYER_TYPE_MAP.keys()))
+        raise RuntimeError(
+            f'Unsupported model type: {model_type}. '
+            f'Supported model types are: {supported_model_types}.')
+    layer_type = LAYER_TYPE_MAP[model_type]
-    fc2fcs = FC_FCS_MAP[layer_type]
-    norm2fcs = NORM_FCS_MAP[layer_type]
-    input_stats = torch.load(osp.join(work_dir, 'inputs_stats.pth'), weights_only=True)
     layers = collect_target_modules(model, layer_type)
     fcs = {}
     for l_name, layer in layers.items():
+        if arch in MOE_MODEL_LIST:
+            CONVERT_MOE_MODELS.get(arch)(layer)
         name2fc = collect_target_modules(layer, nn.Linear, prefix=l_name)
         fcs.update(name2fc)
 
-    if search_scale:
-        awq_ratios = input_stats['ratios']
-        act_scales = input_stats['absmean']
-        awq_layers(layers, fc2fcs, norm2fcs, act_scales, awq_ratios, w_group_size, device)
-    else:
-        act_scales = input_stats['absmax']
-        smooth_layers(layers, fc2fcs, norm2fcs, act_scales, w_group_size, device)
-    quant_weights(model, fcs, w_bits, w_sym, w_group_size, device)
+    if calib_samples != 0:
+        fc2fcs = FC_FCS_MAP[layer_type]
+        norm2fcs = NORM_FCS_MAP[layer_type]
+        input_stats = torch.load(osp.join(work_dir, 'inputs_stats.pth'), weights_only=True)
+        if search_scale:
+            awq_ratios = input_stats['ratios']
+            act_scales = input_stats['absmean']
+            awq_layers(layers, fc2fcs, norm2fcs, act_scales, awq_ratios, w_group_size, device)
+        else:
+            act_scales = input_stats['absmax']
+            smooth_layers(layers, fc2fcs, norm2fcs, act_scales, w_group_size, device)
+
+    matched_exclude_modules = quant_weights(model, fcs, w_bits, w_sym, w_group_size, device,
+                                            arch=arch)
     quantization_config = dict(quant_method='awq',
                                version='gemm',
                                bits=w_bits,
                                group_size=w_group_size,
                                zero_point=not w_sym)
+    if matched_exclude_modules:
+        quantization_config['modules_to_not_convert'] = matched_exclude_modules
     model.config.update(dict(quantization_config=quantization_config))
 
     if vl_model:

diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py
@@ -6,7 +6,7 @@
 from torch import nn
 from transformers import AutoTokenizer
 
-from lmdeploy.archs import get_model_arch, get_task
+from lmdeploy.archs import get_model_arch
 from lmdeploy.lite.quantization import CalibrationContext, CalibrationContextV2
 from lmdeploy.lite.utils import collect_target_modules, get_calib_loaders, load_hf_from_pretrained
 from lmdeploy.vl.model.builder import load_vl_model
@@ -18,6 +18,9 @@
     'QWenLMHeadModel': 'QWenBlock',
     'Qwen2ForCausalLM': 'Qwen2DecoderLayer',
     'Qwen3ForCausalLM': 'Qwen3DecoderLayer',
+    'Qwen3MoeForCausalLM': 'Qwen3MoeDecoderLayer',
+    'Qwen3_5ForConditionalGeneration': 'Qwen3_5DecoderLayer',
+    'Qwen3_5MoeForConditionalGeneration': 'Qwen3_5MoeDecoderLayer',
     'BaiChuanForCausalLM': 'DecoderLayer',  # Baichuan 7B
     'BaichuanForCausalLM': 'DecoderLayer',  # Baichuan2 7B
     'LlamaForCausalLM': 'LlamaDecoderLayer',
@@ -39,6 +42,9 @@
     'QWenLMHeadModel': 'RMSNorm',
     'Qwen2ForCausalLM': 'Qwen2RMSNorm',
     'Qwen3ForCausalLM': 'Qwen3RMSNorm',
+    'Qwen3MoeForCausalLM': 'Qwen3MoeRMSNorm',
+    'Qwen3_5ForConditionalGeneration': 'Qwen3_5RMSNorm',
+    'Qwen3_5MoeForConditionalGeneration': 'Qwen3_5MoeRMSNorm',
     'BaiChuanForCausalLM': 'RMSNorm',  # Baichuan 7B
     'BaichuanForCausalLM': 'RMSNorm',  # Baichuan2 7B
     'LlamaForCausalLM': 'LlamaRMSNorm',
@@ -60,6 +66,9 @@
     'QWenLMHeadModel': 'lm_head',
     'Qwen2ForCausalLM': 'lm_head',
     'Qwen3ForCausalLM': 'lm_head',
+    'Qwen3MoeForCausalLM': 'lm_head',
+    'Qwen3_5ForConditionalGeneration': 'lm_head',
+    'Qwen3_5MoeForConditionalGeneration': 'lm_head',
     'BaiChuanForCausalLM': 'lm_head',  # Baichuan 7B
     'BaichuanForCausalLM': 'lm_head',  # Baichuan2 7B
     'LlamaForCausalLM': 'lm_head',
@@ -74,6 +83,60 @@
     'MistralForCausalLM': 'lm_head',
 }
 
+MOE_MODEL_LIST = [
+    'Qwen3MoeForCausalLM',
+    'Qwen3_5MoeForConditionalGeneration',
+    'MixtralForCausalLM'
+]
+
+
+def check_vl_llm(backend: str, config: dict) -> bool:
+    """Check if the model is a vl model from model config."""
+    if 'auto_map' in config:
+        for _, v in config['auto_map'].items():
+            if 'InternLMXComposer2ForCausalLM' in v:
+                return True
+
+    if 'language_config' in config and 'vision_config' in config and config['language_config'].get(
+            'architectures', [None])[0] == 'DeepseekV2ForCausalLM':
+        return True
+
+    arch = config['architectures'][0]
+    supported_archs = set([
+        'LlavaLlamaForCausalLM', 'LlavaMistralForCausalLM', 'CogVLMForCausalLM', 'InternLMXComposer2ForCausalLM',
+        'InternVLChatModel', 'MiniCPMV', 'LlavaForConditionalGeneration', 'LlavaNextForConditionalGeneration',
+        'Phi3VForCausalLM', 'Qwen2VLForConditionalGeneration', 'Qwen2_5_VLForConditionalGeneration',
+        'Qwen3VLForConditionalGeneration', 'Qwen3VLMoeForConditionalGeneration', 'Qwen3_5ForConditionalGeneration',
+        'Qwen3_5MoeForConditionalGeneration', 'MllamaForConditionalGeneration', 'MolmoForCausalLM',
+        'Gemma3ForConditionalGeneration', 'Llama4ForConditionalGeneration', 'InternVLForConditionalGeneration',
+        'InternS1ForConditionalGeneration', 'InternS1ProForConditionalGeneration',
+        'InternS1_1_ForConditionalGeneration', 'Glm4vForConditionalGeneration'
+    ])
+    if arch == 'QWenLMHeadModel' and 'visual' in config:
+        return True
+    elif arch == 'MultiModalityCausalLM' and 'language_config' in config:
+        return True
+    elif arch in ['ChatGLMModel', 'ChatGLMForConditionalGeneration'] and 'vision_config' in config:
+        return True
+    elif arch in supported_archs:
+        return True
+    return False
+
+
+def get_task(backend: str, model_path: str):
+    """Get pipeline type and pipeline class from model config."""
+    import os
+
+    if os.path.exists(os.path.join(model_path, 'triton_models', 'weights')):
+        # workspace model
+        return 'llm'
+    _, config = get_model_arch(model_path)
+    if check_vl_llm(backend, config.to_dict()):
+        return 'vlm'
+
+    # default task
+    return 'llm'
+
 
 def _prepare_for_calibrate(model: nn.Module,
                            layer_type: str | type,
@@ -195,6 +258,55 @@ def update_moe_mapping(model, model_type):
     NORM_FCS_MAP[LAYER_TYPE_MAP[model_type]] = updated_norm2fcs
 
 
+def load_model_and_tokenizer(model: str,
+                         dtype: Literal['float16', 'bfloat16', 'auto'] = 'auto',
+                         work_dir: str = './work_dir'):
+    """Load model and tokenizer."""
+    model_type = get_task(backend='turbomind', model_path=model)
+    make_compatible_internvl_config(model)
+
+    # Load tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
+
+    # get model arch and config
+    arch, original_config = get_model_arch(model)
+
+    if model_type == 'llm':
+        model = load_hf_from_pretrained(model, dtype=dtype, trust_remote_code=True)
+        vl_model = None
+    elif model_type == 'vlm':
+        vl_model = load_vl_model(model, backend=None, with_llm=True).vl_model
+        model = vl_model
+        if hasattr(vl_model, 'language_model'):  # deepseek-vl, ...
+            model = vl_model.language_model
+        if hasattr(vl_model, 'llm'):  # MiniCPMV, ...
+            model = vl_model.llm
+        model.config.use_cache = False
+        if hasattr(model.config, 'text_config'):
+            model.config.text_config.use_cache = False
+        elif hasattr(model.config, 'llm_config'):
+            model.config.llm_config.use_cache = False
+        if dtype == 'float16' or (dtype == 'auto' and original_config.torch_dtype == torch.float16):
+            model.half()
+        elif dtype == 'bfloat16' or (dtype == 'auto' and original_config.torch_dtype == torch.bfloat16):
+            assert torch.cuda.is_bf16_supported(
+            ), 'your device does not support bfloat16 please set --dtype float16'  # noqa
+            model.to(torch.bfloat16)
+        model.eval()
+
+    model_type = type(model).__name__
+    if model_type not in LAYER_TYPE_MAP or model_type not in NORM_TYPE_MAP:
+        raise RuntimeError(f'Currently, quantification and calibration of {model_type} are '
+                           f'not supported. The supported model types are '
+                           f"{', '.join(LAYER_TYPE_MAP.keys())}.")
+
+    # Create work directory if not exists
+    work_dir = Path(work_dir)
+    work_dir.mkdir(parents=True, exist_ok=True)
+
+    return arch, vl_model, model, tokenizer, model_type, work_dir
+
+
 def calibrate(model: str,
               calib_dataset: str = 'wikitext2',
               calib_samples: int = 128,
@@ -241,41 +353,8 @@ def calibrate(model: str,
         'Support only `wikitext2`, `c4`, `pileval`, `gsm8k`, ' \
         '`neuralmagic_calibration`, `open-platypus`, `openwebtext`.'
 
-    model_type, _ = get_task(backend='turbomind', model_path=model)
-    make_compatible_internvl_config(model)
-
-    # Load tokenizer and configuration
-    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
-
-    if model_type == 'llm':
-        model = load_hf_from_pretrained(model, dtype=dtype, trust_remote_code=True)
-        vl_model = None
-    elif model_type == 'vlm':
-        _, original_config = get_model_arch(model)
-        vl_model = load_vl_model(model, backend=None, with_llm=True).vl_model
-        model = vl_model
-        if hasattr(vl_model, 'language_model'):  # deepseek-vl, ...
-            model = vl_model.language_model
-        if hasattr(vl_model, 'llm'):  # MiniCPMV, ...
-            model = vl_model.llm
-        model.config.use_cache = False
-        if hasattr(model.config, 'text_config'):
-            model.config.text_config.use_cache = False
-        elif hasattr(model.config, 'llm_config'):
-            model.config.llm_config.use_cache = False
-        if dtype == 'float16' or (dtype == 'auto' and original_config.torch_dtype == torch.float16):
-            model.half()
-        elif dtype == 'bfloat16' or (dtype == 'auto' and original_config.torch_dtype == torch.bfloat16):
-            assert torch.cuda.is_bf16_supported(
-            ), 'your device does not support bfloat16 please set --dtype float16'  # noqa
-            model.to(torch.bfloat16)
-        model.eval()
-
-    model_type = type(model).__name__
-    if model_type not in LAYER_TYPE_MAP or model_type not in NORM_TYPE_MAP:
-        raise RuntimeError(f'Currently, quantification and calibration of {model_type} are '
-                           f'not supported. The supported model types are '
-                           f"{', '.join(LAYER_TYPE_MAP.keys())}.")
+    arch, vl_model, model, tokenizer, model_type, work_dir = load_model_and_tokenizer(
+        model, dtype=dtype, work_dir=work_dir)
 
     if model_type in ['MixtralForCausalLM']:
         update_moe_mapping(model, model_type)
@@ -319,12 +398,9 @@ def calibrate(model: str,
         all_data = torch.cat(calib_loader).to(device)
         calib_ctx.calibrate(all_data)
 
-    # Create work directory if not exists
-    work_dir = Path(work_dir)
-    work_dir.mkdir(parents=True, exist_ok=True)
     calib_ctx.export(work_dir)
 
-    return vl_model, model, tokenizer, work_dir
+    return arch, vl_model, model, tokenizer
 
 
 if __name__ == '__main__':

diff --git a/lmdeploy/lite/apis/smooth_quant.py b/lmdeploy/lite/apis/smooth_quant.py
@@ -96,7 +96,7 @@ def smooth_quant(model: str,
     rmsnorms = collect_target_modules(model, norm_type)
 
     for name, linear in fcs.items():
-        if skipped_module(name):
+        if skipped_module(name, model.config.architectures[0])[0]:
             continue
         linear.to(device)
         q_linear = QLinear.from_float(linear, quant_dtype=quant_dtype)
@@ -108,7 +108,7 @@ def smooth_quant(model: str,
         torch.cuda.empty_cache()
 
     for name, norm in rmsnorms.items():
-        if skipped_module(name):
+        if skipped_module(name, model.config.architectures[0])[0]:
             continue
         norm.to(device)
         q_norm = QRMSNorm.from_float(norm, quant_dtype=quant_dtype)

diff --git a/lmdeploy/lite/moe_mlp_modules/__init__.py b/lmdeploy/lite/moe_mlp_modules/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from .base import CONVERT_MOE_MODELS
+from .mixtral import MixtralMoeMLP
+from .qwen import QwenMoeMLP
+
+__all__ = ['CONVERT_MOE_MODELS', 'MixtralMoeMLP', 'QwenMoeMLP']
diff --git a/lmdeploy/lite/moe_mlp_modules/base.py b/lmdeploy/lite/moe_mlp_modules/base.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmengine import Registry
+
+CONVERT_MOE_MODELS = Registry('moe_mlp_module', locations=['lmdeploy.lite.moe_mlp_modules'])