-
Notifications
You must be signed in to change notification settings - Fork 697
Add Qwen3.5 Moe lite awq #4561
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Add Qwen3.5 Moe lite awq #4561
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,6 +39,18 @@ def add_parser_auto_awq(): | |
| type=int, | ||
| default=128, | ||
| help='Group size for weight quantization statistics') | ||
| parser.add_argument('--no-calib-ds-req', | ||
| dest='calib_ds_req', | ||
| action='store_false', | ||
| default=True, | ||
| help='Require calibration dataset before quantizing weights. ' | ||
| 'Default to True. Set to False to skip calibration and directly quantize weights') | ||
| parser.add_argument('--mod-skip-quant', | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
parser.add_argument('--exclude-modules',
nargs='+',
metavar='PATTERN',
default=None,
help='One or more module name patterns (glob‑style) to exclude from quantization. '
'Example: --exclude-modules "*.lm_head" "transformer.layers.*.ffn"'') |
||
| dest='mod_skip_quant', | ||
| nargs='+', | ||
| metavar='PATTERN', | ||
| default=None, | ||
| help='Module name patterns to skip during quantization') | ||
|
|
||
| @staticmethod | ||
| def add_parser_auto_gptq(): | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -10,10 +10,46 @@ | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| from lmdeploy.lite.apis.calibrate import LAYER_TYPE_MAP, calibrate | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| from lmdeploy.lite.quantization.awq import FC_FCS_MAP, NORM_FCS_MAP, awq_layers, quant_weights, smooth_layers | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| from lmdeploy.lite.utils import collect_target_modules | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| from lmdeploy.lite.utils import collect_target_modules, convert_moe_parameters | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| from lmdeploy.utils import try_import_deeplink | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| def load_model(model: str, dtype: Literal['float16', 'bfloat16', 'auto'] = 'auto', work_dir: str = './work_dir'): | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| from pathlib import Path | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| from transformers import AutoTokenizer | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| from lmdeploy.lite.utils import load_hf_from_pretrained | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| model = load_hf_from_pretrained(model, dtype=dtype, trust_remote_code=True) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| vl_model = None | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| work_dir = Path(work_dir) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| work_dir.mkdir(parents=True, exist_ok=True) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| return vl_model, model, tokenizer, work_dir | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) | |
| model = load_hf_from_pretrained(model, dtype=dtype, trust_remote_code=True) | |
| vl_model = None | |
| work_dir = Path(work_dir) | |
| work_dir.mkdir(parents=True, exist_ok=True) | |
| return vl_model, model, tokenizer, work_dir | |
| work_dir = Path(work_dir) | |
| work_dir.mkdir(parents=True, exist_ok=True) | |
| vl_model = None | |
| tokenizer = None | |
| llm_model = None | |
| # Reuse the task-aware VLM loading path when available so the no-calib | |
| # flow can handle conditional-generation / multimodal architectures. | |
| try: | |
| from lmdeploy.archs import get_task | |
| from lmdeploy.vl.model.builder import load_vl_model | |
| _, pipeline_class = get_task(model) | |
| is_vl_task = pipeline_class is not None and hasattr(pipeline_class, 'is_vl') and pipeline_class.is_vl | |
| if is_vl_task: | |
| vl_model = load_vl_model(model, backend='huggingface') | |
| if hasattr(vl_model, 'language_model'): | |
| llm_model = vl_model.language_model | |
| elif hasattr(vl_model, 'llm'): | |
| llm_model = vl_model.llm | |
| else: | |
| raise AttributeError('Cannot find language model in loaded VLM.') | |
| if hasattr(vl_model, 'tokenizer') and vl_model.tokenizer is not None: | |
| tokenizer = vl_model.tokenizer | |
| elif hasattr(vl_model, 'processor') and hasattr(vl_model.processor, 'tokenizer'): | |
| tokenizer = vl_model.processor.tokenizer | |
| except Exception: | |
| # Fall back to the original text-only loading path if task-aware VLM | |
| # loading is unavailable or the model is not a VLM. | |
| vl_model = None | |
| llm_model = None | |
| tokenizer = None | |
| if tokenizer is None: | |
| tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) | |
| if llm_model is None: | |
| llm_model = load_hf_from_pretrained(model, dtype=dtype, trust_remote_code=True) | |
| return vl_model, llm_model, tokenizer, work_dir |
Copilot
AI
Apr 28, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
config_contains_keyword claims to recursively search config keys or string values, but the implementation only recurses into dict values and ignores lists/tuples/strings entirely. For HF configs, to_dict() commonly contains nested lists/dicts, so this can incorrectly return False and prevent MoE detection/conversion. Update search() to handle dict, list/tuple, and str (and optionally other primitive types via str(obj)), consistent with the docstring.
| return False | |
| return False | |
| if isinstance(obj, (list, tuple)): | |
| for item in obj: | |
| if search(item): | |
| return True | |
| return False | |
| if isinstance(obj, str): | |
| return keyword in obj.lower() | |
| if obj is None: | |
| return False | |
| return keyword in str(obj).lower() |
Copilot
AI
Apr 28, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
input_stats is loaded here but never used (it gets reloaded inside the later if calib_ds_req: block). This is unnecessary IO and can noticeably slow down quantization for large stats files; consider removing this load or using the already-loaded input_stats later.
| input_stats = torch.load(osp.join(work_dir, 'inputs_stats.pth'), weights_only=True) |
Copilot
AI
Apr 28, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When calib_ds_req=False, this code bypasses calibrate()'s supported-model validation and then does layer_type = LAYER_TYPE_MAP[type(model).__name__], which will raise a raw KeyError for unsupported/renamed model classes. Consider adding the same explicit check and user-facing RuntimeError message that calibrate() uses (or reusing calibrate()'s model-type validation) so failures are actionable.
| layer_type = LAYER_TYPE_MAP[type(model).__name__] | |
| model_type = type(model).__name__ | |
| if model_type not in LAYER_TYPE_MAP: | |
| supported_model_types = ', '.join(sorted(LAYER_TYPE_MAP.keys())) | |
| raise RuntimeError( | |
| f'Unsupported model type: {model_type}. ' | |
| f'Supported model types are: {supported_model_types}.') | |
| layer_type = LAYER_TYPE_MAP[model_type] |
Copilot
AI
Apr 28, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
convert_moe_parameters(model_path, layer) is called once per decoder layer, but convert_moe_parameters recomputes the registered model name from model_path each time (which calls get_model_arch(...) and reads config). This becomes O(num_layers) config parsing overhead. Consider computing model_name once in auto_awq (or once in convert_moe_parameters via caching) and passing it down so conversion stays cheap.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
May remove the unused code
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,9 @@ | |
| 'QWenLMHeadModel': 'QWenBlock', | ||
| 'Qwen2ForCausalLM': 'Qwen2DecoderLayer', | ||
| 'Qwen3ForCausalLM': 'Qwen3DecoderLayer', | ||
| 'Qwen3MoeForCausalLM': 'Qwen3MoeDecoderLayer', | ||
| 'Qwen3_5ForCausalLM': 'Qwen3_5DecoderLayer', | ||
| 'Qwen3_5MoeForCausalLM': 'Qwen3_5MoeDecoderLayer', | ||
|
||
| 'BaiChuanForCausalLM': 'DecoderLayer', # Baichuan 7B | ||
| 'BaichuanForCausalLM': 'DecoderLayer', # Baichuan2 7B | ||
| 'LlamaForCausalLM': 'LlamaDecoderLayer', | ||
|
|
@@ -39,6 +42,9 @@ | |
| 'QWenLMHeadModel': 'RMSNorm', | ||
| 'Qwen2ForCausalLM': 'Qwen2RMSNorm', | ||
| 'Qwen3ForCausalLM': 'Qwen3RMSNorm', | ||
| 'Qwen3MoeForCausalLM': 'Qwen3MoeRMSNorm', | ||
| 'Qwen3_5ForCausalLM': 'Qwen3_5RMSNorm', | ||
| 'Qwen3_5MoeForCausalLM': 'Qwen3_5MoeRMSNorm', | ||
| 'BaiChuanForCausalLM': 'RMSNorm', # Baichuan 7B | ||
| 'BaichuanForCausalLM': 'RMSNorm', # Baichuan2 7B | ||
| 'LlamaForCausalLM': 'LlamaRMSNorm', | ||
|
|
@@ -60,6 +66,9 @@ | |
| 'QWenLMHeadModel': 'lm_head', | ||
| 'Qwen2ForCausalLM': 'lm_head', | ||
| 'Qwen3ForCausalLM': 'lm_head', | ||
| 'Qwen3MoeForCausalLM': 'lm_head', | ||
| 'Qwen3_5ForCausalLM': 'lm_head', | ||
| 'Qwen3_5MoeForCausalLM': 'lm_head', | ||
| 'BaiChuanForCausalLM': 'lm_head', # Baichuan 7B | ||
| 'BaichuanForCausalLM': 'lm_head', # Baichuan2 7B | ||
| 'LlamaForCausalLM': 'lm_head', | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| # Copyright (c) OpenMMLab. All rights reserved. | ||
| from .mixtral import MixtralMoeMLP # noqa: F401 | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "mlp_moe_modules" -> "moe_mlp_modules" |
||
| from .qwen import QwenMoeMLP # noqa: F401 | ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,5 @@ | ||||||
| # Copyright (c) OpenMMLab. All rights reserved. | ||||||
|
|
||||||
| from mmengine import Registry | ||||||
|
|
||||||
| CONVERT_MOE_MODELS = Registry('mlp moe module', locations=['lmdeploy.lite.mlp_moe_modules.base']) | ||||||
|
||||||
| CONVERT_MOE_MODELS = Registry('mlp moe module', locations=['lmdeploy.lite.mlp_moe_modules.base']) | |
| CONVERT_MOE_MODELS = Registry('mlp moe module', locations=['lmdeploy.lite.mlp_moe_modules']) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| # Copyright (c) OpenMMLab. All rights reserved. | ||
|
|
||
| import torch | ||
| import torch.nn as nn | ||
|
|
||
| from .base import CONVERT_MOE_MODELS | ||
|
|
||
|
|
||
| @CONVERT_MOE_MODELS.register_module(name='mixtral') | ||
| class MixtralMoeMLP(nn.Module): | ||
| """Use unfused MoE expert MLP after splitting fused expert weights.""" | ||
|
|
||
| def __init__(self, hidden_size, intermediate_size, dtype=None, device=None): | ||
| super().__init__() | ||
| self.w1 = nn.Linear(hidden_size, intermediate_size, bias=False, dtype=dtype, device=device) | ||
| self.w3 = nn.Linear(hidden_size, intermediate_size, bias=False, dtype=dtype, device=device) | ||
| self.w2 = nn.Linear(intermediate_size, hidden_size, bias=False, dtype=dtype, device=device) | ||
|
|
||
| def load_weight(self, w1_weight: torch.Tensor, w2_weight: torch.Tensor, w3_weight: torch.Tensor): | ||
| """Load weights for the MoE expert MLP.""" | ||
| self.w1.weight = nn.Parameter(w1_weight.detach(), requires_grad=False) | ||
| self.w2.weight = nn.Parameter(w2_weight.detach(), requires_grad=False) | ||
| self.w3.weight = nn.Parameter(w3_weight.detach(), requires_grad=False) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,24 @@ | ||
| # Copyright (c) OpenMMLab. All rights reserved. | ||
|
|
||
| import torch | ||
| import torch.nn as nn | ||
|
|
||
| from .base import CONVERT_MOE_MODELS | ||
|
|
||
|
|
||
| @CONVERT_MOE_MODELS.register_module(name='qwen3-moe') | ||
| @CONVERT_MOE_MODELS.register_module(name='qwen3_5-moe') | ||
| class QwenMoeMLP(nn.Module): | ||
| """Use unfused MoE expert MLP after splitting fused expert weights.""" | ||
|
|
||
| def __init__(self, hidden_size, intermediate_size, dtype=None, device=None): | ||
| super().__init__() | ||
| self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False, dtype=dtype, device=device) | ||
| self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False, dtype=dtype, device=device) | ||
| self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False, dtype=dtype, device=device) | ||
|
|
||
| def load_weight(self, gate_proj_weight: torch.Tensor, down_proj_weight: torch.Tensor, up_proj_weight: torch.Tensor): | ||
| """Load weights for the MoE expert MLP.""" | ||
| self.gate_proj.weight = nn.Parameter(gate_proj_weight.detach(), requires_grad=False) | ||
| self.up_proj.weight = nn.Parameter(up_proj_weight.detach(), requires_grad=False) | ||
| self.down_proj.weight = nn.Parameter(down_proj_weight.detach(), requires_grad=False) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,6 +32,10 @@ | |
| 'input_layernorm': ['self_attn.k_proj', 'self_attn.q_proj', 'self_attn.v_proj'], | ||
| 'post_attention_layernorm': ['mlp.gate_proj', 'mlp.up_proj'] | ||
| }, | ||
| 'Qwen3MoeDecoderLayer': { | ||
| 'input_layernorm': ['self_attn.k_proj', 'self_attn.q_proj', 'self_attn.v_proj'], | ||
| 'post_attention_layernorm': ['mlp.gate_proj', 'mlp.up_proj'] | ||
| }, | ||
|
Comment on lines
+37
to
+40
|
||
| 'DecoderLayer': { | ||
| 'input_layernorm': ['self_attn.W_pack'], | ||
| 'post_attention_layernorm': ['mlp.gate_proj', 'mlp.up_proj'] | ||
|
|
@@ -121,14 +125,26 @@ | |
| } | ||
| } | ||
|
|
||
| SKIPPED_MODULE = ['lora', 'block_sparse_moe.gate'] | ||
| SKIPPED_MODULE = ['lora', 'block_sparse_moe.gate', 'mlp.gate'] | ||
|
|
||
| def match_builtin_skkiped_pattern(name: str, pattern: str): | ||
| if pattern == 'lora': | ||
| return pattern in name | ||
| return name == pattern or name.endswith(f'.{pattern}') or f'.{pattern}.' in name | ||
|
||
|
|
||
| def skipped_module(name: str): | ||
| """Whether the module should be skipped from quantization.""" | ||
| for m in SKIPPED_MODULE: | ||
| if m in name: | ||
| return True | ||
| def skipped_module(name: str, extra_patterns=None): | ||
| """Whether the module should be skipped from quantization. | ||
|
|
||
| Args: | ||
| name: The fully-qualified module name. | ||
| extra_patterns: Optional iterable of additional substring patterns | ||
| (e.g. user-provided ``mod_skip_quant``). Merged with the | ||
| built-in ``SKIPPED_MODULE`` list. | ||
| """ | ||
| if any(match_builtin_skkiped_pattern(name, pattern) for pattern in SKIPPED_MODULE): | ||
| return True | ||
| if extra_patterns and any(pattern in name for pattern in extra_patterns): | ||
| return True | ||
| return False | ||
|
|
||
|
|
||
|
|
@@ -294,7 +310,7 @@ def check_awq_supported(layer_type): | |
| raise NotImplementedError | ||
|
|
||
|
|
||
| def quant_weights(model, fcs, bits, symmetry, group_size=-1, device='cuda'): | ||
| def quant_weights(model, fcs, bits, symmetry, group_size=-1, device='cuda', mod_skip_quant=None): | ||
| """Quantize the weights of the target model's linear layers.""" | ||
| from lmdeploy.lite.quantization import WeightQuantizer | ||
| from lmdeploy.lite.quantization.modules import WeightOnlyQLinear | ||
|
|
@@ -304,7 +320,7 @@ def quant_weights(model, fcs, bits, symmetry, group_size=-1, device='cuda'): | |
| parent_name, _, child_name = name.rpartition('.') | ||
| parent = model.get_submodule(parent_name) | ||
| pack_or_skip = 'packed' | ||
| if skipped_module(name): | ||
| if skipped_module(name, mod_skip_quant): | ||
| q_linear = fc | ||
| pack_or_skip = 'skipped' | ||
| else: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's unnessary to define another option.
We can use "calib_samples=0" to indicate the data free quantization