diff --git a/.gitignore b/.gitignore index 3e9609afe1..5a8a9866a4 100644 --- a/.gitignore +++ b/.gitignore @@ -45,6 +45,7 @@ htmlcov/ .coverage.* .cache *build*/ +!lmdeploy/turbomind/builders !builder/ lmdeploy/lib/ lmdeploy/bin/ @@ -83,3 +84,6 @@ work_dir*/ !CMakeLists.txt proxy_config.yml + +# Claude Code local config +CLAUDE.local.md diff --git a/lmdeploy/turbomind/builders/__init__.py b/lmdeploy/turbomind/builders/__init__.py new file mode 100644 index 0000000000..922386d9c4 --- /dev/null +++ b/lmdeploy/turbomind/builders/__init__.py @@ -0,0 +1,31 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Builder sub-package — spec-driven module loading for TurboMind.""" +from __future__ import annotations + +from ._base import Builder, BuiltModule, SplitSide, _act_type_id, _cpp_dtype, _torch_dtype_to_cpp +from .attention import AttentionBuilder +from .decoder_layer import DecoderLayerBuilder, DecoderLayerConfig +from .deltanet import DeltaNetBuilder +from .ffn import FfnBuilder, fuse_w1w3 +from .mla import MLABuilder +from .module_list import ModuleListBuilder, ModuleListConfig +from .moe import MoeBuilder +from .norm import NormBuilder, make_norm_config +from .text_model import TextModelBuilder + +__all__ = [ + # Base + 'Builder', 'BuiltModule', 'TextModelBuilder', 'SplitSide', + '_cpp_dtype', '_act_type_id', '_torch_dtype_to_cpp', + # Builders + 'AttentionBuilder', 'FfnBuilder', 'MoeBuilder', + 'DeltaNetBuilder', 'MLABuilder', + 'DecoderLayerBuilder', 'ModuleListBuilder', + 'NormBuilder', + # Primitive config wrappers + 'make_norm_config', + # C++ config re-exports + 'DecoderLayerConfig', 'ModuleListConfig', + # Helper functions + 'fuse_w1w3', +] diff --git a/lmdeploy/turbomind/builders/_base.py b/lmdeploy/turbomind/builders/_base.py new file mode 100644 index 0000000000..f2c8d08d5a --- /dev/null +++ b/lmdeploy/turbomind/builders/_base.py @@ -0,0 +1,425 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +import enum + +import _turbomind as _tm +import torch + +from ..linear import Linear + +# --------------------------------------------------------------------------- +# SplitSide enum (internal -- not exposed to specs) +# --------------------------------------------------------------------------- + + +class SplitSide(enum.Enum): + """Semantic TP split direction for commit operations. + + OUTPUT -- column-parallel: split along the output dimension (axis -1) + INPUT -- row-parallel: split along the input dimension (axis 0) + """ + + OUTPUT = 'output' + INPUT = 'input' + + +# --------------------------------------------------------------------------- +# Canonical dtype mappings (moved from commit.py) +# --------------------------------------------------------------------------- + +_STR_TO_DTYPE: dict[str, _tm.DataType] = { + 'float32': _tm.DataType.TYPE_FP32, + 'float16': _tm.DataType.TYPE_FP16, + 'bfloat16': _tm.DataType.TYPE_BF16, +} + +_TORCH_TO_CPP: dict[torch.dtype, _tm.DataType] = { + torch.float32: _tm.DataType.TYPE_FP32, + torch.float16: _tm.DataType.TYPE_FP16, + torch.bfloat16: _tm.DataType.TYPE_BF16, + torch.int32: _tm.DataType.TYPE_INT32, + torch.int64: _tm.DataType.TYPE_INT64, + torch.int8: _tm.DataType.TYPE_INT8, + torch.uint8: _tm.DataType.TYPE_UINT8, +} + +_CPP_TO_TORCH: dict[_tm.DataType, torch.dtype] = {v: k for k, v in _TORCH_TO_CPP.items()} + +_SPLIT_SIDE_TO_DIM: dict[SplitSide, int] = {SplitSide.OUTPUT: -1, SplitSide.INPUT: 0} + + +# --------------------------------------------------------------------------- +# Dtype / format helpers (moved from commit.py) +# --------------------------------------------------------------------------- + + +def _cpp_dtype(dtype_str: str): + """Convert a model-config data_type string to C++ DataType enum.""" + return _STR_TO_DTYPE[dtype_str] + + +def _act_type_id(act_str: str) -> int: + """Convert activation_type string to C++ ActivationType enum value.""" + return {'silu': 0, 'gpt-oss': 1}.get(act_str, 0) + + +def _torch_dtype_to_cpp(dtype: torch.dtype): + """Convert a torch dtype to the C++ ``DataType`` enum, or ``None``.""" + return _TORCH_TO_CPP.get(dtype) + + +def _cast_shard_for_tm(shard: torch.Tensor, tm_tensor) -> torch.Tensor: + """Cast *shard* dtype to match *tm_tensor*'s C++ dtype when needed.""" + if tm_tensor.type == _tm.DataType.TYPE_FP32 and shard.dtype in (torch.float16, torch.bfloat16): + return shard.float() + if tm_tensor.type == _tm.DataType.TYPE_FP16 and shard.dtype != torch.float16: + return shard.half() + if tm_tensor.type == _tm.DataType.TYPE_BF16 and shard.dtype != torch.bfloat16: + return shard.to(torch.bfloat16) + return shard + + + +def _copy_shard_to_param(handle, param_name: str, shard: torch.Tensor, *, + alloc_shape: list[int] | None = None, + alloc_dtype=None) -> None: + """Move shard to GPU, allocate the C++ param slot, cast, and copy. + + Invariant: ``dst.byte_size == shard.nbytes`` after the cast. Upstream + is responsible for any padding/reshape needed to satisfy this. A + mismatch raises immediately. + + ``alloc_shape`` / ``alloc_dtype`` default to the shard's own shape / + dtype. Override only to express shape/dtype *relabels* where byte + size is preserved (e.g. quantized weight: physical int32 + [in, out/8] stored in a logical UINT4 [in, out] C++ slot). + """ + if not shard.is_cuda: + shard = shard.cuda(0).contiguous() + elif not shard.is_contiguous(): + shard = shard.contiguous() + + if alloc_shape is None: + alloc_shape = list(shard.shape) + if alloc_dtype is None: + alloc_dtype = _torch_dtype_to_cpp(shard.dtype) + + dst = handle.param(param_name).alloc(alloc_shape, alloc_dtype) + shard = _cast_shard_for_tm(shard, dst) + assert dst.byte_size == shard.nbytes, ( + f'{param_name}: alloc byte_size={dst.byte_size} != ' + f'shard.nbytes={shard.nbytes}') + dst.copy_from(shard) + + +def _shard(tensor: torch.Tensor, split_dim: int | None, tp: int, + rank: int) -> torch.Tensor: + """Return the ``rank``-th split along ``split_dim``, or the tensor + unchanged. + + Used wherever a TP shard is selected from a broadcast-by-default + tensor. A ``split_dim`` of ``None`` or ``tp <= 1`` returns the tensor + untouched. + """ + if split_dim is None or tp <= 1: + return tensor + return tensor.split(tensor.shape[split_dim] // tp, dim=split_dim)[rank] + + +# --------------------------------------------------------------------------- +# Builder base class +# --------------------------------------------------------------------------- + + +class BuiltModule: + """Opaque handle bundle returned by ``Builder.build()``. + + Wraps a list of per-GPU C++ module handles. Iteration and len delegate + to the underlying list so callers can ``zip(BuiltModule, contexts)`` etc. + """ + + __slots__ = ('handles',) + + def __init__(self, handles): + self.handles = handles + + def __iter__(self): + return iter(self.handles) + + def __len__(self): + return len(self.handles) + + +class Context: + """Bundle of per-GPU contexts and the model compute dtype.""" + def __init__(self, devices, data_type): + self.devices = devices + self.data_type = data_type + + +class ParallelGroup: + """Bundle a parallelism size with per-GPU TP ranks.""" + def __init__(self, size, ranks): + self.size = size + self.ranks = ranks + + +class Builder: + """Wraps N GPU handles for a single logical module. + + Distributes module creation, child binding, and weight commits + across all GPUs with bound TP configuration. + + Subclasses specialize for particular module types (e.g. attention, + FFN, MoE). + + Lifecycle: stage commits -> build() -> BuiltModule (frozen). + After ``build()`` the Builder is inert — further commits or child + attachments raise. + """ + + def __init__(self, config, ctx): + """Initialise the builder with staging dicts. + + Parameters + ---------- + config : C++ config struct + Config with ``clone()`` method. + ctx : Context + Per-GPU context handles + model compute dtype. + """ + # `_built` must be set first: __setattr__ reads it inside the + # BuiltModule branch. Bool is not a BuiltModule, so the normal + # fall-through assigns it via object.__setattr__ at the end of + # __setattr__. + self._built = False + self._ctx = ctx + self.tp = ParallelGroup(1, None) # default: no TP + self.config = config + if hasattr(self.config, 'data_type'): + self.config.data_type = ctx.data_type + self._pending_tensors = {} + self._pending_children = {} + self._handles = None + + # ------------------------------------------------------------------ + # Child binding via attribute assignment + # ------------------------------------------------------------------ + + def __setattr__(self, name: str, value): + if isinstance(value, Builder): + raise TypeError( + f'{type(self).__name__}.{name}: assign .build() output ' + f'(BuiltModule), not the Builder itself') + if isinstance(value, BuiltModule): + if self._built: + raise RuntimeError( + f'{type(self).__name__} is built; ' + f'cannot assign {name!r}') + self._add_child(name, value.handles) + return + object.__setattr__(self, name, value) + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + @property + def tp_size(self): + return self.tp.size + + def _rank_for(self, gpu_idx: int) -> int: + if self.tp.ranks and self.tp.size > 1: + return self.tp.ranks[gpu_idx] + return 0 + + # ------------------------------------------------------------------ + # Add methods — stage into pending dicts (pre-build only) + # ------------------------------------------------------------------ + + def _add_linear(self, name: str, linear: Linear, + split_side: SplitSide | None = None): + """Create standalone LinearWeight modules and copy tensor data. + + Creates per-GPU LinearWeight modules via ``_tm.create_module`` + at commit time. Attachment to the parent module is deferred to + ``build()`` via ``_commit_child``. + """ + assert not self._built, ( + f"{type(self).__name__} is built; commit '{name}' rejected") + + w = linear.tensors.get('weight') + if w is None: + return + + # --- GPU-invariant preparation ------------------------------------- + fmt = linear.weight_format + + tp = self.tp.size if split_side else 1 + split_dim = _SPLIT_SIDE_TO_DIM.get(split_side) if split_side else None + + in_dim, out_dim = w.shape[0], w.shape[-1] + if split_side == SplitSide.OUTPUT: + out_dim //= tp + elif split_side == SplitSide.INPUT: + in_dim //= tp + + compute_dtype = self.config.data_type + lin_cfg = _tm.LinearConfig() + lin_cfg.input_dim = in_dim + lin_cfg.output_dim = out_dim + lin_cfg.data_type = compute_dtype or _tm.DataType.TYPE_INVALID + lin_cfg.format = linear.weight_format.make_data_format(compute_dtype) + lin_cfg.has_bias = 'bias' in linear.tensors + + packed = {k: fmt.pack(t, k) for k, t in linear.tensors.items()} + tensors = {k: p.tensor for k, p in packed.items()} + + kind_split_dims = { + kind: None if (kind == 'bias' and split_side == SplitSide.INPUT) + else split_dim + for kind in tensors + } + + if tp > 1 and split_dim is not None: + for kind, tensor in tensors.items(): + kind_split_dim = kind_split_dims[kind] + if kind_split_dim is not None: + d = tensor.shape[kind_split_dim] + assert d % tp == 0, ( + f'TP split: {name}.{kind} dim {kind_split_dim} ' + f'has size {d}, not divisible by tp={tp}.') + + # --- Per-GPU: standalone creation + tensor copy -------------------- + handles = [] + for i, ctx in enumerate(self._ctx.devices): + with ctx: + rank = self._rank_for(i) if tp > 1 else 0 + + mod = _tm.create_module(lin_cfg) + + for kind, tensor in tensors.items(): + shard = _shard(tensor, kind_split_dims[kind], tp, rank) + + alloc_shape, alloc_dtype = packed[kind].alloc_shape, \ + packed[kind].alloc_dtype + if alloc_shape is not None and split_dim is not None \ + and tp > 1: + alloc_shape = list(alloc_shape) + alloc_shape[split_dim] //= tp + if alloc_dtype is None and kind == 'weight': + alloc_dtype = self.config.data_type + + _copy_shard_to_param(mod, kind, shard, + alloc_shape=alloc_shape, + alloc_dtype=alloc_dtype) + + handles.append(mod) + + self._add_child(name, handles) + + def _add_tensor(self, name: str, tensor: torch.Tensor | None, + split_side: SplitSide | None = None): + """Stage a raw-tensor commit under ``name``. + + Applied during + ``build()`` in ``_commit_tensor``. + """ + assert not self._built, ( + f"{type(self).__name__} is built; commit '{name}' rejected") + if tensor is not None: + self._pending_tensors[name] = (tensor, split_side) + + # ------------------------------------------------------------------ + # Add helpers + # ------------------------------------------------------------------ + + def _add_child(self, name: str, handles: list): + """Stage pre-created per-GPU ``Module*`` handles under ``name``. + + Applied during ``build()`` in ``_commit_child``. + """ + assert not self._built, ( + f"{type(self).__name__} is built; commit '{name}' rejected") + assert name not in self._pending_children, ( + f"{type(self).__name__}: duplicate child commit '{name}'") + self._pending_children[name] = handles + + # ------------------------------------------------------------------ + # build() — create handles, drain staged state, return BuiltModule + # ------------------------------------------------------------------ + + def build(self) -> BuiltModule: + """Create C++ module handles and drain all staged state. + + Idempotent on second call — returns the same ``BuiltModule``. + """ + if self._built: + return BuiltModule(self._handles) + + self._create_handles() + + # True is not BuiltModule; falls through to plain assignment. + self._built = True + + # Drain staged children (linear weights + sub-builder output) + for name, handles in self._pending_children.items(): + self._commit_child(name, handles) + + # Drain staged tensors + for name, (tensor, split_side) in self._pending_tensors.items(): + self._commit_tensor(name, tensor, split_side) + + return BuiltModule(self._handles) + + def _create_handles(self): + """Create one C++ module per context via ``_tm.create_module(cfg)``.""" + handles = [] + for i, ctx in enumerate(self._ctx.devices): + with ctx: + cfg = self._cfg_for_rank(i) + handle = _tm.create_module(cfg) + handles.append(handle) + self._handles = handles + + def _cfg_for_rank(self, gpu_idx: int): + """Clone config and set tp_rank if tp > 1.""" + if self.tp.size > 1 and hasattr(self.config, 'tp_rank'): + cfg = self.config.clone() + cfg.tp_rank = self.tp.ranks[gpu_idx] + return cfg + return self.config + + def _commit_child(self, name: str, handles: list): + """Attach pre-created per-GPU child handles to parent handles.""" + for i, (parent_h, child_h) in enumerate( + zip(self._handles, handles)): + with self._ctx.devices[i]: + parent_h.add_child_raw(name, child_h) + + # ------------------------------------------------------------------ + # Commit methods — drain pending dicts to C++ modules + # ------------------------------------------------------------------ + + def _commit_tensor(self, name: str, tensor: torch.Tensor, + split_side: SplitSide | None = None): + """Commit a raw tensor to a named parameter on all GPUs. + + Parameters + ---------- + name : str + Parameter name within the module. + tensor : torch.Tensor + The tensor data. + split_side : SplitSide | None + TP split semantics. ``None`` means broadcast. + """ + tp = self.tp.size if split_side else 1 + split_dim = _SPLIT_SIDE_TO_DIM.get(split_side) if split_side else None + + for i, handle in enumerate(self._handles): + with self._ctx.devices[i]: + rank = self._rank_for(i) if tp > 1 else 0 + shard = _shard(tensor, split_dim, tp, rank) + _copy_shard_to_param(handle, name, shard, + alloc_dtype=None) diff --git a/lmdeploy/turbomind/builders/attention.py b/lmdeploy/turbomind/builders/attention.py new file mode 100644 index 0000000000..91f38821fe --- /dev/null +++ b/lmdeploy/turbomind/builders/attention.py @@ -0,0 +1,120 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Attention weight loading builder and QKV fusion pipeline. + +Provides ``AttentionBuilder`` for committing attention weights (QKV fusion, +O-proj, QK-norm, direct params) and pipeline functions (``dequant_mixed``, +``repeat_kv_for_tp``, ``split_output_gate``, ``fuse_qkv``) for fusing Q/K/V +Linear bundles into a single interleaved w_qkv with KV head padding and +output-gate splitting. +""" +from __future__ import annotations + +import torch + +from ..linear import Linear, dequant_mixed, transform_output_dim +from ._base import Builder, ParallelGroup, SplitSide + +# --------------------------------------------------------------------------- +# New pipeline functions (replacing merge_qkv_linear) +# --------------------------------------------------------------------------- + + +def _infer_heads(linear: Linear, head_dim: int) -> int: + """Derive head count from the weight tensor's output dimension.""" + w = linear.tensors.get('weight') + if w is None: + return 0 + return w.size(-1) // head_dim + + +@transform_output_dim +def _repeat_kv_heads(tensor: torch.Tensor, *, tp: int, + heads: int) -> torch.Tensor: + """Repeat KV heads to reach a TP-divisible count.""" + if heads % tp == 0: + return tensor + target_heads = ((heads + tp - 1) // tp) * tp + assert target_heads % heads == 0, ( + f'target_heads={target_heads} must be divisible by heads={heads}') + n_repeat = target_heads // heads + per_head = tensor.size(-1) // heads + t = tensor.view(tensor.size(0), heads, per_head) + return t.repeat(1, n_repeat, 1).reshape(tensor.size(0), target_heads * per_head) + + +def repeat_kv_for_tp(k: Linear, v: Linear, *, + tp: int, head_dim: int) -> tuple[Linear, Linear]: + """Repeat KV heads to reach a TP-divisible count.""" + k = _repeat_kv_heads(k, tp=tp, heads=_infer_heads(k, head_dim)) + v = _repeat_kv_heads(v, tp=tp, heads=_infer_heads(v, head_dim)) + return k, v + + +@transform_output_dim +def split_output_gate(tensor: torch.Tensor, *, head_num: int + ) -> tuple[torch.Tensor, torch.Tensor]: + """Split output gate from Q projection (Qwen3.5). + + Q's output dim is 2 * head_num * head_dim. Reshape to [batch, head_num, 2, head_dim], split into q_real and gate. + """ + per_head = tensor.size(-1) // (head_num * 2) + q, gate = tensor.view(-1, head_num, 2, per_head).unbind(2) + return q.reshape(-1, head_num * per_head), gate.reshape(-1, head_num * per_head) + + +@transform_output_dim +def fuse_qkv(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + *, tp: int, gate: torch.Tensor | None = None) -> torch.Tensor: + """Fuse Q, K, V (and optionally gate) into a single w_qkv Linear. + + Concatenates output channels with TP interleaving. + Layout per tp-shard: [Q | K | V] or [Q | K | V | Gate]. + """ + tensors = [t for t in (q, k, v, gate) if t is not None] + parts = [t.view(t.size(0), tp, -1) for t in tensors] + merged = torch.cat(parts, dim=-1) + return merged.view(-1, merged.size(-1) * tp) + + +# --------------------------------------------------------------------------- +# AttentionBuilder +# --------------------------------------------------------------------------- + + +class AttentionBuilder(Builder): + """Attention weight loading builder.""" + + _PARAM_TP_RULES: dict[str, SplitSide] = { + 'sinks': SplitSide.OUTPUT, + } + + def __init__(self, config, ctx, tp: ParallelGroup): + super().__init__(config, ctx) + self.tp = tp + self.config.tp_size = tp.size + + def add_qkv_proj(self, q, k, v, *, gate=None): + """Fuse Q/K/V into a single w_qkv with TP interleave, commit. + + Pipeline: dequant_mixed -> repeat_kv_for_tp -> fuse_qkv -> commit. + """ + q, k, v, gate = dequant_mixed(q, k, v, gate, data_type=self.config.data_type) + k, v = repeat_kv_for_tp(k, v, tp=self.tp.size, + head_dim=self.config.head_dim) + # After KV head repeat, push the padded-global kv_head_num onto + # config so that C++ module creation sees the correct head count. + self.config.kv_head_num = _infer_heads(k, self.config.head_dim) + merged = fuse_qkv(q, k, v, tp=self.tp.size, gate=gate) + self._add_linear('w_qkv', merged, SplitSide.OUTPUT) + + def add_o_proj(self, o): + """Shard along input dim, commit.""" + self._add_linear('wo', o, SplitSide.INPUT) + + def add_param(self, name, tensor): + """Commit a direct parameter. + + Builder determines split side. + """ + split_side = self._PARAM_TP_RULES.get(name) + self._add_tensor(name, tensor, split_side) diff --git a/lmdeploy/turbomind/builders/decoder_layer.py b/lmdeploy/turbomind/builders/decoder_layer.py new file mode 100644 index 0000000000..9d3ab8fc69 --- /dev/null +++ b/lmdeploy/turbomind/builders/decoder_layer.py @@ -0,0 +1,11 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import _turbomind as _tm + +from ._base import Builder + +DecoderLayerConfig = _tm.DecoderLayerConfig + + +class DecoderLayerBuilder(Builder): + """Pure container builder for decoder layers.""" + pass diff --git a/lmdeploy/turbomind/builders/deltanet.py b/lmdeploy/turbomind/builders/deltanet.py new file mode 100644 index 0000000000..2e1e5ad63a --- /dev/null +++ b/lmdeploy/turbomind/builders/deltanet.py @@ -0,0 +1,146 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""DeltaNet weight loading builder and GDN input-projection fusion helpers. + +Provides ``DeltaNetBuilder`` for committing DeltaNet weights (GDN input +projections, scalar params, conv1d) and helper functions ``split_qkv`` and +``fuse_gdn`` for merging in_proj_qkv/z/b/a into a single ``in_proj_all`` +with TP interleaving. +""" +from __future__ import annotations + +import torch + +from ..linear import Linear, concat_out_dim, dequant_mixed +from ._base import Builder, ParallelGroup, SplitSide + + +def tp_interleave_tensor(t: torch.Tensor, tp: int, d: int) -> torch.Tensor: + """Reshape dim *d* as [tp, per_tp] for TP-rank interleaving.""" + shape = list(t.shape) + return t.reshape(shape[:d] + [tp, shape[d] // tp] + shape[d + 1:]) + + +def split_qkv(linear: Linear, + qkv_split: tuple[int, int, int]) -> tuple[Linear, Linear, Linear]: + """Split combined QKV linear into Q, K, V linears along output dim.""" + wfmt = linear.weight_format + block_out = (wfmt.block_out or 0) if wfmt is not None else 0 + new_linears = [] + offset = 0 + for dim in qkv_split: + tensors = {} + for kind, t in linear.tensors.items(): + out_dim = t.dim() - 1 + if kind in ('scales', 'zeros') and block_out > 0: + block_offset = offset // block_out + block_len = dim // block_out + tensors[kind] = t.narrow(out_dim, block_offset, block_len).contiguous() + else: + tensors[kind] = t.narrow(out_dim, offset, dim).contiguous() + new_linears.append(Linear(tensors=tensors, + weight_format=linear.weight_format)) + offset += dim + return tuple(new_linears) + + +def fuse_gdn(q: Linear, k: Linear, v: Linear, + z: Linear, b: Linear, a: Linear, *, + tp: int) -> Linear: + """Fuse GDN input projections with TP interleaving. + + Layout per tp-shard: [Q | K | V | Z | B | A]. + For tp=1 reduces to simple concat along output dim. + """ + components = [q, k, v, z, b, a] + + if tp <= 1: + return concat_out_dim(components) + + first = components[0] + fused_tensors: dict[str, torch.Tensor] = {} + for kind in first.tensors: + parts = [] + all_1d = True + d = -1 + for lin in components: + t = lin.tensors.get(kind) + if t is None: + continue + if t.dim() > 1: + this_d = t.dim() - 1 + if d >= 0 and this_d != d: + raise ValueError( + f'Inconsistent tensor dims for kind={kind}: ' + f'{this_d} vs {d}') + d = this_d + all_1d = False + parts.append(tp_interleave_tensor(t, tp, d)) + else: + # 1-D tensors (bias): simple concat + parts.append(t) + if not parts: + continue + if all_1d: + fused_tensors[kind] = torch.cat(parts, dim=0) + else: + fused = torch.cat(parts, dim=d + 1) + shape = list(fused.shape) + final = shape[:d] + [shape[d] * shape[d + 1]] + shape[d + 2:] + fused_tensors[kind] = fused.reshape(final) + + return Linear(tensors=fused_tensors, weight_format=first.weight_format) + + +def fuse_qkv_conv1d(t: torch.Tensor, qkv_split: tuple[int, int, int], + tp: int) -> torch.Tensor: + """Split conv1d into Q/K/V parts, TP-interleave each, concatenate back.""" + q_dim, k_dim, _ = qkv_split + d_conv = t.shape[0] + q_part = tp_interleave_tensor(t[:, :q_dim], tp, 1) + k_part = tp_interleave_tensor(t[:, q_dim:q_dim + k_dim], tp, 1) + v_part = tp_interleave_tensor(t[:, q_dim + k_dim:], tp, 1) + return torch.cat([q_part, k_part, v_part], dim=2).reshape(d_conv, -1).contiguous() + + +# --------------------------------------------------------------------------- +# DeltaNetBuilder -- Gated Delta Net input projections, scalar params, conv1d +# --------------------------------------------------------------------------- + + +class DeltaNetBuilder(Builder): + """DeltaNet (Gated Delta Net) weight loading builder.""" + + def __init__(self, config, ctx, tp: ParallelGroup): + super().__init__(config, ctx) + self.tp = tp + self.config.tp_size = tp.size + + def add_input_projections(self, *, in_proj_qkv, in_proj_z=None, + in_proj_b=None, in_proj_a=None, out_proj=None, + qkv_split): + """Fuse GDN input projections via pipeline, commit all linears. + + Pipeline: split_qkv -> dequant_mixed -> fuse_gdn -> commit. + """ + q, k, v = split_qkv(in_proj_qkv, qkv_split) + q, k, v, z, b, a = dequant_mixed(q, k, v, in_proj_z, in_proj_b, in_proj_a, + data_type=self.config.data_type) + fused = fuse_gdn(q, k, v, z, b, a, tp=self.tp.size) + self._add_linear('in_proj_all', fused, SplitSide.OUTPUT) + if out_proj is not None: + self._add_linear('out_proj', out_proj, SplitSide.INPUT) + + def add_scalar_params(self, a_log=None, dt_bias=None): + """Commit A_log and dt_bias as OUTPUT-split tensors.""" + if a_log is not None: + self._add_tensor('A_log', a_log, split_side=SplitSide.OUTPUT) + if dt_bias is not None: + self._add_tensor('dt_bias', dt_bias, split_side=SplitSide.OUTPUT) + + def add_conv1d(self, conv1d, qkv_split): + """Transpose HF layout to TM layout, TP-interleave Q/K/V, commit.""" + if conv1d.ndim == 3 and conv1d.shape[1] == 1: + conv1d = conv1d.squeeze(1) + conv1d = conv1d.t().contiguous() + conv1d = fuse_qkv_conv1d(conv1d, qkv_split, self.tp.size) + self._add_tensor('conv1d', conv1d, split_side=SplitSide.OUTPUT) diff --git a/lmdeploy/turbomind/builders/ffn.py b/lmdeploy/turbomind/builders/ffn.py new file mode 100644 index 0000000000..90f6f46104 --- /dev/null +++ b/lmdeploy/turbomind/builders/ffn.py @@ -0,0 +1,201 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""FFN weight loading builder and w1+w3 fusion helpers. + +Provides ``FfnBuilder`` for committing FFN weights (w1/w2/w3 with optional +w1+w3 fusion) and helper functions for determining whether SiLU fusion +(interleave vs chunk) should be used and whether w1+w3 fusion is safe for +the given TP configuration. +""" +from __future__ import annotations + +import math + +import torch + +from ..linear import (Linear, round_up_input_groups, round_up_output_groups, + transform_output_dim) +from ._base import Builder, ParallelGroup, SplitSide + +__all__ = [ + 'FfnBuilder', + 'fuse_w1w3', +] + +# --------------------------------------------------------------------------- +# @transform_output_dim / @transform_input_dim helpers +# --------------------------------------------------------------------------- + + +@transform_output_dim +def _interleave_w1w3(w1: torch.Tensor, w3: torch.Tensor) -> torch.Tensor: + """Interleave w1 and w3 along output dim for fused SiLU epilogue.""" + return torch.stack([w1, w3], dim=-1).reshape(w1.shape[:-1] + (-1,)).contiguous() + + +@transform_output_dim +def _chunk_w1w3(w1: torch.Tensor, w3: torch.Tensor, *, + tp: int) -> torch.Tensor: + """Concatenate w1 and w3 along output dim with TP interleaving.""" + if tp <= 1: + return torch.cat([w1, w3], dim=-1).contiguous() + d = w1.dim() - 1 + r1 = w1.reshape(w1.shape[:d] + (tp, w1.shape[d] // tp)) + r3 = w3.reshape(w3.shape[:d] + (tp, w3.shape[d] // tp)) + combined = torch.cat([r1, r3], dim=d + 1) + return combined.reshape(w1.shape[:d] + (-1,)).contiguous() + + +# --------------------------------------------------------------------------- +# FFN fusion helpers +# --------------------------------------------------------------------------- + + +def _should_fuse_silu(w1_linear: Linear, act_type: str, is_moe: bool = False) -> bool: + """Determine if fused SiLU (interleave) should be used for w1+w3 fusion. + + Gold standard condition (from GEMM kernel constraints — trust it): + + act_type == SiLU && (int4 || mxfp4 || fp8 || moe) && !(fp8 && SM90) + """ + if act_type not in ('', 'silu', 'SiLU'): + return False + + # Dense bf16/fp16 without MoE -> chunk, not interleave + weight = w1_linear.tensors.get('weight') + is_quantized = weight is not None and weight.element_size() < 2 + if not is_quantized and not is_moe: + return False + + # FP8 on SM90 -> chunk + fmt = w1_linear.weight_format + if fmt is not None and fmt.name == 'fp8': + if torch.cuda.is_available(): + cap = torch.cuda.get_device_capability() + if cap == (9, 0): + return False + + return True + + +def _can_fuse_w1w3(w1: Linear, tp: int) -> bool: + """Check whether w1+w3 fusion is safe for the given TP. + + Fusion (interleave or chunk) concatenates w1 and w3 along the output dim. + For block-quantized formats (e.g. FP8 with block_out=128), the fused + scale count ``2 * cdiv(N/tp, block_out)`` must equal + ``cdiv(2*N/tp, block_out)``. This holds iff ``(N/tp) % block_out == 0``. + When it doesn't, the fused module's C++ allocation won't match the + concatenated scales and we must commit w1/w3 separately. + """ + if tp <= 1: + return True + fmt = w1.weight_format + if fmt is None or fmt.block_out is None: + return True + w = w1.tensors.get('weight') + if w is None: + return True + return (w.size(-1) // tp) % fmt.block_out == 0 + + +def fuse_w1w3( + w1: Linear, + w3: Linear, + tp: int, + act_type: str, + is_moe: bool = False, +) -> tuple[Linear | None, bool]: + """Optionally fuse w1/w3 on full (unsharded) tensors for FFN. + + Returns (fused_w1w3_or_none, fused_silu). + When fusion is possible, fused_w1w3 is set. + When block-scale boundaries prevent fusion, returns (None, fused_silu). + + TP sharding is NOT done here — the caller's commit path handles it + via split_side=SplitSide.OUTPUT. ``tp`` is only used for the + block-scale alignment check in ``_can_fuse_w1w3``. + """ + fused_silu = _should_fuse_silu(w1, act_type, is_moe) + can_fuse = _can_fuse_w1w3(w1, tp) + + if can_fuse: + if fused_silu: + w1w3 = _interleave_w1w3(w1, w3) + else: + w1w3 = _chunk_w1w3(w1, w3, tp=tp) + return (w1w3, fused_silu) + else: + return (None, fused_silu) + + +# --------------------------------------------------------------------------- +# TP padding +# --------------------------------------------------------------------------- + +# Minimum CTA_K across all registered grouped-GEMM kernels (SM75–SM90). +# Included in effective_block via lcm so the padded intermediate is always +# GEMM-aligned. +_GEMM_K_ALIGN = 32 + + +def _pad_ffn_for_tp(w1: Linear, w2: Linear, w3: Linear, + tp: int) -> tuple[Linear, Linear, Linear]: + """Pad w1/w3 output dim and w2 input dim for TP sharding.""" + raw_inter = w1.tensors['weight'].size(-1) + + if tp <= 1: + return w1, w2, w3 + + fmt = w1.weight_format + effective_block = math.lcm(fmt.block_in or 1, fmt.block_out or 1, + _GEMM_K_ALIGN) + + groups = raw_inter // effective_block + w1 = round_up_output_groups(w1, groups, tp) + w3 = round_up_output_groups(w3, groups, tp) + w2 = round_up_input_groups(w2, groups, tp) + return w1, w2, w3 + + +# --------------------------------------------------------------------------- +# FfnBuilder -- w1+w3 fusion, w2 commit +# --------------------------------------------------------------------------- + + +class FfnBuilder(Builder): + """FFN weight loading builder with w1+w3 fusion.""" + + def __init__(self, config, ctx, tp: ParallelGroup): + super().__init__(config, ctx) + self.tp = tp + self.config.tp_size = tp.size + + def add_ffn(self, w1, w2, w3): + """Pad weights for TP alignment, fuse w1+w3 if possible, then shard and + commit. + + The fusion result determines ``fuse_silu`` on the C++ module config. + Updating ``self.config.fuse_silu`` **before** any ``_add_linear`` + call ensures the C++ module is lazily created with the correct flag. + """ + # Pad weights for TP alignment before any fusion or sharding. + # After padding, push the padded-global inter_size onto config so + # that C++ module creation sees the correct dimension. + w1, w2, w3 = _pad_ffn_for_tp(w1, w2, w3, self.tp.size) + self.config.inter_size = w1.tensors['weight'].size(-1) + + act_type = getattr(self.config, 'act_type', 0) + if isinstance(act_type, int): + act_type = {0: 'silu', 1: 'gpt-oss'}.get(act_type, 'silu') + fused, fused_silu = fuse_w1w3( + w1, w3, self.tp.size, act_type, + is_moe=self.config.is_expert) + + self.config.fuse_silu = fused_silu + + if fused is not None: + self._add_linear('w1w3', fused, SplitSide.OUTPUT) + else: + self._add_linear('w1', w1, SplitSide.OUTPUT) + self._add_linear('w3', w3, SplitSide.OUTPUT) + self._add_linear('w2', w2, SplitSide.INPUT) diff --git a/lmdeploy/turbomind/builders/mla.py b/lmdeploy/turbomind/builders/mla.py new file mode 100644 index 0000000000..738596c7b0 --- /dev/null +++ b/lmdeploy/turbomind/builders/mla.py @@ -0,0 +1,89 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from __future__ import annotations + +import torch + +from ..linear import Linear +from ._base import Builder, ParallelGroup, SplitSide + +# --------------------------------------------------------------------------- +# MLA fold+pad pipeline (standalone functions) +# --------------------------------------------------------------------------- + + +def fold_kv_b(q_b: Linear, kv_b: Linear, wo: Linear, *, + cfg) -> tuple[Linear, Linear]: + """Fold kv_b into q_b and wo. Returns (q_b_folded, wo_folded). + + Splits kv_b into key-compressed (kc) and value-compressed (vc) parts. Folds kc into q_b via matmul (q_nope @ kc^T + per head). Folds vc into wo via matmul (vc @ wo per head). All arithmetic in TM layout [in, out]. + """ + H = cfg.head_num + P = cfg.qk_nope_dim + S = cfg.qk_rope_dim + R_q = cfg.q_lora_rank # q_b input dim + R = cfg.kv_lora_rank # kv_b input dim, also fold expansion target + V = wo.tensors['weight'].shape[0] // H # v_head_dim (cfg value overridden) + + q_b_h = q_b.tensors['weight'].reshape(R_q, H, P + S) + kc, vc = kv_b.tensors['weight'].reshape(R, H, P + V).split([P, V], dim=-1) + q_nope, q_rope = q_b_h.split([P, S], dim=-1) + + # q_nope @ kc^T per head: [R_q, H, P] × [R, H, P] → [R_q, H, R] + q_folded = torch.cat([ + torch.einsum('ihp,jhp->ihj', q_nope, kc), # [R_q, H, R] + q_rope, # [R_q, H, S] + ], dim=-1).reshape(R_q, H * (R + S)) + + # vc @ wo per head + o_folded = torch.einsum('rhv,hvn->hrn', vc, + wo.tensors['weight'].reshape(H, V, -1) + ).reshape(H * R, -1) + + return (Linear(tensors={'weight': q_folded.contiguous()}, + weight_format=q_b.weight_format), + Linear(tensors={'weight': o_folded.contiguous()}, + weight_format=wo.weight_format)) + + +def pad_wo_input(wo: Linear, *, cfg) -> Linear: + """Pad wo input dim from head_num * cur_dim to head_num * size_per_head.""" + head_num = cfg.head_num + size_per_head = cfg.head_dim + w = wo.tensors['weight'] + cur_dim = w.shape[0] // head_num + w = w.reshape(head_num, cur_dim, -1) + w = torch.nn.functional.pad(w, (0, 0, size_per_head - cur_dim, 0)) + w = w.reshape(head_num * size_per_head, -1) + return Linear(tensors={'weight': w.contiguous()}, + weight_format=wo.weight_format) + + +# --------------------------------------------------------------------------- +# MLABuilder -- MLA projections, fold+pad, norms +# --------------------------------------------------------------------------- + + +class MLABuilder(Builder): + """MLA (Multi-head Latent Attention) weight loading builder.""" + + def __init__(self, config, ctx, tp: ParallelGroup): + super().__init__(config, ctx) + self.tp = tp + self.config.tp_size = tp.size + if config.kv_lora_rank > 0 and config.kv_head_num < tp.size: + config.kv_head_num = tp.size + + def add_projections(self, *, q_a_proj, q_b_proj, kv_a_proj, kv_b_proj, + wo): + """Apply MLA fold+pad, then commit each projection.""" + q_b_proj, wo = fold_kv_b(q_b_proj, kv_b_proj, wo, cfg=self.config) + wo = pad_wo_input(wo, cfg=self.config) + + for name, lin, side in [ + ('q_a_proj', q_a_proj, None), + ('q_b_proj', q_b_proj, SplitSide.OUTPUT), + ('kv_a_proj', kv_a_proj, None), + ('wo', wo, SplitSide.INPUT), + ]: + self._add_linear(name, lin, split_side=side) diff --git a/lmdeploy/turbomind/builders/module_list.py b/lmdeploy/turbomind/builders/module_list.py new file mode 100644 index 0000000000..5fabf64ed6 --- /dev/null +++ b/lmdeploy/turbomind/builders/module_list.py @@ -0,0 +1,24 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import _turbomind as _tm + +from ._base import Builder, BuiltModule + +ModuleListConfig = _tm.ModuleListConfig + + +class ModuleListBuilder(Builder): + """Builder for ModuleList containers.""" + + def __setitem__(self, index: int, value): + if isinstance(value, Builder): + raise TypeError( + f'{type(self).__name__}[{index}]: call .build() first') + if isinstance(value, BuiltModule): + if self._built: + raise RuntimeError( + f'{type(self).__name__} is built; ' + f'cannot set index {index}') + self._add_child(str(index), value.handles) + return + raise TypeError( + f'{type(self).__name__}[{index}] requires a BuiltModule') diff --git a/lmdeploy/turbomind/builders/moe.py b/lmdeploy/turbomind/builders/moe.py new file mode 100644 index 0000000000..a4a1bcb36d --- /dev/null +++ b/lmdeploy/turbomind/builders/moe.py @@ -0,0 +1,22 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from __future__ import annotations + +from ._base import Builder, SplitSide + +# --------------------------------------------------------------------------- +# MoeBuilder -- gate, non-expert params +# --------------------------------------------------------------------------- + + +class MoeBuilder(Builder): + """MoE weight loading builder.""" + + def add_gate(self, name, linear): + """Commit a gate linear (broadcast, no split).""" + self._add_linear(name, linear, split_side=None) + + def add_param(self, name, tensor, split_side=None): + """Commit a non-expert MoE parameter.""" + if split_side is not None and not isinstance(split_side, SplitSide): + split_side = None # specs may pass None for broadcast + self._add_tensor(name, tensor, split_side) diff --git a/lmdeploy/turbomind/builders/norm.py b/lmdeploy/turbomind/builders/norm.py new file mode 100644 index 0000000000..ab8e4df9ad --- /dev/null +++ b/lmdeploy/turbomind/builders/norm.py @@ -0,0 +1,20 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import _turbomind as _tm +import torch + +from ._base import Builder + + +def make_norm_config(*, dim, norm_eps): + cfg = _tm.NormConfig() + cfg.dim = dim + cfg.norm_eps = norm_eps + return cfg + + +class NormBuilder(Builder): + """Builder for a single norm weight module.""" + + def set_weight(self, tensor: torch.Tensor): + """Commit the norm weight tensor to all GPU handles.""" + self._add_tensor('weight', tensor) diff --git a/lmdeploy/turbomind/builders/text_model.py b/lmdeploy/turbomind/builders/text_model.py new file mode 100644 index 0000000000..e3c4ab5598 --- /dev/null +++ b/lmdeploy/turbomind/builders/text_model.py @@ -0,0 +1,54 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from ..linear import round_up_output_groups +from ._base import Builder, BuiltModule, ParallelGroup, SplitSide + + +class TextModelBuilder(Builder): + """Builder for the root ModelWeight. + + Constructs a ModelWeight via ``_tm.create_module(ModelWeightConfig)`` + on each context (inherited Builder machinery), then attaches it to + externally-owned ``ModelRoot`` sentinel handles as their + ``text_model`` child during ``build()``. + + Owns ``tok_embeddings`` (Tensor param) and ``output`` (LinearWeight + child) commits on the ModelWeight via ``add_token_embeds`` / + ``add_lm_head``. + """ + + def __init__(self, config, ctx, *, root_handles, + tp: ParallelGroup, vocab_size): + super().__init__(config, ctx) + self.tp = tp + self.config.tp_size = tp.size + self._root_handles = root_handles + self._vocab_size = vocab_size + + def build(self) -> BuiltModule: + """Create ModelWeight via _tm.create_module (via super), then attach + each per-GPU ModelWeight handle to its sentinel root via + add_child_raw.""" + built = super().build() + for i, (root, text_model) in enumerate( + zip(self._root_handles, built.handles)): + with self._ctx.devices[i]: + root.add_child_raw('text_model', text_model) + return built + + def add_token_embeds(self, tensor): + """Commit the raw embedding lookup as the ``tok_embeddings`` root + param. + + Shards along hidden (output) dim by ``self.tp.size``. No vocab padding — + embedding lookup never indexes past ``vocab - 1``. + """ + self._add_tensor('tok_embeddings', tensor, + split_side=SplitSide.OUTPUT) + + def add_lm_head(self, linear): + """Pad output dim to ``round_up(vocab_size, tp)`` and commit to the + ``output`` LinearWeight root child.""" + linear = round_up_output_groups(linear, self._vocab_size, + self.tp.size) + self._add_linear('output', linear, split_side=SplitSide.OUTPUT) diff --git a/lmdeploy/turbomind/converter.py b/lmdeploy/turbomind/converter.py new file mode 100644 index 0000000000..56dace37bd --- /dev/null +++ b/lmdeploy/turbomind/converter.py @@ -0,0 +1,240 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +import torch + +from lmdeploy.archs import get_model_arch, search_nested_config +from lmdeploy.messages import TurbomindEngineConfig +from lmdeploy.pytorch.config import override_hf_config +from lmdeploy.utils import get_logger + +from ..utils import _get_and_verify_max_len, is_bf16_supported +from .builders import _cpp_dtype +from .models.base import INPUT_MODELS +from .models.utils import source_model_config +from .supported_models import SUPPORTED_ARCHS +from .weight_format import ( + AWQFormat, + CompressedTensorFormat, + FP8Format, + GPTQFormat, + MXFP4Format, + TrivialFormat, + WeightFormat, + WeightFormatResolver, +) + +logger = get_logger('lmdeploy') + + +def _build_resolver(model_format: str | None, + group_size: int | None, + data_type: '_tm.DataType') -> WeightFormatResolver: + """Build the active resolver: quantized format (if any) + trivial fallback. + + Called after the int4 fp16 force but before the ``compressed-tensors → + awq`` rename, so compressed-tensors models get ``CompressedTensorFormat``. + """ + formats: list[WeightFormat] = [] + if model_format in (None, 'hf'): + pass + elif model_format == 'awq': + formats.append(AWQFormat(block_in=group_size)) + elif model_format == 'gptq': + formats.append(GPTQFormat(block_in=group_size)) + elif model_format == 'compressed-tensors': + formats.append(CompressedTensorFormat(block_in=group_size)) + elif model_format == 'fp8': + formats.append(FP8Format()) + elif model_format == 'mxfp4': + formats.append(MXFP4Format()) + else: + raise ValueError(f'unknown model_format: {model_format!r}') + formats.append(TrivialFormat()) + return WeightFormatResolver(data_type=data_type, formats=formats) + + +def _deep_merge(base: dict, override: dict, path: str = '') -> dict: + """Recursively merge override into base, mutating base in-place.""" + for k, v in override.items(): + key_path = f'{path}.{k}' if path else k + if k in base and isinstance(base[k], dict) and isinstance(v, dict): + _deep_merge(base[k], v, key_path) + else: + if k not in base: + logger.warning(f'hf_overrides key "{key_path}" not found in config, applying anyway') + base[k] = v + return base + + +def _apply_hf_overrides(cfg, override: dict): + """Apply hf_overrides to a Transformers config object or nested dict.""" + override_hf_config(cfg, override) + return cfg + + +_DEFAULT_GROUP_SIZES = { + 'awq': 128, + 'gptq': 128, + 'compressed-tensors': 128, + 'fp8': 128, + 'mxfp4': 32, +} + +_SUPPORTED_GROUP_SIZES = { + 'awq': frozenset({128}), + 'gptq': frozenset({128}), + 'compressed-tensors': frozenset({32, 128}), + 'fp8': frozenset({128}), + 'mxfp4': frozenset({32}), +} + + +def _validate_quant_group_size(model_format: str | None, group_size: int | None) -> int | None: + """Normalize and validate quantized group sizes. + + The low-level int4 kernels can be shared across formats, but we only expose the format/group-size combinations that + are verified end to end. + """ + if group_size in (None, 0): + group_size = _DEFAULT_GROUP_SIZES.get(model_format, group_size) + + supported_group_sizes = _SUPPORTED_GROUP_SIZES.get(model_format) + if supported_group_sizes is not None and group_size not in supported_group_sizes: + supported = ', '.join(map(str, sorted(supported_group_sizes))) + raise ValueError(f'Unsupported group_size={group_size} for model_format="{model_format}". ' + f'Supported group_size values: {supported}.') + + return group_size + + +def get_registered_name(model_path: str, model_format: str, arch: str = None): + """Get the registered name of a model. The name will be used to access the + INPUT_MODELS registry. + + Args: + model_path (str): the path of the input model + model_format (str): the format of the model, which can be one of + ['hf', 'awq', 'gptq', 'compressed-tensors', 'fp8', 'mxfp4'] + arch (str): optional architecture string, to avoid reloading config + """ + if arch is None: + arch = get_model_arch(model_path)[0] + register_name = SUPPORTED_ARCHS[arch] + return register_name + + +def _resolve_dtype(requested: str, hf_model_cfg) -> str: + """Resolve 'auto' dtype against the HF config and the current device. + + Prefers `dtype` over the deprecated `torch_dtype` key. Falls back to + float16 on hardware that does not support bfloat16. + """ + has_bf16 = is_bf16_supported() + dtype = requested + if dtype == 'auto': + dtype = 'bfloat16' if has_bf16 else 'float16' + torch_dtype = getattr(hf_model_cfg, 'dtype', None) + if torch_dtype is None: + torch_dtype = getattr(hf_model_cfg, 'torch_dtype', None) + TORCH_DTYPE_MAP = {torch.bfloat16: 'bfloat16', torch.float16: 'float16'} + dtype = TORCH_DTYPE_MAP.get(torch_dtype, dtype) + + if dtype == 'bfloat16' and not has_bf16: + logger.warning('data type fallback to float16 since ' + 'torch.cuda.is_bf16_supported is False') + dtype = 'float16' + return dtype + + +def get_tm_config(model_path, + engine_config: TurbomindEngineConfig, + group_size: int = None): + """Resolve dtype/model_format/group_size/session_len, mutate engine_config + in place, build the text model. + + Returns: + tuple: (text_model, model_path, data_type) + """ + # 1. Load HF config once; reused for quant_config, dtype, and session_len. + arch, hf_model_cfg = get_model_arch(model_path) + + # 2. Reconcile quant_config (unchanged logic from the prior flow). + quant_config = search_nested_config( + hf_model_cfg.to_dict(), 'quantization_config') + if quant_config: + quant_method = quant_config.get('quant_method') + _group_size = int(quant_config.get('group_size', 0)) + version = quant_config.get('version') + assert engine_config.model_format is None or engine_config.model_format == quant_method, ( + f'mismatched quant method: user input "{engine_config.model_format}" ' + f'vs model quant_config "{quant_method}"') + assert not group_size or group_size == _group_size, ( + f'mismatched quant group size: user input "{group_size}" ' + f'vs model quant_config "{_group_size}"') + + if quant_method == 'awq': + assert version == 'gemm', f'unsupported quant config: {quant_config}' + elif quant_method == 'gptq': + assert not quant_config.get('desc_act', False) and quant_config.get( + 'sym', True), f'unsupported quant config: {quant_config}' + elif quant_method == 'fp8': + pass + elif quant_method == 'mxfp4': + _group_size = 32 + elif quant_method == 'compressed-tensors': + _format = quant_config['config_groups']['group_0']['format'] + assert _format == 'pack-quantized', ( + 'compressed-tensors only supports pack-quantized format, ' + f'but got {_format}') + _weights = quant_config['config_groups']['group_0']['weights'] + _group_size = _weights['group_size'] + _num_bits = _weights['num_bits'] + _type = _weights['type'] + assert _num_bits == 4 and _type == 'int', ( + 'pack-quantized requires 4-bit int, ' + f'but got {_num_bits}-bit {_type}') + else: + assert 0, f'unsupported quant_config: {quant_config}' + + engine_config.model_format = quant_method + group_size = _group_size + + group_size = _validate_quant_group_size(engine_config.model_format, group_size) + if engine_config.model_format is None: + engine_config.model_format = 'hf' + + # 3. Resolve dtype and format overrides. + dtype = _resolve_dtype(engine_config.dtype, hf_model_cfg) + if engine_config.model_format in ('awq', 'gptq', 'compressed-tensors'): + dtype = 'float16' + engine_config.dtype = dtype + + # Build resolver after dtype is finalized but before the CT→AWQ rename, + # so compressed-tensors models instantiate CompressedTensorFormat. + resolver = _build_resolver(engine_config.model_format, + group_size, _cpp_dtype(dtype)) + + # C++-side label rename (does not affect resolver). + if engine_config.model_format == 'compressed-tensors': + engine_config.model_format = 'awq' + + # 4. Resolve session_len default. + session_len_default = _get_and_verify_max_len(hf_model_cfg, None) + + # 5. Mutate engine_config with remaining resolved values. + if engine_config.session_len is None: + engine_config.session_len = session_len_default + engine_config.attn_tp_size = engine_config.attn_tp_size or 1 + engine_config.attn_cp_size = engine_config.attn_cp_size or 1 + engine_config.mlp_tp_size = engine_config.mlp_tp_size or 1 + + # 6. Build text model. + cfg = source_model_config(hf_model_cfg) + if engine_config.hf_overrides: + logger.warning(f'Overriding HF config with {engine_config.hf_overrides}') + _apply_hf_overrides(cfg, engine_config.hf_overrides) + registered_name = get_registered_name(model_path, engine_config.model_format, arch=arch) + model_cls = INPUT_MODELS.get(registered_name) + text_model = model_cls(cfg, resolver=resolver) + + return text_model, model_path, _cpp_dtype(dtype) diff --git a/lmdeploy/turbomind/deploy/__init__.py b/lmdeploy/turbomind/deploy/__init__.py deleted file mode 100644 index ef101fec61..0000000000 --- a/lmdeploy/turbomind/deploy/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py deleted file mode 100644 index ba6d632242..0000000000 --- a/lmdeploy/turbomind/deploy/config.py +++ /dev/null @@ -1,242 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import inspect -import json -from dataclasses import asdict, field, fields - -# use pydantic.dataclasses.dataclass to check data type -from pydantic.dataclasses import dataclass - -from lmdeploy.messages import TurbomindEngineConfig -from lmdeploy.utils import get_logger - -logger = get_logger('lmdeploy') - - -def config_from_dict(cls, env): - """Initiate an instance of a config class from a dict.""" - params = inspect.signature(cls).parameters - used = {k: v for k, v in env.items() if k in params and v is not None} - - def _remove_none(d: dict): - for k, v in d.items(): - if isinstance(v, dict): - d[k] = _remove_none(v) - return {k: v for k, v in d.items() if v is not None} - - used = _remove_none(used) - return cls(**used) - - -def config_to_dict(config): - """Export config to a dict.""" - if not config: - return dict() - assert isinstance(config, (ModelConfig, AttentionConfig, LoraConfig)), \ - f'A dataclass is expected, but got {type(config)}' - - return asdict(config) - - -@dataclass -class ModelConfig: - model_name: str = '' - chat_template: str = '' - model_arch: str = None - head_num: int = None - kv_head_num: int = None - hidden_units: int = None - vocab_size: int = None - # Turbomind used to assume token_embedding and lm_head has the same size - # at vocab dim, i.e. `vocab_size` - # But in molmo, embedding.shape is [vocab_size + 128, hidden_units] - # while lm_head shape is [hidden_units, vocab_size]. - # Therefore, we add a new attr "embedding_size" to represent the vocab dim - # of token_embedding - embedding_size: int = 0 - num_layer: int = None - inter_size: list[int] = None - norm_eps: float = None - attn_bias: int = 0 - mlp_bias: bool = False - window_size: list[int] = field(default_factory=list) - attn_sink: bool = False - qk_norm: bool = False - size_per_head: int = 128 - group_size: int = 32 - data_type: str = None - weight_type: str = None - expert_weight_type: str = None - ffn_weight_type: str = None - session_len: int = None - attn_tp_size: int = 1 - attn_cp_size: int = 1 - mlp_tp_size: int = 1 - model_format: str = 'hf' - expert_num: list[int] = field(default_factory=list) - expert_router_bias: bool = False - expert_inter_size: int = 0 - experts_per_token: int = 0 - activation_type: str = '' - moe_shared_gate: bool = False - norm_topk_prob: bool = False - routed_scale: float = 1.0 - topk_group: int = 1 - topk_method: str = 'greedy' - moe_group_num: int = 1 - scoring_func: str = 'softmax' - router_n_groups: int = -1 - # MLA - q_lora_rank: int = 0 - kv_lora_rank: int = 0 - qk_rope_dim: int = 0 - v_head_dim: int = 0 - # Qwen 3.5 - layer_types: list[str] = field(default_factory=list) - linear_key_head_dim: int = 0 - linear_value_head_dim: int = 0 - linear_conv_kernel_dim: int = 0 - linear_num_key_heads: int = 0 - linear_num_value_heads: int = 0 - attn_output_gate: bool = False - # Per-layer expert weight type override: layer indices whose - # MoE experts are unquantized (fp16) despite expert_weight_type=int4. - # Populated from modules_to_not_convert patterns like 'model.layers.0.'. - unquantized_expert_layers: list[int] = field(default_factory=list) - # tuning - tune_layer_num: int = 1 - - def verify(self): - invalid = {} - for k, v in self.__dict__.items(): - if v is None: - invalid[k] = v - assert not invalid, f'incomplete model config: {invalid}' - - -@dataclass -class RopeParam: - type: str - base: float - dim: int - factor: float = 1.0 - max_position_embeddings: int = None - attention_factor: float = 1.0 - beta_fast: float = 32 - beta_slow: float = 1 - low_freq_factor: float = None - high_freq_factor: float = None - original_max_position_embeddings: int = None - mrope_section: list[int] = None - - -@dataclass -class AttentionConfig: - softmax_scale: float = 0 - cache_block_seq_len: int = 64 - use_logn_attn: int = 0 - max_position_embeddings: int = 0 - rope_param: RopeParam = None - - -@dataclass -class LoraConfig: - lora_policy: str = '' - lora_r: int = 0 - lora_scale: float = 0.0 - lora_max_wo_r: int = 0 - lora_rank_pattern: str = '' - lora_scale_pattern: str = '' - - -@dataclass -class TurbomindModelConfig: - """Config for turbomind model.""" - model_config: ModelConfig = None - attention_config: AttentionConfig = None - lora_config: LoraConfig = None - - def update_from_engine_config(self, config: TurbomindEngineConfig): - """Update the attributes of this instance with the attributes from - TurbomindEngineConfig. - - Args: - config (TurbomindEngineConfig): The turbomind engine config - """ - if config is None: - return - for key, value in asdict(config).items(): - if not value: - continue - - if hasattr(self.model_config, key): - setattr(self.model_config, key, value) - if hasattr(self.attention_config, key): - setattr(self.attention_config, key, value) - - # update from hf_overrides - if hasattr(config, 'hf_overrides') and config.hf_overrides: - hf_overrides = config.hf_overrides - - if hf_overrides.get('rope_scaling'): - override_params = hf_overrides.get('rope_scaling') - - rope_param = self.attention_config.rope_param or RopeParam(type='', base=0, dim=0) - rope_param.type = override_params.get('rope_type', '') - if rope_param.type == 'yarn' and 'original_max_position_embeddings' in override_params: - rope_param.factor = self.attention_config.max_position_embeddings / override_params[ - 'original_max_position_embeddings'] - rope_param.max_position_embeddings = override_params['original_max_position_embeddings'] - else: - rope_param.factor = override_params.get('factor', 1.0) - rope_param.max_position_embeddings = override_params.get('original_max_position_embeddings', None) - - self.attention_config.rope_param = rope_param - logger.warning(f'Overriding HF config with {hf_overrides}') - - # use dynamic ntk - if config.rope_scaling_factor: - # some ut will create empty RopeParam, will check base/dim in src code - rope_param = self.attention_config.rope_param or RopeParam(type='', base=0, dim=0) - rope_param.type = 'dynamic' - rope_param.factor = config.rope_scaling_factor - rope_param.max_position_embeddings = self.attention_config.max_position_embeddings - - self.attention_config.rope_param = rope_param - logger.warning( - '`--rope-scaling-factor` will be removed in a future release. Please instead use `--hf-overrides`.') - - @classmethod - def from_dict(cls, config: dict | None = None): - """Construct TurbomindModelConfig instance from config in a dict.""" - if config is None: - config = {} - _cfg = {field.name: config.get(field.name, {}) for field in fields(TurbomindModelConfig)} - - return TurbomindModelConfig(model_config=config_from_dict(ModelConfig, _cfg['model_config']), - attention_config=config_from_dict(AttentionConfig, _cfg['attention_config']), - lora_config=config_from_dict(LoraConfig, _cfg['lora_config'])) - - def to_dict(self): - """Export to a dict.""" - return dict(model_config=config_to_dict(self.model_config), - attention_config=config_to_dict(self.attention_config), - lora_config=config_to_dict(self.lora_config)) - - @property - def session_len(self): - return self.model_config.session_len - - @property - def weight_type(self): - return self.model_config.weight_type - - @property - def group_size(self): - return self.model_config.group_size - - @property - def vocab_size(self): - return self.model_config.vocab_size - - def __str__(self): - return json.dumps(self.to_dict(), indent=2) diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py deleted file mode 100644 index 05b1ba526f..0000000000 --- a/lmdeploy/turbomind/deploy/converter.py +++ /dev/null @@ -1,285 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import torch - -from lmdeploy.archs import get_model_arch, search_nested_config -from lmdeploy.messages import TurbomindEngineConfig -from lmdeploy.utils import get_logger - -from ...utils import _get_and_verify_max_len, is_bf16_supported -from ..supported_models import SUPPORTED_ARCHS -from .config import TurbomindModelConfig -from .module import Transformer -from .policy import get_input_policy -from .source_model.base import INPUT_MODELS -from .target_model.base import OUTPUT_MODELS, BaseOutputModel - -SUPPORTED_FORMATS = ['hf', 'awq', 'gptq', 'compressed-tensors', 'fp8', 'mxfp4', None] -logger = get_logger('lmdeploy') - -_DEFAULT_GROUP_SIZES = { - 'awq': 128, - 'gptq': 128, - 'compressed-tensors': 128, - 'fp8': 128, - 'mxfp4': 32, -} - -_SUPPORTED_GROUP_SIZES = { - 'awq': frozenset({128}), - 'gptq': frozenset({128}), - 'compressed-tensors': frozenset({32, 128}), - 'fp8': frozenset({128}), - 'mxfp4': frozenset({32}), -} - - -def _validate_quant_group_size(model_format: str | None, group_size: int | None) -> int | None: - """Normalize and validate quantized group sizes. - - The low-level int4 kernels can be shared across formats, but we only expose the format/group-size combinations that - are verified end to end. - """ - if group_size in (None, 0): - group_size = _DEFAULT_GROUP_SIZES.get(model_format, group_size) - - supported_group_sizes = _SUPPORTED_GROUP_SIZES.get(model_format) - if supported_group_sizes is not None and group_size not in supported_group_sizes: - supported = ', '.join(map(str, sorted(supported_group_sizes))) - raise ValueError(f'Unsupported group_size={group_size} for model_format="{model_format}". ' - f'Supported group_size values: {supported}.') - - return group_size - - -def get_input_model_registered_name(model_path: str, model_format: str): - """Get the registered name of a model. The name will be used to access the - INPUT_MODELS registry. - - Args: - model_path (str): the path of the input model - model_format (str): the format of the model, which can be one of - ['hf', 'awq', 'gptq', 'compressed-tensors', 'fp8', 'mxfp4'] - """ - arch = get_model_arch(model_path)[0] - register_name = SUPPORTED_ARCHS[arch] - return register_name - - -def get_output_model_registered_name_and_config(model_path: str, model_format: str, dtype: str, group_size: int): - """Get the registered name of the turbomind model and its configuration - according to the input model path, format and user-input config. The name - will be used to access the OUTPUT_MODELS registry. - - Args: - model_path (str): the path of the input model - model_format (str): the format of the model, which can be one of - ['hf', 'awq', 'gptq', 'compressed-tensors', 'fp8', 'mxfp4'] - dtype (str): the data type of the model's weights and activations - group_size (int): the quantization group size used by grouped formats - """ - register_name = 'tm' - - has_bf16 = is_bf16_supported() - - model_arch, model_config = get_model_arch(model_path) - - # infer dtype from device and model config - if dtype == 'auto': - # pick dtype by device as default - dtype = 'bfloat16' if has_bf16 else 'float16' - # dtype from model (prefer `dtype` over deprecated `torch_dtype`) - torch_dtype = getattr(model_config, 'dtype', None) - if torch_dtype is None: - torch_dtype = getattr(model_config, 'torch_dtype', None) - if not torch_dtype: - if model_arch in ['QWenLMHeadModel', 'GptOssForCausalLM']: - torch_dtype = torch.bfloat16 - TORCH_DTYPE_MAP = {torch.bfloat16: 'bfloat16', torch.float16: 'float16'} - dtype = TORCH_DTYPE_MAP.get(torch_dtype, dtype) - - if dtype == 'bfloat16' and not has_bf16: - logger.warning('data type fallback to float16 since ' - 'torch.cuda.is_bf16_supported is False') - dtype = 'float16' - - weight_type = dtype - - config = TurbomindModelConfig.from_dict() - - session_len = _get_and_verify_max_len(model_config, None) - - group_size = _validate_quant_group_size(model_format, group_size) - - if model_format in ['awq', 'gptq', 'compressed-tensors']: - weight_type = 'int4' - dtype = 'float16' # force float16 for int4 quantized weights - if model_format == 'compressed-tensors': - # TurboMind reuses the AWQ int4 export path for pack-quantized - # compressed-tensors weights after the format-specific checks above. - model_format = 'awq' - elif model_format == 'fp8': - weight_type = 'fp8' - elif model_format == 'mxfp4': - weight_type = 'e2m1' - - expert_weight_type = weight_type - - # ONLY experts are in mxfp4 - if model_arch == 'GptOssForCausalLM': - weight_type = dtype - - # Three weight types control allocation for mixed quantization: - # weight_type - attention weights - # ffn_weight_type - dense FFN / shared expert weights - # expert_weight_type - MoE routed expert weights - # - # The assignment order matters: - # 1. expert_weight_type = original weight_type (before any overrides) - # 2. GptOss override: weight_type -> dtype (attn + shared experts are fp16) - # 3. ffn_weight_type = weight_type (captures post-GptOss value) - # 4. Mixed AWQ override: weight_type -> dtype (only attn becomes fp16) - # - # weight_type ffn_weight_type expert_weight_type - # Pure fp16 float16 float16 float16 - # Full AWQ int4 int4 int4 - # Mixed AWQ float16 int4 int4 - # GptOss mxfp4 bfloat16 bfloat16 e2m1 - ffn_weight_type = weight_type - - # When attention weights are not quantized (e.g. AWQ with self_attn in - # modules_to_not_convert), weight_type becomes fp16 for attention. - # ffn_weight_type and expert_weight_type retain int4. - if model_format in ['awq', 'gptq'] and weight_type != dtype: - quant_config = getattr(model_config, 'quantization_config', None) - if quant_config is None: - quant_config = {} - if isinstance(quant_config, dict): - modules_to_not_convert = quant_config.get('modules_to_not_convert') or [] - else: - modules_to_not_convert = getattr(quant_config, 'modules_to_not_convert', None) or [] - if any('self_attn' in m for m in modules_to_not_convert): - weight_type = dtype - if any('shared_expert' in m for m in modules_to_not_convert): - ffn_weight_type = dtype - # Detect per-layer exclusions like 'model.layers.0.' which mean - # ALL weights in that layer (including MoE experts) are fp16. - import re as _re - unquantized_expert_layers = [] - for m in modules_to_not_convert: - _m = _re.match(r'model\.layers\.(\d+)\.?$', m) - if _m: - unquantized_expert_layers.append(int(_m.group(1))) - config.model_config.unquantized_expert_layers = unquantized_expert_layers - - config.model_config.model_arch = model_arch - config.model_config.data_type = dtype - config.model_config.weight_type = weight_type - config.model_config.expert_weight_type = expert_weight_type - config.model_config.ffn_weight_type = ffn_weight_type - config.model_config.model_format = model_format - config.model_config.group_size = group_size - config.model_config.session_len = session_len - - return register_name, config - - -def get_tm_model(model_path, - model_name, - chat_template_name, - engine_config: TurbomindEngineConfig, - group_size: int = None, - out_dir: str = None) -> BaseOutputModel: - """Create turbomind model. - - Args: - model_path (str): the path of the input model, which is supposed - to be a local path, or huggingface hub repo_id, or modelscope - hub repo_id - model_name (str): user customized model name - chat_template_name (str): the name of the chat template of - the input model - engine_config(TurbomindEngineConfig): user input engine config - group_size(int): refers to the group_size if the input model - is a grouped quantized model - out_dir(str): the output directory where to save to turbomind model. - If it is None, the turbomind model won't be saved - """ - _, cfg = get_model_arch(model_path) - quant_config = search_nested_config(cfg.to_dict(), 'quantization_config') - mixed_awq = False - if quant_config: - quant_method = quant_config.get('quant_method') - _group_size = int(quant_config.get('group_size', 0)) - version = quant_config.get('version') - assert engine_config.model_format is None or engine_config.model_format == quant_method, ( - f'mismatched quant method: user input "{engine_config.model_format}" ' - f'vs model quant_config "{quant_method}"') - assert not group_size or group_size == _group_size, (f'mismatched quant group size: user input "{group_size}" ' - f'vs model quant_config "{_group_size}"') - - if quant_method == 'awq': - assert version == 'gemm', f'unsupported quant config: {quant_config}' - modules_to_not_convert = quant_config.get('modules_to_not_convert') or [] - if any('self_attn' in name for name in modules_to_not_convert): - mixed_awq = True - elif quant_method == 'gptq': - assert not quant_config.get('desc_act', False) and quant_config.get( - 'sym', True), f'unsupported quant config: {quant_config}' - elif quant_method == 'fp8': - pass - elif quant_method == 'mxfp4': - _group_size = 32 - elif quant_method == 'compressed-tensors': - _format = quant_config['config_groups']['group_0']['format'] - assert _format == 'pack-quantized', ('compressed-tennsors only supports pack-quantized format, ' - f'but got {_format}') - _weights = quant_config['config_groups']['group_0']['weights'] - _group_size = _weights['group_size'] - _num_bits = _weights['num_bits'] - _type = _weights['type'] - assert _num_bits == 4 and _type == 'int', ('pack-quantized requires 4-bit int, ' - f'but got {_num_bits}-bit {_type}') - else: - assert 0, f'unsupported quant_config: {quant_config}' - - engine_config.model_format = quant_method - group_size = _group_size - - group_size = _validate_quant_group_size(engine_config.model_format, group_size) - - input_model_name = get_input_model_registered_name(model_path, engine_config.model_format) - - fp8_quant = (engine_config.model_format == 'fp8' and not quant_config) - input_policy = get_input_policy(engine_config.model_format) - input_model = INPUT_MODELS.get(input_model_name)(model_path=model_path, - tokenizer_path=model_path, - input_policy=input_policy, - fp8_quant=fp8_quant) - - output_model_name, tm_cfg = get_output_model_registered_name_and_config(model_path=model_path, - model_format=engine_config.model_format, - dtype=engine_config.dtype, - group_size=group_size) - - if mixed_awq: - # Mixed-precision AWQ: attention weights are fp16 (not quantized), - # but expert weights remain as int4 AWQ for efficient inference. - tm_cfg.model_config.weight_type = tm_cfg.model_config.data_type - # expert_weight_type stays as 'int4' (set by get_output_model_registered_name_and_config) - - tm_cfg.model_config.chat_template = chat_template_name - tm_cfg.model_config.model_name = model_name - - if engine_config.attn_tp_size is not None: - tm_cfg.model_config.attn_tp_size = engine_config.attn_tp_size - if engine_config.attn_cp_size is not None: - tm_cfg.model_config.attn_cp_size = engine_config.attn_cp_size - if engine_config.mlp_tp_size is not None: - tm_cfg.model_config.mlp_tp_size = engine_config.mlp_tp_size - - output_model = OUTPUT_MODELS.get(output_model_name)(input_model=input_model, - cfg=tm_cfg, - model_cls=Transformer, - out_dir=out_dir) - - return output_model diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py deleted file mode 100644 index b62d9aead0..0000000000 --- a/lmdeploy/turbomind/deploy/module.py +++ /dev/null @@ -1,639 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from abc import ABC, abstractmethod -from functools import partial - -import torch - -from .parameter import get_params -from .source_model.base import BaseReader -from .target_model.base import BaseOutputModel - - -def permute_v2(x: torch.Tensor, size_per_head: int = 128): - """ - Contract: x.size(-1) is output dims - """ - - assert x.size(-1) > 1 - - output_dims = x.size(-1) - head_num = output_dims // size_per_head - - return x.view(-1, head_num, 2, size_per_head // 2).transpose(2, 3).reshape(x.shape) - - -def permute_v2_partial(x: torch.Tensor, size_per_head: int, rotary_dim: int): - """Permute only the first rotary_dim elements of each head. - - Used when partial_rotary_factor < 1.0: only the rotary portion needs interleaving for TurboMind's RoPE kernel - layout. - """ - assert x.size(-1) > 1 - assert rotary_dim % 2 == 0, f'rotary_dim must be even, got {rotary_dim}' - assert rotary_dim <= size_per_head, f'rotary_dim ({rotary_dim}) must be <= size_per_head ({size_per_head})' - output_dims = x.size(-1) - assert output_dims % size_per_head == 0, (f'output_dims ({output_dims}) must be divisible by ' - f'size_per_head ({size_per_head})') - head_num = output_dims // size_per_head - orig_shape = x.shape - if x.dim() == 1: - x = x.unsqueeze(0) - x = x.view(x.size(0), head_num, size_per_head) - rotary = x[:, :, :rotary_dim] - passthrough = x[:, :, rotary_dim:] - # Interleave rotary part: [2, rotary_dim//2] -> [rotary_dim//2, 2] - rotary = rotary.view(x.size(0), head_num, 2, rotary_dim // 2).transpose(2, 3).contiguous() - rotary = rotary.view(x.size(0), head_num, rotary_dim) - x = torch.cat([rotary, passthrough], dim=-1) - return x.reshape(orig_shape) - - -def merge_qkv_v2(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, tp: int): - """ - Contract: x.size(-1) is output dims - """ - - def reshape(x): - return x.view(x.size(0), tp, -1) if q.dim() == 2 else x.view(tp, -1) - - qkv = torch.cat(tuple(map(reshape, (q, k, v))), dim=-1) - - qkv = qkv.view(-1, qkv.size(-1) * tp) - if q.dim() == 1: - qkv.squeeze_() - - return qkv - - -def merge_qkvg_v2(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, gate: torch.Tensor, tp: int): - """Merge Q, K, V, and Gate with gate appended after V. - - Layout per tp-shard: [Q | K | V | Gate]. - """ - - def reshape(x): - return x.view(x.size(0), tp, -1) if q.dim() == 2 else x.view(tp, -1) - - qkvg = torch.cat(tuple(map(reshape, (q, k, v, gate))), dim=-1) - - qkvg = qkvg.view(-1, qkvg.size(-1) * tp) - if q.dim() == 1: - qkvg.squeeze_() - - return qkvg - - -def transpose(x): - return x.t() if x is not None else x - - -def pad_out_dims(x: torch.Tensor, dims: int): - pad = dims - x.size(-1) - assert pad >= 0 - return torch.nn.functional.pad(x, (0, pad), 'constant', 0) - - -def pad_in_dims(x: torch.Tensor, dims: int): - if x.dim() == 1: # 1-dim object does not have input dim (e.g. bias) - return x - pad = dims - x.size(0) - assert x.dim() == 2 - assert pad >= 0 - return torch.nn.functional.pad(x, (0, 0, 0, pad), 'constant', 0) - - -# split out dims -> copy A, split-out-dims B (qkv, w1, w3) -# split in dims -> split-in-dims A, copy B ( o, w2) -def get_lora_flags(kind: str): - return ('lora_a' in kind, 'lora_b' in kind) - - -class Module(ABC): - - def __init__(self, model: BaseOutputModel): - self.model = model - - def __call__(self, *args, **kwargs): - return self.apply(*args, **kwargs) - - @abstractmethod - def apply(self, idx: int, r: BaseReader): - pass - - -class LayerNorm(Module): - - def apply(self, i: int, r: BaseReader): - attn_norm = r.attn_norm(i) - ffn_norm = r.ffn_norm(i) - self.model.save_split(attn_norm, f'layers.{i}.attention_norm.weight') - self.model.save_split(ffn_norm, f'layers.{i}.ffn_norm.weight') - - -class Ffn(Module): - """ - requires: - r.ffn(i, kind) - """ - - _ffn = 'layers.{0}.feed_forward.{1}.{2}' - - def __init__(self, model: BaseOutputModel): - self.model = model - self.tp = model.mlp_tp_size - # inter_sizes in config are padded and may be different from what's - # in the weights - self.inter_size = model.model_config.inter_size - self.group_size = max(1, model.model_config.group_size) - - def _export(self, inter_size: int, fmt: str, idx: int, w123, kind: str, pack_fn, apply_gs=None, **kwargs): - if apply_gs is None: - apply_gs = [] - is_lora_a, is_lora_b = get_lora_flags(kind) - w1, w2, w3 = map(transpose, w123) - - gs1 = self.group_size if 'w1' in apply_gs else 1 - w1 = pad_out_dims(w1, inter_size // gs1) - - gs3 = self.group_size if 'w3' in apply_gs else 1 - w3 = pad_out_dims(w3, inter_size // gs3) - - gs2 = self.group_size if 'w2' in apply_gs else 1 - w2 = pad_in_dims(w2, inter_size // gs2) - - w1, w2, w3 = map(pack_fn, (w1, w2, w3)) - self.model.save_split(w1, fmt.format(idx, 'w1', kind), split_dim=-1, split_num=self.tp, copy=is_lora_a) - self.model.save_split(w3, fmt.format(idx, 'w3', kind), split_dim=-1, split_num=self.tp, copy=is_lora_a) - self.model.save_split(w2, fmt.format(idx, 'w2', kind), split_dim=0, split_num=self.tp, copy=is_lora_b) - - def apply(self, i: int, r: BaseReader): - if i >= len(self.inter_size) or not self.inter_size[i]: - return - keys = r.ffn(i, None) - - for e in get_params(keys): - e(partial(self._export, self.inter_size[i], self._ffn), partial(r.ffn, i), i) - - -class MoeFfn(Ffn): - """ - requires: - r.moe_ffn_expert(e, i, kind) - r.moe_ffn_gate(i) - r.moe_ffn_shared_gate(i) - """ - - _moe_ffn_expert = 'layers.{0}.moe_ffn.experts.E.{1}.{2}' - _moe_ffn_gate = 'layers.{0}.moe_ffn.gate.{1}' - _moe_ffn_shared_gate = 'layers.{0}.moe_ffn.shared_gate.weight' - - def __init__(self, model: BaseOutputModel): - super().__init__(model) - self.expert_num = model.model_config.expert_num - self.inter_size = model.model_config.expert_inter_size - self.shared_gate = model.model_config.moe_shared_gate - - def apply(self, i: int, r: BaseReader): - if i >= len(self.expert_num) or self.expert_num[i] == 0: - return - - # Export expert weights with outer loop over experts (not params) - # to ensure each expert's full weight set is grouped together - for e in range(self.expert_num[i]): - for p in get_params(r.moe_ffn_expert(), 1): - fmt = self._moe_ffn_expert.replace('E', str(e)) - p(partial(self._export, self.inter_size, fmt), partial(r.moe_ffn_expert, e, i), i) - - # router - gate = transpose(r.moe_ffn_gate(i, 'weight')) - self.model.save_split(gate, self._moe_ffn_gate.format(i, 'weight')) - bias = r.moe_ffn_gate(i, 'bias') - if bias is not None: - self.model.save_split(bias, self._moe_ffn_gate.format(i, 'bias')) - - # Export score_correction_bias for noaux_tc routing (GLM 4.7 Flash) - correction_bias = getattr(r, 'moe_ffn_gate_correction_bias', None) - if callable(correction_bias): - correction = correction_bias(i) - if correction is not None: - self.model.save_split(correction, self._moe_ffn_gate.format(i, 'score_correction_bias')) - - if self.shared_gate: - shared_gate = transpose(r.moe_ffn_shared_gate(i)) - self.model.save_split(shared_gate, self._moe_ffn_shared_gate.format(i)) - - -class Attn(Module): - """ - requires: - r.attn(i, kind) - """ - - _attn = 'layers.{0}.attention.{1}.{2}' - - def __init__(self, model: BaseOutputModel): - self.model = model - self.tp = model.attn_tp_size - self.head_dim = model.model_config.size_per_head - self.attn_bias = model.model_config.attn_bias - self.qk_norm = model.model_config.qk_norm - self.attn_sink = model.model_config.attn_sink - self.group_size = max(1, model.model_config.group_size) - self.attn_output_gate = model.model_config.attn_output_gate - rope_param = model.attention_config.rope_param - self.rope_dim = rope_param.dim if rope_param else self.head_dim - self.head_num = model.model_config.head_num - - def _split_q_gate(self, q): - """Split interleaved Q+gate tensor into separate Q and gate. - - HF layout: [Q_head0, Gate_head0, Q_head1, Gate_head1, ...] - Returns: (q_real, gate) each with shape [..., num_heads * head_dim] - """ - output_dims = q.size(-1) - head_num = output_dims // (self.head_dim * 2) - orig_shape = list(q.shape) - if q.dim() == 1: - q = q.unsqueeze(0) - q = q.view(q.size(0), head_num, 2, self.head_dim) - q_real = q[:, :, 0, :].contiguous() - gate = q[:, :, 1, :].contiguous() - new_last_dim = head_num * self.head_dim - q_real = q_real.reshape(-1, new_last_dim) - gate = gate.reshape(-1, new_last_dim) - if len(orig_shape) == 1: - q_real = q_real.squeeze(0) - gate = gate.squeeze(0) - return q_real, gate - - def _reorder_and_merge(self, qkvo, gs: int): - q, k, v, o = qkvo - gate = None - # When attn_output_gate, Q is interleaved [Q0, G0, Q1, G1, ...] - # Split into separate Q and gate before permuting - if self.attn_output_gate and q is not None: - q, gate = self._split_q_gate(q) - # reorder output dim for tm's rotary embedding layout - if self.model.permute_qk: - if gs == 1: - if self.rope_dim < self.head_dim: - q = permute_v2_partial(q, self.head_dim, self.rope_dim) - k = permute_v2_partial(k, self.head_dim, self.rope_dim) - else: - q = permute_v2(q, self.head_dim) - k = permute_v2(k, self.head_dim) - else: - assert gs % self.head_dim == 0 - # Merge QKV with gate appended at end if present - if gate is not None: - qkv = merge_qkvg_v2(q, k, v, gate, self.tp) - else: - qkv = merge_qkv_v2(q, k, v, self.tp) - # zero bias for `wo` when `w_qkv` has bias but `wo` doesn't - if o is None and q.dim() == 1: - o = torch.zeros_like(q) - return qkv, o - - def _repeat_kv(self, qkvo, gs: int, kind: str): - """Replicate kv.""" - q, k, v, o = qkvo - head_dim = self.model.model_config.size_per_head // gs - kv_head_num = self.model.model_config.kv_head_num // self.model.repeat_kv - hidden_dim = self.model.model_config.hidden_units - - def _repeat(x): - n = self.model.repeat_kv - - x = x.reshape(-1, kv_head_num, head_dim) - x = x.repeat(1, 1, n) - x = x.reshape(-1, kv_head_num * n * head_dim) - - return x - - k, v = map(_repeat, (k, v)) - - if kind == 'bias': - if o is None: - o = torch.zeros(hidden_dim, dtype=q.dtype, device=q.device) - q, k, v, o = map(torch.squeeze, (q, k, v, o)) - - return (q, k, v, o) - - def _export(self, idx: int, qkvo, kind: str, pack_fn, apply_gs=None, **kwargs): - if apply_gs is None: - apply_gs = [] - if all(x is None for x in qkvo): - return - is_lora_a, is_lora_b = get_lora_flags(kind) - assert not (is_lora_a or is_lora_b) - - qkvo = tuple(map(transpose, qkvo)) - - gs = self.group_size if ('w1' in apply_gs) else 1 - - if self.model.repeat_kv: - qkvo = self._repeat_kv(qkvo, gs, kind) - - qkv, o = self._reorder_and_merge(qkvo, gs) - - self.model.save_split(pack_fn(qkv), - self._attn.format(idx, 'w_qkv', kind), - split_dim=-1, - split_num=self.tp, - copy=is_lora_a) - self.model.save_split(pack_fn(o), - self._attn.format(idx, 'wo', kind), - split_dim=0, - split_num=self.tp, - copy=is_lora_b) - - def apply(self, i: int, r: BaseReader): - for e in get_params(r.attn(i, None), bias=self.attn_bias): - e(self._export, partial(r.attn, i), i) - if self.qk_norm: - q, k = r.qk_norm(i) - if q is not None and k is not None: - if self.model.permute_qk: - if self.rope_dim < self.head_dim: - q = permute_v2_partial(q, self.head_dim, self.rope_dim) - k = permute_v2_partial(k, self.head_dim, self.rope_dim) - else: - q = permute_v2(q, self.head_dim) - k = permute_v2(k, self.head_dim) - self.model.save_split(q, self._attn.format(i, 'q_norm', '')[:-1]) - self.model.save_split(k, self._attn.format(i, 'k_norm', '')[:-1]) - if self.attn_sink: - sinks = r.attn_sinks(i) - self.model.save_split(sinks, self._attn.format(i, 'sinks', '')[:-1], split_dim=-1, split_num=self.tp) - - -class MLA(Module): - """ - requires: - r.mla(i, kind) - r.mla_norm(i) - """ - - _mla = 'layers.{0}.attention.{1}.{2}' - - def __init__(self, model: BaseOutputModel): - self.model = model - - def _export(self, idx: int, xs, kind: str, pack_fn, **kwargs): - if all(x is None for x in xs): - return - q_a, q_b, q, kv_a, kv_b, o = xs - - cfg = self.model.model_config - head_num = cfg.head_num - kv_lora_rank = cfg.kv_lora_rank - qk_rope_dim = cfg.qk_rope_dim - size_per_head = cfg.size_per_head - v_head_dim = cfg.v_head_dim - - # ========== MLA Weight Folding for Dimension Mismatch ========== - # When kv_lora_rank != qk_nope_dim (e.g., GLM 4.7 Flash: 512 != 512+64=576), - # fold the kc/vc compression/decompression BMMs into q_b_proj/o_proj weights - # at conversion time to avoid runtime overhead. - if kind == 'weight' and kv_lora_rank and q is None and q_b is not None and kv_b is not None and o is not None: - if not (torch.is_floating_point(q_b) and torch.is_floating_point(kv_b) and torch.is_floating_point(o)): - raise ValueError('MLA weight folding requires floating-point attention weights.') - - orig_q_head_dim = q_b.size(0) // head_num - orig_qk_nope_dim = orig_q_head_dim - qk_rope_dim - orig_kv_dim_total = kv_b.size(0) // head_num - orig_v_head_dim = o.size(1) // head_num - actual_orig_qk_nope_dim = orig_kv_dim_total - orig_v_head_dim - - if abs(orig_qk_nope_dim - actual_orig_qk_nope_dim) > 1: - raise ValueError(f'Dimension mismatch: inferred qk_nope from q_b ({orig_qk_nope_dim}) != ' - f'inferred from kv_b ({actual_orig_qk_nope_dim})') - - orig_qk_nope_dim = actual_orig_qk_nope_dim - target_nope_dim = size_per_head - qk_rope_dim - target_v_head_dim = v_head_dim - - if orig_qk_nope_dim != target_nope_dim or orig_v_head_dim != target_v_head_dim: - if target_nope_dim != kv_lora_rank or target_v_head_dim != kv_lora_rank: - raise ValueError(f'MLA folding expects v_head_dim and nope_dim to equal kv_lora_rank, ' - f'got nope={target_nope_dim}, v_head={target_v_head_dim}, rank={kv_lora_rank}') - - if kv_b.size(1) != kv_lora_rank: - raise ValueError(f'kv_b_proj second dim must equal kv_lora_rank for MLA folding, ' - f'got {kv_b.size(1)} != {kv_lora_rank}') - - # Split kv_b into kc and vc - kv_b_per_head = kv_b.reshape(head_num, orig_qk_nope_dim + orig_v_head_dim, kv_lora_rank) - kc_w = kv_b_per_head[:, :orig_qk_nope_dim, :] - vc_w = kv_b_per_head[:, orig_qk_nope_dim:, :] - - # Fold kc into q_b_proj - q_b_per_head = q_b.reshape(head_num, orig_q_head_dim, q_b.size(1)) - q_nope_w = q_b_per_head[:, :orig_qk_nope_dim, :] - q_rope_w = q_b_per_head[:, orig_qk_nope_dim:, :] - q_nope_expanded = torch.bmm(kc_w.transpose(1, 2), q_nope_w) - q_b_folded = torch.cat([q_nope_expanded, q_rope_w], dim=1) - q_b = q_b_folded.reshape(head_num * size_per_head, q_b.size(1)) - - # Fold vc into o_proj - o_per_head = o.reshape(o.size(0), head_num, orig_v_head_dim) - o_folded = torch.bmm(o_per_head.permute(1, 0, 2), vc_w) - o = o_folded.permute(1, 0, 2).reshape(o.size(0), head_num * kv_lora_rank) - - # Set kv_b to identity (kc/vc are now absorbed) - eye = torch.eye(kv_lora_rank, dtype=kv_b.dtype, device=kv_b.device) - kv_b = torch.cat([eye, eye], dim=0).repeat(head_num, 1) - # ========== End MLA Weight Folding ========== - - # Transpose after folding - q_a, q_b, q, kv_a, kv_b, o = map(transpose, (q_a, q_b, q, kv_a, kv_b, o)) - - if q is not None: - q_b = q - - # Pad o_proj to size_per_head if present - if o is not None: - o = o.reshape(head_num, v_head_dim, -1) - o = torch.nn.functional.pad(o, (0, 0, size_per_head - v_head_dim, 0, 0, 0)) - o = o.view(head_num * size_per_head, cfg.hidden_units) - - tp = self.model.attn_tp_size - - # Export MLA weights (handle None for folded-away tensors) - if q_a is not None: - self.model.save_split(pack_fn(q_a), self._mla.format(idx, 'q_a_proj', kind)) - q_b_name = 'q_proj' if q_a is None else 'q_b_proj' - if q_b is not None: - self.model.save_split(pack_fn(q_b), self._mla.format(idx, q_b_name, kind), split_dim=-1, split_num=tp) - if kv_a is not None: - self.model.save_split(pack_fn(kv_a), self._mla.format(idx, 'kv_a_proj', kind)) - # if kv_b is not None: - # self.model.save_split(pack_fn(kv_b), self._mla.format(idx, 'kv_b_proj', kind), split_dim=-1, split_num=tp) - if o is not None: - self.model.save_split(pack_fn(o), self._mla.format(idx, 'wo', kind), split_dim=0, split_num=tp) - - _layernorm = 'layers.{0}.attention.{1}_a_layernorm' - - def apply(self, i: int, r: BaseReader): - - for f in get_params(r.attn(i, None), bias=False): - f(self._export, partial(r.mla, i), i) - - q, k = r.mla_norm(i) - if q is not None: - self.model.save_split(q, self._layernorm.format(i, 'q')) - self.model.save_split(k, self._layernorm.format(i, 'kv')) - - -class LinearAttn(Module): - _linear_attn = 'layers.{0}.linear_attn.{1}.{2}' - - def __init__(self, model: BaseOutputModel): - self.model = model - self.tp = model.attn_tp_size - cfg = model.model_config - self.key_dim = cfg.linear_num_key_heads * cfg.linear_key_head_dim - self.value_dim = cfg.linear_num_value_heads * cfg.linear_value_head_dim - - def _tp_interleave_qkv(self, tensor, dim): - """Split a concatenated [Q, K, V] tensor into components, reshape each - for TP interleaving, and re-concatenate. - - in_proj_qkv layout along ``dim``: Q(key_dim) | K(key_dim) | V(value_dim). - A naive split doesn't respect component boundaries when key_dim and - value_dim differ. This method splits Q/K/V, reshapes each to - ``(tp, -1)`` along ``dim``, concatenates per-TP-shard, then flattens - so that a subsequent ``save_split(split_dim=dim)`` gives each rank the - correct portion. - """ - if dim < 0: - dim = tensor.dim() + dim - q, k, v = torch.split(tensor, [self.key_dim, self.key_dim, self.value_dim], dim=dim) - - def reshape(x): - # Move TP axis to a new dimension right after ``dim`` - shape = list(x.shape) - d = shape[dim] - new_shape = shape[:dim] + [self.tp, d // self.tp] + shape[dim + 1:] - return x.view(new_shape) - - parts = torch.cat([reshape(q), reshape(k), reshape(v)], dim=dim + 1) - # Collapse tp and per-shard dims back - shape = list(parts.shape) - final_shape = shape[:dim] + [shape[dim] * shape[dim + 1]] + shape[dim + 2:] - return parts.reshape(final_shape) - - def apply(self, i: int, r: BaseReader): - layer_types = getattr(self.model.model_config, 'layer_types', []) - if i >= len(layer_types) or layer_types[i] != 'linear_attention': - return - - for kind in ['weight', 'bias']: - weights = r.linear_attn(i, kind) - if not weights: - continue - - names = ['conv1d', 'in_proj_qkv', 'in_proj_z', 'in_proj_b', 'in_proj_a', 'out_proj', 'A_log', 'dt_bias'] - for name, tensor in zip(names, weights): - if tensor is None: - continue - if name == 'conv1d': - # conv1d shape: (conv_dim, 1, d_conv) where - # conv_dim = key_dim*2 + value_dim. Interleave Q/K/V - # portions along dim 0 before splitting for TP. - tensor = self._tp_interleave_qkv(tensor, dim=0) - self.model.save_split(tensor, - self._linear_attn.format(i, name, kind), - split_dim=0, - split_num=self.tp) - elif name in ['A_log', 'dt_bias']: - # Split per-head params across TP ranks (use -1 to - # avoid the 1-D copy shortcut in save_split). - self.model.save_split(tensor, - self._linear_attn.format(i, name, kind), - split_dim=-1, - split_num=self.tp) - elif name == 'out_proj': - self.model.save_split(transpose(tensor), - self._linear_attn.format(i, name, kind), - split_dim=0, - split_num=self.tp) - elif name == 'in_proj_qkv': - # in_proj_qkv: (conv_dim, hidden) where conv_dim = - # key_dim*2 + value_dim. After transpose the QKV - # components are along dim -1. Interleave for TP so - # each shard gets the correct Q/K/V slice. - t = transpose(tensor) - t = self._tp_interleave_qkv(t, dim=-1) - self.model.save_split(t, self._linear_attn.format(i, name, kind), split_dim=-1, split_num=self.tp) - else: - self.model.save_split(transpose(tensor), - self._linear_attn.format(i, name, kind), - split_dim=-1, - split_num=self.tp) - - norm = r.linear_norm(i, 'weight') - if norm is not None: - self.model.export_weight(norm, f'layers.{i}.linear_attn.norm.weight') - - -class Misc(Module): - """ - requires: - r.tok_embeddings() - r.norm_weight() - r.output_weight() - """ - - def apply(self, i: int, r: BaseReader): - """Export embedding, norm, output weight.""" - emb = r.tok_embeddings() - norm_weight = r.norm_weight() - output_weight = r.output_weight() - - def pad_weight(tensor: torch.Tensor, tp: int): - pad_size = None - vocab_size = self.model.model_config.vocab_size - if vocab_size % tp != 0: - pad_size = (vocab_size + tp - 1) // tp * tp - vocab_size - if pad_size is None: - return tensor - return torch.nn.functional.pad(tensor, (0, 0, 0, pad_size), 'constant', 0) - - tp = self.model.attn_tp_size * self.model.attn_cp_size - if emb is not None: - emb = pad_weight(emb, tp=tp) - self.model.save_split(emb, 'tok_embeddings.weight', split_dim=1, split_num=tp) - if norm_weight is not None: - self.model.export_weight(norm_weight, 'norm.weight') - if output_weight is not None: - output_weight = pad_weight(output_weight, tp=tp) - # transpose - self.model.save_split(output_weight.t(), 'output.weight', split_dim=1, split_num=tp) - - -class Transformer: - - def __init__(self, model: BaseOutputModel): - self.model = model - modules = [LayerNorm] - if model.model_config.kv_lora_rank: - modules.append(MLA) - else: - modules.append(Attn) - if getattr(model.model_config, 'layer_types', []): - modules.append(LinearAttn) - if model.model_config.inter_size: - modules.append(Ffn) - if model.model_config.expert_num: - modules.append(MoeFfn) - self.modules = [c(model) for c in modules] - self.misc = Misc(model) - - def __call__(self, i: int, r: BaseReader): - if i >= 0: - for m in self.modules: - m(i, r) - return 1 - else: - self.misc(i, r) diff --git a/lmdeploy/turbomind/deploy/parameter.py b/lmdeploy/turbomind/deploy/parameter.py deleted file mode 100644 index 59c6f0158f..0000000000 --- a/lmdeploy/turbomind/deploy/parameter.py +++ /dev/null @@ -1,160 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from abc import abstractmethod - -import torch - - -def identity(x): - return x - - -def to_half(x: torch.Tensor): - return x.to(torch.half) - - -def to_float(x: torch.Tensor): - return x.to(torch.float) - - -def to_fp8(x: torch.Tensor): - assert x.dtype == torch.uint8 - return x.view(dtype=torch.float8_e4m3fn) - - -def pack_u4_row(x: torch.Tensor) -> torch.Tensor: - assert x.dtype == torch.uint8, f'x.dtype: {x.dtype}' - xs = x.view(*x.shape[:-1], -1, 8).split(1, dim=-1) - a = torch.zeros(xs[0].shape, dtype=torch.int32, device=x.device) - for t in reversed(xs): - a = (a << 4) | t - return a.squeeze(dim=-1) - - -def generate_zero_point(scales): - """Synthesize symmetric int4 zero-points from exported scale shapes.""" - return tuple(torch.full(s.shape, 8, dtype=torch.uint8) for s in scales) - - -class Parameter: - KEY = () - - @classmethod - def take(cls, keys: list[str]): - if not any(k.endswith(cls.KEYS[0]) for k in keys): - return False - xs = [] - for k in keys: - if any(k.endswith(p) for p in cls.KEYS): - xs.append(k) - for x in xs: - keys.remove(x) - return xs - - @abstractmethod - def __call__(cls, f, g, i): - pass - - -class QuantWeightOnly(Parameter): - AWQ_KEYS = '.qweight', '.scales', '.qzeros' - COMPRESSED_KEYS = '.weight_packed', '.weight_scale', '.weight_zero_point' - KEYS = AWQ_KEYS + COMPRESSED_KEYS - - @classmethod - def take(cls, keys: list[str]): - if any(k.endswith(cls.AWQ_KEYS[0]) for k in keys): - suffixes = cls.AWQ_KEYS - elif any(k.endswith(cls.COMPRESSED_KEYS[0]) for k in keys): - suffixes = cls.COMPRESSED_KEYS - else: - return False - - xs = [] - for k in keys: - if any(k.endswith(p) for p in suffixes): - xs.append(k) - for x in xs: - keys.remove(x) - return xs - - def __init__(self, xs): - self.compressed_tensors = any(key.endswith(self.COMPRESSED_KEYS[0]) for key in xs) - self.has_zero_point = any(key.endswith(self.COMPRESSED_KEYS[2]) for key in xs) - - def _get(self, g, kind: str): - if not self.compressed_tensors: - return g(kind) - - mapping = { - 'qweight': 'weight_packed', - 'scales': 'weight_scale', - 'qzeros': 'weight_zero_point', - } - return g(mapping[kind]) - - def __call__(self, f, g, i): - f(i, self._get(g, 'qweight'), 'qweight', pack_u4_row) - scales = self._get(g, 'scales') - f(i, scales, 'scales', to_half, apply_gs=['w2']) - if self.compressed_tensors and not self.has_zero_point: - zeros = generate_zero_point(scales) - else: - zeros = self._get(g, 'qzeros') - f(i, zeros, 'zeros', to_half, apply_gs=['w2']) - - -class WeightScaleInv(Parameter): - KEYS = '.weight_scale_inv', '.weight' - - # TODO: flag any operations crossing the quant blocks as illegal - def __call__(self, f, g, i): - f(i, g('weight_scale_inv'), 'scales', to_float, apply_gs=['w1', 'w3', 'w2']) - f(i, g('weight'), 'weight', identity) - - -class Mxfp4Weight(Parameter): - KEYS = '.blocks', '.scales' - - def __call__(self, f, g, i): - f(i, g('blocks'), 'weight', pack_u4_row) - f(i, g('scales'), 'scales', identity, apply_gs=['w2']) - - -class Weight(Parameter): - KEYS = '.weight', - - def __call__(self, f, g, i): - f(i, g('weight'), 'weight', identity) - - -class Bias(Parameter): - KEYS = '.bias', - - def __call__(self, f, g, i): - f(i, g('bias'), 'bias', identity) - - -class PLora(Parameter): - KEYS = '.Plora_A.weight', '.Plora_B.weight' - - def __call__(self, f, g, i): - f(i, g('Plora_A.weight'), 'lora_a.weight', identity) - f(i, g('Plora_B.weight'), 'lora_b.weight', identity) - - -def get_params(keys: list[str], bias=0): - ps = [] - if PLora.take(keys): - ps.append(PLora()) - xs = QuantWeightOnly.take(keys) - if xs: - ps.append(QuantWeightOnly(xs)) - if WeightScaleInv.take(keys): - ps.append(WeightScaleInv()) - if Mxfp4Weight.take(keys): - ps.append(Mxfp4Weight()) - if Weight.take(keys): - ps.append(Weight()) - if bias and Bias.take(keys): - ps.append(Bias()) - return ps diff --git a/lmdeploy/turbomind/deploy/policy.py b/lmdeploy/turbomind/deploy/policy.py deleted file mode 100644 index 0e4c061c0d..0000000000 --- a/lmdeploy/turbomind/deploy/policy.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. - -import torch.cuda - - -def to_cuda(x: torch.Tensor, *args): - return x.cuda() - - -def get_u4_slices(x: torch.Tensor, dtype: torch.dtype) -> list[torch.Tensor]: - MAP = {torch.int32: 8, torch.uint8: 2} - xs = [] - for _ in range(MAP[x.dtype]): - xs.append((x & 15).to(dtype)) - x = x >> 4 - return xs - - -def unpack_awq_gemm(x: torch.Tensor) -> torch.Tensor: - xs = get_u4_slices(x, torch.uint8) - order = [0, 4, 1, 5, 2, 6, 3, 7] - ys = [xs[i] for i in order] - return torch.stack(ys, dim=-1).view(*x.shape[:-1], -1) - - -def process_awq_gemm(x: torch.Tensor, kind: str): - x = x.cuda() - if x.dtype == torch.int32: - x = unpack_awq_gemm(x) - if kind in ['qweight', 'qzeros', 'scales']: - x = x.t() - return x - - -def process_gptq(x: torch.Tensor, kind: str): - x = x.cuda() - if x.dtype == torch.int32: - xs = get_u4_slices(x, torch.uint8) - if kind == 'qweight': # (k/8,n) - x = torch.stack(xs, dim=1).view(-1, x.size(-1)) - else: # 'qzeros' (k/g,n/8) - x = torch.stack(xs, dim=-1).view(x.size(0), -1) + 1 - if kind in ['qweight', 'qzeros', 'scales']: - x = x.t() - return x - - -def process_mxfp4(x: torch.Tensor, kind: str): - # print(x.shape, x.dtype, kind) - x = x.cuda() - if kind == 'blocks': - xs = get_u4_slices(torch.flatten(x, start_dim=-2), torch.uint8) - x = torch.flatten(torch.stack(xs, dim=-1), start_dim=-2) - if kind == 'scales': - pass - return x - - -def process_fp8(x: torch.Tensor, kind: str): - x = x.cuda() - if x.dtype == torch.float8_e4m3fn: - # some ops (e.g. torch.cat) for fp8 is not implemented in pytorch - return x.view(dtype=torch.uint8) - elif kind != 'weight_scale_inv' and x.dtype == torch.float: - return x.to(dtype=torch.bfloat16) - else: - return x.to(dtype=torch.bfloat16) - - -def process_compressed_tensor(x: torch.Tensor, kind: str): - x = x.cuda() - if x.dtype == torch.int32: - xs = get_u4_slices(x, torch.uint8) - if kind == 'weight_packed': # (out_channels, in_channels // 8) - x = torch.stack(xs, dim=-1).view(*x.shape[:-1], -1) - elif kind == 'weight_zero_point': # (out_channels // 8, in_channels // group_size) - x = torch.stack(xs, dim=1).view(-1, x.size(-1)) - return x - - -def get_input_policy(model_format): - if model_format == 'awq': - return process_awq_gemm - elif model_format == 'gptq': - return process_gptq - elif model_format == 'mxfp4': - return process_mxfp4 - elif model_format == 'fp8': - return process_fp8 - elif model_format == 'compressed-tensors': - return process_compressed_tensor - else: - return to_cuda diff --git a/lmdeploy/turbomind/deploy/source_model/__init__.py b/lmdeploy/turbomind/deploy/source_model/__init__.py deleted file mode 100644 index 11a17bea9d..0000000000 --- a/lmdeploy/turbomind/deploy/source_model/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from .baichuan import Baichuan2Model, BaichuanModel # noqa: F401 -from .deepseek2 import DeepSeek2Model # noqa: F401 -from .deepseek_vl import DeepSeekVLModel # noqa: F401 -from .glm4 import Glm4Model # noqa: F401 -from .glm4_moe_lite import Glm4MoeLiteModel # noqa: F401 -from .gpt_oss import GptOssModel # noqa: F401 -from .internlm2 import InternLM2Model # noqa: F401 -from .internvl import InternVLModel # noqa: F401 -from .llama import LlamaModel # noqa: F401 -from .llava import LlavaModel # noqa: F401 -from .minicpmv import MiniCPMVModel # noqa: F401 -from .mixtral import MixtralModel # noqa: F401 -from .molmo import MolmoModel # noqa: F401 -from .qwen import QwenModel # noqa: F401 -from .xcomposer2 import Xcomposer2Model # noqa: F401 diff --git a/lmdeploy/turbomind/deploy/source_model/baichuan.py b/lmdeploy/turbomind/deploy/source_model/baichuan.py deleted file mode 100644 index 51ca34b55a..0000000000 --- a/lmdeploy/turbomind/deploy/source_model/baichuan.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. - -import torch - -from .base import INPUT_MODELS -from .llama import LlamaModel, LlamaReader - - -class BaichuanReader(LlamaReader): - """BaichuanReader.""" - - def _attn(self, i: int, kind: str): - """Get q, k, v, o kind for layer i.""" - q, k, v, o = (None, ) * 4 - pack_key = f'model.layers.{i}.self_attn.W_pack.{kind}' - qkv = self.transform(self.params.get(pack_key), kind) - if qkv is not None: - q, k, v = torch.split(qkv, qkv.shape[0] // 3, dim=0) - o = self.params.get(f'model.layers.{i}.self_attn.o_proj.{kind}') - o = self.transform(o, kind) - return q, k, v, o - - -@INPUT_MODELS.register_module(name='baichuan') -class BaichuanModel(LlamaModel): - """Llama model in baichuan format.""" - - Reader = BaichuanReader - - -class Baichuan2Reader(BaichuanReader): - """Baichuan2Reader.""" - - def output_weight(self): - """Get output.""" - # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/main/modeling_baichuan.py#L507 - tensor = self.params.get('lm_head.weight', None) - if tensor is not None: - tensor = tensor.cuda() - tensor = torch.nn.functional.normalize(tensor) - return tensor - - -@INPUT_MODELS.register_module(name='baichuan2') -class Baichuan2Model(LlamaModel): - """Llama model in baichuan format.""" - - Reader = Baichuan2Reader diff --git a/lmdeploy/turbomind/deploy/source_model/base.py b/lmdeploy/turbomind/deploy/source_model/base.py deleted file mode 100644 index 9bc6ca3bbc..0000000000 --- a/lmdeploy/turbomind/deploy/source_model/base.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from abc import ABC, abstractmethod -from collections.abc import Iterator - -import torch -from mmengine import Registry - -INPUT_MODELS = Registry('source model', locations=['lmdeploy.turbomind.deploy.source_model.base']) - - -class BaseReader(ABC): - """Mapping between TM modules and source modules.""" - - def __init__(self): - pass - - def transform(self, x: torch.Tensor | None, kind: str) -> torch.Tensor | None: - return None if x is None else self._transform(x, kind) - - @abstractmethod - def _transform(self, x: torch.Tensor, kind: str): - """Transform x.""" - pass - - -class BaseInputModel(ABC): - """Base class for input model.""" - - def __init__(self, model_path: str, tokenizer_path: str, **kwargs): - """Constructor for BaseInputModel. - - Args: - model_path (str): the path of the model. - tokenizer_path (str): the path of the tokenizer model. - """ - self.model_path = model_path - self.tokenizer_path = tokenizer_path - - @abstractmethod - def model_info(self) -> dict: - """Read model info.""" - pass - - @abstractmethod - def readers(self) -> Iterator[BaseReader]: - pass diff --git a/lmdeploy/turbomind/deploy/source_model/deepseek2.py b/lmdeploy/turbomind/deploy/source_model/deepseek2.py deleted file mode 100644 index 79b6d3c354..0000000000 --- a/lmdeploy/turbomind/deploy/source_model/deepseek2.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import math -import os - -from ..config import RopeParam -from .base import INPUT_MODELS -from .llama import LlamaModel, LlamaReader - - -class DeepSeek2Reader(LlamaReader): - - def moe_ffn_gate(self, i, kind): - return self.params.get(f'model.layers.{i}.mlp.gate.{kind}') - - def moe_ffn_expert(self, e=None, i=None, kind=None): - if not kind: - return self.filter(r'experts', i) - result = [] - for key in ['gate', 'down', 'up']: - name = f'model.layers.{i}.mlp.experts.{e}.{key}_proj.{kind}' - tensor = self.params.get(name) - tensor = self.transform(tensor, kind) - result.append(tensor) - return (*result, ) - - def _ffn(self, i: int, kind: str): - """Get ffn kind for layer i.""" - if not kind: - # Filter by layer number to get only keys for this specific layer - if i == 0: - pattern = rf'model\.layers\.{i}\.mlp\.' - else: - pattern = rf'model\.layers\.{i}\.mlp\.shared_experts\.' - return self.filter(pattern, None) - result = [] - for key in ['gate', 'down', 'up']: - name = f'model.layers.{i}.mlp.shared_experts.{key}_proj.{kind}' - if i == 0: - name = name.replace('shared_experts.', '') - tensor = self.params.get(name) - tensor = self.transform(tensor, kind) - result.append(tensor) - return (*result, ) - - def ffn(self, i: int, kind: str): - return self._ffn(i, kind) - - def mla(self, i: int, kind: str): - if not kind: - return self.filter(r'self_attn.*proj', i) - result = [] - for key in ['q_a_proj', 'q_b_proj', 'q_proj', 'kv_a_proj_with_mqa', 'kv_b_proj', 'o_proj']: - tensor = self.params.get(f'{self.attn_layer_prefix}.{i}.self_attn.{key}.{kind}') - tensor = self.transform(tensor, kind) - result.append(tensor) - return (*result, ) - - def mla_norm(self, i: int): - result = [] - for k in ['q', 'kv']: - name = f'{self.attn_layer_prefix}.{i}.self_attn.{k}_a_layernorm.weight' # noqa: E501 - result.append(self.params.get(name)) - return (*result, ) - - -def get_yarn_params(rope_scaling: dict): - - scaling_factor = float(rope_scaling['factor']) - mscale = rope_scaling['mscale'] - mscale_all_dim = rope_scaling['mscale_all_dim'] - - def yarn_get_mscale(scale=1, mscale=1): - if scale <= 1: - return 1.0 - return 0.1 * mscale * math.log(scale) + 1.0 - - _mscale = float(yarn_get_mscale(scaling_factor, mscale) / yarn_get_mscale(scaling_factor, mscale_all_dim)) - - softmax_scale = 0 - if mscale_all_dim: - scale = yarn_get_mscale(scaling_factor, mscale_all_dim) - softmax_scale = scale * scale - - return _mscale, softmax_scale - - -@INPUT_MODELS.register_module(name='deepseek2') -class DeepSeek2Model(LlamaModel): - - Reader = DeepSeek2Reader - - def model_info(self): - cfg = self.model_config - info = super().model_info() - qk_nope_dim = cfg['qk_nope_head_dim'] - qk_rope_dim = cfg['qk_rope_head_dim'] - kv_lora_rank = cfg['kv_lora_rank'] - q_head_dim = qk_nope_dim + qk_rope_dim - num_layer = cfg['num_hidden_layers'] - expert_num = cfg['n_routed_experts'] - expert_num = [expert_num] * num_layer - expert_num[0] = 0 - n_shared_experts = cfg['n_shared_experts'] - expert_inter_size = cfg['moe_intermediate_size'] - experts_per_token = cfg['num_experts_per_tok'] - inter_size = [n_shared_experts * expert_inter_size] * num_layer - inter_size[0] = cfg['intermediate_size'] - norm_topk_prob = cfg['norm_topk_prob'] - size_per_head = qk_rope_dim + qk_nope_dim - v_head_dim = cfg['v_head_dim'] - softmax_scale = 0.0 - disable_mla_fold = os.getenv('LMDEPLOY_MLA_FOLD', '1').lower() in ('0', 'false', 'no') - if kv_lora_rank and kv_lora_rank != qk_nope_dim and not disable_mla_fold: - # MLA folding: remap to kv_lora_rank-based head dims and fold - # kc/vc BMMs into q_b_proj/o_proj at conversion time. - size_per_head = kv_lora_rank + qk_rope_dim - v_head_dim = kv_lora_rank - softmax_scale = q_head_dim**(-0.5) - elif kv_lora_rank and kv_lora_rank != qk_nope_dim: - softmax_scale = q_head_dim**(-0.5) - - info.update(kv_lora_rank=kv_lora_rank, - q_lora_rank=cfg['q_lora_rank'] or 0, - qk_rope_dim=qk_rope_dim, - v_head_dim=v_head_dim, - size_per_head=size_per_head, - kv_head_num=1, - expert_num=expert_num, - expert_inter_size=expert_inter_size, - experts_per_token=experts_per_token, - inter_size=inter_size, - norm_topk_prob=norm_topk_prob, - routed_scale=cfg['routed_scaling_factor'], - topk_method=cfg['topk_method'], - topk_group=cfg['topk_group'], - moe_group_num=cfg['n_group'], - scoring_func=cfg.get('scoring_func', 'softmax'), - tune_layer_num=2) - if 'router_n_groups' in cfg and cfg['router_n_groups'] > 0: - info['router_n_groups'] = cfg['router_n_groups'] - rope_param: RopeParam = info['rope_param'] - rope_param.dim = qk_rope_dim - if 'rope_parameters' in cfg: - # transformers v5.0.0 aggregates all rope-related parameters into 'rope_parameters' - rope_scaling = cfg['rope_parameters'] - else: - rope_scaling = cfg.get('rope_scaling') - if rope_scaling and rope_scaling.get('type') == 'yarn': - attention_factor, yarn_scale = get_yarn_params(rope_scaling) - yarn_scale *= q_head_dim**(-0.5) - rope_param.max_position_embeddings = rope_scaling['original_max_position_embeddings'] - rope_param.attention_factor = attention_factor - info.update(rope_param=rope_param, softmax_scale=yarn_scale) - elif softmax_scale: - info.update(softmax_scale=softmax_scale) - return info diff --git a/lmdeploy/turbomind/deploy/source_model/deepseek_vl.py b/lmdeploy/turbomind/deploy/source_model/deepseek_vl.py deleted file mode 100644 index 8fa8a4c85a..0000000000 --- a/lmdeploy/turbomind/deploy/source_model/deepseek_vl.py +++ /dev/null @@ -1,78 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import json -import os.path as osp - -from ..config import RopeParam -from .base import INPUT_MODELS -from .llama import LlamaModel, LlamaReader - - -class DeepSeekVLReader(LlamaReader): - """DeepSeekVL model reader.""" - - attn_layer_prefix = 'language_model.model.layers' - attn_layer_patten = r'language_model\.model\.layers\.([0-9]+).' - tok_embeddings_key = 'language_model.model.embed_tokens.weight' - norm_weight_key = 'language_model.model.norm.weight' - output_weight_key = 'language_model.lm_head.weight' - - def __init__(self, new_params: dict, unused_params: dict, last_bin: bool, model_cfg: dict, **kwargs): - model_cfg = model_cfg['language_config'] - super().__init__(new_params, unused_params, last_bin, model_cfg, **kwargs) - - def attn_norm(self, i: int): - """Get attn norm for layer i.""" - return self.params[f'language_model.model.layers.{i}.input_layernorm.weight'] - - def ffn_norm(self, i: int): - """Get ffn norm for layer i.""" - return self.params[f'language_model.model.layers.{i}.post_attention_layernorm.weight'] - - -@INPUT_MODELS.register_module(name='deepseekvl') -class DeepSeekVLModel(LlamaModel): - """DeepSeekVL model in hf format.""" - - Reader = DeepSeekVLReader - - def model_info(self): - """Read model info.""" - params_path = osp.join(self.model_path, 'config.json') - with open(params_path) as f: - model_arg = json.load(f) - if 'language_config' in model_arg and model_arg['language_config'].get('model_type', None) == 'llama': - model_arg = model_arg['language_config'] # depseek-vl - num_layer = model_arg['num_hidden_layers'] - hidden_units = model_arg.get('hidden_size', 4096) - inter_size = model_arg.get('intermediate_size', 11008) - vocab_size = model_arg.get('vocab_size', 102400) - norm_eps = model_arg.get('rms_norm_eps', 1e-06) - attn_head_num = model_arg.get('num_attention_heads', 32) - if 'num_key_value_heads' in model_arg: - kv_head_num = model_arg['num_key_value_heads'] - else: - kv_head_num = model_arg.get('num_attention_heads', 32) - rope_theta = float(model_arg.get('rope_theta', 10000.0)) - max_position_embeddings = int(model_arg.get('max_position_embeddings', 0)) - rope_scaling = model_arg.get('rope_scaling', None) - scaling_factor = 0.0 - scaling_type = 'default' - if isinstance(rope_scaling, dict): - scaling_type = model_arg['rope_scaling'].get('type', 'default') - scaling_factor = model_arg['rope_scaling'].get('factor', '') - head_dim = model_arg.get('head_dim', hidden_units // attn_head_num) - rope_param = RopeParam(type=scaling_type, - base=rope_theta, - dim=head_dim, - max_position_embeddings=max_position_embeddings, - factor=scaling_factor) - - return dict(num_layer=num_layer, - norm_eps=norm_eps, - head_num=attn_head_num, - kv_head_num=kv_head_num, - hidden_units=hidden_units, - inter_size=inter_size, - vocab_size=vocab_size, - max_position_embeddings=max_position_embeddings, - rope_param=rope_param) diff --git a/lmdeploy/turbomind/deploy/source_model/glm4.py b/lmdeploy/turbomind/deploy/source_model/glm4.py deleted file mode 100644 index df6c2f574a..0000000000 --- a/lmdeploy/turbomind/deploy/source_model/glm4.py +++ /dev/null @@ -1,98 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import json -import os.path as osp - -import torch - -from ..config import RopeParam -from .base import INPUT_MODELS -from .llama import LlamaModel, LlamaReader - - -class Glm4Reader(LlamaReader): - """Glm4Reader.""" - - attn_layer_patten = r'transformer\.encoder\.layers\.([0-9]+).' - tok_embeddings_key = 'transformer.embedding.word_embeddings.weight' - norm_weight_key = 'transformer.encoder.final_layernorm.weight' - output_weight_key = 'transformer.output_layer.weight' - - attn_pattern = r'self_attention' - - def _attn(self, i: int, kind: str): - """Get q, k, v, o kind for layer i.""" - qkv = self.params[f'transformer.encoder.layers.{i}' - f'.self_attention.query_key_value.{kind}'] - qkv = self.transform(qkv, kind) - attn_head_num = self.model_cfg['num_attention_heads'] - kv_head_num = attn_head_num - if self.model_cfg.get('multi_query_attention', False): - kv_head_num = self.model_cfg['multi_query_group_num'] - HEAD_DIM = 128 - q, k, v = torch.split(qkv, [attn_head_num * HEAD_DIM, kv_head_num * HEAD_DIM, kv_head_num * HEAD_DIM], dim=0) - o = self.params.get(f'transformer.encoder.layers.{i}.self_attention.dense.{kind}') - o = self.transform(o, kind) - if o is None: # handle the case when qkv has bias but o doesn't - o = torch.zeros_like(q) - return q, k, v, o - - def attn_norm(self, i: int): - """Get attn norm for layer i.""" - return self.params[f'transformer.encoder.layers.{i}.input_layernorm.weight'] - - def _ffn(self, i: int, kind: str): - """Get ffn kind for layer i.""" - up_and_gate = self.params[f'transformer.encoder.layers.{i}.mlp.dense_h_to_4h.{kind}'] - up_and_gate = self.transform(up_and_gate, kind) - up, gate = up_and_gate.chunk(2, dim=0) - down = self.params[f'transformer.encoder.layers.{i}.mlp.dense_4h_to_h.{kind}'] - down = self.transform(down, kind) - return (up, down, gate) - - def ffn_norm(self, i: int): - """Get ffn norm for layer i.""" - return self.params[f'transformer.encoder.layers.{i}.post_attention_layernorm.weight'] - - -@INPUT_MODELS.register_module(name='glm4') -class Glm4Model(LlamaModel): - """Glm2/3/4 model in hf format.""" - - Reader = Glm4Reader - - def __init__(self, model_path: str, tokenizer_path: str, **kwargs): - super().__init__(model_path, tokenizer_path, **kwargs) - config_path = osp.join(self.model_path, 'config.json') - with open(config_path) as f: - self.config = json.load(f) - - def model_info(self): - """Read model info.""" - config = self.config - hidden_units = config.get('hidden_size', None) - num_layer = config.get('num_hidden_layers', None) - num_layer = config.get('num_layers', num_layer) - norm_eps = config['layernorm_epsilon'] - rope_theta = float(config.get('rotary_emb_base', 10000.0)) - rope_ratio = float(config.get('rope_ratio', 1.0)) - rope_theta *= rope_ratio - attn_head_num = config['num_attention_heads'] - kv_head_num = attn_head_num - inter_size = config['ffn_hidden_size'] - vocab_size = config['padded_vocab_size'] - attn_bias = config['add_qkv_bias'] - if config['multi_query_attention']: - kv_head_num = config['multi_query_group_num'] - seq_length = config['seq_length'] - rope_param = RopeParam(type='default', base=rope_theta, dim=64) - return dict(num_layer=num_layer, - norm_eps=norm_eps, - head_num=attn_head_num, - kv_head_num=kv_head_num, - hidden_units=hidden_units, - attn_bias=int(attn_bias), - inter_size=inter_size, - vocab_size=vocab_size, - rope_param=rope_param, - max_position_embeddings=seq_length, - permute_qk=False) # head layout is same as TM diff --git a/lmdeploy/turbomind/deploy/source_model/glm4_moe_lite.py b/lmdeploy/turbomind/deploy/source_model/glm4_moe_lite.py deleted file mode 100644 index 9e4eeedebd..0000000000 --- a/lmdeploy/turbomind/deploy/source_model/glm4_moe_lite.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -"""GLM-4 MoE Lite (e.g. GLM-4.7-Flash) source model for TurboMind. - -Architecture: MLA (Multi-head Latent Attention) + MoE with dense first layer. -Weight layout follows HuggingFace checkpoint with model.layers.* (same family as DeepSeek2). -""" - -from .base import INPUT_MODELS -from .deepseek2 import DeepSeek2Model, DeepSeek2Reader - - -class Glm4MoeLiteReader(DeepSeek2Reader): - """Reader for Glm4MoeLiteForCausalLM (GLM-4.7-Flash). - - Uses same key layout as DeepSeek2: model.layers.{i}.self_attn.*, model.layers.{i}.mlp.* - Supports noaux_tc via e_score_correction_bias. - """ - - attn_layer_prefix = 'model.layers' - attn_layer_patten = r'model\.layers\.([0-9]+).' - tok_embeddings_key = 'model.embed_tokens.weight' - norm_weight_key = 'model.norm.weight' - output_weight_key = 'lm_head.weight' - - def moe_ffn_gate_correction_bias(self, i: int): - """Per-expert score correction bias for noaux_tc routing.""" - return self.params.get(f'{self.attn_layer_prefix}.{i}.mlp.gate.e_score_correction_bias') - - -@INPUT_MODELS.register_module(name='glm4-moe-lite') -class Glm4MoeLiteModel(DeepSeek2Model): - """GLM-4 MoE Lite (e.g. GLM-4.7-Flash) in HF format. - - MLA + MoE with first_k_dense_replace; config mapping aligned to DeepSeek2. - """ - - Reader = Glm4MoeLiteReader - - def model_info(self): - cfg = self.model_config - # Set default MoE routing config for GLM-4 MoE Lite if not in HF config - if 'topk_method' not in cfg: - cfg['topk_method'] = 'noaux_tc' - if 'topk_group' not in cfg: - cfg['topk_group'] = 1 - if 'n_group' not in cfg: - cfg['n_group'] = 1 - if 'scoring_func' not in cfg: - cfg['scoring_func'] = 'sigmoid' - - info = super().model_info() - # GLM4 MoE Lite uses noaux_tc routing with sigmoid scoring - info['topk_method'] = 'noaux_tc' - info['scoring_func'] = 'sigmoid' - if 'router_n_groups' in cfg and cfg['router_n_groups'] > 0: - info['router_n_groups'] = cfg['router_n_groups'] - - return info diff --git a/lmdeploy/turbomind/deploy/source_model/gpt_oss.py b/lmdeploy/turbomind/deploy/source_model/gpt_oss.py deleted file mode 100644 index c6bfdb06b1..0000000000 --- a/lmdeploy/turbomind/deploy/source_model/gpt_oss.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. - -import re - -from .base import INPUT_MODELS -from .llama import LlamaModel, LlamaReader - - -def map_experts(str): - s = re.sub(r'(experts.*proj)$', r'\1.weight', str) - s = re.sub(r'(experts.*proj)_bias$', r'\1.bias', s) - s = re.sub(r'(experts.*proj)_blocks$', r'\1.blocks', s) - s = re.sub(r'(experts.*proj)_scales$', r'\1.scales', s) - return s - - -class GptOssReader(LlamaReader): - - mappings = [map_experts] - - def moe_ffn_expert(self, e=None, i=None, kind=None): - if not kind: - return self.filter(r'experts', i) - result = [] - for key in ['gate_up', 'down']: - name = f'{self.attn_layer_prefix}.{i}.mlp.experts.{key}_proj.{kind}' - tensor = self.params.get(name)[e] - if kind == 'weight': # experts in BF16 models are in M-major - tensor = tensor.cuda().t() - if key == 'gate_up': - gate, up = tensor[::2], tensor[1::2] - result.append(self.transform(gate, kind)) - result.append(self.transform(up, kind)) - else: - result.append(self.transform(tensor, kind)) - return (result[0], result[2], result[1]) - - def moe_ffn_gate(self, i, kind): - return self.transform(self.params.get(f'{self.attn_layer_prefix}.{i}.mlp.router.{kind}'), kind) - - def attn_sinks(self, i): - return self.params.get(f'{self.attn_layer_prefix}.{i}.self_attn.sinks') - - -@INPUT_MODELS.register_module(name='gpt-oss') -class GptOssModel(LlamaModel): - - Reader = GptOssReader - - def model_info(self): - cfg = self.model_config - types = cfg['layer_types'] - sliding_window = cfg['sliding_window'] - info = super().model_info() - info.update(attn_bias=int(cfg['attention_bias']), - mlp_bias=True, - expert_router_bias=True, - expert_num=cfg['num_local_experts'], - expert_inter_size=cfg['intermediate_size'], - experts_per_token=cfg['experts_per_token'], - norm_topk_prob=True, - inter_size=0, - window_size=[sliding_window if x == 'sliding_attention' else 0 for x in types], - attn_sink=True, - activation_type='gpt-oss') - return info diff --git a/lmdeploy/turbomind/deploy/source_model/internlm2.py b/lmdeploy/turbomind/deploy/source_model/internlm2.py deleted file mode 100644 index 21ccf9d9f8..0000000000 --- a/lmdeploy/turbomind/deploy/source_model/internlm2.py +++ /dev/null @@ -1,112 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. - -import re - -import torch - -from .base import INPUT_MODELS -from .llama import LlamaModel, LlamaReader - - -class InternLM2Reader(LlamaReader): - """InternLM2 model reader.""" - - attn_layer_prefix = 'model.layers' - attn_layer_patten = r'model\.layers\.([0-9]+).' - tok_embeddings_key = 'model.tok_embeddings.weight' - norm_weight_key = 'model.norm.weight' - output_weight_key = 'output.weight' - - attn_pattern = r'attention' - ffn_pattern = r'feed_forward' - - proj_pattern = 'w' - - def filter(self, pattern: str, i: int | None): - params = [] - for k in self.params.keys(): - if re.search(pattern, k): - params.append(k) - - if self.fp8_quant and pattern == self.attn_pattern: - from lmdeploy.lite.quantization.weight.quant_utils import quant_blocked_fp8 - q, k, v = (None, ) * 3 - kv_head_num = self.model_cfg['num_key_value_heads'] - gs = int(self.model_cfg['num_attention_heads'] / kv_head_num) - qkv = self.params.get(f'{self.attn_layer_prefix}.{i}.attention.wqkv.weight') - - if qkv is not None: - qkv = qkv.view(kv_head_num, gs + 2, 128, -1) - hidden_dim = qkv.shape[-1] - q, k, v = torch.split(qkv, [gs, 1, 1], dim=1) - - tensors = [q.reshape(-1, hidden_dim), k.reshape(-1, hidden_dim), v.reshape(-1, hidden_dim)] - split_sizes = [gs, 1, 1] - keys = ['q', 'k', 'v'] - qkv_weight = [] - for tensor, split_size, key in zip(tensors, split_sizes, keys): - qweight, scale = quant_blocked_fp8(tensor, torch.float8_e4m3fn, block_size=128) - qweight = qweight.reshape(kv_head_num, split_size, 128, -1) - qkv_weight.append(qweight) - - self.params[f'{self.attn_layer_prefix}.{i}.{self.attn_pattern}.w{key}.weight_scale_inv'] = scale - params.append(f'{self.attn_layer_prefix}.{i}.{self.attn_pattern}.w{key}.weight_scale_inv') - - qkv_weight = torch.cat(qkv_weight, dim=1) - qkv_weight = qkv_weight.reshape(-1, hidden_dim) - self.params[f'{self.attn_layer_prefix}.{i}.{self.attn_pattern}.wqkv.weight'] = qkv_weight - - return params - else: - return params - - def _attn(self, i: int, kind: str): - """Get q, k, v, o kind for layer i.""" - if self.fp8_quant and kind == 'weight_scale_inv': - result = [] - for key in ['q', 'k', 'v', 'o']: - tensor = self.params.get(f'{self.attn_layer_prefix}.{i}.{self.attn_pattern}.w{key}.{kind}') - tensor = self.transform(tensor, kind) - result.append(tensor) - return (*result, ) - q, k, v = (None, ) * 3 - kv_head_num = self.model_cfg['num_key_value_heads'] - gs = int(self.model_cfg['num_attention_heads'] / kv_head_num) - qkv = self.params.get(f'{self.attn_layer_prefix}.{i}.attention.wqkv.{kind}') - qkv = self.transform(qkv, kind) - if qkv is not None: - qkv = qkv.view(kv_head_num, gs + 2, 128, -1) - hidden_dim = qkv.shape[-1] - q, k, v = torch.split(qkv, [gs, 1, 1], dim=1) - q = q.reshape(-1, hidden_dim) - k = k.reshape(-1, hidden_dim) - v = v.reshape(-1, hidden_dim) - o = self.params.get(f'{self.attn_layer_prefix}.{i}.attention.wo.{kind}') - o = self.transform(o, kind) - return (q, k, v, o) - - def attn_norm(self, i: int): - """Get attn norm for layer i.""" - return self.params[f'{self.attn_layer_prefix}.{i}.attention_norm.weight'] - - def _ffn(self, i: int, kind: str): - """Get ffn kind for layer i.""" - if not kind: - return self.filter(self.ffn_pattern, i) - result = [] - for key in ['w1', 'w2', 'w3']: - tensor = self.params[f'{self.attn_layer_prefix}.{i}.feed_forward.{key}.{kind}'] - tensor = self.transform(tensor, kind) - result.append(tensor) - return (*result, ) - - def ffn_norm(self, i: int): - """Get ffn norm for layer i.""" - return self.params[f'{self.attn_layer_prefix}.{i}.ffn_norm.weight'] - - -@INPUT_MODELS.register_module(name='internlm2') -class InternLM2Model(LlamaModel): - """InternLM2 model in hf format.""" - - Reader = InternLM2Reader diff --git a/lmdeploy/turbomind/deploy/source_model/internvl.py b/lmdeploy/turbomind/deploy/source_model/internvl.py deleted file mode 100644 index 575507c2a9..0000000000 --- a/lmdeploy/turbomind/deploy/source_model/internvl.py +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from .base import INPUT_MODELS -from .gpt_oss import GptOssReader -from .internlm2 import InternLM2Reader -from .llama import LlamaModel, LlamaReader -from .qwen import Qwen3MoeReader, Qwen3Reader - - -class InternVLReader(LlamaReader): - """InternVLReader for llama model.""" - - attn_layer_prefix = 'language_model.model.layers' - attn_layer_patten = r'language_model\.model\.layers\.([0-9]+).' - tok_embeddings_key = 'language_model.model.embed_tokens.weight' - norm_weight_key = 'language_model.model.norm.weight' - output_weight_key = 'language_model.lm_head.weight' - - -# Note the subtle difference in keys -class InternVL2Reader(InternLM2Reader): - """InternVLReader for InternLM2 model.""" - - attn_layer_prefix = 'language_model.model.layers' - attn_layer_patten = r'language_model\.model\.layers\.([0-9]+).' - tok_embeddings_key = 'language_model.model.tok_embeddings.weight' - norm_weight_key = 'language_model.model.norm.weight' - output_weight_key = 'language_model.output.weight' - - -class InternVL3d5Reader(Qwen3Reader): - attn_layer_prefix = 'language_model.model.layers' - attn_layer_patten = r'language_model\.model\.layers\.([0-9]+).' - tok_embeddings_key = 'language_model.model.embed_tokens.weight' - norm_weight_key = 'language_model.model.norm.weight' - output_weight_key = 'language_model.lm_head.weight' - - -class InternVL3d5Qwen3MoEReader(Qwen3MoeReader): - attn_layer_prefix = 'language_model.model.layers' - attn_layer_patten = r'language_model\.model\.layers\.([0-9]+).' - tok_embeddings_key = 'language_model.model.embed_tokens.weight' - norm_weight_key = 'language_model.model.norm.weight' - output_weight_key = 'language_model.lm_head.weight' - - -class InternVL3d5GptOSSReader(GptOssReader): - attn_layer_prefix = 'language_model.model.layers' - attn_layer_patten = r'language_model\.model\.layers\.([0-9]+).' - tok_embeddings_key = 'language_model.model.embed_tokens.weight' - norm_weight_key = 'language_model.model.norm.weight' - output_weight_key = 'language_model.lm_head.weight' - - -class InternS1Reader(Qwen3MoeReader): - """InternS1Reader for internlm/InternS1 model.""" - - attn_layer_prefix = 'model.language_model.layers' - attn_layer_patten = r'model\.language_model\.layers\.([0-9]+).' - tok_embeddings_key = 'model.language_model.embed_tokens.weight' - norm_weight_key = 'model.language_model.norm.weight' - output_weight_key = 'lm_head.weight' - - -class InternS1MiniReader(Qwen3Reader): - - attn_layer_prefix = 'model.language_model.layers' - attn_layer_patten = r'model\.language_model\.layers\.([0-9]+).' - tok_embeddings_key = 'model.language_model.embed_tokens.weight' - norm_weight_key = 'model.language_model.norm.weight' - output_weight_key = 'lm_head.weight' - - -@INPUT_MODELS.register_module(name='internvl') -class InternVLModel(LlamaModel): - """InternVL model in hf format.""" - - def __init__(self, model_path: str, tokenizer_path: str, **kwargs): - super().__init__(model_path, tokenizer_path, **kwargs) - from transformers import AutoConfig - config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) - - arch = config.architectures[0] - if arch == 'InternVLChatModel' or arch == 'InternVLForConditionalGeneration': - relations = dict(InternLM2ForCausalLM=('internlm2', InternVL2Reader), - LlamaForCausalLM=('llama', InternVLReader), - Qwen2ForCausalLM=('qwen2', InternVLReader), - Qwen3MoeForCausalLM=('qwen3-moe', InternVL3d5Qwen3MoEReader), - Qwen3ForCausalLM=('qwen3', InternVL3d5Reader), - GptOssForCausalLM=('gpt-oss', InternVL3d5GptOSSReader)) - elif arch == 'InternS1ForConditionalGeneration': - relations = dict(Qwen3MoeForCausalLM=('qwen3-moe', InternS1Reader), - Qwen3ForCausalLM=('qwen3', InternS1MiniReader)) - else: - raise ValueError('unsupported model arch {arch}') - self.llm_config = getattr(config, 'llm_config', None) or getattr(config, 'text_config', None) - arch = self.llm_config.architectures[0] - llm_model, self.Reader = relations[arch] - self.llm_model = INPUT_MODELS.get(llm_model)(model_path=model_path, tokenizer_path=tokenizer_path, **kwargs) - - def model_info(self): - """Read model info.""" - self.llm_model.model_config = self.llm_config.to_dict() - return self.llm_model.model_info() diff --git a/lmdeploy/turbomind/deploy/source_model/llama.py b/lmdeploy/turbomind/deploy/source_model/llama.py deleted file mode 100644 index 339b084f9a..0000000000 --- a/lmdeploy/turbomind/deploy/source_model/llama.py +++ /dev/null @@ -1,237 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import math -import re - -import torch - -from lmdeploy.archs import get_model_arch - -from ..config import RopeParam -from ..loader import create_loader -from .base import INPUT_MODELS, BaseInputModel, BaseReader - - -class LlamaReader(BaseReader): - """LlamaReader.""" - - attn_layer_prefix = 'model.layers' - attn_layer_patten = r'model\.layers\.([0-9]+).' - tok_embeddings_key = 'model.embed_tokens.weight' - norm_weight_key = 'model.norm.weight' - output_weight_key = 'lm_head.weight' - - attn_pattern = r'self_attn' - ffn_pattern = r'mlp' - - proj_pattern = 'proj' - scale_inv_suffix = '_scale_inv' - - def __init__(self, new_params: dict, unused_params: dict, last_bin: bool, model_cfg: dict, policy, fp8_quant=False): - super().__init__() - self.params = unused_params - self.params.update(new_params) - self.last_bin = last_bin - self.model_cfg = model_cfg - tie_word_embeddings = self.model_cfg.get('tie_word_embeddings', False) - if tie_word_embeddings: - self.output_weight_key = self.tok_embeddings_key - self.processor = policy - self.fp8_quant = fp8_quant - if self.fp8_quant: - quant_params = self.quant_weight_fp8() - self.params.update(quant_params) - - def quant_weight_fp8(self): - from lmdeploy.lite.quantization.weight.quant_utils import quant_blocked_fp8 - pattern_str = fr'({self.attn_pattern}|{self.ffn_pattern}).*{self.proj_pattern}.*\.weight' - target_pattern = re.compile(pattern_str) - - if self.__class__.__name__ == 'InternLM2Reader': - skip_pattern = re.compile(r'wqkv.*\.weight') - else: - skip_pattern = None - - quant_params = {} - for name, weight in self.params.items(): - if target_pattern.search(name) and name.endswith('.weight'): - if skip_pattern and skip_pattern.search(name): - continue - q_weight, scale = quant_blocked_fp8(weight, torch.float8_e4m3fn, block_size=128) - quant_params[name] = q_weight - quant_params[f'{name}{self.scale_inv_suffix}'] = scale.to(weight.dtype) - - return quant_params - - def filter(self, pattern: str, i: int | None): - params = [] - for k in self.params.keys(): - if re.search(pattern, k): - params.append(k) - return params - - def tok_embeddings(self): - """Get embeddings.""" - return self.transform(self.params.get(self.tok_embeddings_key, None), 'weight') - - def norm_weight(self): - """Get norm.""" - return self.transform(self.params.get(self.norm_weight_key, None), 'weight') - - def output_weight(self): - """Get output.""" - return self.transform(self.params.get(self.output_weight_key, None), 'weight') - - def _transform(self, x: torch.Tensor, kind: str): - return self.processor(x, kind) - - def _attn(self, i: int, kind: str): - """Get q, k, v, o kind for layer i.""" - result = [] - for key in ['q', 'k', 'v', 'o']: - tensor = self.params.get(f'{self.attn_layer_prefix}.{i}.self_attn.{key}_proj.{kind}') - tensor = self.transform(tensor, kind) - result.append(tensor) - return (*result, ) - - def attn(self, i: int, kind: str): - if not kind: - return self.filter(self.attn_pattern, i) - return self._attn(i, kind) - - def attn_norm(self, i: int): - """Get attn norm for layer i.""" - return self.transform(self.params[f'{self.attn_layer_prefix}.{i}.input_layernorm.weight'], 'weight') - - def _ffn(self, i: int, kind: str): - """Get ffn kind for layer i.""" - if not kind: - return self.filter(self.ffn_pattern, i) - result = [] - for key in ['gate', 'down', 'up']: - tensor = self.params[f'{self.attn_layer_prefix}.{i}.mlp.{key}_proj.{kind}'] - tensor = self.transform(tensor, kind) - result.append(tensor) - return (*result, ) - - def ffn(self, i: int, kind: str): - if not kind: - return self.filter(self.ffn_pattern, i) - return self._ffn(i, kind) - - def ffn_norm(self, i: int): - """Get ffn norm for layer i.""" - return self.transform(self.params[f'{self.attn_layer_prefix}.{i}.post_attention_layernorm.weight'], 'weight') - - -@INPUT_MODELS.register_module(name='llama') -class LlamaModel(BaseInputModel): - """Llama model in hf format.""" - - Reader = LlamaReader - - def __init__(self, model_path: str, tokenizer_path: str, **kwargs: dict): - super().__init__(model_path, tokenizer_path) - self.policy = kwargs.get('input_policy') - _, model_config = get_model_arch(model_path) - if hasattr(model_config, 'text_config'): - model_config = model_config.text_config - elif hasattr(model_config, 'llm_config'): - model_config = model_config.llm_config - if hasattr(model_config, 'to_dict'): - self.model_config = model_config.to_dict() - else: - self.model_config = model_config - self.fp8_quant = kwargs.get('fp8_quant', False) - - def readers(self): - mappings = getattr(self.Reader, 'mappings', []) - loader = create_loader(self.model_path, self.Reader.attn_layer_patten, mappings) - for i, param in loader.items(): - reader = self.Reader(param, {}, False, self.model_config, policy=self.policy, fp8_quant=self.fp8_quant) - yield i, reader - torch.cuda.empty_cache() - - def model_info(self): - """Read model info.""" - model_arg = self.model_config - num_layer = model_arg['num_hidden_layers'] - norm_eps = model_arg['rms_norm_eps'] - attn_head_num = model_arg['num_attention_heads'] - vocab_size = model_arg['vocab_size'] - inter_size = model_arg.get('intermediate_size', 0) - if 'num_key_value_heads' in model_arg: - kv_head_num = model_arg['num_key_value_heads'] - else: - kv_head_num = model_arg['num_attention_heads'] - hidden_units = model_arg['hidden_size'] - # head_dim could be none in config - head_dim = model_arg.get('head_dim', None) - head_dim = head_dim or hidden_units // attn_head_num - # compute rope param - if 'rope_parameters' in model_arg: - # transformers v5.0.0 aggregates rope settings into rope_parameters - rope_scaling = model_arg['rope_parameters'] - rope_theta = float(rope_scaling.get('rope_theta', 10000.0)) - else: - rope_theta = float(model_arg.get('rope_theta', 10000.0)) - rope_scaling = model_arg.get('rope_scaling', None) - max_position_embeddings = int(model_arg.get('max_position_embeddings', 0)) - rope_param = RopeParam(type='default', base=rope_theta, dim=head_dim) - if isinstance(rope_scaling, dict): - rope_type = rope_scaling.get('rope_type', '') or rope_scaling.get('type', '') - if rope_scaling.get('mrope_section') is not None: - # TODO: treat mrope as an option to the common rope functions - rope_type = 'mrope' - scaling_factor = rope_scaling.get('factor', 0.0) - if rope_type == 'default': - pass - elif rope_type == 'dynamic': - rope_param.type = 'dynamic' - rope_param.factor = scaling_factor - rope_param.max_position_embeddings = max_position_embeddings - elif rope_type == 'linear': - rope_param.type = 'linear' - rope_param.factor = scaling_factor - elif rope_type == 'llama3': - low_freq_factor = rope_scaling.get('low_freq_factor', 1.0) - high_freq_factor = rope_scaling.get('high_freq_factor', 1.0) - original_max_position_embeddings = rope_scaling.get('original_max_position_embeddings', 0) - rope_param.type = 'llama3' - rope_param.factor = scaling_factor - rope_param.low_freq_factor = low_freq_factor - rope_param.high_freq_factor = high_freq_factor - rope_param.original_max_position_embeddings = original_max_position_embeddings - elif rope_type == 'yarn': - attention_factor = rope_scaling.get('attention_factor', None) - if attention_factor is None: - attention_factor = 0.1 * math.log(scaling_factor) + 1.0 - beta_fast = rope_scaling.get('beta_fast', 32.0) - beta_slow = rope_scaling.get('beta_slow', 1.0) - rope_param.type = 'yarn' - if 'original_max_position_embeddings' in rope_scaling: - original_max_position_embeddings = rope_scaling['original_max_position_embeddings'] - scaling_factor = max_position_embeddings / original_max_position_embeddings - else: - original_max_position_embeddings = max_position_embeddings - rope_param.factor = scaling_factor - rope_param.max_position_embeddings = original_max_position_embeddings - rope_param.attention_factor = attention_factor - rope_param.beta_fast = beta_fast - rope_param.beta_slow = beta_slow - elif rope_type == 'mrope': - mrope_section = rope_scaling.get('mrope_section') - rope_param.type = 'mrope' - rope_param.mrope_section = mrope_section - else: - raise RuntimeError(f'Unsupported rope type: {rope_type}') - - return dict(size_per_head=head_dim, - num_layer=num_layer, - norm_eps=norm_eps, - head_num=attn_head_num, - kv_head_num=kv_head_num, - hidden_units=hidden_units, - inter_size=inter_size, - vocab_size=vocab_size, - max_position_embeddings=max_position_embeddings, - rope_param=rope_param) diff --git a/lmdeploy/turbomind/deploy/source_model/llava.py b/lmdeploy/turbomind/deploy/source_model/llava.py deleted file mode 100644 index a305f0ac9e..0000000000 --- a/lmdeploy/turbomind/deploy/source_model/llava.py +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import json -import os.path as osp - -from ..config import RopeParam -from .base import INPUT_MODELS -from .llama import LlamaModel, LlamaReader - - -class LlavaReader(LlamaReader): - """LlavaReader for llama model.""" - - attn_layer_prefix = 'language_model.model.layers' - attn_layer_patten = r'language_model\.model\.layers\.([0-9]+).' - tok_embeddings_key = 'language_model.model.embed_tokens.weight' - norm_weight_key = 'language_model.model.norm.weight' - output_weight_key = 'language_model.lm_head.weight' - - def __init__(self, new_params: dict, unused_params: dict, last_bin: bool, model_cfg: dict, policy): - model_cfg = model_cfg.get('text_config') - super().__init__(new_params, unused_params, last_bin, model_cfg, policy) - - -@INPUT_MODELS.register_module(name='llava') -class LlavaModel(LlamaModel): - """LlavaModel model in hf format.""" - - def __init__(self, model_path: str, tokenizer_path: str, **kwargs): - super().__init__(model_path, tokenizer_path, **kwargs) - from transformers import AutoConfig - config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) - config = getattr(config, 'text_config', config) - arch = config.architectures[0] - _readers = dict(Qwen2ForCausalLM=LlavaReader, LlamaForCausalLM=LlavaReader) - self.Reader = _readers[arch] - self.arch = arch - - def model_info(self): - """Read model info for LlavaForConditionalGeneration. - - https://huggingface.co/llava-hf/llava-interleave-qwen-7b-hf - """ - params_path = osp.join(self.model_path, 'config.json') - with open(params_path) as f: - model_arg = json.load(f)['text_config'] - num_layer = model_arg.get('num_hidden_layers', 32) - norm_eps = model_arg.get('rms_norm_eps', 1e-6) - attn_head_num = model_arg.get('num_attention_heads', 32) - if 'num_key_value_heads' in model_arg: - kv_head_num = model_arg.get('num_key_value_heads', 32) - else: - kv_head_num = model_arg.get('num_attention_heads', 32) - rope_theta = float(model_arg.get('rope_theta', 10000.0)) - max_position_embeddings = int(model_arg.get('max_position_embeddings', 0)) - rope_scaling = model_arg.get('rope_scaling', None) - scaling_factor = 0.0 - scaling_type = 'default' - - # special for the model: llava-hf/llava-interleave-qwen-7b-hf - hidden_units = model_arg.get('hidden_size', 4096) - vocab_size = model_arg.get('vocab_size', 152000) - intermediate_size = model_arg.get('intermediate_size', 11008) - attn_bias = 1 if model_arg['architectures'][0] \ - == 'Qwen2ForCausalLM' else 0 - attn_bias = int(model_arg.get('attn_bias', attn_bias)) - use_logn_attn = int(model_arg.get('use_logn_attn', 0)) - - if isinstance(rope_scaling, dict): - scaling_type = model_arg['rope_scaling'].get('type', '') - scaling_factor = model_arg['rope_scaling'].get('factor', '') - - rope_param = RopeParam(type=scaling_type, - base=rope_theta, - dim=hidden_units // attn_head_num, - max_position_embeddings=max_position_embeddings, - factor=scaling_factor) - - return dict(num_layer=num_layer, - norm_eps=norm_eps, - head_num=attn_head_num, - hidden_units=hidden_units, - kv_head_num=kv_head_num, - rope_param=rope_param, - max_position_embeddings=max_position_embeddings, - inter_size=intermediate_size, - use_logn_attn=use_logn_attn, - attn_bias=attn_bias, - vocab_size=vocab_size) diff --git a/lmdeploy/turbomind/deploy/source_model/minicpmv.py b/lmdeploy/turbomind/deploy/source_model/minicpmv.py deleted file mode 100644 index 6046dd3ac1..0000000000 --- a/lmdeploy/turbomind/deploy/source_model/minicpmv.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. - -import json -import os.path as osp - -from .base import INPUT_MODELS -from .llama import LlamaModel, LlamaReader - - -class MiniCPMVReader(LlamaReader): - """MiniCPMVReader for llama model.""" - - attn_layer_prefix = 'llm.model.layers' - attn_layer_patten = r'llm\.model\.layers\.([0-9]+).' - tok_embeddings_key = 'llm.model.embed_tokens.weight' - norm_weight_key = 'llm.model.norm.weight' - output_weight_key = 'llm.lm_head.weight' - - -@INPUT_MODELS.register_module(name='minicpmv') -class MiniCPMVModel(LlamaModel): - """MiniCPMV model in hf format.""" - Reader = MiniCPMVReader - - def model_info(self): - info = super().model_info() - with open(osp.join(self.model_path, 'config.json')) as f: - config = json.load(f) - if str(config.get('version')) == '2.6': - info['attn_bias'] = True - return info diff --git a/lmdeploy/turbomind/deploy/source_model/mixtral.py b/lmdeploy/turbomind/deploy/source_model/mixtral.py deleted file mode 100644 index 820a106956..0000000000 --- a/lmdeploy/turbomind/deploy/source_model/mixtral.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. - -from .base import INPUT_MODELS -from .llama import LlamaModel, LlamaReader - - -class MixtralReader(LlamaReader): - - def moe_ffn_expert(self, e=None, i=None, kind=None): - if not kind: - return self.filter(r'experts', i) - result = [] - for x in ['w1', 'w2', 'w3']: - name = f'model.layers.{i}.block_sparse_moe.experts.{e}.{x}.{kind}' - tensor = self.params.get(name) - tensor = self.transform(tensor, kind) - result.append(tensor) - return (*result, ) - - def moe_ffn_gate(self, i, kind): - return self.params.get(f'model.layers.{i}.block_sparse_moe.gate.{kind}') - - -@INPUT_MODELS.register_module(name='mixtral') -class MixtralModel(LlamaModel): - - Reader = MixtralReader - - def model_info(self): - cfg = self.model_config - info = super().model_info() - info['expert_num'] = cfg['num_local_experts'] - info['expert_inter_size'] = cfg['intermediate_size'] - info['experts_per_token'] = cfg['num_experts_per_tok'] - info['norm_topk_prob'] = True - info['inter_size'] = 0 - return info diff --git a/lmdeploy/turbomind/deploy/source_model/molmo.py b/lmdeploy/turbomind/deploy/source_model/molmo.py deleted file mode 100644 index 09e320e9de..0000000000 --- a/lmdeploy/turbomind/deploy/source_model/molmo.py +++ /dev/null @@ -1,115 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import json -import os.path as osp - -import torch - -from ..config import RopeParam -from .base import INPUT_MODELS -from .llama import LlamaModel, LlamaReader - - -class MolmoReader(LlamaReader): - attn_layer_prefix = 'model.transformer.blocks' - attn_layer_patten = r'model\.transformer\.blocks\.([0-9]+).' - norm_weight_key = 'model.transformer.ln_f.weight' - output_weight_key = 'model.transformer.ff_out.weight' - - # In molmo, names of attention parameters are "att_proj.bias", - # "att_proj.weight", "attn_norm.weight", "attn_out.weight", and names - # of ffn parameters are "ff_norm", "ff_out", "ff_proj", so we - # make the patterns are r'att' and r'ffn_', respectively. - attn_pattern = r'att' - ffn_pattern = r'ff_' - - def tok_embeddings(self): - embed1 = self.params.get('model.transformer.wte.embedding', None) - embed2 = self.params.get('model.transformer.wte.new_embedding', None) - if embed1 is not None and embed2 is not None: - return torch.cat((embed1, embed2), dim=0) - else: - assert embed1 is None and embed2 is None - return None - - def attn_norm(self, i: int): - """Get attn norm for layer i.""" - return self.params[f'{self.attn_layer_prefix}.{i}.attn_norm.weight'] - - def _attn(self, i: int, kind: str): - """Get q, k, v, o kind(weight, bias, qweight) for layer i. - - Args: - i (int): layer id - kind (str): can be one of ["weight", "bias", "qweight"] - """ - q, k, v = (None, ) * 3 - hidden_size = self.model_cfg['hidden_size'] - head_num = self.model_cfg['num_attention_heads'] - kv_head_num = self.model_cfg['num_key_value_heads'] - head_dim = hidden_size // head_num - assert head_dim == 128 - fused_dims = (hidden_size, kv_head_num * head_dim, kv_head_num * head_dim) - qkv = self.params.get(f'{self.attn_layer_prefix}.{i}.att_proj.{kind}') - qkv = self.transform(qkv, kind) - if qkv is not None: - q, k, v = qkv.split(fused_dims, dim=0) - o = self.params.get(f'{self.attn_layer_prefix}.{i}.attn_out.{kind}') - o = self.transform(o, kind) - if o is None: # handle the case when qkv has bias but o doesn't - o = torch.zeros_like(q) - return (q, k, v, o) - - def _ffn(self, i: int, kind: str): - """Get ffn kind(weight, qweight) for layer i.""" - up_and_gate = self.params[f'{self.attn_layer_prefix}.{i}.ff_proj.{kind}'] - up_and_gate = self.transform(up_and_gate, kind) - gate, up = up_and_gate.chunk(2, dim=0) - down = self.params[f'{self.attn_layer_prefix}.{i}.ff_out.{kind}'] - down = self.transform(down, kind) - return (up, down, gate) - - def ffn_norm(self, i: int): - """Get ffn norm for layer i.""" - return self.params[f'{self.attn_layer_prefix}.{i}.ff_norm.weight'] - - -@INPUT_MODELS.register_module(name='molmo') -class MolmoModel(LlamaModel): - - Reader = MolmoReader - - def __init__(self, model_path: str, tokenizer_path: str, **kwargs): - super().__init__(model_path, tokenizer_path, **kwargs) - config_path = osp.join(self.model_path, 'config.json') - with open(config_path) as f: - self.config = json.load(f) - - def model_info(self): - config = self.config - num_layer = config['num_hidden_layers'] - norm_eps = config['layer_norm_eps'] - attn_head_num = config['num_attention_heads'] - kv_head_num = config['num_key_value_heads'] - hidden_units = config['hidden_size'] - rope_theta = config['rope_theta'] - max_position_embeddings = config['max_position_embeddings'] - vocab_size = config['vocab_size'] - # https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/modeling_molmo.py#L2041 - additional_vocab_size = 128 - inter_size = config['intermediate_size'] // 2 - attn_bias = config['qkv_bias'] - rope_param = RopeParam(type='default', base=rope_theta, dim=hidden_units // attn_head_num) - return dict( - num_layer=num_layer, - norm_eps=norm_eps, - head_num=attn_head_num, - kv_head_num=kv_head_num, - hidden_units=hidden_units, - attn_bias=int(attn_bias), - inter_size=inter_size, - vocab_size=vocab_size, - # https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/modeling_molmo.py#L564 - embedding_size=vocab_size + additional_vocab_size, - rope_param=rope_param, - max_position_embeddings=max_position_embeddings, - ) diff --git a/lmdeploy/turbomind/deploy/source_model/qwen.py b/lmdeploy/turbomind/deploy/source_model/qwen.py deleted file mode 100644 index 2223151e54..0000000000 --- a/lmdeploy/turbomind/deploy/source_model/qwen.py +++ /dev/null @@ -1,499 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import json -import os.path as osp -import re - -import torch - -from ..config import RopeParam -from ..loader import create_loader -from .base import INPUT_MODELS -from .llama import LlamaModel, LlamaReader - - -class QwenReader(LlamaReader): - """QwenReader.""" - - attn_layer_patten = r'transformer\.h\.([0-9]+).' - tok_embeddings_key = 'transformer.wte.weight' - norm_weight_key = 'transformer.ln_f.weight' - output_weight_key = 'lm_head.weight' - - attn_pattern = r'attn' - ffn_pattern = r'mlp' - - def _attn(self, i: int, kind: str): - """Get q, k, v, o kind for layer i.""" - q, k, v, o = (None, ) * 4 - qkv = self.params[f'transformer.h.{i}.attn.c_attn.{kind}'] - qkv = self.transform(qkv, kind) - if qkv is not None: - q, k, v = torch.split(qkv, qkv.size(0) // 3, dim=0) - o = self.params.get(f'transformer.h.{i}.attn.c_proj.{kind}') - o = self.transform(o, kind) - if o is None: - o = torch.zeros_like(q) - return q, k, v, o - - def attn_norm(self, i: int): - """Get attn norm for layer i.""" - return self.params[f'transformer.h.{i}.ln_1.weight'] - - def _ffn(self, i: int, kind: str): - """Get ffn kind for layer i.""" - result = [] - for key in ['w2', 'c_proj', 'w1']: - tensor = self.params[f'transformer.h.{i}.mlp.{key}.{kind}'] - tensor = self.transform(tensor, kind) - result.append(tensor) - return (*result, ) - - def ffn_norm(self, i: int): - """Get ffn norm for layer i.""" - return self.params[f'transformer.h.{i}.ln_2.weight'] - - -@INPUT_MODELS.register_module(name='qwen') -class QwenModel(LlamaModel): - """Qwen model in hf format.""" - - Reader = QwenReader - - def model_info(self): - """Read model info.""" - params_path = osp.join(self.model_path, 'config.json') - with open(params_path) as f: - config = json.load(f) - hidden_units = config['hidden_size'] - num_layer = config['num_hidden_layers'] - norm_eps = config['layer_norm_epsilon'] - kv_channels = config['kv_channels'] - rope_theta = float(config.get('rotary_emb_base', 10000.0)) - if 'num_key_value_heads' in config: - kv_head_num = config['num_key_value_heads'] - else: - kv_head_num = config['num_attention_heads'] - attn_head_num = config['num_attention_heads'] - seq_length = config['seq_length'] - use_dynamic_ntk = int(config['use_dynamic_ntk']) - use_logn_attn = int(config['use_logn_attn']) - vocab_size = config['vocab_size'] - inter_size = config['intermediate_size'] - scaling_type = 'dynamic' if use_dynamic_ntk else 'default' - # need setting rope_scaling_factor in TurbomindEngineConfig if scaling_type is dynamic - rope_param = RopeParam(type=scaling_type, - base=rope_theta, - dim=kv_channels, - max_position_embeddings=seq_length, - factor=0) - - return dict(size_per_head=kv_channels, - num_layer=num_layer, - norm_eps=norm_eps, - hidden_units=hidden_units, - head_num=attn_head_num, - kv_head_num=kv_head_num, - vocab_size=vocab_size, - inter_size=inter_size, - attn_bias=1, - rope_param=rope_param, - max_position_embeddings=seq_length, - use_dynamic_ntk=int(use_dynamic_ntk), - use_logn_attn=use_logn_attn) - - -@INPUT_MODELS.register_module(name='qwen2') -class Qwen2Model(LlamaModel): - """Qwen model in hf format. - - The weight of qwen2 model is similar to Llama, except its attention bias doesn't include o_proj bias. - """ - - Reader = LlamaReader - - def model_info(self): - cfg = super().model_info() - cfg['attn_bias'] = 1 - return cfg - - -class Qwen2MoeReader(LlamaReader): - - def moe_ffn_expert(self, e=None, i=None, kind=None): - if not kind: - return self.filter(r'experts', i) - result = [] - for key in ['gate', 'down', 'up']: - name = f'{self.attn_layer_prefix}.{i}.mlp.experts.{e}.{key}_proj.{kind}' - tensor = self.params.get(name) - tensor = self.transform(tensor, kind) - result.append(tensor) - return (*result, ) - - def moe_ffn_gate(self, i, kind): - return self.transform(self.params.get(f'{self.attn_layer_prefix}.{i}.mlp.gate.{kind}'), kind) - - def _ffn(self, i: int, kind: str): - """Get ffn kind for layer i.""" - if not kind: - return self.filter(r'shared_expert\.', i) - result = [] - for key in ['gate', 'down', 'up']: - tensor = self.params[f'{self.attn_layer_prefix}.{i}.mlp.shared_expert.{key}_proj.{kind}'] - tensor = self.transform(tensor, kind) - result.append(tensor) - return (*result, ) - - def ffn(self, i: int, kind: str): - if not kind: - return self.filter(r'shared_expert\.', i) - return self._ffn(i, kind) - - def moe_ffn_shared_gate(self, i): - return self.params.get(f'{self.attn_layer_prefix}.{i}.mlp.shared_expert_gate.weight') - - -@INPUT_MODELS.register_module(name='qwen2-moe') -class Qwen2MoeModel(LlamaModel): - - Reader = Qwen2MoeReader - - def model_info(self): - cfg = self.model_config - info = super().model_info() - info['expert_num'] = cfg['num_experts'] - info['expert_inter_size'] = cfg['moe_intermediate_size'] - info['experts_per_token'] = cfg['num_experts_per_tok'] - info['inter_size'] = cfg['shared_expert_intermediate_size'] - info['moe_shared_gate'] = True - info['norm_topk_prob'] = cfg['norm_topk_prob'] - info['attn_bias'] = cfg.get('qkv_bias', 1) - return info - - -class Qwen3Reader(LlamaReader): - - def qk_norm(self, i: int): - result = [] - for x in ['q', 'k']: - name = f'{self.attn_layer_prefix}.{i}.self_attn.{x}_norm.weight' - result.append(self.transform(self.params.get(name), 'weight')) - return (*result, ) - - -@INPUT_MODELS.register_module(name='qwen3') -class Qwen3Model(LlamaModel): - Reader = Qwen3Reader - - def model_info(self): - cfg = self.model_config - info = super().model_info() - info.update(qk_norm=True, attn_bias=cfg.get('attention_bias', 0)) - return info - - -class Qwen3MoeReader(Qwen2MoeReader): - - def qk_norm(self, i: int): - result = [] - for x in ['q', 'k']: - name = f'{self.attn_layer_prefix}.{i}.self_attn.{x}_norm.weight' - result.append(self.transform(self.params.get(name), 'weight')) - return (*result, ) - - -@INPUT_MODELS.register_module(name='qwen3-moe') -class Qwen3MoeModel(LlamaModel): - Reader = Qwen3MoeReader - - def model_info(self): - cfg = self.model_config - info = super().model_info() - info.update( - qk_norm=True, - expert_num=cfg.get('num_experts', 128), - experts_per_token=cfg.get('num_experts_per_tok', 8), - expert_inter_size=cfg.get('moe_intermediate_size', 768), - attn_bias=cfg.get('attention_bias', 0), - inter_size=0, # no shared expert - norm_topk_prob=cfg.get('norm_topk_prob', False)) - return info - - -class Qwen3_5ReaderMixin: - """Mixin providing linear attention weight reading for Qwen3.5 models. - - Qwen3.5 uses a zero-centered RMSNorm: ``output = norm(x) * (1 + weight)`` - where weight is initialized to zeros. TurboMind's RMSNorm kernel computes - ``norm(x) * weight`` (standard LLaMA style), so we add 1 to every - RMSNorm weight during export. The GDN-internal norm - (``Qwen3_5MoeRMSNormGated``) uses standard weight and is NOT affected. - """ - - attn_layer_pattern = r'(?:model\.language_model\.|model\.)layers\.([0-9]+)\.' - - _LINEAR_ATTN_KEYS = ['conv1d', 'in_proj_qkv', 'in_proj_z', 'in_proj_b', 'in_proj_a', 'out_proj', 'A_log', 'dt_bias'] - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if any(k.startswith('model.language_model.') for k in self.params.keys()): - self.attn_layer_prefix = 'model.language_model.layers' - self.tok_embeddings_key = 'model.language_model.embed_tokens.weight' - self.norm_weight_key = 'model.language_model.norm.weight' - tie_word_embeddings = self.model_cfg.get('tie_word_embeddings', False) - if tie_word_embeddings: - self.output_weight_key = self.tok_embeddings_key - - # ---- zero-centered RMSNorm: add 1 to weights during export ---- - def attn_norm(self, i: int): - w = super().attn_norm(i) - if w is not None: - w = w.float() + 1.0 - return w - - def ffn_norm(self, i: int): - w = super().ffn_norm(i) - if w is not None: - w = w.float() + 1.0 - return w - - def norm_weight(self): - w = super().norm_weight() - if w is not None: - w = w.float() + 1.0 - return w - - def qk_norm(self, i: int): - result = super().qk_norm(i) - return tuple(w.float() + 1.0 if w is not None else w for w in result) - - # ---- handle mixed QKV(fp16) + O(AWQ) attention layers ------- - - def _attn(self, i: int, kind: str): - """Override to handle mixed QKV(fp16) + O(AWQ) attention layers. - - Some AWQ-quantized Qwen3.5 models keep QKV in fp16 while quantizing only the O projection. TurboMind requires - uniform weight types per layer, so we dequantize O to fp16 at export time. - """ - prefix = f'{self.attn_layer_prefix}.{i}.self_attn' - q_is_fp16 = f'{prefix}.q_proj.weight' in self.params - o_is_awq = f'{prefix}.o_proj.qweight' in self.params - - if not (q_is_fp16 and o_is_awq): - # Not a mixed-format layer, use standard behaviour. - return super()._attn(i, kind) - - # Mixed format detected: QKV are fp16 but O is AWQ. - if kind == 'weight': - # Get fp16 QKV the normal way, then dequantize O. - q, k, v, _ = super()._attn(i, kind) - o = self._awq_dequant(f'{prefix}.o_proj') - o = self.transform(o, kind) - return (q, k, v, o) - - # For any quant kind (qweight/scales/qzeros), return all None - # so that the AWQ handler skips this layer entirely — the O - # weight is already handled via dequantization above. - return (None, None, None, None) - - def _awq_dequant(self, prefix: str): - """Dequantize an AWQ-quantized linear layer to fp16. - - AWQ stores weights in transposed form relative to PyTorch's - convention ([in, out] vs [out, in]), so we transpose here to - match the fp16 ``.weight`` layout that downstream export - expects. - """ - from lmdeploy.pytorch.backends.default.awq_modules import dequantize_gemm - qweight = self.params[f'{prefix}.qweight'] - scales = self.params[f'{prefix}.scales'] - qzeros = self.params[f'{prefix}.qzeros'] - group_size = qweight.shape[0] // scales.shape[0] - w = dequantize_gemm(qweight, qzeros, scales, 4, group_size) - return w.t() # [in, out] → [out, in] (PyTorch convention) - - @staticmethod - def _compressed_tensors_dequant(weight_packed, weight_scale): - """Dequantize a compressed-tensors (pack-quantized, symmetric int4) - weight to fp16. - - Args: - weight_packed: int32 tensor of shape (out_features, in_features//8). - weight_scale: bf16/fp16 tensor of shape (out_features, in_features//group_size). - Returns: - fp16 tensor of shape (out_features, in_features). - """ - out_features = weight_packed.shape[0] - num_groups = weight_scale.shape[1] - in_features = weight_packed.shape[1] * 8 - group_size = in_features // num_groups - - # Reinterpret the packed int32 buffer as bytes and unpack two nibbles - # per byte directly into the final fp16 tensor. This avoids creating - # eight temporary fp16 tensors before applying scales. - packed_bytes = weight_packed.contiguous().view(torch.uint8).reshape(out_features, -1) - weight = torch.empty((out_features, in_features), device=weight_packed.device, dtype=torch.float16) - weight[:, 0::2] = (packed_bytes & 0xF).to(torch.float16) - weight[:, 1::2] = (packed_bytes >> 4).to(torch.float16) - - scales = weight_scale.to(torch.float16).unsqueeze(-1) - weight = weight.view(out_features, num_groups, group_size) - weight.sub_(8.0).mul_(scales) - return weight.reshape(out_features, in_features) - - def linear_attn(self, i: int, kind: str): - if not kind: - return self.filter(r'linear_attn\.', i) - # Always return a fixed-length tuple with None placeholders to - # preserve positional alignment with the name list in module.py. - result = [] - for key in self._LINEAR_ATTN_KEYS: - prefix = f'{self.attn_layer_prefix}.{i}.linear_attn.{key}' - tensor = self.params.get(f'{prefix}.{kind}') - # A_log and dt_bias are bare nn.Parameter (no .weight suffix) - if tensor is None: - tensor = self.params.get(prefix) - # If requesting weight but only AWQ qweight exists, - # dequantize on the fly so LinearAttn gets fp16 tensors. - if tensor is None and kind == 'weight': - if f'{prefix}.qweight' in self.params: - tensor = self._awq_dequant(prefix) - elif f'{prefix}.weight_packed' in self.params: - tensor = self._compressed_tensors_dequant(self.params[f'{prefix}.weight_packed'], - self.params[f'{prefix}.weight_scale']) - if tensor is not None: - tensor = self.transform(tensor, kind) - result.append(tensor) # keep None to preserve alignment - if all(t is None for t in result): - return tuple() - return tuple(result) - - def linear_norm(self, i: int, kind: str = 'weight'): - tensor = self.params.get(f'{self.attn_layer_prefix}.{i}.linear_attn.norm.{kind}') - if tensor is not None: - return self.transform(tensor, kind) - return None - - -class Qwen3_5Reader(Qwen3_5ReaderMixin, Qwen3Reader): - pass - - -@INPUT_MODELS.register_module(name='qwen3_5') -class Qwen3_5Model(Qwen3Model): - Reader = Qwen3_5Reader - - def model_info(self): - if 'text_config' in self.model_config: - self.model_config = self.model_config['text_config'] - cfg = self.model_config - info = super().model_info() - # MoE parameters (same as Qwen2MoeModel / Qwen3MoeModel) - info['expert_num'] = cfg.get('num_experts', 0) - info['expert_inter_size'] = cfg.get('moe_intermediate_size', 0) - info['experts_per_token'] = cfg.get('num_experts_per_tok', 0) - # For MoE models, inter_size is the shared expert intermediate size; - # for dense models, keep the value from super() (intermediate_size). - shared_expert_size = cfg.get('shared_expert_intermediate_size') - if shared_expert_size is not None: - info['inter_size'] = shared_expert_size - info['moe_shared_gate'] = True - # Qwen3.5 uses sigmoid MoE routing (not softmax) - info['scoring_func'] = 'softmax' - info['norm_topk_prob'] = True - # Fix RoPE dim for partial_rotary_factor - rope_params = cfg.get('rope_parameters', {}) - partial_rotary_factor = rope_params.get('partial_rotary_factor', cfg.get('partial_rotary_factor', 1.0)) - if partial_rotary_factor < 1.0: - info['rope_param'].dim = int(info['size_per_head'] * partial_rotary_factor) - # Linear attention parameters - info['layer_types'] = cfg.get('layer_types', []) - info['linear_key_head_dim'] = cfg.get('linear_key_head_dim', 0) - info['linear_value_head_dim'] = cfg.get('linear_value_head_dim', 0) - info['linear_conv_kernel_dim'] = cfg.get('linear_conv_kernel_dim', 0) - info['linear_num_key_heads'] = cfg.get('linear_num_key_heads', 0) - info['linear_num_value_heads'] = cfg.get('linear_num_value_heads', 0) - # attn_output_gate doubles Q projection for full-attention layers - info['attn_output_gate'] = cfg.get('attn_output_gate', False) - return info - - -class Qwen3_5MoeReader(Qwen3_5ReaderMixin, Qwen3MoeReader): - - def _unpacked_moe_expert(self, e: int, i: int, kind: str): - prefix = f'{self.attn_layer_prefix}.{i}.mlp.experts' - gate_up = self.params.get(f'{prefix}.gate_up_proj.{kind}') - down = self.params.get(f'{prefix}.down_proj.{kind}') - if gate_up is None or down is None: - return None - - # Packed Qwen3.5 MoE checkpoints store all experts in the first - # dimension. Slice one expert before transform so quantized policies - # still see a 2D tensor. - gate_up = self.transform(gate_up[e], kind) - down = self.transform(down[e], kind) - gate, up = gate_up.chunk(2, dim=0) - return (gate, down, up) - - def moe_ffn_expert(self, e=None, i=None, kind=None): - if not kind: - return self.filter(r'experts', i) - unpacked = self._unpacked_moe_expert(e, i, kind) - if unpacked is not None: - return unpacked - - return super().moe_ffn_expert(e, i, kind) - - -@INPUT_MODELS.register_module(name='qwen3_5-moe') -class Qwen3_5MoeModel(Qwen3MoeModel): - Reader = Qwen3_5MoeReader - - @staticmethod - def map_packed_qwen35_experts(name: str): - """Map packed expert names to weight names, i.e., - "mlp.experts.gate_up_proj" -> "mlp.experts.gate_up_proj.weight" so that - class Weight in parameter.py can classify them.""" - s = re.sub(r'(mlp\.experts\.(?:gate_up|down)_proj)$', r'\1.weight', name) - return s - - def readers(self): - pattern = getattr(self.Reader, 'attn_layer_pattern', self.Reader.attn_layer_patten) - loader = create_loader(self.model_path, pattern, []) - - has_packed_gate_up = any('mlp.experts.gate_up_proj' in k for k in loader.index.keys()) - has_packed_down = any('mlp.experts.down_proj' in k for k in loader.index.keys()) - if has_packed_gate_up and has_packed_down: - loader.mappings = [self.map_packed_qwen35_experts] - - for i, param in loader.items(): - reader = self.Reader(param, {}, False, self.model_config, policy=self.policy, fp8_quant=self.fp8_quant) - yield i, reader - torch.cuda.empty_cache() - - def model_info(self): - if 'text_config' in self.model_config: - self.model_config = self.model_config['text_config'] - cfg = self.model_config - info = super().model_info() - # Shared expert params (missing from Qwen3MoeModel base) - info['inter_size'] = cfg.get('shared_expert_intermediate_size', 0) - info['moe_shared_gate'] = True - # Qwen3.5 uses sigmoid MoE routing (not softmax) - info['scoring_func'] = 'softmax' - info['norm_topk_prob'] = True - # Fix RoPE dim for partial_rotary_factor - rope_params = cfg.get('rope_parameters', {}) - partial_rotary_factor = rope_params.get('partial_rotary_factor', cfg.get('partial_rotary_factor', 1.0)) - if partial_rotary_factor < 1.0: - info['rope_param'].dim = int(info['size_per_head'] * partial_rotary_factor) - # Linear attention parameters - info['layer_types'] = cfg.get('layer_types', []) - info['linear_key_head_dim'] = cfg.get('linear_key_head_dim', 0) - info['linear_value_head_dim'] = cfg.get('linear_value_head_dim', 0) - info['linear_conv_kernel_dim'] = cfg.get('linear_conv_kernel_dim', 0) - info['linear_num_key_heads'] = cfg.get('linear_num_key_heads', 0) - info['linear_num_value_heads'] = cfg.get('linear_num_value_heads', 0) - # attn_output_gate doubles Q projection for full-attention layers - info['attn_output_gate'] = cfg.get('attn_output_gate', False) - return info diff --git a/lmdeploy/turbomind/deploy/source_model/xcomposer2.py b/lmdeploy/turbomind/deploy/source_model/xcomposer2.py deleted file mode 100644 index 44d0b726b8..0000000000 --- a/lmdeploy/turbomind/deploy/source_model/xcomposer2.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. - -from .base import INPUT_MODELS -from .internlm2 import InternLM2Model, InternLM2Reader - - -class Xcomposer2Reader(InternLM2Reader): - """Xcomposer2 model reader.""" - - # include only Plora and ignore other lora weights - attn_pattern = r'attention.\w+(.Plora_[AB])?.\w+$' - ffn_pattern = r'feed_forward.\w+(.Plora_[AB])?.\w+$' - - def _attn(self, i, kind): - if 'Plora_A' in kind: - qkv = self.params[f'model.layers.{i}.attention.wqkv.Plora_A.weight'] - o = self.params[f'model.layers.{i}.attention.wo.Plora_A.weight'] - return qkv, o - return super()._attn(i, kind) - - -@INPUT_MODELS.register_module(name='xcomposer2') -class Xcomposer2Model(InternLM2Model): - """Xcomposer2 model in hf format.""" - - Reader = Xcomposer2Reader - - def _lora_cfg_7b(self): - """Lora config for internlm-xcomposer2-7b.""" - return dict(lora_r=256, lora_scale=1.0, lora_policy='plora', lora_max_wo_r=256) - - def _lora_cfg_4khd_7b(self, model_info: dict): - """Lora config for internlm-xcomposer2-4khd-7b.""" - rank_pattern = ['attention.w_qkv:8', 'attention.wo:256'] - scale_pattern = ['attention.w_qkv:2.0', 'attention.wo:1.0'] - rank_pattern = ','.join(rank_pattern) - scale_pattern = ','.join(scale_pattern) - return dict(lora_r=256, - lora_scale=1.0, - lora_max_wo_r=256, - lora_policy='plora', - lora_rank_pattern=rank_pattern, - lora_scale_pattern=scale_pattern) - - def model_info(self): - out = super().model_info() - from lmdeploy.vl.model.xcomposer2 import ModelType, get_xcomposer_type - model_type, _ = get_xcomposer_type(self.model_path) - if model_type == ModelType.XCOMPOSER2_4KHD: - out.update(self._lora_cfg_4khd_7b(out)) - else: - out.update(self._lora_cfg_7b()) - return out diff --git a/lmdeploy/turbomind/deploy/target_model/__init__.py b/lmdeploy/turbomind/deploy/target_model/__init__.py deleted file mode 100644 index 505c70de30..0000000000 --- a/lmdeploy/turbomind/deploy/target_model/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from .fp import TurbomindModel # noqa: F401 diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py deleted file mode 100644 index 95baad7cf2..0000000000 --- a/lmdeploy/turbomind/deploy/target_model/base.py +++ /dev/null @@ -1,239 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. - -import os.path as osp -from abc import ABC -from collections.abc import Sequence - -import torch -import tqdm -import yaml -from mmengine import Registry - -from ..config import AttentionConfig, LoraConfig, ModelConfig, TurbomindModelConfig, config_from_dict, config_to_dict -from ..source_model.base import BaseInputModel - -OUTPUT_MODELS = Registry('target model', locations=['lmdeploy.turbomind.deploy.target_model.base']) - - -def tprint(*args, **kwargs): - to_file = kwargs.pop('to_file', False) - if not to_file: - return - from io import StringIO - s = StringIO() - print(*args, **kwargs, file=s, end='') - tqdm.tqdm.write(s.getvalue()) - - -def _weight_dtype_map(weight_type: str, default=None): - """Map literal data type to torch dtype.""" - - _WEIGHT_DTYPE_MAP = dict(int4=torch.float16, float16=torch.float16, float32=torch.float16, bfloat16=torch.bfloat16) - - return _WEIGHT_DTYPE_MAP.get(weight_type, default) - - -def _pad_inter_size(inter_size: int, group_size: int, tp: int): - group_size = max(1, group_size) - group_num = (inter_size + group_size - 1) // group_size - groups_per_rank = (group_num + tp - 1) // tp - inter_size_padded = groups_per_rank * group_size * tp - return inter_size_padded - - -class BaseOutputModel(ABC): - """Base output model.""" - - def __init__(self, input_model: BaseInputModel, cfg: TurbomindModelConfig, model_cls, out_dir: str = ''): - super().__init__() - self.input_model = input_model - self.model_config = cfg.model_config - self.attention_config = cfg.attention_config - self.lora_config = cfg.lora_config - self.attn_tp_size = self.model_config.attn_tp_size - self.attn_cp_size = self.model_config.attn_cp_size - self.mlp_tp_size = self.model_config.mlp_tp_size - self.out_dir = out_dir - self.to_file = True if out_dir else False - self.tm_params = dict() - - # get `model_info` at first, which will be updated to `self.model_config` and `self.attention_config` - self.input_model_info = self.input_model.model_info() - self.input_model_info = self.single_to_list(self.input_model_info, keys=['inter_size', 'expert_num']) - self.permute_qk = self.input_model_info.get('permute_qk', True) - self.update_model_config() - for i, v in enumerate(self.model_config.inter_size): - self.model_config.inter_size[i] = _pad_inter_size(v, self.model_config.group_size, self.mlp_tp_size) - if self.model_config.expert_num: - self.model_config.expert_inter_size = _pad_inter_size(self.model_config.expert_inter_size, - self.model_config.group_size, self.mlp_tp_size) - - # head_num is divisble by tp but kv_head_num is not - # and tp is divisble by kv_head_num - assert self.model_config.head_num % self.attn_tp_size == 0 - self.repeat_kv = 0 - if (self.attn_tp_size > self.model_config.kv_head_num - and self.attn_tp_size % self.model_config.kv_head_num == 0): - self.repeat_kv = (self.attn_tp_size // self.model_config.kv_head_num) - self.model_config.kv_head_num = self.attn_tp_size - - self.model_config.verify() - assert self.model_config.kv_head_num % self.attn_tp_size == 0 - - # print(self.model_config) - - self.update_attention_config() - self.update_lora_config() - # ! Dependency on `self` - self.model = model_cls(self) - - def single_to_list(self, config: dict, keys): - num_layer = int(config['num_layer']) - for k in keys: - v = config.get(k, None) - if v is not None and not isinstance(v, Sequence): - config[k] = [v] * num_layer - return config - - def update_model_config(self): - """Update `self.model_config` according to the input_model's - `model_info`""" - final_cfg = config_to_dict(self.model_config) - final_cfg.update(self.input_model_info) - if 'embedding_size' not in self.input_model_info.keys(): - final_cfg.update(embedding_size=self.input_model_info['vocab_size']) - - self.model_config = config_from_dict(ModelConfig, final_cfg) - - def update_attention_config(self): - """Update attention config according to input model's model info.""" - final_cfg = config_to_dict(self.attention_config) - final_cfg.update(self.input_model_info) - self.attention_config = config_from_dict(AttentionConfig, final_cfg) - - def update_lora_config(self): - """Update lora config according to input model's model info.""" - final_cfg = config_to_dict(self.lora_config) - final_cfg.update(self.input_model_info) - self.lora_config = config_from_dict(LoraConfig, final_cfg) - - def export_config(self) -> None: - """Export turbomind config.""" - if self.to_file: - config_path = osp.join(self.out_dir, 'config.yaml') - with open(config_path, 'w') as f: - yaml.safe_dump(self.tm_config.to_dict(), f) - - def export_weight(self, param: torch.Tensor, name: str) -> None: - """Export turbomind weight.""" - - def _tofile(tensor, path): - """To file.""" - if tensor.dtype == torch.bfloat16: - tensor = tensor.view(torch.half) - tensor.contiguous().cpu().numpy().tofile(path) - - if self.to_file: - if torch.is_floating_point(param): - torch_type = _weight_dtype_map(self.model_config.weight_type, torch.float16) - param = param.to(torch_type) - tprint(name, param.shape) - _tofile(param, osp.join(self.out_dir, name)) - elif len(self.tm_params) > 0: - tm_params = self.tm_params - weight_type = self.model_config.weight_type - data_type = self.model_config.data_type - assert weight_type in ['float16', 'bfloat16', 'int4', 'fp8'] - - # currently, the tensor type should in - # [torch.float, torch.half, torch.bfloat16, torch.int32] - torch_tensor = param if param.is_contiguous() else param.contiguous() - torch_tensor = torch_tensor.cuda() - assert torch_tensor.dtype in [torch.int32, torch.float, torch.half, torch.bfloat16, torch.uint8] - FLOAT_TYPES = [torch.float, torch.half, torch.bfloat16] - if weight_type == 'fp8': - # avoid casting float scales to half - if torch_tensor.dtype == torch.bfloat16 and data_type == 'float16': - torch_tensor = torch_tensor.half() - elif torch_tensor.dtype in FLOAT_TYPES: - if weight_type in ['float16', 'int4']: - torch_tensor = torch_tensor.half() - elif weight_type == 'bfloat16': - torch_tensor = torch_tensor.bfloat16() - else: - torch_tensor = torch_tensor.half() - if name in tm_params: - try: - import _turbomind as _tm - except ImportError: - _tm = None - for tm_tensor in tm_params[name]: - # Match TurboMind tensor dtype to avoid byte_size mismatch (e.g. f32 256b vs f16 128b) - if _tm is not None: - if tm_tensor.type == _tm.DataType.TYPE_FP32 and torch_tensor.dtype in [ - torch.float16, torch.bfloat16 - ]: - torch_tensor = torch_tensor.float() - elif tm_tensor.type == _tm.DataType.TYPE_FP16 and torch_tensor.dtype == torch.float32: - torch_tensor = torch_tensor.half() - tm_tensor.copy_from(torch_tensor) - tm_params.pop(name) - else: - tprint('skip export', name, param.shape) - - def save_split(self, tensor: torch.Tensor, name: str, split_dim=None, split_num=1, copy=False) -> None: - """Save split. - - - 2D input - shape must be (input_dims, output_dims) - - 1D input (bias) - shape must be (output_dims) - split is skipped when split_dim == 0 - """ - - if copy or (tensor.dim() == 1 and split_dim == 0): - split_dim = None - copy = True - - if split_dim is not None: - tprint(f'*** splitting {name}, shape={tensor.shape}, ' - f'split_dim={split_dim}, split_num={split_num}', - to_file=self.to_file) - if tensor.shape[split_dim] % split_num != 0: - raise RuntimeError(f'{name}: shape={list(tensor.shape)}, split_num={split_num}') - split_size = tensor.shape[split_dim] // split_num - splits = torch.split(tensor, split_size, dim=split_dim) - for i, split in enumerate(splits): - prefix, ext = osp.splitext(name) - self.export_weight(split, f'{prefix}.{i}{ext}') - elif copy: - tprint(f'### copying {name}, shape={tensor.shape}', to_file=self.to_file) - copies = [tensor] * split_num - for i, copy in enumerate(copies): - prefix, ext = osp.splitext(name) - self.export_weight(copy, f'{prefix}.{i}{ext}') - else: - self.export_weight(tensor, name) - - def export(self) -> None: - """Export to turbomind model format.""" - num_layer = self.model_config.num_layer - from tqdm import tqdm - pbar = tqdm(total=num_layer, desc='Convert to turbomind format', leave=self.to_file) - self.export_config() - for i, reader in self.input_model.readers(): - if self.model(i, reader): - pbar.update(1) - pbar.close() - - def export_iter(self): - self.export_config() - for i, reader in self.input_model.readers(): - self.model(i, reader) - yield i - - @property - def tm_config(self): - return TurbomindModelConfig(model_config=self.model_config, - attention_config=self.attention_config, - lora_config=self.lora_config) diff --git a/lmdeploy/turbomind/deploy/target_model/fp.py b/lmdeploy/turbomind/deploy/target_model/fp.py deleted file mode 100644 index 11f1f78170..0000000000 --- a/lmdeploy/turbomind/deploy/target_model/fp.py +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. - -from .base import OUTPUT_MODELS, BaseOutputModel - - -@OUTPUT_MODELS.register_module(name='tm') -class TurbomindModel(BaseOutputModel): - """Export to turbomind fp16 format.""" - pass diff --git a/lmdeploy/turbomind/linear.py b/lmdeploy/turbomind/linear.py new file mode 100644 index 0000000000..c073a9fbb1 --- /dev/null +++ b/lmdeploy/turbomind/linear.py @@ -0,0 +1,265 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Linear weight bundle and composable dimension operations. + +Two weight types flow through the TurboMind weight loading pipeline: + +- ``Linear`` -- a bundle of tensors for a single linear layer (weight + + optional scales, zeros, bias). +- Raw ``torch.Tensor`` -- everything else (norms, embeddings, scalars). + +**concat_out_dim** joins ``Linear`` bundles along the output +dimension, handling all component tensors correctly regardless of +quantization-induced dimension scaling. +""" + +from __future__ import annotations + +import functools +import inspect +from dataclasses import dataclass, field +from typing import TYPE_CHECKING + +import torch +from torch import Tensor + +if TYPE_CHECKING: + from .weight_format import WeightFormat + + +# --------------------------------------------------------------------------- +# Linear dataclass with methods +# --------------------------------------------------------------------------- + + +@dataclass +class Linear: + """Bundle of tensors for a single linear layer. + + ``tensors`` maps a closed-set TM weight kind (e.g. ``"weight"``, + ``"scales"``, ``"zeros"``, ``"bias"``, ``"qweight"``) to the actual + tensor. + + **Layout contract**: all ``Linear`` objects are in TM layout with + axis 0 as the input dimension and axis -1 as the output dimension. + ``commit_linear`` assumes this layout and does not re-transpose. + 1-D tensors (e.g. bias) only have an output dimension (axis 0). + """ + + tensors: dict[str, Tensor] + weight_format: WeightFormat = field(compare=False, repr=False) + + +def concat_out_dim(xs: list[Linear]) -> Linear: + """Concatenate along output dim.""" + first = xs[0] + result: dict[str, Tensor] = {} + for kind in first.tensors: + t = first.tensors[kind] + result[kind] = torch.cat([x.tensors[kind] for x in xs], dim=t.dim() - 1) + wfmts = {x.weight_format for x in xs} + assert len(wfmts) == 1, ( + 'concat_out_dim requires uniform weight_format; ' + 'call dequant_mixed first if formats differ.') + return Linear(tensors=result, + weight_format=next(iter(wfmts))) + + +# --------------------------------------------------------------------------- +# Format / compatibility utilities +# --------------------------------------------------------------------------- + + +def _dequant_linear(linear: Linear, *, data_type) -> Linear: + """Dequantize a quantized Linear to trivial. + + ``TrivialFormat.dequant`` is identity, so already-trivial inputs round-trip + safely. ``AWQFormat.dequant`` and ``FP8Format.dequant`` do real work. + GPTQ / CompressedTensor / MXFP4 inherit the base-class + ``NotImplementedError`` — calling ``_dequant_linear`` on one of those is a + broken-fusion-group configuration, and the raise names it at the call site. + """ + from .weight_format import TrivialFormat + + fmt = linear.weight_format + new_tensors = fmt.dequant(linear.tensors, data_type) + trivial = TrivialFormat() + return Linear(tensors=new_tensors, weight_format=trivial) + + +def dequant_mixed(*linears: Linear | None, data_type) -> tuple[Linear | None, ...]: + """Dequantize linears to a common trivial format when formats differ. + + Trivial inputs round-trip safely through ``_dequant_linear``. + None args pass through unchanged. + """ + formats = {l.weight_format.name for l in linears if l is not None} + if len(formats) <= 1: + return linears + return tuple( + _dequant_linear(l, data_type=data_type) if l is not None else l + for l in linears + ) + + +# --------------------------------------------------------------------------- +# Linear-level transform decorators +# --------------------------------------------------------------------------- + + +def transform_output_dim(fn): + """Decorator that lifts a tensor-level transform to Linear-level. + + For output-dim operations: 1-D tensors (bias) are unsqueezed to 2-D + before calling *fn*, then squeezed back. Convention: args that are + ``Linear`` instances are treated as tensor inputs; all other args pass + through unchanged. Return type is detected at runtime: + ``Tensor`` -> single ``Linear``, ``tuple`` -> tuple of ``Linear`` objects. + """ + sig = inspect.signature(fn) + + @functools.wraps(fn) + def wrapper(*args, **kwargs): + bound = sig.bind(*args, **kwargs) + bound.apply_defaults() + + first = next( + v for v in bound.arguments.values() if isinstance(v, Linear) + ) + out_buckets = None + + for kind in first.tensors: + was_1d = False + fn_kwargs = {} + + for name, val in bound.arguments.items(): + if isinstance(val, Linear): + t = val.tensors[kind] + if t.dim() == 1: + was_1d = True + t = t.unsqueeze(0) + fn_kwargs[name] = t + else: + fn_kwargs[name] = val + + result = fn(**fn_kwargs) + if not isinstance(result, tuple): + result = (result,) + if out_buckets is None: + out_buckets = [{} for _ in result] + for i, item in enumerate(result): + out_buckets[i][kind] = item.squeeze(0) if was_1d else item + + outputs = tuple( + Linear(ts, weight_format=first.weight_format) for ts in out_buckets + ) + return outputs if len(outputs) > 1 else outputs[0] + + return wrapper + + +def transform_input_dim(fn): + """Decorator that lifts a tensor-level transform to Linear-level. + + For input-dim operations: 1-D tensors (bias) have no input dimension + and are **passed through unchanged**. The inner function only ever + sees 2-D tensors for each kind. For multi-output functions, 1-D + tensors are duplicated into every output bucket. + """ + sig = inspect.signature(fn) + + @functools.wraps(fn) + def wrapper(*args, **kwargs): + bound = sig.bind(*args, **kwargs) + bound.apply_defaults() + + first = next( + v for v in bound.arguments.values() if isinstance(v, Linear) + ) + out_buckets = None + deferred_1d: list[str] = [] + + for kind in first.tensors: + fn_kwargs = {} + is_1d = False + + for name, val in bound.arguments.items(): + if isinstance(val, Linear): + t = val.tensors[kind] + if t.dim() < 2: + is_1d = True + break + fn_kwargs[name] = t + else: + fn_kwargs[name] = val + + if is_1d: + deferred_1d.append(kind) + continue + + result = fn(**fn_kwargs) + if not isinstance(result, tuple): + result = (result,) + if out_buckets is None: + out_buckets = [{} for _ in result] + for i, item in enumerate(result): + out_buckets[i][kind] = item + + if out_buckets is None: + out_buckets = [{}] + for kind in deferred_1d: + for bucket in out_buckets: + bucket[kind] = first.tensors[kind] + + outputs = tuple( + Linear(ts, weight_format=first.weight_format) for ts in out_buckets + ) + return outputs if len(outputs) > 1 else outputs[0] + + return wrapper + + +# --------------------------------------------------------------------------- +# Group-based padding +# --------------------------------------------------------------------------- + + +@transform_output_dim +def pad_output_groups(t: torch.Tensor, *, src_groups: int, + dst_groups: int) -> torch.Tensor: + """Pad output dim by src_groups → dst_groups, viewing it as (groups, -1).""" + t = t.reshape(t.shape[:-1] + (src_groups, -1)) + pad = t.new_zeros(t.shape[:-2] + (dst_groups - src_groups, t.shape[-1])) + return torch.cat([t, pad], dim=-2).reshape(t.shape[:-2] + (-1,)) + + +@transform_input_dim +def pad_input_groups(t: torch.Tensor, *, src_groups: int, + dst_groups: int) -> torch.Tensor: + """Pad input dim by src_groups → dst_groups, viewing it as (groups, -1).""" + t = t.reshape((src_groups, -1) + t.shape[1:]) + block = t.shape[1] + pad = t.new_zeros((dst_groups - src_groups, block) + t.shape[2:]) + return torch.cat([t, pad], dim=0).reshape((dst_groups * block,) + t.shape[2:]) + + +def _round_up(src_groups: int, div: int) -> int: + """Round *src_groups* up to the nearest multiple of *div*.""" + return ((src_groups + div - 1) // div) * div + + +def round_up_output_groups(linear: Linear, groups: int, + div: int) -> Linear: + """Pad output-dim groups to ``round_up(groups, div)``.""" + dst = _round_up(groups, div) + if dst == groups: + return linear + return pad_output_groups(linear, src_groups=groups, dst_groups=dst) + + +def round_up_input_groups(linear: Linear, groups: int, + div: int) -> Linear: + """Pad input-dim groups to ``round_up(groups, div)``.""" + dst = _round_up(groups, div) + if dst == groups: + return linear + return pad_input_groups(linear, src_groups=groups, dst_groups=dst) diff --git a/lmdeploy/turbomind/deploy/loader.py b/lmdeploy/turbomind/loader.py similarity index 74% rename from lmdeploy/turbomind/deploy/loader.py rename to lmdeploy/turbomind/loader.py index 2475a8a928..2602dfe61d 100644 --- a/lmdeploy/turbomind/deploy/loader.py +++ b/lmdeploy/turbomind/loader.py @@ -23,11 +23,11 @@ class BaseLoader(ABC): - def __init__(self, model_path: str, pattern, mappings: list): + def __init__(self, model_path: str, pattern=None, mappings: list | None = None): self.model_path = model_path self.pattern = pattern self.item_count = defaultdict(int) - self.mappings = mappings + self.mappings = mappings or [] def get_index(self, index_name: str, file_pattern: str) -> tuple[dict, list]: """Get shards and weight map (if possible) for the model.""" @@ -58,10 +58,17 @@ def map_key(self, key: str): def items(self) -> Iterator[tuple[int, dict]]: pass + @abstractmethod + def all_items(self) -> dict: + """Return ALL weights in a single dict.""" + pass + class SafetensorsLoader(BaseLoader): - def __init__(self, model_path: str, pattern: str, mappings: list, index_name=None, file_pattern=None): + def __init__(self, model_path: str, pattern=None, + mappings: list | None = None, index_name=None, + file_pattern=None): super().__init__(model_path, pattern, mappings) self.shards, index = self.get_index(index_name, file_pattern) if not index: @@ -74,10 +81,11 @@ def __init__(self, model_path: str, pattern: str, mappings: list, index_name=Non # self.index maps weight names to their corresponding safetensors file name self.index = index # count layer-wise parameters - for k in index.keys(): - match = re.findall(self.pattern, k) - if match: - self.item_count[int(match[0])] += 1 + if self.pattern: + for k in index.keys(): + match = re.findall(self.pattern, k) + if match: + self.item_count[int(match[0])] += 1 def items(self): params = defaultdict(dict) @@ -91,7 +99,7 @@ def items(self): # - Exclude duplicated weights (present in multiple files) if k not in self.index or self.index[k] != filename: continue - match = re.findall(self.pattern, k) + match = re.findall(self.pattern, k) if self.pattern else [] if not match: misc.append(k) else: @@ -104,16 +112,31 @@ def items(self): yield (-1, {k: f.get_tensor(k) for k in misc}) assert not params + def all_items(self) -> dict: + """Return ALL weights in a single dict (mmap-backed, no eager load).""" + all_params = {} + for shard in self.shards: + with safe_open(shard, 'pt') as f: + filename = osp.basename(shard) + for k in f.keys(): + if k not in self.index or self.index[k] != filename: + continue + all_params[self.map_key(k)] = f.get_tensor(k) + return all_params + class PytorchLoader(BaseLoader): - def __init__(self, model_path: str, pattern: str, mappings: list, index_name=None, file_pattern=None): + def __init__(self, model_path: str, pattern=None, + mappings: list | None = None, index_name=None, + file_pattern=None): super().__init__(model_path, pattern, mappings) self.shards, index = self.get_index(index_name, file_pattern) - for k in index.keys(): - match = re.findall(self.pattern, k) - if match: - self.item_count[int(match[0])] += 1 + if self.pattern: + for k in index.keys(): + match = re.findall(self.pattern, k) + if match: + self.item_count[int(match[0])] += 1 def items(self): params = defaultdict(dict) @@ -121,7 +144,7 @@ def items(self): misc = {} tmp = torch.load(shard, map_location='cpu', weights_only=True) for k, v in tmp.items(): - match = re.findall(self.pattern, k) + match = re.findall(self.pattern, k) if self.pattern else [] if not match: misc[k] = v else: @@ -144,6 +167,16 @@ def items(self): for idx in idxs: yield (idx, params.pop(idx)) + def all_items(self) -> dict: + """Return ALL weights in a single dict.""" + all_params = {} + for shard in self.shards: + tmp = torch.load(shard, map_location='cpu', weights_only=True) + for k, v in tmp.items(): + all_params[self.map_key(k)] = v + del tmp + return all_params + class StateDictLoader: """This loader is used for `update_params`. @@ -152,7 +185,7 @@ class StateDictLoader: lm_head, norm). """ - def __init__(self, queue: Queue, pattern: str, mappings: list): + def __init__(self, queue: Queue, pattern=None, mappings: list | None = None): self.que = queue self.pattern = pattern @@ -160,9 +193,11 @@ def items(self): for data in iter(self.que.get, None): # If data is state dict of a decoder layer, any key will match the pattern. # Otherwise, none of the keys will match the pattern. - for k in data.keys(): - match = re.findall(self.pattern, k) - break + match = [] + if self.pattern: + for k in data.keys(): + match = re.findall(self.pattern, k) + break if not match: yield (-1, data) @@ -173,8 +208,12 @@ def items(self): torch.cuda.empty_cache() self.que.task_done() + def all_items(self) -> dict: + raise NotImplementedError('StateDictLoader does not support all_items()') + -def create_loader(model_path: str | Queue, pattern: str, mappings: list) -> BaseLoader: +def create_loader(model_path: str | Queue, pattern=None, + mappings: list | None = None) -> BaseLoader: args = (model_path, pattern, mappings) if isinstance(model_path, Queue): diff --git a/lmdeploy/turbomind/model_loader.py b/lmdeploy/turbomind/model_loader.py new file mode 100644 index 0000000000..0d241b6922 --- /dev/null +++ b/lmdeploy/turbomind/model_loader.py @@ -0,0 +1,63 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""ModelLoader: coordinates loading a model's weights into the TurboMind runtime.""" +import torch + +from .builders._base import Context, ParallelGroup +from .loader import create_loader + + +class ModelLoader: + """Coordinates loading a model's weights into the TurboMind runtime. + + Holds the model, model_comm handle, and model_path. Extracts GPU topology handles from model_comm and binds them + onto the model at construction time. Provides export() and export_iter() to load checkpoint weights and commit them + to the C++ runtime. + """ + + def __init__(self, model, model_comm, gpu_count, model_path, + data_type, engine_config): + self.model = model + self.model_comm = model_comm + self.gpu_count = gpu_count + self.model_path = model_path + self.data_type = data_type + self.engine_config = engine_config + self._bind_runtime() + + def _bind_runtime(self): + mc = self.model_comm + ctx = Context( + [mc.context(g) for g in range(self.gpu_count)], + data_type=self.data_type, + ) + ec = self.engine_config + + attn_tp = ParallelGroup(ec.attn_tp_size, + [mc.attn_tp_rank(g) for g in range(self.gpu_count)]) + mlp_tp = ParallelGroup(ec.mlp_tp_size, + [mc.mlp_tp_rank(g) for g in range(self.gpu_count)]) + model_tp = ParallelGroup(ec.attn_tp_size * ec.attn_cp_size, + [mc.model_tp_rank(g) for g in range(self.gpu_count)]) + + self.model.bind_runtime( + ctx=ctx, + root_handles=[mc.root(g) for g in range(self.gpu_count)], + attn_tp=attn_tp, + mlp_tp=mlp_tp, + model_tp=model_tp, + ) + + def export(self): + loader = create_loader(self.model_path, None, + getattr(self.model, '_loader_mappings', [])) + self.model.set_params(loader.all_items()) + self.model.model() + torch.cuda.empty_cache() + + def export_iter(self): + loader = create_loader(self.model_path, None, + getattr(self.model, '_loader_mappings', [])) + self.model.set_params(loader.all_items()) + self.model.model() + yield -1 + torch.cuda.empty_cache() diff --git a/lmdeploy/turbomind/models/__init__.py b/lmdeploy/turbomind/models/__init__.py new file mode 100644 index 0000000000..e3310bd183 --- /dev/null +++ b/lmdeploy/turbomind/models/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .glm4_moe_lite import Glm4MoeLiteModel # noqa: F401 +from .gpt_oss import GptOssModel # noqa: F401 +from .internlm2 import InternLM2Model # noqa: F401 +from .internvl3_5 import InternVL3_5Model # noqa: F401 +from .llama import LlamaModel # noqa: F401 +from .qwen2 import Qwen2Model # noqa: F401 +from .qwen3 import Qwen3TextModel # noqa: F401 +from .qwen3_5 import Qwen3_5Model # noqa: F401 diff --git a/lmdeploy/turbomind/models/base.py b/lmdeploy/turbomind/models/base.py new file mode 100644 index 0000000000..9b317e8397 --- /dev/null +++ b/lmdeploy/turbomind/models/base.py @@ -0,0 +1,12 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Source-model registry. + +The INPUT_MODELS registry maps an architecture name to its TextModel +subclass. Models register themselves via ``@INPUT_MODELS.register_module(name=...)``. +""" +from __future__ import annotations + +from mmengine import Registry + +INPUT_MODELS = Registry('source model', + locations=['lmdeploy.turbomind.models.base']) diff --git a/lmdeploy/turbomind/models/glm4_moe_lite.py b/lmdeploy/turbomind/models/glm4_moe_lite.py new file mode 100644 index 0000000000..5ed969412e --- /dev/null +++ b/lmdeploy/turbomind/models/glm4_moe_lite.py @@ -0,0 +1,145 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""GLM-4 MoE Lite (GLM-4.7-Flash) TextModel for the new pipeline.""" +from __future__ import annotations + +import _turbomind as _tm +from transformers import Glm4MoeLiteConfig + +from ..builders import ( + DecoderLayerBuilder, + DecoderLayerConfig, + FfnBuilder, + MLABuilder, + ModuleListBuilder, + ModuleListConfig, + MoeBuilder, + TextModelBuilder, + _act_type_id, +) +from ..text_model import TextModel +from .base import INPUT_MODELS +from .utils import ( + layer_progress, + make_mla_config, + make_model_weight_config, + make_moe_config, +) + + +@INPUT_MODELS.register_module(name='glm4-moe-lite') +class Glm4MoeLiteModel(TextModel): + """Weight model for GLM-4 MoE Lite (e.g. GLM-4.7-Flash).""" + + cfg: Glm4MoeLiteConfig + + def __init__(self, cfg: Glm4MoeLiteConfig, *, resolver): + super().__init__(cfg, resolver=resolver) + + self._attn_cfg = make_mla_config(cfg) + + # ---- FFN template ---- + self._ffn_cfg = _tm.FfnConfig() + self._ffn_cfg.hidden_dim = self.cfg.hidden_size + self._ffn_cfg.act_type = _act_type_id('silu') + + # ---- MoE template (GLM-specific: noaux_tc + sigmoid) ---- + if cfg.n_routed_experts > 0: + self._moe_cfg = make_moe_config( + cfg, + experts_per_token=cfg.num_experts_per_tok, + topk_method='noaux_tc', + scoring_func='sigmoid', + routed_scale=cfg.routed_scaling_factor, + topk_group=cfg.topk_group, + n_group=cfg.n_group) + self._moe_cfg.expert_num = cfg.n_routed_experts + + self._tune_layer_num = 2 # GLM-MoE recommends tuning 2 layers + + # ------------------------------------------------------------------ + # model() — same as old code + # ------------------------------------------------------------------ + + def model(self): + root_cfg = make_model_weight_config(self.cfg) + root = TextModelBuilder( + root_cfg, self._ctx, + root_handles=self._root_handles, + tp=self._model_tp, + vocab_size=self.cfg.vocab_size) + root.add_token_embeds(self._get('model.embed_tokens.weight')) + root.norm = self.norm(self._get('model.norm.weight')) + root.add_lm_head(self._linear('lm_head')) # GLM: never tied + root.layers = self.layers('model.layers') + root.build() + + # ------------------------------------------------------------------ + # MLA attention (uses MLABuilder + self._attn_cfg clone) + # ------------------------------------------------------------------ + + def attn(self, pfx): + cfg = self._attn_cfg.clone() + m = MLABuilder(cfg, self._ctx, tp=self._attn_tp) + + q_b = (self._linear(f'{pfx}.q_b_proj', optional=True) or + self._linear(f'{pfx}.q_proj')) + m.add_projections( + q_a_proj=self._linear(f'{pfx}.q_a_proj'), + q_b_proj=q_b, + kv_a_proj=self._linear(f'{pfx}.kv_a_proj_with_mqa'), + kv_b_proj=self._linear(f'{pfx}.kv_b_proj'), + wo=self._linear(f'{pfx}.o_proj'), + ) + m.q_a_layernorm = self.norm(self._get(f'{pfx}.q_a_layernorm.weight')) + m.kv_a_layernorm = self.norm(self._get(f'{pfx}.kv_a_layernorm.weight')) + return m.build() + + # ------------------------------------------------------------------ + # FFN / MoE factories + # ------------------------------------------------------------------ + + def ffn(self, pfx, inter_size, is_expert=False): + w1, w3, w2 = [self._linear(f'{pfx}.{x}_proj') for x in ('gate', 'up', 'down')] + + cfg = self._ffn_cfg.clone() + cfg.inter_size = inter_size + cfg.is_expert = is_expert + + m = FfnBuilder(cfg, self._ctx, tp=self._mlp_tp) + m.add_ffn(w1, w2, w3) + return m.build() + + def moe(self, pfx): + cfg = self._moe_cfg.clone() + + m = MoeBuilder(cfg, self._ctx) + + m.add_gate('gate', self._linear(f'{pfx}.gate')) + + correction = self._get(f'{pfx}.gate.e_score_correction_bias') + m.add_param('score_correction_bias', correction) + + experts = ModuleListBuilder(ModuleListConfig(), self._ctx) + for e in range(cfg.expert_num): + experts[e] = self.ffn(f'{pfx}.experts.{e}', + self.cfg.moe_intermediate_size, is_expert=True) + m.experts = experts.build() + + shared = self.ffn(f'{pfx}.shared_experts', + self.cfg.intermediate_size * self.cfg.n_shared_experts) + + return m.build(), shared + + def layers(self, pfx): + layers = ModuleListBuilder(ModuleListConfig(), self._ctx) + for i in layer_progress(self.cfg.num_hidden_layers): + d = DecoderLayerBuilder(DecoderLayerConfig(), self._ctx) + d.attention_norm = self.norm(self._get(f'{pfx}.{i}.input_layernorm.weight')) + d.attention = self.attn(f'{pfx}.{i}.self_attn') + d.ffn_norm = self.norm(self._get(f'{pfx}.{i}.post_attention_layernorm.weight')) + if self.cfg.mlp_layer_types[i] == 'sparse': + d.moe_ffn, d.feed_forward = self.moe(f'{pfx}.{i}.mlp') + else: + d.feed_forward = self.ffn(f'{pfx}.{i}.mlp', self.cfg.intermediate_size) + layers[i] = d.build() + return layers.build() diff --git a/lmdeploy/turbomind/models/gpt_oss.py b/lmdeploy/turbomind/models/gpt_oss.py new file mode 100644 index 0000000000..096345d958 --- /dev/null +++ b/lmdeploy/turbomind/models/gpt_oss.py @@ -0,0 +1,142 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Gpt-oss TextModel for the new pipeline.""" +from __future__ import annotations + +import re + +from transformers import GptOssConfig + +from ..builders import ( + AttentionBuilder, + DecoderLayerBuilder, + DecoderLayerConfig, + FfnBuilder, + ModuleListBuilder, + ModuleListConfig, + MoeBuilder, + TextModelBuilder, + _act_type_id, +) +from ..text_model import TextModel +from .base import INPUT_MODELS +from .utils import ( + layer_progress, + make_attention_config, + make_ffn_config, + make_model_weight_config, + make_moe_config, + read_packed_moe_expert, + reorder_rotary_emb, +) + + +def map_experts(s: str) -> str: + s = re.sub(r'(experts.*proj)$', r'\1.weight', s) + s = re.sub(r'(experts.*proj)_bias$', r'\1.bias', s) + s = re.sub(r'(experts.*proj)_blocks$', r'\1.blocks', s) + s = re.sub(r'(experts.*proj)_scales$', r'\1.scales', s) + return s + + +@INPUT_MODELS.register_module(name='gpt-oss') +class GptOssModel(TextModel): + """Weight model for gpt-oss (MoE with packed experts).""" + + cfg: GptOssConfig + + _loader_mappings = [map_experts] + + def __init__(self, cfg: GptOssConfig, *, resolver): + super().__init__(cfg, resolver=resolver) + + self._attn_cfg = make_attention_config(cfg) + + self._ffn_cfg = make_ffn_config(cfg, + act_type=_act_type_id('gpt-oss')) + self._ffn_cfg.inter_size = cfg.intermediate_size + self._ffn_cfg.is_expert = True + + # ---- MoE template ---- + self._moe_cfg = make_moe_config( + cfg, + act_type=_act_type_id('gpt-oss'), + experts_per_token=cfg.num_experts_per_tok) + self._moe_cfg.expert_num = cfg.num_local_experts + + # ------------------------------------------------------------------ + # model() — walks full hierarchy + # ------------------------------------------------------------------ + + def model(self): + embed_key = 'model.embed_tokens.weight' + root_cfg = make_model_weight_config(self.cfg) + root = TextModelBuilder( + root_cfg, self._ctx, + root_handles=self._root_handles, + tp=self._model_tp, + vocab_size=self.cfg.vocab_size) + root.add_token_embeds(self._get(embed_key)) + root.norm = self.norm(self._get('model.norm.weight')) + lm_key = embed_key if self.cfg.tie_word_embeddings else 'lm_head.weight' + root.add_lm_head(self._linear(lm_key.removesuffix('.weight'))) + root.layers = self.layers('model.layers') + root.build() + + # ------------------------------------------------------------------ + # Attention / FFN / MoE factories + # ------------------------------------------------------------------ + + def attn(self, pfx, layer): + q, k, v, o = [self._linear(f'{pfx}.{x}_proj') for x in 'qkvo'] + + cfg = self._attn_cfg.clone() + if self.cfg.layer_types[layer] == 'sliding_attention': + cfg.window_size = self.cfg.sliding_window + + def reorder(x): + return reorder_rotary_emb(x, cfg.head_dim, cfg.rope.dim, resolver=self._resolver) + + q, k = [reorder(x) for x in (q, k)] + + m = AttentionBuilder(cfg, self._ctx, tp=self._attn_tp) + m.add_qkv_proj(q, k, v) + m.add_o_proj(o) + + m.add_param('sinks', self._get(f'{pfx}.sinks')) + return m.build() + + def moe(self, pfx): + cfg = self._moe_cfg.clone() + m = MoeBuilder(cfg, self._ctx) + m.add_gate('gate', self._linear(f'{pfx}.router')) + experts = ModuleListBuilder(ModuleListConfig(), self._ctx) + for e in range(cfg.expert_num): + experts[e] = self._packed_moe_ffn(f'{pfx}.experts', e) + m.experts = experts.build() + return m.build() + + def layers(self, pfx): + layers = ModuleListBuilder(ModuleListConfig(), self._ctx) + for i in layer_progress(self.cfg.num_hidden_layers): + d = DecoderLayerBuilder(DecoderLayerConfig(), self._ctx) + d.attention = self.attn(f'{pfx}.{i}.self_attn', i) + d.moe_ffn = self.moe(f'{pfx}.{i}.mlp') + d.attention_norm = self.norm(self._get(f'{pfx}.{i}.input_layernorm.weight')) + d.ffn_norm = self.norm(self._get(f'{pfx}.{i}.post_attention_layernorm.weight')) + layers[i] = d.build() + return layers.build() + + def _packed_moe_ffn(self, pfx, idx): + w1, w2, w3 = read_packed_moe_expert( + self.params, + f'{pfx}.gate_up_proj', + f'{pfx}.down_proj', + idx, + resolver=self._resolver, + interleaved=True, + trans=True, + ) + cfg = self._ffn_cfg.clone() + m = FfnBuilder(cfg, self._ctx, tp=self._mlp_tp) + m.add_ffn(w1, w2, w3) + return m.build() diff --git a/lmdeploy/turbomind/models/internlm2.py b/lmdeploy/turbomind/models/internlm2.py new file mode 100644 index 0000000000..29c0c05ca9 --- /dev/null +++ b/lmdeploy/turbomind/models/internlm2.py @@ -0,0 +1,137 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""InternLM2 / InternLM2.5 TextModel for the new pipeline. + +Handles InternLM2 and InternLM2.5 decoder variants. The key difference from Llama is the GQA-interleaved fused wqkv +projection that must be deinterleaved into separate Q / K / V bundles before feeding to AttentionBuilder. +""" +from __future__ import annotations + +from transformers import PretrainedConfig + +from ..builders import ( + AttentionBuilder, + DecoderLayerBuilder, + DecoderLayerConfig, + FfnBuilder, + ModuleListBuilder, + ModuleListConfig, + TextModelBuilder, + _act_type_id, +) +from ..linear import transform_output_dim +from ..text_model import TextModel +from .base import INPUT_MODELS +from .utils import ( + layer_progress, + make_attention_config, + make_ffn_config, + make_model_weight_config, + reorder_rotary_emb, +) + + +@transform_output_dim +def _split_qkv_gqa(w_qkv, *, head_dim, q_heads, kv_heads): + """Deinterleave a GQA-fused QKV tensor into separate Q, K, V tensors. + + InternLM2 layout: ``[Q0 Q1 Q2 Q3 K V]`` repeated per KV group. + ``per_head_elems`` self-adapts (128 for weights, 1 for block scales). + """ + groups = kv_heads + q_per_group = q_heads // kv_heads + slots = q_per_group + 2 # Q-slots + K + V + total = groups * slots + n = w_qkv.size(-1) // total # elems per head-equivalent + + t = w_qkv.unflatten(-1, (groups, slots, n)) + q = t[..., :q_per_group, :].flatten(-3, -2) + k = t[..., q_per_group, :].flatten(-2, -1) + v = t[..., q_per_group + 1, :].flatten(-2, -1) + return q.contiguous(), k.contiguous(), v.contiguous() + + +@INPUT_MODELS.register_module(name='internlm2') +class InternLM2Model(TextModel): + """Weight model for InternLM2 / InternLM2.5 decoder-only variants.""" + + cfg: PretrainedConfig + + def __init__(self, cfg: PretrainedConfig, *, resolver): + super().__init__(cfg, resolver=resolver) + + self._attn_cfg = make_attention_config(cfg) + + self._ffn_cfg = make_ffn_config(cfg, + act_type=_act_type_id('silu')) + + # ------------------------------------------------------------------ + # model() — full topology + # ------------------------------------------------------------------ + + def model(self): + embed_key = 'model.tok_embeddings.weight' + root_cfg = make_model_weight_config(self.cfg) + root = TextModelBuilder( + root_cfg, self._ctx, + root_handles=self._root_handles, + tp=self._model_tp, + vocab_size=self.cfg.vocab_size) + root.add_token_embeds(self._get(embed_key)) + root.norm = self.norm(self._get('model.norm.weight')) + lm_key = embed_key if self.cfg.tie_word_embeddings else 'output.weight' + root.add_lm_head(self._linear(lm_key.removesuffix('.weight'))) + root.layers = self.layers('model.layers') + root.build() + + # ------------------------------------------------------------------ + # attn() — deinterleave fused wqkv then feed to AttentionBuilder + # ------------------------------------------------------------------ + + def attn(self, pfx): + wqkv = self._linear(f'{pfx}.wqkv') + cfg = self._attn_cfg.clone() + q, k, v = _split_qkv_gqa( + wqkv, head_dim=cfg.head_dim, + q_heads=cfg.head_num, kv_heads=cfg.kv_head_num) + o = self._linear(f'{pfx}.wo') + + def reorder(x): + return reorder_rotary_emb(x, cfg.head_dim, cfg.rope.dim, resolver=self._resolver) + + q, k = [reorder(x) for x in (q, k)] + + m = AttentionBuilder(cfg, self._ctx, tp=self._attn_tp) + m.add_qkv_proj(q, k, v) + m.add_o_proj(o) + + return m.build() + + # ------------------------------------------------------------------ + # ffn() — InternLM2 uses w1 / w3 / w2 naming + # ------------------------------------------------------------------ + + def ffn(self, pfx): + w1, w3, w2 = [self._linear(f'{pfx}.{x}') for x in ('w1', 'w3', 'w2')] + + cfg = self._ffn_cfg.clone() + + m = FfnBuilder(cfg, self._ctx, tp=self._mlp_tp) + m.add_ffn(w1, w2, w3) + return m.build() + + # ------------------------------------------------------------------ + # layers() — standard loop, InternLM2 norm names + # ------------------------------------------------------------------ + + def layers(self, pfx): + layers = ModuleListBuilder(ModuleListConfig(), self._ctx) + for i in layer_progress(self.cfg.num_hidden_layers): + d = DecoderLayerBuilder(DecoderLayerConfig(), self._ctx) + d.attention_norm = self.norm( + self._get(f'{pfx}.{i}.attention_norm.weight')) + d.attention = self.attn(f'{pfx}.{i}.attention') + d.ffn_norm = self.norm( + self._get(f'{pfx}.{i}.ffn_norm.weight')) + d.feed_forward = self.ffn(f'{pfx}.{i}.feed_forward') + layers[i] = d.build() + return layers.build() diff --git a/lmdeploy/turbomind/models/internvl3_5.py b/lmdeploy/turbomind/models/internvl3_5.py new file mode 100644 index 0000000000..9139714fc8 --- /dev/null +++ b/lmdeploy/turbomind/models/internvl3_5.py @@ -0,0 +1,61 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""InternVL3.5 aggregate source model for TurboMind.""" +from __future__ import annotations + +from transformers import PretrainedConfig + +from .base import INPUT_MODELS +from .qwen3 import Qwen3TextModel + + +def _cfg_get(cfg, name: str, default=None): + if isinstance(cfg, dict): + return cfg.get(name, default) + return getattr(cfg, name, default) + + +@INPUT_MODELS.register_module(name='internvl3_5') +class InternVL3_5Model: + """Aggregate source model for Qwen3-backed InternVL3.5 checkpoints.""" + + _text_pfx = 'language_model.' + _supported_inner_arch = 'Qwen3ForCausalLM' + + def __init__(self, cfg: PretrainedConfig, *, resolver): + llm_cfg = _cfg_get(cfg, 'llm_config') + if llm_cfg is None: + raise ValueError('InternVL3.5 TurboMind requires llm_config.') + + archs = _cfg_get(llm_cfg, 'architectures') + if not archs: + raise ValueError( + 'InternVL3.5 TurboMind requires llm_config.architectures.') + + inner_arch = archs[0] + if inner_arch != self._supported_inner_arch: + raise ValueError( + 'InternVL3.5 TurboMind currently supports only ' + f'{self._supported_inner_arch}, but got {inner_arch}.') + + self.text_model = Qwen3TextModel(llm_cfg, resolver=resolver) + self.vision_model = None + + def bind_runtime(self, *, ctx, root_handles, + attn_tp, mlp_tp, model_tp): + self.text_model.bind_runtime( + ctx=ctx, + root_handles=root_handles, + attn_tp=attn_tp, + mlp_tp=mlp_tp, + model_tp=model_tp, + ) + + @property + def _vocab_size(self): + return self.text_model.cfg.vocab_size + + def set_params(self, params: dict): + self.text_model.set_params(params) + + def model(self): + self.text_model.model(pfx=self._text_pfx) diff --git a/lmdeploy/turbomind/models/llama.py b/lmdeploy/turbomind/models/llama.py new file mode 100644 index 0000000000..5a565843f5 --- /dev/null +++ b/lmdeploy/turbomind/models/llama.py @@ -0,0 +1,103 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Llama TextModel for the new pipeline.""" +from __future__ import annotations + +from transformers import LlamaConfig + +from ..builders import ( + AttentionBuilder, + DecoderLayerBuilder, + DecoderLayerConfig, + FfnBuilder, + ModuleListBuilder, + ModuleListConfig, + TextModelBuilder, + _act_type_id, +) +from ..text_model import TextModel +from .base import INPUT_MODELS +from .utils import ( + layer_progress, + make_attention_config, + make_ffn_config, + make_model_weight_config, + reorder_rotary_emb, +) + + +@INPUT_MODELS.register_module(name='llama') +class LlamaModel(TextModel): + """Weight model for Llama decoder-only variants.""" + + cfg: LlamaConfig + + def __init__(self, cfg: LlamaConfig, *, resolver): + super().__init__(cfg, resolver=resolver) + + self._attn_cfg = make_attention_config(cfg) + + self._ffn_cfg = make_ffn_config(cfg, + act_type=_act_type_id('silu')) + + # ------------------------------------------------------------------ + # model() — walks full hierarchy + # ------------------------------------------------------------------ + + def model(self): + embed_key = 'model.embed_tokens.weight' + root_cfg = make_model_weight_config(self.cfg) + root = TextModelBuilder( + root_cfg, self._ctx, + root_handles=self._root_handles, + tp=self._model_tp, + vocab_size=self.cfg.vocab_size) + root.add_token_embeds(self._get(embed_key)) + root.norm = self.norm(self._get('model.norm.weight')) + lm_key = embed_key if self.cfg.tie_word_embeddings else 'lm_head.weight' + root.add_lm_head(self._linear(lm_key.removesuffix('.weight'))) + root.layers = self.layers('model.layers') + root.build() + + # ------------------------------------------------------------------ + # Attention / FFN factories + # ------------------------------------------------------------------ + + def attn(self, pfx): + q, k, v, o = [self._linear(f'{pfx}.{x}_proj') for x in 'qkvo'] + + cfg = self._attn_cfg.clone() + + def reorder(x): + return reorder_rotary_emb(x, cfg.head_dim, cfg.rope.dim, resolver=self._resolver) + + q, k = [reorder(x) for x in (q, k)] + + # No QK-norm for Llama. + m = AttentionBuilder(cfg, self._ctx, tp=self._attn_tp) + + m.add_qkv_proj(q, k, v) + m.add_o_proj(o) + + return m.build() + + def ffn(self, pfx): + w1, w3, w2 = [self._linear(f'{pfx}.{x}_proj') for x in ('gate', 'up', 'down')] + + cfg = self._ffn_cfg.clone() + + m = FfnBuilder(cfg, self._ctx, tp=self._mlp_tp) + m.add_ffn(w1, w2, w3) + return m.build() + + def layers(self, pfx): + layers = ModuleListBuilder(ModuleListConfig(), self._ctx) + for i in layer_progress(self.cfg.num_hidden_layers): + d = DecoderLayerBuilder(DecoderLayerConfig(), self._ctx) + d.attention_norm = self.norm( + self._get(f'{pfx}.{i}.input_layernorm.weight')) + d.attention = self.attn(f'{pfx}.{i}.self_attn') + d.ffn_norm = self.norm( + self._get(f'{pfx}.{i}.post_attention_layernorm.weight')) + d.feed_forward = self.ffn(f'{pfx}.{i}.mlp') + layers[i] = d.build() + return layers.build() diff --git a/lmdeploy/turbomind/models/qwen2.py b/lmdeploy/turbomind/models/qwen2.py new file mode 100644 index 0000000000..0ad6258eb0 --- /dev/null +++ b/lmdeploy/turbomind/models/qwen2.py @@ -0,0 +1,144 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Qwen2 TextModel for the new pipeline. + +Handles both dense Qwen2 and Qwen2-MoE variants. MoE detected via num_experts in HF config. Shared expert uses +shared_gate pattern matching Qwen3.5. No QK-norm, no sliding window. +""" +from __future__ import annotations + +from transformers import Qwen2Config, Qwen2MoeConfig + +from ..builders import ( + AttentionBuilder, + DecoderLayerBuilder, + DecoderLayerConfig, + FfnBuilder, + ModuleListBuilder, + ModuleListConfig, + MoeBuilder, + TextModelBuilder, + _act_type_id, +) +from ..text_model import TextModel +from .base import INPUT_MODELS +from .utils import ( + layer_progress, + make_attention_config, + make_ffn_config, + make_model_weight_config, + make_moe_config, + reorder_rotary_emb, +) + + +@INPUT_MODELS.register_module(name='qwen2-moe') +@INPUT_MODELS.register_module(name='qwen2') +class Qwen2Model(TextModel): + """Weight model for Qwen2 (dense) and Qwen2-MoE.""" + + cfg: Qwen2Config | Qwen2MoeConfig + + def __init__(self, cfg: Qwen2Config | Qwen2MoeConfig, *, resolver): + super().__init__(cfg, resolver=resolver) + + self._attn_cfg = make_attention_config(cfg) + + self._ffn_cfg = make_ffn_config(cfg, + act_type=_act_type_id('silu')) + + self._n_experts = getattr(cfg, 'num_experts', 0) + # ---- MoE template (only if MoE variant) ---- + if self._n_experts > 0: + self._moe_cfg = make_moe_config( + cfg, + experts_per_token=cfg.num_experts_per_tok, + norm_topk_prob=cfg.norm_topk_prob) + self._moe_cfg.expert_num = self._n_experts + + # ------------------------------------------------------------------ + # model() — walks full hierarchy + # ------------------------------------------------------------------ + + def model(self, pfx=''): + embed_key = 'model.embed_tokens.weight' + root_cfg = make_model_weight_config(self.cfg) + root = TextModelBuilder( + root_cfg, self._ctx, + root_handles=self._root_handles, + tp=self._model_tp, + vocab_size=self.cfg.vocab_size) + root.add_token_embeds(self._get(f'{pfx}{embed_key}')) + root.norm = self.norm(self._get(f'{pfx}model.norm.weight')) + lm_key = embed_key if self.cfg.tie_word_embeddings else 'lm_head.weight' + root.add_lm_head(self._linear(f'{pfx}{lm_key.removesuffix(".weight")}')) + root.layers = self.layers(f'{pfx}model.layers') + root.build() + + # ------------------------------------------------------------------ + # Attention / FFN / MoE factories + # ------------------------------------------------------------------ + + def attn(self, pfx): + q, k, v, o = [self._linear(f'{pfx}.{x}_proj') for x in 'qkvo'] + + cfg = self._attn_cfg.clone() + + def reorder(x): + return reorder_rotary_emb(x, cfg.head_dim, cfg.rope.dim, resolver=self._resolver) + + q, k = [reorder(x) for x in (q, k)] + + # No QK-norm for Qwen2. + m = AttentionBuilder(cfg, self._ctx, tp=self._attn_tp) + + m.add_qkv_proj(q, k, v) + m.add_o_proj(o) + + return m.build() + + def ffn(self, pfx, inter_size, is_expert=False): + w1, w3, w2 = [self._linear(f'{pfx}.{x}_proj') for x in ('gate', 'up', 'down')] + + cfg = self._ffn_cfg.clone() + cfg.inter_size = inter_size + cfg.is_expert = is_expert + + m = FfnBuilder(cfg, self._ctx, tp=self._mlp_tp) + m.add_ffn(w1, w2, w3) + return m.build() + + def moe(self, pfx): + cfg = self._moe_cfg.clone() + + m = MoeBuilder(cfg, self._ctx) + + m.add_gate('gate', self._linear(f'{pfx}.gate')) + + experts = ModuleListBuilder(ModuleListConfig(), self._ctx) + for e in range(self.cfg.num_experts): + experts[e] = self.ffn(f'{pfx}.experts.{e}', self.cfg.moe_intermediate_size, + is_expert=True) + m.experts = experts.build() + + m.add_gate('shared_gate', self._linear(f'{pfx}.shared_expert_gate')) + shared = self.ffn(f'{pfx}.shared_expert', self.cfg.shared_expert_intermediate_size) + + return m.build(), shared + + # ------------------------------------------------------------------ + # layers() — layer dispatch loop + # ------------------------------------------------------------------ + + def layers(self, pfx): + layers = ModuleListBuilder(ModuleListConfig(), self._ctx) + for i in layer_progress(self.cfg.num_hidden_layers): + d = DecoderLayerBuilder(DecoderLayerConfig(), self._ctx) + d.attention = self.attn(f'{pfx}.{i}.self_attn') + if self._n_experts > 0: + d.moe_ffn, d.feed_forward = self.moe(f'{pfx}.{i}.mlp') + else: + d.feed_forward = self.ffn(f'{pfx}.{i}.mlp', self.cfg.intermediate_size) + d.attention_norm = self.norm(self._get(f'{pfx}.{i}.input_layernorm.weight')) + d.ffn_norm = self.norm(self._get(f'{pfx}.{i}.post_attention_layernorm.weight')) + layers[i] = d.build() + return layers.build() diff --git a/lmdeploy/turbomind/models/qwen3.py b/lmdeploy/turbomind/models/qwen3.py new file mode 100644 index 0000000000..2463e60010 --- /dev/null +++ b/lmdeploy/turbomind/models/qwen3.py @@ -0,0 +1,144 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Qwen3 TextModel for the new pipeline. + +Qwen3 is a standard Llama-like model with QK norm and optional MoE. No shared expert in the MoE variant, no linear +attention, no zero-centered norm. +""" +from __future__ import annotations + +from transformers import Qwen3Config, Qwen3MoeConfig + +from ..builders import ( + AttentionBuilder, + DecoderLayerBuilder, + DecoderLayerConfig, + FfnBuilder, + ModuleListBuilder, + ModuleListConfig, + MoeBuilder, + TextModelBuilder, + _act_type_id, +) +from ..text_model import TextModel +from .base import INPUT_MODELS +from .utils import ( + layer_progress, + make_attention_config, + make_ffn_config, + make_model_weight_config, + make_moe_config, + reorder_rotary_emb, +) + + +@INPUT_MODELS.register_module(name='qwen3-moe') +@INPUT_MODELS.register_module(name='qwen3') +class Qwen3TextModel(TextModel): + """Weight model for Qwen3 (dense) and Qwen3-MoE.""" + + cfg: Qwen3Config | Qwen3MoeConfig + + def __init__(self, cfg: Qwen3Config | Qwen3MoeConfig, *, resolver): + super().__init__(cfg, resolver=resolver) + + self._attn_cfg = make_attention_config(cfg) + + self._ffn_cfg = make_ffn_config(cfg, + act_type=_act_type_id('silu')) + + self._n_experts = getattr(cfg, 'num_experts', 0) + + if self._n_experts > 0: + self._moe_cfg = make_moe_config( + cfg, + experts_per_token=cfg.num_experts_per_tok, + norm_topk_prob=cfg.norm_topk_prob) + self._moe_cfg.expert_num = self._n_experts + + # ------------------------------------------------------------------ + # model() — walks full hierarchy (same as existing code) + # ------------------------------------------------------------------ + + def model(self, pfx=''): + embed_key = 'model.embed_tokens.weight' + root_cfg = make_model_weight_config(self.cfg) + root = TextModelBuilder( + root_cfg, self._ctx, + root_handles=self._root_handles, + tp=self._model_tp, + vocab_size=self.cfg.vocab_size) + root.add_token_embeds(self._get(f'{pfx}{embed_key}')) + root.norm = self.norm(self._get(f'{pfx}model.norm.weight')) + lm_key = embed_key if self.cfg.tie_word_embeddings else 'lm_head.weight' + root.add_lm_head(self._linear(f'{pfx}{lm_key.removesuffix(".weight")}')) + root.layers = self.layers(f'{pfx}model.layers') + root.build() + + # ------------------------------------------------------------------ + # Attention / FFN / MoE factories + # ------------------------------------------------------------------ + + def attn(self, pfx): + q, k, v, o = [self._linear(f'{pfx}.{x}_proj') for x in 'qkvo'] + + cfg = self._attn_cfg.clone() + + def reorder(x): + return reorder_rotary_emb(x, cfg.head_dim, cfg.rope.dim, resolver=self._resolver) + + q, k = [reorder(x) for x in (q, k)] + + # No per-layer attention fields for Qwen3 (no sliding window). + m = AttentionBuilder(cfg, self._ctx, tp=self._attn_tp) + + m.add_qkv_proj(q, k, v) + m.add_o_proj(o) + + q_norm, k_norm = [self._get(f'{pfx}.{x}_norm.weight') for x in 'qk'] + m.q_norm = self.norm(reorder(q_norm)) + m.k_norm = self.norm(reorder(k_norm)) + + return m.build() + + + def ffn(self, pfx, is_expert=False): + w1, w3, w2 = [self._linear(f'{pfx}.{x}_proj') for x in ('gate', 'up', 'down')] + + cfg = self._ffn_cfg.clone() + cfg.is_expert = is_expert + + m = FfnBuilder(cfg, self._ctx, tp=self._mlp_tp) + m.add_ffn(w1, w2, w3) + return m.build() + + + def moe(self, pfx): + cfg = self._moe_cfg.clone() + + m = MoeBuilder(cfg, self._ctx) + + m.add_gate('gate', self._linear(f'{pfx}.gate')) + + experts = ModuleListBuilder(ModuleListConfig(), self._ctx) + for e in range(self.cfg.num_experts): + experts[e] = self.ffn(f'{pfx}.experts.{e}', is_expert=True) + m.experts = experts.build() + + return m.build() + + + def layers(self, pfx): + layers = ModuleListBuilder(ModuleListConfig(), self._ctx) + + for i in layer_progress(self.cfg.num_hidden_layers): + d = DecoderLayerBuilder(DecoderLayerConfig(), self._ctx) + d.attention_norm = self.norm(self._get(f'{pfx}.{i}.input_layernorm.weight')) + d.attention = self.attn(f'{pfx}.{i}.self_attn') + d.ffn_norm = self.norm(self._get(f'{pfx}.{i}.post_attention_layernorm.weight')) + if self._n_experts: + d.moe_ffn = self.moe(f'{pfx}.{i}.mlp') + else: + d.feed_forward = self.ffn(f'{pfx}.{i}.mlp') + layers[i] = d.build() + + return layers.build() diff --git a/lmdeploy/turbomind/models/qwen3_5.py b/lmdeploy/turbomind/models/qwen3_5.py new file mode 100644 index 0000000000..adde792f6f --- /dev/null +++ b/lmdeploy/turbomind/models/qwen3_5.py @@ -0,0 +1,233 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Qwen3.5 TextModel for the new pipeline.""" +from __future__ import annotations + +import re + +import _turbomind as _tm +from transformers.models.qwen3_5.configuration_qwen3_5 import Qwen3_5TextConfig +from transformers.models.qwen3_5_moe.configuration_qwen3_5_moe import Qwen3_5MoeTextConfig + +from ..builders import ( + AttentionBuilder, + DecoderLayerBuilder, + DecoderLayerConfig, + DeltaNetBuilder, + FfnBuilder, + ModuleListBuilder, + ModuleListConfig, + MoeBuilder, + TextModelBuilder, + _act_type_id, +) +from ..builders.attention import split_output_gate +from ..text_model import TextModel +from .base import INPUT_MODELS +from .utils import ( + layer_progress, + make_attention_config, + make_ffn_config, + make_model_weight_config, + make_moe_config, + read_packed_moe_expert, + reorder_rotary_emb, +) + + +def map_packed_qwen35_experts(name: str) -> str: + """Map packed expert names to weight names so parameter.py can classify.""" + return re.sub(r'(mlp\.experts\.(?:gate_up|down)_proj)$', r'\1.weight', name) + + +@INPUT_MODELS.register_module(name='qwen3_5-moe') +@INPUT_MODELS.register_module(name='qwen3_5') +class Qwen3_5Model(TextModel): + """Weight model for Qwen3.5 (dense + linear-attn + optional MoE).""" + + _loader_mappings = [map_packed_qwen35_experts] + cfg: Qwen3_5TextConfig | Qwen3_5MoeTextConfig + + def __init__(self, cfg: Qwen3_5TextConfig | Qwen3_5MoeTextConfig, *, resolver): + super().__init__(cfg, resolver=resolver) + + self._attn_cfg = make_attention_config(cfg) + self._attn_cfg.output_gate = True + + self._n_experts = getattr(cfg, 'num_experts', 0) + + # ---- DeltaNet template ---- + ln_key_heads = cfg.linear_num_key_heads + ln_val_heads = cfg.linear_num_value_heads + ln_key_dim = cfg.linear_key_head_dim + ln_val_dim = cfg.linear_value_head_dim + + self._dn_cfg = _tm.DeltaNetConfig() + self._dn_cfg.hidden_dim = self.cfg.hidden_size + self._dn_cfg.num_k_heads = ln_key_heads + self._dn_cfg.num_v_heads = ln_val_heads + self._dn_cfg.key_head_dim = ln_key_dim + self._dn_cfg.value_head_dim = ln_val_dim + self._dn_cfg.d_conv = cfg.linear_conv_kernel_dim or 4 + q_dim = ln_key_heads * ln_key_dim + v_dim = ln_val_heads * ln_val_dim + self._linear_qkv_split = (q_dim, q_dim, v_dim) + + # ---- MoE template ---- + if self._n_experts > 0: + self._moe_cfg = make_moe_config( + cfg, + experts_per_token=cfg.num_experts_per_tok) + self._moe_cfg.expert_num = self._n_experts + inter_size=cfg.moe_intermediate_size + else: + inter_size=cfg.intermediate_size + + # ---- FFN template ---- + self._ffn_cfg = make_ffn_config( + cfg, + act_type=_act_type_id('silu'), inter_size=inter_size) + + # ------------------------------------------------------------------ + # model() — same topology as old code + # ------------------------------------------------------------------ + + def model(self): + root_cfg = make_model_weight_config(self.cfg) + root = TextModelBuilder( + root_cfg, self._ctx, + root_handles=self._root_handles, + tp=self._model_tp, + vocab_size=self.cfg.vocab_size) + embed_key = 'model.language_model.embed_tokens.weight' + root.add_token_embeds(self._get(embed_key)) + root.norm = self.norm(1.0 + self._get('model.language_model.norm.weight')) + lm_key = embed_key if self.cfg.tie_word_embeddings else 'lm_head.weight' + root.add_lm_head(self._linear(lm_key.removesuffix('.weight'))) + root.layers = self.layers('model.language_model.layers') + root.build() + + # ------------------------------------------------------------------ + # Attention / linear-attention factories + # ------------------------------------------------------------------ + + def attn(self, pfx): + q, k, v, o = [self._linear(f'{pfx}.{x}_proj') for x in 'qkvo'] + + cfg = self._attn_cfg.clone() + q, gate = split_output_gate(q, head_num=cfg.head_num) + + def reorder(x): + return reorder_rotary_emb(x, cfg.head_dim, cfg.rope.dim, resolver=self._resolver) + + q, k = [reorder(x) for x in (q, k)] + + m = AttentionBuilder(cfg, self._ctx, tp=self._attn_tp) + + m.add_qkv_proj(q, k, v, gate=gate) + m.add_o_proj(o) + + q_norm, k_norm = [self._get(f'{pfx}.{x}_norm.weight') for x in 'qk'] + + m.q_norm = self.norm(reorder(1.0 + q_norm.float())) + m.k_norm = self.norm(reorder(1.0 + k_norm.float())) + + return m.build() + + def linear_attn(self, pfx): + cfg = self._dn_cfg.clone() + builder = DeltaNetBuilder(cfg, self._ctx, + tp=self._attn_tp) + + builder.add_input_projections( + in_proj_qkv=self._linear(f'{pfx}.in_proj_qkv'), + in_proj_z=self._linear(f'{pfx}.in_proj_z'), + in_proj_b=self._linear(f'{pfx}.in_proj_b'), + in_proj_a=self._linear(f'{pfx}.in_proj_a'), + out_proj=self._linear(f'{pfx}.out_proj'), + qkv_split=self._linear_qkv_split) + builder.add_scalar_params( + a_log=self._get(f'{pfx}.A_log'), + dt_bias=self._get(f'{pfx}.dt_bias')) + builder.add_conv1d( + self._get(f'{pfx}.conv1d.weight'), + qkv_split=self._linear_qkv_split) + builder.norm = self.norm(self._get(f'{pfx}.norm.weight')) # ! not zero-centered + return builder.build() + + # ------------------------------------------------------------------ + # FFN / MoE factories + # ------------------------------------------------------------------ + + def ffn(self, pfx, inter_size, is_expert=False): + try: + w1, w3, w2 = [self._linear(f'{pfx}.{x}_proj') + for x in ('gate', 'up', 'down')] + except KeyError: + return None + + cfg = self._ffn_cfg.clone() + cfg.inter_size = inter_size + cfg.is_expert = is_expert + + m = FfnBuilder(cfg, self._ctx, tp=self._mlp_tp) + m.add_ffn(w1, w2, w3) + return m.build() + + def moe(self, pfx): + cfg = self._moe_cfg.clone() + + m = MoeBuilder(cfg, self._ctx) + + m.add_gate('gate', self._linear(f'{pfx}.gate')) + + experts = ModuleListBuilder(ModuleListConfig(), self._ctx) + for e in range(self._n_experts): + experts[e] = self._moe_expert_ffn(f'{pfx}.experts', e, self.cfg.moe_intermediate_size) + m.experts = experts.build() + + m.add_gate('shared_gate', self._linear(f'{pfx}.shared_expert_gate')) + shared = self.ffn(f'{pfx}.shared_expert', self.cfg.shared_expert_intermediate_size) + + return m.build(), shared + + def _packed_moe_ffn(self, pfx, expert_idx, inter_size): + w1, w2, w3 = read_packed_moe_expert( + self.params, + f'{pfx}.gate_up_proj', + f'{pfx}.down_proj', + expert_idx, + resolver=self._resolver, + ) + cfg = self._ffn_cfg.clone() + cfg.inter_size = inter_size + cfg.is_expert = True + m = FfnBuilder(cfg, self._ctx, tp=self._mlp_tp) + m.add_ffn(w1, w2, w3) + return m.build() + + def _moe_expert_ffn(self, pfx, expert_idx, inter_size): + expert_pfx = f'{pfx}.{expert_idx}' + inter_size = self.cfg.moe_intermediate_size + return (self.ffn(expert_pfx, inter_size, is_expert=True) + or self._packed_moe_ffn(pfx, expert_idx, inter_size)) + + # ------------------------------------------------------------------ + # layers() — dispatch by layer type + # ------------------------------------------------------------------ + + def layers(self, pfx): + layers = ModuleListBuilder(ModuleListConfig(), self._ctx) + for i in layer_progress(self.cfg.num_hidden_layers): + d = DecoderLayerBuilder(DecoderLayerConfig(), self._ctx) + if self.cfg.layer_types[i] == 'linear_attention': + d.linear_attn = self.linear_attn(f'{pfx}.{i}.linear_attn') + else: + d.attention = self.attn(f'{pfx}.{i}.self_attn') + if self._n_experts > 0: + d.moe_ffn, d.feed_forward = self.moe(f'{pfx}.{i}.mlp') + else: + d.feed_forward = self.ffn(f'{pfx}.{i}.mlp', self.cfg.intermediate_size) + d.attention_norm = self.norm(1.0 + self._get(f'{pfx}.{i}.input_layernorm.weight').float()) + d.ffn_norm = self.norm(1.0 + self._get(f'{pfx}.{i}.post_attention_layernorm.weight').float()) + layers[i] = d.build() + return layers.build() diff --git a/lmdeploy/turbomind/models/utils.py b/lmdeploy/turbomind/models/utils.py new file mode 100644 index 0000000000..4a95a0df08 --- /dev/null +++ b/lmdeploy/turbomind/models/utils.py @@ -0,0 +1,441 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Shared utilities for source model input classes.""" +from __future__ import annotations + +import math +from types import SimpleNamespace + +import _turbomind as _tm +import torch + +from lmdeploy.archs import get_model_arch +from lmdeploy.utils import get_logger + +from ..builders import _act_type_id +from ..linear import Linear, _dequant_linear + + +def source_model_config(model_config): + """Select the local config object consumed by a TurboMind source model. + + Unwrap text_config for text-only wrappers. Keep llm_config on the outer config so aggregate models, such as + InternVL3.5, can validate and delegate explicitly. + """ + if hasattr(model_config, 'text_config'): + return model_config.text_config + return model_config + + +def load_model_config(model_path: str): + """Load the local Transformers config object for a source text model.""" + _, model_config = get_model_arch(model_path) + return source_model_config(model_config) + + +def _optional_attr(cfg, name: str, default=None): + if isinstance(cfg, dict): + return cfg.get(name, default) + return getattr(cfg, name, default) + + +def _param_get(params, name: str, default=None): + if params is None: + return default + if isinstance(params, dict): + return params.get(name, default) + return getattr(params, name, default) + + +def _param_has(params, name: str) -> bool: + if params is None: + return False + if isinstance(params, dict): + return name in params + return hasattr(params, name) + + +_ROPE_TYPE_MAP = { + 'default': 1, + 'linear': 2, + 'dynamic': 3, + 'yarn': 4, + 'llama3': 5, + 'mrope': 6, +} + + +def rope_type_to_int(type_str: str) -> int: + return _ROPE_TYPE_MAP[type_str] + + +def _get_mscale(scale, mscale=1): + """YaRN mscale helper. Shared by parse_rope_param and MLA softmax_scale.""" + if scale <= 1: + return 1.0 + return 0.1 * mscale * math.log(scale) + 1.0 + + +def parse_rope_param(cfg, head_dim: int) -> tuple[SimpleNamespace, int]: + """Parse RoPE configuration from a model config dict or object. + + Returns: + rope_param: SimpleNamespace carrying rope fields (type, base, dim, + factor, max_position_embeddings, attention_factor, beta_fast, + beta_slow, low_freq_factor, high_freq_factor, + original_max_position_embeddings, mrope_section) + max_position_embeddings: int (0 if not present in config) + """ + rope_parameters = _optional_attr(cfg, 'rope_parameters', None) + if rope_parameters is not None: + # transformers v5.0.0 aggregates rope settings into rope_parameters + rope_scaling = rope_parameters + rope_theta = float(_param_get(rope_scaling, 'rope_theta', 10000.0)) + else: + rope_theta = float(_optional_attr(cfg, 'rope_theta', 10000.0)) + rope_scaling = _optional_attr(cfg, 'rope_scaling', None) + + max_position_embeddings = int(_optional_attr(cfg, 'max_position_embeddings', 0)) + partial_rotary_factor = _param_get(rope_parameters, 'partial_rotary_factor', None) + if partial_rotary_factor is None: + partial_rotary_factor = float(_optional_attr(cfg, 'partial_rotary_factor', 1.0)) + rope_param = SimpleNamespace( + type='default', + base=rope_theta, + dim=int(head_dim * partial_rotary_factor), + factor=1.0, + max_position_embeddings=None, + attention_factor=1.0, + beta_fast=32, + beta_slow=1, + low_freq_factor=None, + high_freq_factor=None, + original_max_position_embeddings=None, + mrope_section=None, + ) + + if rope_scaling is not None: + rope_type = _param_get(rope_scaling, 'rope_type', '') or _param_get(rope_scaling, 'type', '') + if _param_get(rope_scaling, 'mrope_section') is not None: + rope_type = 'mrope' + scaling_factor = _param_get(rope_scaling, 'factor', 0.0) + + if rope_type == 'default': + pass + elif rope_type == 'dynamic': + rope_param.type = 'dynamic' + rope_param.factor = scaling_factor + rope_param.max_position_embeddings = max_position_embeddings + elif rope_type == 'linear': + rope_param.type = 'linear' + rope_param.factor = scaling_factor + elif rope_type == 'llama3': + low_freq_factor = _param_get(rope_scaling, 'low_freq_factor', 1.0) + high_freq_factor = _param_get(rope_scaling, 'high_freq_factor', 1.0) + original_max_position_embeddings = _param_get(rope_scaling, 'original_max_position_embeddings', 0) + rope_param.type = 'llama3' + rope_param.factor = scaling_factor + rope_param.low_freq_factor = low_freq_factor + rope_param.high_freq_factor = high_freq_factor + rope_param.original_max_position_embeddings = original_max_position_embeddings + elif rope_type == 'yarn': + attention_factor = _param_get(rope_scaling, 'attention_factor', None) + if attention_factor is None: + mscale = _param_get(rope_scaling, 'mscale') + mscale_all_dim = _param_get(rope_scaling, 'mscale_all_dim') + if mscale is not None and mscale_all_dim is not None: + attention_factor = float( + _get_mscale(scaling_factor, mscale) / + _get_mscale(scaling_factor, mscale_all_dim)) + else: + attention_factor = _get_mscale(scaling_factor) + beta_fast = _param_get(rope_scaling, 'beta_fast', 32.0) + beta_slow = _param_get(rope_scaling, 'beta_slow', 1.0) + rope_param.type = 'yarn' + if _param_has(rope_scaling, 'original_max_position_embeddings'): + original_max_position_embeddings = _param_get(rope_scaling, 'original_max_position_embeddings') + scaling_factor = max_position_embeddings / original_max_position_embeddings + else: + original_max_position_embeddings = max_position_embeddings + rope_param.factor = scaling_factor + rope_param.max_position_embeddings = original_max_position_embeddings + rope_param.attention_factor = attention_factor + rope_param.beta_fast = beta_fast + rope_param.beta_slow = beta_slow + elif rope_type == 'mrope': + mrope_section = _param_get(rope_scaling, 'mrope_section') + rope_param.type = 'mrope' + rope_param.mrope_section = mrope_section + else: + raise RuntimeError(f'Unsupported rope type: {rope_type}') + + return rope_param, max_position_embeddings + + +def copy_rope_config(rope_cfg, rope_param, max_position_embeddings: int): + """Copy parsed RoPE fields into a TurboMind C++ rope config object.""" + rope_cfg.type = rope_type_to_int(rope_param.type) + rope_cfg.base = rope_param.base + rope_cfg.dim = rope_param.dim + rope_cfg.factor = rope_param.factor + rope_cfg.max_position_embeddings = max_position_embeddings + if rope_param.type == 'yarn': + rope_cfg.yarn_attention_factor = rope_param.attention_factor + rope_cfg.yarn_beta_fast = rope_param.beta_fast + rope_cfg.yarn_beta_slow = rope_param.beta_slow + elif rope_param.type == 'llama3': + rope_cfg.llama3_low_freq_factor = rope_param.low_freq_factor + rope_cfg.llama3_high_freq_factor = rope_param.high_freq_factor + rope_cfg.llama3_original_max_position_embeddings = rope_param.original_max_position_embeddings + elif rope_param.type == 'mrope': + rope_cfg.mrope_section = rope_param.mrope_section + + +def make_model_weight_config(cfg): + """Build the root ModelWeightConfig from root-module fields.""" + model_cfg = _tm.ModelWeightConfig() + model_cfg.hidden_units = cfg.hidden_size + return model_cfg + + +def make_attention_config(cfg, *, head_dim=None): + """Build common AttentionConfig fields from attention-module geometry.""" + hidden_dim = cfg.hidden_size + head_num = cfg.num_attention_heads + head_dim = head_dim if head_dim is not None else getattr(cfg, 'head_dim', hidden_dim // head_num) + kv_head_num = cfg.num_key_value_heads + rope, max_position_embeddings = parse_rope_param(cfg, head_dim) + attn_cfg = _tm.AttentionConfig() + attn_cfg.hidden_dim = hidden_dim + attn_cfg.head_dim = head_dim + attn_cfg.head_num = head_num + attn_cfg.kv_head_num = kv_head_num + attn_cfg.window_size = 0 + attn_cfg.softmax_scale = 0.0 + copy_rope_config(attn_cfg.rope, rope, max_position_embeddings) + return attn_cfg + + +def make_ffn_config(cfg, *, act_type, inter_size=None): + """Build common FfnConfig fields from FFN-module shape.""" + ffn_cfg = _tm.FfnConfig() + ffn_cfg.hidden_dim = cfg.hidden_size + ffn_cfg.act_type = act_type + ffn_cfg.inter_size = inter_size if inter_size is not None else cfg.intermediate_size + return ffn_cfg + + +def make_moe_config(cfg, *, + experts_per_token, + act_type=None, + norm_topk_prob=True, + topk_method='greedy', + scoring_func='softmax', + routed_scale=1.0, + topk_group=1, + n_group=1, + router_n_groups=0): + """Build a MoeConfig populated from HF config and per-model overrides.""" + if act_type is None: + act_type = _act_type_id('silu') + + moe_cfg = _tm.MoeConfig() + moe_cfg.experts_per_token = experts_per_token + moe_cfg.norm_topk_prob = norm_topk_prob + moe_cfg.routed_scale = routed_scale + moe_cfg.topk_group = topk_group + moe_cfg.topk_method = topk_method + moe_cfg.n_group = n_group + moe_cfg.scoring_func = scoring_func + moe_cfg.router_n_groups = router_n_groups + moe_cfg.act_type = act_type + moe_cfg.fuse_silu = True + return moe_cfg + + +def make_mla_config(cfg): + """Build an AttentionConfig for MLA models. + + Computes MLA geometry, softmax scale (including YaRN mscale_all_dim), + and populates all MLA-specific AttentionConfig fields. + + Returns: + _tm.AttentionConfig populated with MLA fields. + """ + qk_nope_dim = cfg.qk_nope_head_dim + qk_rope_dim = cfg.qk_rope_head_dim + kv_lora_rank = cfg.kv_lora_rank + q_head_dim = qk_nope_dim + qk_rope_dim + + size_per_head = q_head_dim + v_head_dim = cfg.v_head_dim + softmax_scale = 0.0 + if kv_lora_rank and kv_lora_rank != qk_nope_dim: + size_per_head = kv_lora_rank + qk_rope_dim + v_head_dim = kv_lora_rank + softmax_scale = q_head_dim ** (-0.5) + + rope, max_position_embeddings = parse_rope_param(cfg, qk_rope_dim) + + # MLA-specific YaRN mscale_all_dim softmax_scale adjustment + rope_params = (getattr(cfg, 'rope_parameters', None) + or getattr(cfg, 'rope_scaling', None)) + if rope_params: + rope_type = (_param_get(rope_params, 'rope_type', '') + or _param_get(rope_params, 'type', '')) + if rope_type == 'yarn': + mscale_all_dim = _param_get(rope_params, 'mscale_all_dim') + if mscale_all_dim: + scaling_factor = float(_param_get(rope_params, 'factor', 0.0)) + mscale = _get_mscale(scaling_factor, mscale_all_dim) + softmax_scale = q_head_dim ** (-0.5) * mscale * mscale + + attn_cfg = _tm.AttentionConfig() + attn_cfg.hidden_dim = cfg.hidden_size + attn_cfg.head_dim = size_per_head + attn_cfg.head_num = cfg.num_attention_heads + attn_cfg.kv_head_num = 1 + attn_cfg.kv_lora_rank = kv_lora_rank + attn_cfg.q_lora_rank = cfg.q_lora_rank or 0 + attn_cfg.qk_rope_dim = qk_rope_dim + attn_cfg.qk_nope_dim = qk_nope_dim + attn_cfg.v_head_dim = v_head_dim + copy_rope_config(attn_cfg.rope, rope, max_position_embeddings) + attn_cfg.softmax_scale = softmax_scale + + return attn_cfg + + +def _reorder_rotary_emb(x: torch.Tensor, head_dim: int, rope_dim: int): + """Reorder rotary embedding layout for TurboMind's RoPE kernel.""" + if rope_dim < head_dim: + output_dims = x.size(-1) + head_num = output_dims // head_dim + orig_shape = x.shape + if x.dim() == 1: + x = x.unsqueeze(0) + x = x.view(x.size(0), head_num, head_dim) + rotary = x[:, :, :rope_dim] + passthrough = x[:, :, rope_dim:] + rotary = rotary.view(x.size(0), head_num, 2, rope_dim // 2).transpose(2, 3).contiguous() + rotary = rotary.view(x.size(0), head_num, rope_dim) + x = torch.cat([rotary, passthrough], dim=-1) + return x.reshape(orig_shape) + else: + output_dims = x.size(-1) + head_num = output_dims // head_dim + return x.view(-1, head_num, 2, head_dim // 2).transpose(2, 3).reshape(x.shape) + + +def reorder_rotary_emb(x, head_dim: int, rope_dim: int, *, resolver=None): + """Apply RoPE layout permutation. + + Accepts either a ``Linear`` or a raw ``torch.Tensor``. + + For ``Linear`` inputs the permutation is applied to every tensor in the + bundle with quantization awareness (block-alignment check, dequant + fallback, block-level shuffling for scales/zeros). ``resolver`` is + required and must not be ``None`` — it supplies the compute dtype + threaded into ``_dequant_linear``. + + For ``torch.Tensor`` inputs the element-level interleave-transpose is + applied directly. ``resolver`` is ignored. + """ + if isinstance(x, Linear): + if resolver is None: + raise TypeError( + 'resolver is required when passing a Linear to reorder_rotary_emb' + ) + data_type = resolver.data_type + wfmt = x.weight_format + block_out = wfmt.block_out or 0 + + # If blocks don't align with heads, dequant first + if block_out and block_out % head_dim != 0: + x = _dequant_linear(x, data_type=data_type) + block_out = 0 + + new_tensors = {} + for kind, tensor in x.tensors.items(): + if kind in ('scales', 'zeros') and block_out > 0: + # Block-level shuffle: reinterpret each block as a "head" + # so _reorder_rotary_emb shuffles at block granularity. + blocks_per_head = block_out // head_dim + if blocks_per_head <= 1: + new_tensors[kind] = tensor + else: + rope_dim_blocks = rope_dim * blocks_per_head // head_dim + new_tensors[kind] = _reorder_rotary_emb(tensor, blocks_per_head, rope_dim_blocks) + elif tensor.size(-1) % head_dim == 0: + new_tensors[kind] = _reorder_rotary_emb(tensor, head_dim, rope_dim) + else: + new_tensors[kind] = tensor + + return Linear(tensors=new_tensors, weight_format=x.weight_format) + + return _reorder_rotary_emb(x, head_dim, rope_dim) + + +def layer_progress(num_layers: int): + """Tqdm iterable for model.layers() per-layer conversion loops. + + Yields the layer indices 0..num_layers-1, displaying a single-line + progress bar on stderr. ``leave=False`` clears the bar when the loop + completes. Lazy-imports tqdm so importing utils.py stays cheap. + """ + from tqdm import tqdm + return tqdm(range(num_layers), desc='Loading', leave=False) + + +def read_packed_moe_expert( + params: dict, + gate_up_pfx: str, + down_pfx: str, + expert_idx: int, + *, + resolver, + interleaved: bool = False, + trans: bool = False, +) -> tuple[Linear, Linear, Linear]: + """Read one packed MoE expert's fused gate_up + down and split into (w1, + w2, w3) Linears in TM layout. + + ``gate_up_pfx`` and ``down_pfx`` are the full prefixes to the two + packed tensors (e.g. ``'model.layers.5.mlp.experts.gate_up_proj'``). + The caller composes these strings; this helper concatenates nothing. + + Parameters + ---------- + interleaved : bool + Split scheme for the fused gate_up output dim. + ``False`` -> contiguous ``[..., :half]`` / ``[..., half:]`` (qwen3.5). + ``True`` -> stride-2 interleaved ``[..., ::2]`` / ``[..., 1::2]`` (gpt-oss). + trans : bool + For trivial-format checkpoints that store the packed tensor in + ``[n_experts, in, out]`` layout (gpt-oss), transposes the 2D + ``weight`` tensor to undo the HF-to-TM transpose applied by + ``TrivialFormat.normalize``. Only affects the ``weight`` kind on + trivial-format linears; quantized formats use their own normalizers. + """ + gate_up = resolver.resolve(params, gate_up_pfx, index=expert_idx) + down = resolver.resolve(params, down_pfx, index=expert_idx) + + if trans: + for lin in (gate_up, down): + if lin.weight_format.name == 'trivial': + w = lin.tensors.get('weight') + if w is not None and w.dim() == 2: + lin.tensors['weight'] = w.t().contiguous() + + w1_t: dict[str, torch.Tensor] = {} + w3_t: dict[str, torch.Tensor] = {} + for kind, t in gate_up.tensors.items(): + if interleaved: + w1_t[kind] = t[..., ::2].contiguous() + w3_t[kind] = t[..., 1::2].contiguous() + else: + half = t.shape[-1] // 2 + w1_t[kind] = t[..., :half].contiguous() + w3_t[kind] = t[..., half:].contiguous() + w1 = Linear(tensors=w1_t, weight_format=gate_up.weight_format) + w3 = Linear(tensors=w3_t, weight_format=gate_up.weight_format) + return w1, down, w3 diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py index 732b38c84d..1a08b9cbe1 100644 --- a/lmdeploy/turbomind/supported_models.py +++ b/lmdeploy/turbomind/supported_models.py @@ -5,62 +5,25 @@ logger = get_logger('lmdeploy') SUPPORTED_ARCHS = dict( - # baichuan-7b - BaiChuanForCausalLM='baichuan', - # baichuan2-7b, baichuan-13b, baichuan2-13b - BaichuanForCausalLM='baichuan2', - # gpt-oss - GptOssForCausalLM='gpt-oss', - # internlm - InternLMForCausalLM='llama', - # internlm2 - InternLM2ForCausalLM='internlm2', - # internlm3 - InternLM3ForCausalLM='llama', - # llama, llama2, alpaca, vicuna, codellama, ultracm, yi, - # deepseek-coder, deepseek-llm - LlamaForCausalLM='llama', - # Qwen 7B-72B, Qwen-VL-7B - QWenLMHeadModel='qwen', - # Qwen2 + # Qwen2 / Qwen2-MoE Qwen2ForCausalLM='qwen2', Qwen2MoeForCausalLM='qwen2-moe', - # Qwen2-VL - Qwen2VLForConditionalGeneration='qwen2', - # Qwen2.5-VL - Qwen2_5_VLForConditionalGeneration='qwen2', # Qwen3 Qwen3ForCausalLM='qwen3', Qwen3MoeForCausalLM='qwen3-moe', # Qwen 3.5 Qwen3_5ForConditionalGeneration='qwen3_5', Qwen3_5MoeForConditionalGeneration='qwen3_5-moe', - # mistral - MistralForCausalLM='llama', - # llava - LlavaLlamaForCausalLM='llama', - LlavaMistralForCausalLM='llama', - LlavaForConditionalGeneration='llava', - # xcomposer2 - InternLMXComposer2ForCausalLM='xcomposer2', - # internvl - InternVLChatModel='internvl', - # internvl3 - InternVLForConditionalGeneration='internvl', - InternS1ForConditionalGeneration='internvl', - # deepseek-vl - MultiModalityCausalLM='deepseekvl', - DeepseekV2ForCausalLM='deepseek2', - # MiniCPMV - MiniCPMV='minicpmv', - # chatglm2/3, glm4 - ChatGLMModel='glm4', - ChatGLMForConditionalGeneration='glm4', + # InternVL3.5 + InternVLChatModel='internvl3_5', + # Llama (2, 3, 3.1, 3.2) + InternLM3 + LlamaForCausalLM='llama', + InternLM2ForCausalLM='internlm2', + InternLM3ForCausalLM='llama', # glm4-moe-lite (e.g. GLM-4.7-Flash) Glm4MoeLiteForCausalLM='glm4-moe-lite', - # mixtral - MixtralForCausalLM='mixtral', - MolmoForCausalLM='molmo', + # gpt-oss + GptOssForCausalLM='gpt-oss', ) @@ -86,53 +49,19 @@ def is_supported(model_path: str): """ # noqa: E501 import os - def _is_head_dim_supported(cfg): - head_dim = cfg.head_dim if hasattr(cfg, 'head_dim') else cfg.hidden_size // cfg.num_attention_heads - return head_dim in [128, 64] - support_by_turbomind = False triton_model_path = os.path.join(model_path, 'triton_models') if os.path.exists(triton_model_path): support_by_turbomind = True else: - arch, cfg = get_model_arch(model_path) quant_method = search_nested_config(cfg.to_dict(), 'quant_method') if quant_method and quant_method in ['smooth_quant']: - # tm hasn't support quantized models by applying smoothquant return False if arch in SUPPORTED_ARCHS.keys(): support_by_turbomind = True - # special cases - if arch == 'BaichuanForCausalLM': - num_attn_head = cfg.num_attention_heads - if num_attn_head == 40: - # baichuan-13B, baichuan2-13B not supported by turbomind - support_by_turbomind = False - elif arch in ['Qwen2ForCausalLM', 'LlamaForCausalLM']: - support_by_turbomind = _is_head_dim_supported(cfg) - elif arch in ('ChatGLMModel', 'ChatGLMForConditionalGeneration'): - # chatglm1/2/3 is not working yet - support_by_turbomind = cfg.num_layers == 40 - if getattr(cfg, 'vision_config', None) is not None: - # glm-4v-9b not supported - support_by_turbomind = False - elif arch == 'InternVLChatModel': - llm_arch = cfg.llm_config.architectures[0] - support_by_turbomind = (llm_arch in SUPPORTED_ARCHS and _is_head_dim_supported(cfg.llm_config)) - elif arch in ['LlavaForConditionalGeneration', 'InternVLForConditionalGeneration']: - llm_arch = cfg.text_config.architectures[0] - if llm_arch in ['Qwen2ForCausalLM', 'LlamaForCausalLM']: - support_by_turbomind = _is_head_dim_supported(cfg.text_config) - elif arch == 'MolmoForCausalLM': - kv_heads = cfg.num_key_value_heads - # TM hasn't supported allenai/Molmo-7B-O-0924 yet - support_by_turbomind = kv_heads is not None - elif arch == 'DeepseekV2ForCausalLM': - if getattr(cfg, 'vision_config', None) is not None: - support_by_turbomind = False - elif arch == 'Glm4MoeLiteForCausalLM': + if arch == 'Glm4MoeLiteForCausalLM': if getattr(cfg, 'vision_config', None) is not None: support_by_turbomind = False diff --git a/lmdeploy/turbomind/text_model.py b/lmdeploy/turbomind/text_model.py new file mode 100644 index 0000000000..e87415565e --- /dev/null +++ b/lmdeploy/turbomind/text_model.py @@ -0,0 +1,95 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""TextModel — per-architecture model owning HF parsing and C++ configs.""" +from __future__ import annotations + +from abc import ABC +from typing import TYPE_CHECKING + +import torch + +from .builders import NormBuilder, make_norm_config + +if TYPE_CHECKING: + from transformers import PretrainedConfig + + + +class TextModel(ABC): + """Text model: HF config -> C++ configs + weight commits. + + Subclass contract: + - __init__ takes (cfg, *, resolver), calls super().__init__, then + builds per-module C++ config templates as self._attn_cfg / + self._ffn_cfg / self._moe_cfg / self._dn_cfg. + - Factory method NAMES (attn/ffn/moe/linear_attn/mla/norm/...) + are a convention for readability, NOT a protocol. Signatures + may differ across subclasses. The base class provides no + factory stubs; every subclass implements its own model() + that calls root.add_token_embeds / root.add_lm_head on a + TextModelBuilder for the root-level commits. + """ + + _loader_mappings: list = [] + + + # ------------------------------------------------------------------ + # Construction / parsing + # ------------------------------------------------------------------ + + def __init__(self, cfg: PretrainedConfig, *, resolver): + """Store local config and shared runtime helpers. + + Source-model subclasses own architecture-specific field reads. Shared + utilities in ``models.utils`` build common C++ module configs. + """ + self.cfg: PretrainedConfig = cfg + self._resolver = resolver + + @property + def _vocab_size(self) -> int: + return self.cfg.vocab_size + + + # ------------------------------------------------------------------ + # Runtime binding (called by ModelLoader after model_comm exists) + # ------------------------------------------------------------------ + + def bind_runtime(self, *, ctx, root_handles, + attn_tp, mlp_tp, model_tp): + self._ctx = ctx + self._root_handles = root_handles + self._attn_tp = attn_tp + self._mlp_tp = mlp_tp + self._model_tp = model_tp + + def set_params(self, params: dict): + self.params = params + + # ------------------------------------------------------------------ + # Checkpoint access helpers + # ------------------------------------------------------------------ + + def _get(self, key: str) -> torch.Tensor | None: + return self.params.get(key) + + def _linear(self, pfx: str, *, optional: bool = False): + return self._resolver.resolve(self.params, pfx, optional=optional) + + + + # ------------------------------------------------------------------ + # Norm factories (shared across all models) + # ------------------------------------------------------------------ + + def norm(self, weight, *, dim=None): + """Build a NormBuilder for *weight* under this model's contexts. + + ``dim`` defaults to ``weight.shape[-1]``. + """ + cfg = make_norm_config( + dim=dim if dim is not None else weight.shape[-1], + norm_eps=self.cfg.rms_norm_eps, + ) + m = NormBuilder(cfg, self._ctx) + m.set_weight(weight) + return m.build() diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index 805bb3653e..13690f22c2 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -9,7 +9,6 @@ import sys from collections.abc import Sequence from concurrent.futures import ThreadPoolExecutor -from dataclasses import asdict from functools import partial from multiprocessing.reduction import ForkingPickler from queue import Queue @@ -17,7 +16,6 @@ import pybase64 import torch -import yaml import lmdeploy from lmdeploy.messages import EngineOutput, GenerationConfig, ResponseType, ScheduleMetrics, TurbomindEngineConfig @@ -25,7 +23,6 @@ from lmdeploy.tokenizer import Tokenizer from lmdeploy.utils import get_logger, get_max_batch_size, get_model -from .deploy.config import TurbomindModelConfig from .supported_models import is_supported # TODO: find another way import _turbomind @@ -172,23 +169,12 @@ def __init__(self, self._process_weights() self._create_engine() - self.session_len = self.config.session_len - - def _check_unloaded_tm_params(self): - tm_params = self._tm_model.tm_params - if len(tm_params) > 0: - uninitialized = list(tm_params.keys()) - logger.warning('the model may not be loaded successfully ' - f'with {len(tm_params)} uninitialized params:\n{uninitialized}') + self.session_len = _engine_config.session_len def _load_weights(self): """Load weights.""" - self._get_model_params() - with torch.cuda.device(self.devices[0]): - self._tm_model.export() - - self._check_unloaded_tm_params() + self._model_loader.export() def _process_weights(self): """Process weight.""" @@ -204,11 +190,18 @@ def _create_engine(self): self._engine_created = True def _create_weight(self, model_comm): - """Allocate weight buffer, load params if from_workspace.""" + """Create per-GPU Context + empty ModelRoot sentinel. + + Runs both C++ init steps sequentially per device, inside a + ThreadPoolExecutor so all ranks enter ``create_context`` + concurrently and hit its ``h_global->Sync()`` barriers together. + ``create_root`` itself has no collectives, so it can follow + synchronously on each thread. + """ - # create weight def _create_weight_func(device_id): - model_comm.create_weights(device_id) + model_comm.create_context(device_id) + model_comm.create_root(device_id) with ThreadPoolExecutor(max_workers=self.gpu_count) as executor: futures = [] @@ -217,72 +210,67 @@ def _create_weight_func(device_id): for future in futures: future.result() - def _get_model_params(self): - """Get turbomind model params when loading from hf.""" - - model_comm = self.model_comm - tm_params = self._tm_model.tm_params - tm_params.clear() - - def _get_params(device_id, que): - out = model_comm.get_weights(device_id) - que.put(out) - - que = Queue() - with ThreadPoolExecutor(max_workers=self.gpu_count) as executor: - futures = [] - for device_id in range(self.gpu_count): - futures.append(executor.submit(_get_params, device_id, que)) - for future in futures: - future.result() - - for _ in range(self.gpu_count): - tensor_map = que.get() - for k, v in tensor_map.items(): - if k not in tm_params: - tm_params[k] = [v] - else: - tm_params[k].append(v) - logger.warning(f'get {len(tm_params)} model params') - - def _postprocess_config(self, tm_config: TurbomindModelConfig, engine_config: TurbomindEngineConfig): - """Postprocess turbomind config by.""" - import copy - self.config = copy.deepcopy(tm_config) - # Update the attribute values in `self.config` with the valid values - # from the corresponding attributes in `engine_config`, such as - # `session_len`, `quant_policy`, `rope_scaling_factor`, etc. - self.config.update_from_engine_config(engine_config) - - # update some attributes of `engine_config` which depends on - # `session_len` - self.engine_config = engine_config - - # pack `self.config` and `self.engine_config` into a dict - self.config_dict = self.config.to_dict() - self.config_dict.update(dict(engine_config=asdict(self.engine_config))) - logger.info(f'turbomind model config:\n\n' - f'{json.dumps(self.config_dict, indent=2)}') - def _from_hf(self, model_path: str, engine_config: TurbomindEngineConfig): """Load model which is in hf format.""" - assert is_supported(model_path), (f'turbomind does not support {model_path}. ' - 'Plz try pytorch engine instead.') + assert is_supported(model_path), ( + f'turbomind does not support {model_path}. ' + 'Plz try pytorch engine instead.') - # convert transformers model into turbomind model - from .deploy.converter import get_tm_model - tm_model = get_tm_model(model_path, self.model_name, self.chat_template_name, engine_config) + from .converter import get_tm_config + from .model_loader import ModelLoader - self._postprocess_config(tm_model.tm_config, engine_config) + text_model, model_path, data_type = get_tm_config(model_path, engine_config) - model_comm = _tm.TurboMind.create(model_dir='', - config=yaml.safe_dump(self.config_dict), - weight_type=self.config.model_config.weight_type) + self._vocab_size = text_model._vocab_size + self.engine_config = engine_config - # create empty weight + dtype_map = { + 'bfloat16': _tm.DataType.TYPE_BF16, + 'float16': _tm.DataType.TYPE_FP16, + } + ec = _tm.EngineConfig() + ec.data_type = dtype_map[engine_config.dtype] + ec.cache_block_seq_len = engine_config.cache_block_seq_len + ec.quant_policy = engine_config.quant_policy + ec.max_batch_size = engine_config.max_batch_size + ec.max_prefill_token_num = engine_config.max_prefill_token_num + ec.session_len = engine_config.session_len + ec.cache_max_block_count = engine_config.cache_max_entry_count + ec.cache_chunk_size = engine_config.cache_chunk_size + ec.enable_prefix_caching = engine_config.enable_prefix_caching + ec.enable_metrics = engine_config.enable_metrics + ec.num_tokens_per_iter = engine_config.num_tokens_per_iter + ec.max_prefill_iters = engine_config.max_prefill_iters + ec.async_ = engine_config.async_ + ec.outer_dp_size = engine_config.outer_dp_size + ec.attn_dp_size = engine_config.attn_dp_size + ec.attn_tp_size = engine_config.attn_tp_size + ec.attn_cp_size = engine_config.attn_cp_size + ec.mlp_tp_size = engine_config.mlp_tp_size + ec.devices = engine_config.devices + ec.nnodes = engine_config.nnodes + ec.node_rank = engine_config.node_rank + ec.communicator = engine_config.communicator + + logger.info(f'turbomind engine config:\n\n' + f'dtype={engine_config.dtype}, session_len={engine_config.session_len}, ' + f'max_batch_size={engine_config.max_batch_size}, ' + f'devices={engine_config.devices}, ' + f'tp={engine_config.attn_tp_size}, ' + f'dp={engine_config.attn_dp_size}, ' + f'cp={engine_config.attn_cp_size}') + + model_comm = _tm.TurboMind.create(model_dir='', engine_config=ec) self._create_weight(model_comm) - # output model - self._tm_model = tm_model + + self._model_loader = ModelLoader( + model=text_model, + model_comm=model_comm, + gpu_count=self.gpu_count, + model_path=model_path, + data_type=data_type, + engine_config=engine_config, + ) return model_comm async def sleep(self, level: int = 1): @@ -319,12 +307,11 @@ def _construct(item): return func(*args).clone() if not hasattr(self, '_export_iter'): - self._get_model_params() que = Queue() - tm_model = self._tm_model - tm_model.input_model.model_path = que + ml = self._model_loader + ml.model_path = que self._update_params_que = que - self._export_iter = tm_model.export_iter() + self._export_iter = ml.export_iter() with torch.cuda.device(self.devices[0]): if isinstance(request.serialized_named_tensors, str): @@ -336,7 +323,6 @@ def _construct(item): next(self._export_iter) if request.finished: - self._check_unloaded_tm_params() self._process_weights() if self._engine_created is False: self._create_engine() @@ -374,9 +360,6 @@ def from_pretrained(cls, **kwargs) def close(self): - if hasattr(self, '_tm_model'): - # close immediately after init engine with empty_init=True - self._tm_model.tm_params.clear() if hasattr(self, '_export_iter'): del self._export_iter if self.model_comm is not None: @@ -393,7 +376,7 @@ def create_instance(self, cuda_stream_id=0): Returns: TurboMindInstance: an instance of turbomind """ - return TurboMindInstance(self, self.config, cuda_stream_id) + return TurboMindInstance(self, cuda_stream_id) def get_schedule_metrics(self): # TODO: support dp @@ -525,15 +508,14 @@ class TurboMindInstance: cuda_stream_id(int): identity of a cuda stream """ - def __init__(self, tm_model: TurboMind, config: TurbomindModelConfig, cuda_stream_id: int = 0): + def __init__(self, tm_model: 'TurboMind', cuda_stream_id: int = 0): self.tm_model = tm_model self.cuda_stream_id = cuda_stream_id # create model instances - lazy_init = self.tm_model.config_dict['engine_config'].get('empty_init', False) + lazy_init = self.tm_model.engine_config.empty_init self._model_inst = None if lazy_init else self._create_model_instance() - self.config = config self.lock = None # error code map from csrc (refer to `struct Request` in src/turbomind/engine/request.h) # to lmdeploy.messages.ResponseType @@ -593,7 +575,7 @@ def prepare_embeddings(self, input_embeddings=None, input_embedding_ranges=None) length = sum([x.shape[0] for x in input_embeddings]) _MAP = dict(bfloat16=torch.bfloat16, float16=torch.float16) - dtype = _MAP[self.tm_model.config.model_config.data_type] + dtype = _MAP[self.tm_model.engine_config.dtype] values = torch.empty((length, input_embeddings[0].shape[-1]), dtype=dtype, device='cpu') ranges = torch.tensor(input_embedding_ranges, dtype=torch.int32, device='cpu') @@ -695,7 +677,7 @@ async def async_stream_infer(self, if gen_config.response_format is not None: tokenizer = self.tm_model.tokenizer - vocab_size = self.tm_model.config.model_config.vocab_size + vocab_size = self.tm_model._vocab_size try: tokenizer_info = TokenizerInfo.from_huggingface(tokenizer.model.model, vocab_size=vocab_size) diff --git a/lmdeploy/turbomind/weight_format.py b/lmdeploy/turbomind/weight_format.py new file mode 100644 index 0000000000..499cd216ad --- /dev/null +++ b/lmdeploy/turbomind/weight_format.py @@ -0,0 +1,493 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Weight format resolution for TurboMind checkpoint loading. + +Exports: + +- ``WeightFormat`` (ABC) and six concrete subclasses: ``TrivialFormat``, + ``AWQFormat``, ``GPTQFormat``, ``CompressedTensorFormat``, ``FP8Format``, + ``MXFP4Format``. Each subclass declares its ``name``, ``suffix_map``, + ``weight_dtype`` (``_tm.DataType`` or ``None``), ``has_zero_point`` flag, + and overrides ``accepts`` + ``normalize``. Optional overrides: ``pack`` + (identity default), ``synthesize_zeros`` (raises by default), ``dequant`` + (raises by default; ``TrivialFormat.dequant`` is identity). + +- ``WeightFormatResolver``: holds the model compute dtype plus an ordered + list of candidate formats. ``resolve(params, prefix, *, index=None, + optional=False)`` returns a ``Linear`` bundle in TM layout or raises + (``KeyError`` on missing tensors without ``optional``, ``ValueError`` when + tensors exist but no candidate matches). + +- ``pack_u4_row``: uint8 → int32 row packer used by quantized ``pack`` + overrides and by downstream callers that pack packed-expert weights + after slicing. +""" +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import ClassVar, NamedTuple + +import _turbomind as _tm +import torch +from torch import Tensor + +from .linear import Linear + + +class PackedTensor(NamedTuple): + tensor: torch.Tensor + alloc_shape: list[int] | None # None = inherit from packed tensor + alloc_dtype: _tm.DataType | None # None = inherit from packed tensor + + +# --------------------------------------------------------------------------- +# Low-level u4 packing / unpacking helpers (reused across normalize / pack) +# --------------------------------------------------------------------------- + + +def _get_u4_slices(x: Tensor, dtype: torch.dtype) -> list[Tensor]: + MAP = {torch.int32: 8, torch.uint8: 2} + xs = [] + for _ in range(MAP[x.dtype]): + xs.append((x & 15).to(dtype)) + x = x >> 4 + return xs + + +def _unpack_awq_gemm(x: Tensor) -> Tensor: + xs = _get_u4_slices(x, torch.uint8) + order = [0, 4, 1, 5, 2, 6, 3, 7] + ys = [xs[i] for i in order] + return torch.stack(ys, dim=-1).view(*x.shape[:-1], -1) + + +def pack_u4_row(x: torch.Tensor) -> torch.Tensor: + """Pack uint8 4-bit values into int32 rows along the last dim. + + Used by every int4 format's ``pack`` override and by callers that + re-pack tensors after slicing (e.g. packed-MoE expert split). + """ + assert x.dtype == torch.uint8, f'x.dtype: {x.dtype}' + xs = x.view(*x.shape[:-1], -1, 8).split(1, dim=-1) + a = torch.zeros(xs[0].shape, dtype=torch.int32, device=x.device) + for t in reversed(xs): + a = (a << 4) | t + return a.squeeze(dim=-1) + + +def _zeros_int4_symmetric(scales: Tensor) -> Tensor: + """Synthesize symmetric int4 zero-points (value = 8) matching *scales* + shape.""" + return torch.full(scales.shape, 8, dtype=torch.uint8, device=scales.device) + + +# --------------------------------------------------------------------------- +# WeightFormat ABC +# --------------------------------------------------------------------------- + + +class WeightFormat(ABC): + """Abstract per-format policy object. + + Class attributes (override in subclasses): + + - ``name``: canonical format name used for string comparisons. + - ``suffix_map``: ``{checkpoint_suffix: tm_kind}``. Drives which + checkpoint tensors each format ingests at a given prefix. + - ``weight_dtype``: ``_tm.DataType`` for the weight storage dtype; + ``None`` for trivial (weight dtype equals compute dtype). + - ``has_zero_point``: ``True`` when the format uses a zero-point + tensor; gates the resolver's ``synthesize_zeros`` call. + + Instance attributes (set by subclass ``__init__``): + + - ``block_in``, ``block_out``: quantization block sizes. ``None`` for + dimensions without blocking. + + Methods: + + - ``accepts`` (abstract): classify a checkpoint suffix dict. + - ``normalize`` (abstract): raw-checkpoint tensor → TM layout. + - ``pack``: optional commit-time packer. Identity default. + - ``synthesize_zeros``: fabricate a zeros tensor when the checkpoint + omits it. Raises ``NotImplementedError`` by default. + - ``dequant``: produce a trivial ``{weight, bias?}`` dict from TM + tensors for mixed-format fusion. Raises ``NotImplementedError`` by + default. ``TrivialFormat.dequant`` is identity. + - ``make_data_format``: build the ``_tm.DataFormat`` descriptor. + + Equality / hashing: two WeightFormats are equal iff they share class + and block sizes. This matters for the set-based uniformity checks in + ``concat_out_dim``. + """ + + name: ClassVar[str] + suffix_map: ClassVar[dict[str, str]] + weight_dtype: ClassVar[_tm.DataType | None] + has_zero_point: ClassVar[bool] + + block_in: int | None + block_out: int | None + + def __init__(self, *, block_in: int | None = None, + block_out: int | None = None): + self.block_in = block_in + self.block_out = block_out + + @abstractmethod + def accepts(self, available: dict[str, Tensor]) -> bool: ... + + @abstractmethod + def normalize(self, tensor: Tensor, kind: str) -> Tensor: ... + + def pack(self, tensor: Tensor, kind: str) -> PackedTensor: + return PackedTensor(tensor, None, None) + + def synthesize_zeros(self, scales: Tensor) -> Tensor: + raise NotImplementedError( + f'{type(self).__name__}.synthesize_zeros not implemented') + + def dequant(self, tensors: dict[str, Tensor], + data_type) -> dict[str, Tensor]: + raise NotImplementedError( + f'{type(self).__name__}.dequant not implemented') + + def make_data_format(self, data_type) -> _tm.DataFormat: + if self.weight_dtype is None: + return _tm.ResolveLinearWeightFormat(data_type, data_type, 1, 1) + return _tm.ResolveLinearWeightFormat( + data_type, self.weight_dtype, + self.block_in or 1, self.block_out or 1) + + def __eq__(self, other) -> bool: + if not isinstance(other, WeightFormat): + return NotImplemented + return (type(self) is type(other) + and self.block_in == other.block_in + and self.block_out == other.block_out) + + def __hash__(self) -> int: + return hash((type(self), self.block_in, self.block_out)) + + +# --------------------------------------------------------------------------- +# Concrete subclasses +# --------------------------------------------------------------------------- + + +class TrivialFormat(WeightFormat): + name = 'trivial' + suffix_map = {'.weight': 'weight', '.bias': 'bias'} + weight_dtype = None + has_zero_point = False + + def accepts(self, available: dict[str, Tensor]) -> bool: + if not (available.keys() <= {'.weight', '.bias'}): + return False + w = available.get('.weight') + return w is None or w.dtype.is_floating_point + + def normalize(self, x: Tensor, kind: str) -> Tensor: + x = x.cuda() + if x.dim() >= 2: + x = x.t() + return x + + def dequant(self, tensors, data_type): + # Already trivial — nothing to undo. Identity override for mixed + # fusion groups. + return tensors + + +class AWQFormat(WeightFormat): + name = 'awq' + suffix_map = {'.qweight': 'weight', '.scales': 'scales', + '.qzeros': 'zeros', '.bias': 'bias'} + weight_dtype = _tm.DataType.TYPE_UINT4 + has_zero_point = True + + def __init__(self, *, block_in: int): + super().__init__(block_in=block_in, block_out=None) + + def accepts(self, available: dict[str, Tensor]) -> bool: + qw = available.get('.qweight') + if qw is None or qw.dtype != torch.int32: + return False + scales = available.get('.scales') + if scales is not None and qw.ndim >= 2 and scales.ndim >= 2: + return qw.shape[-1] * 8 == scales.shape[-1] + return True + + def normalize(self, x: Tensor, kind: str) -> Tensor: + # AWQ checkpoints store weights in TM-native layout: + # qweight: [K, N//8] int32 → unpack → [K, N] (TM, no .t()) + # scales: [K//g, N] float16 → already TM + # zeros: [K//g, N//8] int32 → unpack → [K//g, N] + x = x.cuda() + if x.dtype == torch.int32: + x = _unpack_awq_gemm(x) + if kind == 'zeros': + x = x.to(torch.float16) + return x + + def pack(self, tensor: Tensor, kind: str) -> PackedTensor: + if kind == 'weight' and tensor.dtype == torch.uint8: + return PackedTensor(pack_u4_row(tensor), + list(tensor.shape), self.weight_dtype) + return PackedTensor(tensor, None, None) + + def dequant(self, tensors, data_type): + from lmdeploy.pytorch.backends.default.awq_modules import dequantize_gemm + + qweight = tensors['weight'] + scales = tensors['scales'] + qzeros = tensors['zeros'] + group_size = qweight.shape[0] // scales.shape[0] + w = dequantize_gemm(qweight, qzeros, scales, 4, group_size) + result: dict[str, Tensor] = {'weight': w} + if 'bias' in tensors: + result['bias'] = tensors['bias'] + return result + + +class GPTQFormat(WeightFormat): + name = 'gptq' + suffix_map = {'.qweight': 'weight', '.scales': 'scales', + '.qzeros': 'zeros', '.bias': 'bias'} + weight_dtype = _tm.DataType.TYPE_UINT4 + has_zero_point = True + + def __init__(self, *, block_in: int): + super().__init__(block_in=block_in, block_out=None) + + def accepts(self, available: dict[str, Tensor]) -> bool: + qw = available.get('.qweight') + if qw is None or qw.dtype != torch.int32: + return False + scales = available.get('.scales') + if scales is not None and qw.ndim >= 2 and scales.ndim >= 2: + return qw.shape[-1] == scales.shape[-1] + return True + + def normalize(self, x: Tensor, kind: str) -> Tensor: + # GPTQ checkpoint stores weights in TM-native layout: + # qweight: [K//8, N] int32 → unpack → [K, N] + # scales: [K//g, N] float16 → already TM + # zeros: [K//g, N//8] int32 → unpack → [K//g, N] (+1 offset) + x = x.cuda() + if x.dtype == torch.int32: + xs = _get_u4_slices(x, torch.uint8) + if kind == 'weight': + x = torch.stack(xs, dim=1).view(-1, x.size(-1)) + else: + x = torch.stack(xs, dim=-1).view(x.size(0), -1) + 1 + if kind == 'zeros': + x = x.to(torch.float16) + return x + + def pack(self, tensor: Tensor, kind: str) -> PackedTensor: + if kind == 'weight' and tensor.dtype == torch.uint8: + return PackedTensor(pack_u4_row(tensor), + list(tensor.shape), self.weight_dtype) + return PackedTensor(tensor, None, None) + + def synthesize_zeros(self, scales: Tensor) -> Tensor: + return _zeros_int4_symmetric(scales) + + +class CompressedTensorFormat(WeightFormat): + name = 'compressed-tensors' + suffix_map = {'.weight_packed': 'weight', + '.weight_scale': 'scales', + '.weight_zero_point': 'zeros', + '.bias': 'bias'} + weight_dtype = _tm.DataType.TYPE_UINT4 + has_zero_point = True + + def __init__(self, *, block_in: int): + super().__init__(block_in=block_in, block_out=None) + + def accepts(self, available: dict[str, Tensor]) -> bool: + wp = available.get('.weight_packed') + return wp is not None and wp.dtype == torch.int32 + + def normalize(self, x: Tensor, kind: str) -> Tensor: + x = x.cuda() + if x.dtype == torch.int32: + xs = _get_u4_slices(x, torch.uint8) + if kind == 'weight': + x = torch.stack(xs, dim=-1).view(*x.shape[:-1], -1) + elif kind == 'zeros': + x = torch.stack(xs, dim=1).view(-1, x.size(-1)) + if kind == 'zeros': + x = x.to(torch.float16) + if x.dim() >= 2: + x = x.t() + return x + + def pack(self, tensor: Tensor, kind: str) -> PackedTensor: + if kind == 'weight' and tensor.dtype == torch.uint8: + return PackedTensor(pack_u4_row(tensor), + list(tensor.shape), self.weight_dtype) + return PackedTensor(tensor, None, None) + + def synthesize_zeros(self, scales: Tensor) -> Tensor: + return _zeros_int4_symmetric(scales) + + +class FP8Format(WeightFormat): + name = 'fp8' + suffix_map = {'.weight': 'weight', + '.weight_scale_inv': 'scales', + '.bias': 'bias'} + weight_dtype = _tm.DataType.TYPE_FP8_E4M3 + has_zero_point = False + + def __init__(self): + super().__init__(block_in=128, block_out=128) + + def accepts(self, available: dict[str, Tensor]) -> bool: + if '.weight_scale_inv' not in available: + return False + w = available.get('.weight') + return w is None or w.dtype in (torch.float8_e4m3fn, torch.uint8) + + def normalize(self, x: Tensor, kind: str) -> Tensor: + x = x.cuda() + if x.dtype == torch.float8_e4m3fn: + x = x.view(dtype=torch.uint8) + if x.dim() >= 2: + x = x.t() + return x + + def dequant(self, tensors, data_type): + from .builders._base import _CPP_TO_TORCH + + weight = tensors['weight'] + scales = tensors['scales'] + block_size = 128 + fp8_weight = weight.view(torch.float8_e4m3fn).float() + scale = scales.float() + scale = scale.repeat_interleave(block_size, dim=0) + scale = scale.repeat_interleave(block_size, dim=1) + scale = scale[: fp8_weight.shape[0], : fp8_weight.shape[1]] + target_dtype = _CPP_TO_TORCH[data_type] + result: dict[str, Tensor] = {'weight': (fp8_weight * scale).to(target_dtype)} + if 'bias' in tensors: + result['bias'] = tensors['bias'] + return result + + def pack(self, tensor: Tensor, kind: str) -> PackedTensor: + if kind == 'weight': + return PackedTensor(tensor, list(tensor.shape), self.weight_dtype) + return PackedTensor(tensor, None, None) + + +class MXFP4Format(WeightFormat): + name = 'mxfp4' + suffix_map = {'.blocks': 'weight', '.scales': 'scales', '.bias': 'bias'} + weight_dtype = _tm.DataType.TYPE_FP4_E2M1 + has_zero_point = False + + def __init__(self): + super().__init__(block_in=32, block_out=None) + + def accepts(self, available: dict[str, Tensor]) -> bool: + if '.scales' not in available: + return False + w = available.get('.blocks') + return w is None or w.dtype == torch.uint8 + + def normalize(self, x: Tensor, kind: str) -> Tensor: + x = x.cuda() + if kind == 'weight': + xs = _get_u4_slices(torch.flatten(x, start_dim=-2), torch.uint8) + x = torch.flatten(torch.stack(xs, dim=-1), start_dim=-2) + if x.dim() >= 2: + x = x.t() + return x + + def pack(self, tensor: Tensor, kind: str) -> PackedTensor: + if kind == 'weight' and tensor.dtype == torch.uint8: + return PackedTensor(pack_u4_row(tensor), + list(tensor.shape), self.weight_dtype) + return PackedTensor(tensor, None, None) + + +# --------------------------------------------------------------------------- +# Resolver +# --------------------------------------------------------------------------- + + +class WeightFormatResolver: + """Resolve a checkpoint prefix to a ``Linear`` bundle in TM layout. + + Holds the model compute dtype and an ordered list of candidate + formats. ``resolve(params, prefix)`` probes the checkpoint at the + given prefix, dispatches to the first candidate whose ``accepts`` + returns True, and constructs a ``Linear`` with the format's + ``make_data_format`` descriptor. + + The suffix probe is scoped to the union of candidate ``suffix_map`` + keys only — not a global "every format ever" list — so adding a new + format elsewhere does not widen the probe. + + Priority is encoded by list order. The converter puts quantized + candidates first and ``TrivialFormat()`` last: a prefix that only + matches trivial (router, norm-like linears in a quantized model) + deterministically falls through. + + Failure modes are loud and distinct: + + - ``optional=False`` (default) + no tensors at prefix → ``KeyError`` + with candidate suffix list. + - Tensors present but no candidate accepts → ``ValueError`` with + available keys and candidate names. + - Only "no tensors AND optional=True" returns ``None``. + """ + + def __init__(self, *, data_type: _tm.DataType, + formats: list[WeightFormat]): + self._data_type = data_type + self._formats = formats + self._suffixes = frozenset( + s for f in formats for s in f.suffix_map) + + @property + def data_type(self) -> _tm.DataType: + return self._data_type + + def resolve(self, params: dict[str, Tensor], prefix: str, *, + index: int | None = None, + optional: bool = False) -> Linear | None: + available = {s: params[prefix + s] + for s in self._suffixes if (prefix + s) in params} + if index is not None: + available = {s: t[index] for s, t in available.items()} + + if not available: + if optional: + return None + raise KeyError( + f'no checkpoint tensors found at prefix {prefix!r} ' + f'(candidate suffixes: {sorted(self._suffixes)})') + + for fmt in self._formats: + if fmt.accepts(available): + return self._build_linear(fmt, available) + + raise ValueError( + f'no weight format accepts tensors at {prefix!r}: ' + f'got {sorted(available)}, ' + f'tried {[f.name for f in self._formats]}') + + def _build_linear(self, fmt: WeightFormat, + available: dict[str, Tensor]) -> Linear: + tensors = { + kind: fmt.normalize(available[s], kind) + for s, kind in fmt.suffix_map.items() + if s in available + } + if fmt.has_zero_point and 'zeros' not in tensors: + tensors['zeros'] = fmt.synthesize_zeros(tensors['scales']) + return Linear(tensors=tensors, + weight_format=fmt) diff --git a/scripts/test_turbomind_model.py b/scripts/test_turbomind_model.py new file mode 100644 index 0000000000..e35f7774cb --- /dev/null +++ b/scripts/test_turbomind_model.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 +"""Smoke-test one TurboMind model for agent/subagent harnesses. + +Code flow: parse argv → configure HF/GPU and run one inference → print sections. + +Stdout is plain text in short sections, for example: + + --- setup --- + model: + tp: + gpus: + TM_DEBUG_LEVEL: DEBUG (only if --debug was passed) + + --- timing --- + pipeline load: s + inference: s + + --- tokens --- + input: + generated: + + --- response begin --- + + --- response end --- + +Exit code: 0 only if no uncaught exception (pipeline load + inference complete). +On failure the full traceback is printed to stderr. +Output quality is not validated. + +Usage (from repo root): + + python scripts/test_turbomind_model.py \\ + [--debug] + +Optional --debug sets TM_DEBUG_LEVEL=DEBUG before loading TurboMind so asynchronous +CUDA errors surface after kernel launch (see TurboMind CUDA helpers). + +Example gpus: "0" for tp=1, "0,1" for tp=2. +""" +from __future__ import annotations + +import os +import sys +import time +import traceback +from typing import NamedTuple + +import huggingface_hub.constants as hf_constants + + +class SmokeResult(NamedTuple): + create_s: float + infer_s: float + text: str + input_token_len: int + generate_token_len: int + + +def _set_hf_cache(path: str) -> None: + hf_constants.HF_HUB_CACHE = path + hf_constants.HF_HUB_OFFLINE = 1 + + +def parse_args(argv: list[str]) -> tuple[str, str, int, str, bool]: + prog = os.path.basename(argv[0]) if argv else 'test_turbomind_model.py' + rest = [a for a in argv[1:] if a != '--debug'] + debug = len(rest) != len(argv) - 1 + + if len(rest) != 4: + print( + f'usage: {prog} [--debug] ', + file=sys.stderr, + ) + sys.exit(2) + + model_path, cache_dir, tp_s, gpus = rest + try: + tp = int(tp_s) + except ValueError: + print(f'invalid tp: {tp_s!r}', file=sys.stderr) + sys.exit(2) + return model_path, cache_dir, tp, gpus, debug + + +def run_smoke_infer( + model_path: str, + cache_dir: str, + tp: int, + gpus: str, + *, + debug: bool = False, +) -> SmokeResult: + _set_hf_cache(cache_dir) + os.environ['CUDA_VISIBLE_DEVICES'] = gpus + if debug: + os.environ['TM_DEBUG_LEVEL'] = 'DEBUG' + + from lmdeploy import GenerationConfig, TurbomindEngineConfig, pipeline + + engine_config = TurbomindEngineConfig( + async_=1, + max_batch_size=4, + session_len=4096, + cache_max_entry_count=0.5, + max_prefill_token_num=1024, + tp=tp, + dp=1, + enable_metrics=False, + communicator='nccl', + ) + gen_config = GenerationConfig(max_new_tokens=128, do_sample=False) + prompt = 'Write a short paragraph about the importance of reading books.' + + t0 = time.perf_counter() + with pipeline(model_path, backend_config=engine_config, log_level='WARNING') as pipe: + create_s = time.perf_counter() - t0 + t1 = time.perf_counter() + out = pipe([prompt], gen_config=gen_config, do_preprocess=True) + infer_s = time.perf_counter() - t1 + + res = out[0] + text = res.text if hasattr(res, 'text') else str(res) + input_token_len = getattr(res, 'input_token_len', -1) + generate_token_len = getattr(res, 'generate_token_len', -1) + return SmokeResult(create_s, infer_s, text, input_token_len, generate_token_len) + + +def print_report( + model_path: str, + tp: int, + gpus: str, + result: SmokeResult, + *, + debug: bool = False, +) -> None: + if not result.text.strip(): + print('warning: empty response text', file=sys.stderr) + + print('--- setup ---') + print(f'model: {model_path}') + print(f'tp: {tp}') + print(f'gpus: {gpus}') + if debug: + print('TM_DEBUG_LEVEL: DEBUG') + print() + print('--- timing ---') + print(f'pipeline load: {result.create_s:.2f} s') + print(f'inference: {result.infer_s:.2f} s') + print() + print('--- tokens ---') + print(f'input: {result.input_token_len}') + print(f'generated: {result.generate_token_len}') + print() + print('--- response begin ---') + print(result.text, end='') + if result.text and not result.text.endswith('\n'): + print() + print('--- response end ---') + + +def main() -> None: + model_path, cache_dir, tp, gpus, debug = parse_args(sys.argv) + result = run_smoke_infer(model_path, cache_dir, tp, gpus, debug=debug) + print_report(model_path, tp, gpus, result, debug=debug) + + +if __name__ == '__main__': + try: + main() + except Exception: + traceback.print_exc() + sys.exit(1) diff --git a/scripts/test_vlm.py b/scripts/test_vlm.py new file mode 100644 index 0000000000..693daa2067 --- /dev/null +++ b/scripts/test_vlm.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +"""Smoke-test InternVL3.5 VLM with an image.""" +import os +import sys +import time + +import huggingface_hub.constants as hf_constants + + +def main(): + model_path = sys.argv[1] if len(sys.argv) > 1 else 'OpenGVLab/InternVL3_5-8B' + cache_dir = sys.argv[2] if len(sys.argv) > 2 else '/nvme2/huggingface_hub/hub' + image_path = sys.argv[3] if len(sys.argv) > 3 else '/data/lmdeploy-modeling/resources/batch_memory.png' + gpus = sys.argv[4] if len(sys.argv) > 4 else '0' + + hf_constants.HF_HUB_CACHE = cache_dir + hf_constants.HF_HUB_OFFLINE = 1 + os.environ['CUDA_VISIBLE_DEVICES'] = gpus + + from lmdeploy import GenerationConfig, TurbomindEngineConfig, pipeline + from lmdeploy.vl import load_image + + engine_config = TurbomindEngineConfig( + async_=1, + max_batch_size=4, + session_len=8192, + cache_max_entry_count=0.5, + max_prefill_token_num=1024, + tp=1, + dp=1, + enable_metrics=False, + communicator='nccl', + ) + gen_config = GenerationConfig(max_new_tokens=256, do_sample=False) + + image = load_image(image_path) + prompt = 'Describe this image in detail. What do you see?' + + print('--- setup ---') + print(f'model: {model_path}') + print(f'image: {image_path}') + print(f'gpus: {gpus}') + print() + + t0 = time.perf_counter() + with pipeline(model_path, backend_config=engine_config, log_level='WARNING') as pipe: + load_s = time.perf_counter() - t0 + print('--- timing ---') + print(f'pipeline load: {load_s:.2f} s') + + t1 = time.perf_counter() + out = pipe([(prompt, image)], gen_config=gen_config, do_preprocess=True) + infer_s = time.perf_counter() - t1 + print(f'inference: {infer_s:.2f} s') + print() + + res = out[0] + text = res.text if hasattr(res, 'text') else str(res) + input_tokens = getattr(res, 'input_token_len', -1) + gen_tokens = getattr(res, 'generate_token_len', -1) + + print('--- tokens ---') + print(f'input: {input_tokens}') + print(f'generated: {gen_tokens}') + print() + print('--- response begin ---') + print(text) + print('--- response end ---') + + +if __name__ == '__main__': + main() diff --git a/src/turbomind/CMakeLists.txt b/src/turbomind/CMakeLists.txt index 11f7d0ed22..2fec4ba1c6 100644 --- a/src/turbomind/CMakeLists.txt +++ b/src/turbomind/CMakeLists.txt @@ -36,4 +36,4 @@ target_link_libraries(turbomind PUBLIC nvtx_utils CUDA::cublasLt CUDA::cudart - yaml-cpp::yaml-cpp) + ) diff --git a/src/turbomind/core/CMakeLists.txt b/src/turbomind/core/CMakeLists.txt index 6e8e6bc49d..c32f8f1609 100644 --- a/src/turbomind/core/CMakeLists.txt +++ b/src/turbomind/core/CMakeLists.txt @@ -22,9 +22,11 @@ add_library(core STATIC context.cc buffer.cc layout.cc + data_format.cc tensor.cc tensor.cu module.cc + registry.cc copy.cc logger.cc) @@ -41,4 +43,7 @@ if (BUILD_TEST) add_executable(test_logger test_logger.cc) target_link_libraries(test_logger PRIVATE core Catch2::Catch2WithMain) + + add_executable(test_data_format test_data_format.cc) + target_link_libraries(test_data_format PRIVATE core Catch2::Catch2WithMain) endif () diff --git a/src/turbomind/core/data_format.cc b/src/turbomind/core/data_format.cc new file mode 100644 index 0000000000..4255073ca5 --- /dev/null +++ b/src/turbomind/core/data_format.cc @@ -0,0 +1,64 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include "src/turbomind/core/data_format.h" +#include "src/turbomind/core/check.h" + +namespace turbomind { + +bool DataFormat::is_quantized() const noexcept +{ + if (scales.present() || zeros.present()) { + return true; + } + for (int bs : block_sizes) { + if (bs > 1) { + return true; + } + } + return false; +} + +DataFormat ResolveLinearWeightFormat(DataType data_type, DataType weight_dtype, int block_in, int block_out) +{ + DataFormat fmt; + fmt.dtype = weight_dtype; + + if (IsTrivialFloatType(weight_dtype)) { + TM_CHECK(block_in == 1 && block_out == 1) + << "Trivial float weight requires block_in==1 and block_out==1, got " << block_in << ", " << block_out; + fmt.block_sizes = {1, 1}; + return fmt; + } + + if (weight_dtype == kFloat8_e4m3) { + TM_CHECK(block_in == 128 && block_out == 128) + << "FP8 weight format requires block_in==128 and block_out==128, got " << block_in << ", " << block_out; + fmt.block_sizes = {128, 128}; + fmt.scales.dtype = kFloat; + return fmt; + } + + if (weight_dtype == kFloat4_e2m1) { + TM_CHECK(block_in > 0 && block_out == 1) + << "FP4 weight format requires block_in>0 and block_out==1, got " << block_in << ", " << block_out; + fmt.block_sizes = {block_in, 1}; + fmt.scales.dtype = kUint8; + return fmt; + } + + const bool is_qweight = weight_dtype == kUint4 || weight_dtype == kUint8; + if (is_qweight) { + TM_CHECK(block_in > 0 && block_in <= 256 && block_out == 1) + << "Quantized integer weight requires 0 < block_in <= 256 and block_out==1, got " << block_in << ", " + << block_out; + fmt.block_sizes = {block_in, 1}; + fmt.scales.dtype = data_type; + fmt.zeros.dtype = data_type; + return fmt; + } + + TM_CHECK(0) << "Unsupported weight format: " << to_string(weight_dtype); + return fmt; +} + +} // namespace turbomind diff --git a/src/turbomind/core/data_format.h b/src/turbomind/core/data_format.h new file mode 100644 index 0000000000..70605bd68e --- /dev/null +++ b/src/turbomind/core/data_format.h @@ -0,0 +1,50 @@ +// Copyright (c) OpenMMLab. All rights reserved. +#pragma once + +#include "src/turbomind/core/data_type.h" +#include + +namespace turbomind { + +/// True for trivial (non-quantized) float dtypes: FP32, FP16, BF16. +inline bool IsTrivialFloatType(DataType t) noexcept +{ + return t == kFloat || t == kHalf || t == kBfloat16; +} + +/// Descriptor for a single quantization parameter (scales or zeros). +struct QuantParamDesc { + DataType dtype{}; // kNull means "not present" + bool transposed{}; // stored transposed w.r.t. data tensor + + bool present() const noexcept + { + return dtype != kNull; + } +}; + +/// Universal descriptor for the storage format of a (possibly quantized) tensor. +struct DataFormat { + DataType dtype{}; // element type of the data tensor + std::vector block_sizes; // per-dimension block sizes (1 = no quantization) + QuantParamDesc scales{}; + QuantParamDesc zeros{}; + + /// True if any quantization parameter is present or any block_size > 1. + bool is_quantized() const noexcept; + + /// Number of dimensions described by this format. + int rank() const noexcept + { + return static_cast(block_sizes.size()); + } +}; + +/// Construct the DataFormat for a linear weight tensor in TM [in, out] layout. +/// block_sizes stored in tensor-shape order: {block_in, block_out}, so +/// block_sizes[0] is the K-axis group size and block_sizes[1] is the N-axis. +/// Scales / zeros dtypes are derived from (data_type, weight_dtype) per the +/// format's GEMM convention. Validates that the combination is supported. +DataFormat ResolveLinearWeightFormat(DataType data_type, DataType weight_dtype, int block_in, int block_out); + +} // namespace turbomind diff --git a/src/turbomind/core/module.cc b/src/turbomind/core/module.cc index bfdbd8a1e4..0cbf111b2e 100644 --- a/src/turbomind/core/module.cc +++ b/src/turbomind/core/module.cc @@ -1,78 +1,194 @@ +// Copyright (c) OpenMMLab. All rights reserved. #include "src/turbomind/core/module.h" + #include "src/turbomind/core/check.h" -#include +#include "src/turbomind/core/registry.h" + +#include namespace turbomind::core { -Module::Module(): parent_{} {} +// ====================================================================== +// Module +// ====================================================================== + +Module::Module() = default; + +Module::~Module() = default; -Module::~Module() +// ----- Type info ----- + +const char* Module::type() const { - if (parent_) { - parent_->remove_module(*this); - parent_ = {}; - } + return "Module"; } -void Module::register_module(std::string name, Module& module, std::optional index) +// ----- Hierarchy (default implementations) ----- + +Module* Module::add_child(std::string /*name*/, std::unique_ptr /*child*/) { - module.parent_ = this; - if (index) { - name += "."; - name += std::to_string(*index); + return nullptr; +} + +Module* Module::child(const std::string& /*name*/) const +{ + return nullptr; +} + +void Module::for_each_child(std::function /*visitor*/) const +{ + // default: no-op +} + +// ----- Parameters (default implementations) ----- + +Param Module::param(const std::string& /*name*/) +{ + return {}; +} + +void Module::for_each_param(std::function /*visitor*/) +{ + // default: no-op +} + +// ----- Lifecycle ----- + +void Module::prepare() +{ + for_each_child([](const char* /*name*/, Module* child) { + if (child) + child->prepare(); + }); +} + +// ----- Registry-driven child creation ----- + +std::unique_ptr Module::create(const ModuleConfig& config) +{ + return ModuleRegistry::instance().create(std::string(config.module_type), config); +} + +Module* Module::create_child(const std::string& name, const ModuleConfig& config) +{ + auto mod = create(config); + if (!mod) { + return nullptr; } - // std::cout << "register Module " << name << " " << &module << ", parent " << this << "\n"; - modules_.emplace_back(std::move(name), &module); + return add_child(name, std::move(mod)); } -void Module::register_parameter(std::string name, Tensor& param) +// ----- Lookup ----- + +Module* Module::get(const std::string& segment) { - // std::cout << "register Parameter " << name << " " << ¶m << " " << param.layout() << "\n"; - params_.emplace_back(std::move(name), ¶m); + auto* c = child(segment); + TM_CHECK(c != nullptr) << "child '" << segment << "' not found in " << type(); + return c; } -void Module::remove_module(Module& module) +// ----- Verification ----- + +bool Module::verify(std::vector& missing) { - for (auto it = modules_.begin(); it != modules_.end(); ++it) { - if (it->second == &module) { - // std::cout << "erase " << it->first << " " << &module << " from " << this << "\n"; - modules_.erase(it); - return; + // Recurse into children + for_each_child([&](const char* /*name*/, Module* child) { + if (child) + child->verify(missing); + }); + + // Check parameters are initialized + for_each_param([&](const char* name, Tensor& tensor) { + if (!tensor) { + missing.push_back(full_path() + "." + name); } + }); + + return missing.empty(); +} + +// ----- Utilities ----- + +std::string Module::full_path() const +{ + if (!parent_) { + return name_; + } + std::string pp = parent_->full_path(); + if (pp.empty()) { + return name_; } - TM_CHECK(0) << "module " << &module << " not found"; + return pp + "." + name_; } -void Module::remove_parameter(Tensor& param) +// ====================================================================== +// ModuleList +// ====================================================================== + +Module* ModuleList::add_child(std::string name, std::unique_ptr child) { - for (auto it = params_.begin(); it != params_.end(); ++it) { - if (it->second == ¶m) { - params_.erase(it); - return; + TM_CHECK(child != nullptr); + TM_CHECK(child->parent_ == nullptr) << "module already has a parent"; + + // Parse index before moving name. + int index = -1; + { + std::istringstream iss(name); + iss >> index; + if (!iss.eof()) { + index = -1; + } + } + + child->parent_ = this; + child->name_ = name; + + Module* raw = child.get(); + items_.emplace_back(std::move(name), std::move(child)); + + if (index >= 0) { + if (index >= static_cast(indexed_.size())) { + indexed_.resize(index + 1, nullptr); } + indexed_[index] = raw; } - TM_CHECK(0) << "param " << ¶m << " not found"; + + return raw; } -std::unordered_map Module::get_parameters() const +Module* ModuleList::child(const std::string& name) const { - std::unordered_map m; - get_parameters_impl({}, m); - return m; + for (auto& [n, c] : items_) { + if (n == name) { + return c.get(); + } + } + return nullptr; } -void Module::get_parameters_impl(std::string prefix, std::unordered_map& m) const +void ModuleList::for_each_child(std::function visitor) const { - if (!prefix.empty()) { - prefix += "."; + for (auto& [name, c] : items_) { + visitor(name.c_str(), c.get()); } - for (const auto& [k, v] : params_) { - m.emplace(prefix + k, v); - } - for (const auto& [k, v] : modules_) { - v->get_parameters_impl(prefix + k, m); +} + +int ModuleList::size() const +{ + int n = 0; + for (auto* p : indexed_) { + if (p) { + ++n; + } } + return n; } +// ====================================================================== +// ModuleList registry +// ====================================================================== + +TM_MODULE_REGISTER(ModuleList, ModuleListConfig); + } // namespace turbomind::core diff --git a/src/turbomind/core/module.h b/src/turbomind/core/module.h index 147a3d6593..e33c0344ea 100644 --- a/src/turbomind/core/module.h +++ b/src/turbomind/core/module.h @@ -2,11 +2,205 @@ #ifndef TURBOMIND_CORE_MODULE_H #define TURBOMIND_CORE_MODULE_H +#include +#include +#include +#include +#include +#include +#include + +#include "src/turbomind/core/allocator.h" +#include "src/turbomind/core/check.h" +#include "src/turbomind/core/data_type.h" +#include "src/turbomind/core/registry.h" #include "src/turbomind/core/tensor.h" namespace turbomind::core { +// ====================================================================== +// X-macro config field infrastructure +// ====================================================================== + +#define TM_MEMBER(Type, name, ...) Type name{__VA_ARGS__}; +#define TM_PTR(Type, name, ...) visitor(#name, &Config::name); +#define TM_FOR_EACH(ClassName, field_list) \ + template \ + static void for_each(Visitor&& visitor) \ + { \ + using Config = ClassName; \ + field_list(TM_PTR) \ + } + +// ====================================================================== +// ModuleConfig — plain base for typed config structs +// ====================================================================== + +struct ModuleConfig { + std::string_view module_type; +}; + +struct ModuleListConfig: ModuleConfig { + ModuleListConfig(): ModuleConfig{"ModuleList"} {} + template + static void for_each(Visitor&&) + { + } +}; + +// ====================================================================== +// X-macro expansion macros for Module-derived classes +// +// Usage in a derived class header: +// +// #define MY_CHILDREN(X) \ +// X(LinearWeight, w1) \ +// X(NormWeight, norm) +// +// #define MY_PARAMS(X) \ +// X(weight) \ +// X(bias) +// +// class MyWeight: public Module { +// public: +// MY_CHILDREN(TM_CHILD_MEMBER) +// MY_PARAMS(TM_PARAM_MEMBER) +// +// // Optional: override virtuals using the CASE macros +// Module* add_child(std::string name, std::unique_ptr child) override; +// Module* child(const std::string& name) const override; +// Param param(const std::string& name) override; +// void for_each_child(std::function visitor) const override; +// void for_each_param(std::function visitor) override; +// }; +// +// // In the .cc file: +// Module* MyWeight::add_child(std::string name, std::unique_ptr child) { +// MY_CHILDREN(TM_ADD_CHILD_CASE) +// return nullptr; +// } +// // ... etc. +// ====================================================================== + +/// Declares a unique_ptr member named `name`. +#define TM_CHILD_MEMBER(Type, name) std::unique_ptr name; + +/// Declares a Tensor member named `name`. +#define TM_PARAM_MEMBER(name) core::Tensor name{}; + +/// Fragment for add_child() override body: matches name and stores child. +/// Assumes member `std::unique_ptr name` and local `std::string name_str`. +#define TM_ADD_CHILD_CASE(Type, name) \ + if (name_str == #name) { \ + TM_CHECK_EQ(child->type(), Type().type()); \ + name.reset(static_cast(child.release())); \ + attach_child_(name.get(), this, std::move(name_str)); \ + return name.get(); \ + } + +/// Fragment for child() override body: matches name and returns pointer. +#define TM_CHILD_CASE(Type, name) \ + if (name_str == #name) { \ + return name.get(); \ + } + +/// Fragment for param() override body: matches name and returns Param handle. +#define TM_PARAM_CASE(name) \ + if (name_str == #name) { \ + return core::Param{&name}; \ + } + +/// Fragment for for_each_child() override body: visits child. +#define TM_VISIT_CHILD(Type, name) visitor(#name, name.get()); + +/// Fragment for for_each_param() override body: visits param. +#define TM_VISIT_PARAM(name) visitor(#name, name); + +/// Declares data members (children + params) and virtual method overrides. +/// Used in the public section of a derived class. +#define TM_MODULE_DECLARE(Class, ChildrenX, ParamsX) \ + ChildrenX(TM_CHILD_MEMBER) ParamsX(TM_PARAM_MEMBER) core::Module* add_child( \ + std::string name, std::unique_ptr child) override; \ + core::Module* child(const std::string& name) const override; \ + core::Param param(const std::string& name) override; \ + void for_each_child(std::function visitor) const override; \ + void for_each_param(std::function visitor) override; + +/// Defines all X-macro generated method bodies for a derived module class. +/// Used in the .cc file. ChildrenX/ParamsX may be empty macros. +#define TM_MODULE_METHODS(Class, ChildrenX, ParamsX) \ + core::Module* Class::add_child(std::string name, std::unique_ptr child) \ + { \ + std::string name_str = std::move(name); \ + ChildrenX(TM_ADD_CHILD_CASE) return nullptr; \ + } \ + core::Module* Class::child(const std::string& name_str) const \ + { \ + ChildrenX(TM_CHILD_CASE) return nullptr; \ + } \ + core::Param Class::param(const std::string& name_str) \ + { \ + ParamsX(TM_PARAM_CASE) return {}; \ + } \ + void Class::for_each_child(std::function visitor) const \ + { \ + ChildrenX(TM_VISIT_CHILD) \ + } \ + void Class::for_each_param(std::function visitor) \ + { \ + ParamsX(TM_VISIT_PARAM) \ + } + +// ====================================================================== +// Param — lightweight handle to a Module parameter slot +// ====================================================================== + +/// Lightweight handle to a Tensor slot within a Module. +/// Returned by Module::param(name). Used for per-param allocation. +class Param { + Tensor* slot_; + +public: + Param(Tensor* slot = nullptr): slot_(slot) {} + + /// Allocate the tensor with explicit shape/dtype. Returns the tensor for data copy. + Tensor alloc(const std::vector& shape, DataType dtype) + { + TM_CHECK(slot_ != nullptr); + auto layout = Layout{std::vector(shape.begin(), shape.end())}; + *slot_ = Tensor{std::move(layout), dtype, kDEVICE}; + return *slot_; + } + + /// Get current tensor (empty if not yet allocated). + Tensor get() const + { + return slot_ ? *slot_ : Tensor{}; + } + + explicit operator bool() const + { + return slot_ && static_cast(*slot_); + } +}; + +// ====================================================================== +// Module — type-erased hierarchical module with virtual lifecycle +// ====================================================================== + +/// Type-erased hierarchical module with virtual lifecycle. +/// +/// The module tree is built explicitly via ``create_child()`` from the Python +/// loading pipeline. Children are looked up by name; no lazy creation. +/// - ``prepare()`` runs post-load processing (format conversion, fusion). +/// - ``verify()`` walks the tree and collects uninitialized params/modules. +/// +/// Derived classes use X-macro hooks (TM_CHILD_MEMBER, TM_PARAM_MEMBER, etc.) +/// to declare children and parameters as direct members, overriding the +/// virtual lookup methods to match by name. class Module { + friend class ModuleList; + public: virtual ~Module(); @@ -14,26 +208,133 @@ class Module { Module(const Module&) = delete; Module& operator=(const Module&) = delete; + Module(Module&&) = delete; + Module& operator=(Module&&) = delete; - Module(Module&&) noexcept = delete; - Module& operator=(Module&&) noexcept = delete; + // ----- Type info ----- - void register_module(std::string name, Module& module, std::optional index = {}); - void register_parameter(std::string name, Tensor& param); + /// Returns a static string identifying the module type (e.g., "LinearWeight", "NormWeight"). + virtual const char* type() const; - void remove_module(Module& module); - void remove_parameter(Tensor& param); + // ----- Hierarchy (virtual, overridden by derived classes) ----- - std::unordered_map get_parameters() const; + /// Owns child; registers it under the given local name. + /// Returns raw pointer to the added child, or nullptr if name not recognized. + /// Default: returns nullptr. + virtual Module* add_child(std::string name, std::unique_ptr child); -private: - void get_parameters_impl(std::string prefix, std::unordered_map& m) const; + /// Find a direct child by name. Default: returns nullptr. + virtual Module* child(const std::string& name) const; + + /// Iterate over all children. Default: no-op. + virtual void for_each_child(std::function visitor) const; + + // ----- Parameters (virtual, overridden by derived classes) ----- + + /// Find a parameter by name within this module. Default: returns empty Param. + virtual Param param(const std::string& name); + + /// Iterate over all parameters. Default: no-op. + virtual void for_each_param(std::function visitor); + + // ----- Lifecycle (virtual, default = recurse / no-op) ----- + + /// Post-load processing: weight format conversion, fusion. + /// Default recurses into children via for_each_child. + virtual void prepare(); + + // ----- Registry-driven child creation ----- + + /// Create a standalone module using the type registry (no parent binding). + static std::unique_ptr create(const ModuleConfig& config); + + /// Create a child module using the type registry and attach it. + /// Uses config.module_type to look up the factory. + /// Returns pointer to the created child, or nullptr on failure. + Module* create_child(const std::string& name, const ModuleConfig& config = {}); + + /// Typed child accessor. Aborts if child not found. + template + T* get(const std::string& name) const + { + auto* c = child(name); + TM_CHECK(c != nullptr) << "child '" << name << "' not found in " << type(); + return static_cast(c); + } + + /// Find a child by single segment name. Aborts on null. + Module* get(const std::string& segment); + + // ----- Verification ----- + + /// Walk subtree, collect paths of uninitialized params/modules into ``missing``. + /// Composite modules override to also check required children exist. + /// Returns true if everything is OK. + virtual bool verify(std::vector& missing); + + // ----- Utilities ----- + + /// Build the fully-qualified path by walking up the parent chain. + std::string full_path() const; + + /// Access the parent module (nullptr for root). + Module* parent() const noexcept + { + return parent_; + } + + /// Access the local name of this module within its parent. + const std::string& name() const noexcept + { + return name_; + } protected: - Module* parent_; + Module* parent_ = nullptr; + std::string name_; + + /// Helper for add_child() overrides: sets parent and name on a child module. + /// This is needed because derived classes cannot access protected members + /// of other Module instances through the C++ protected access rules. + static void attach_child_(Module* child, Module* parent, std::string name) + { + child->parent_ = parent; + child->name_ = std::move(name); + } +}; + +// ====================================================================== +// ModuleList — indexed container for layer/expert sequences +// ====================================================================== + +/// A systematic container for indexed module sequences (layers, experts). +/// Children are added explicitly via ``add_child`` or ``create_child``. +class ModuleList: public Module { +public: + const char* type() const override + { + return "ModuleList"; + } - std::vector> modules_; - std::vector> params_; + ModuleList() = default; + + explicit ModuleList(const core::ModuleListConfig&) {} // empty config, no-op + + /// Override to also track the child in the indexed_ vector. + Module* add_child(std::string name, std::unique_ptr child) override; + + /// Find child by name. + Module* child(const std::string& name) const override; + + /// Iterate over children. + void for_each_child(std::function visitor) const override; + + /// Number of children created so far. + int size() const; + +private: + std::vector>> items_; + std::vector indexed_; }; } // namespace turbomind::core diff --git a/src/turbomind/core/registry.cc b/src/turbomind/core/registry.cc new file mode 100644 index 0000000000..5bae97e953 --- /dev/null +++ b/src/turbomind/core/registry.cc @@ -0,0 +1,35 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include "src/turbomind/core/registry.h" + +#include "src/turbomind/core/check.h" +#include "src/turbomind/core/module.h" + +namespace turbomind::core { + +ModuleRegistry& ModuleRegistry::instance() +{ + static ModuleRegistry reg; + return reg; +} + +void ModuleRegistry::register_type(const std::string& name, Factory factory) +{ + factories_[name] = std::move(factory); +} + +std::unique_ptr ModuleRegistry::create(const std::string& type, const ModuleConfig& config) const +{ + auto it = factories_.find(type); + if (it == factories_.end()) { + return nullptr; + } + return it->second(config); +} + +bool ModuleRegistry::has_type(const std::string& name) const +{ + return factories_.count(name) > 0; +} + +} // namespace turbomind::core diff --git a/src/turbomind/core/registry.h b/src/turbomind/core/registry.h new file mode 100644 index 0000000000..8401dbcb81 --- /dev/null +++ b/src/turbomind/core/registry.h @@ -0,0 +1,57 @@ +// Copyright (c) OpenMMLab. All rights reserved. +#pragma once + +#include +#include +#include +#include + +namespace turbomind::core { + +// Forward declarations — full definitions in module.h. +struct ModuleConfig; +class Module; + +/// Module type registry. Maps type name strings to factory functions. +class ModuleRegistry { +public: + using Factory = std::function(const ModuleConfig&)>; + + static ModuleRegistry& instance(); + + /// Register a factory under the given type name. + /// Duplicate names overwrite silently. + void register_type(const std::string& name, Factory factory); + + /// Convenience overload: derive the factory lambda from the concrete types. + /// `CfgT` defaults to `ModuleConfig` so callers that accept the base config + /// need not specify it explicitly. + template + void register_type(const std::string& name) + { + register_type(name, [](const ModuleConfig& cfg) -> std::unique_ptr { + return std::make_unique(static_cast(cfg)); + }); + } + + /// Create a module instance by type name and typed config. + /// Returns nullptr if type name is not registered. + std::unique_ptr create(const std::string& type, const ModuleConfig& config) const; + + /// Check if a type name is registered. + bool has_type(const std::string& name) const; + +private: + ModuleRegistry() = default; + std::map factories_; +}; + +} // namespace turbomind::core + +#define TM_MODULE_REGISTER(ModuleClass, ConfigType) \ + namespace { \ + static const bool _tm_module_registered_ = [] { \ + ::turbomind::core::ModuleRegistry::instance().register_type(#ModuleClass); \ + return true; \ + }(); \ + } diff --git a/src/turbomind/core/test_data_format.cc b/src/turbomind/core/test_data_format.cc new file mode 100644 index 0000000000..fcb4997927 --- /dev/null +++ b/src/turbomind/core/test_data_format.cc @@ -0,0 +1,76 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include "src/turbomind/core/data_format.h" + +#include "catch2/catch_test_macros.hpp" + +using namespace turbomind; + +TEST_CASE("DataFormat default is not quantized", "[data_format]") +{ + DataFormat fmt; + REQUIRE(!fmt.is_quantized()); + REQUIRE(fmt.rank() == 0); + REQUIRE(!fmt.scales.present()); + REQUIRE(!fmt.zeros.present()); +} + +TEST_CASE("DataFormat trivial is not quantized", "[data_format]") +{ + DataFormat fmt = ResolveLinearWeightFormat(kHalf, kHalf, 1, 1); + REQUIRE(!fmt.is_quantized()); + REQUIRE(fmt.rank() == 2); + REQUIRE(fmt.block_sizes == std::vector{1, 1}); + REQUIRE(!fmt.scales.present()); + REQUIRE(!fmt.zeros.present()); +} + +TEST_CASE("DataFormat FP8 blocked", "[data_format]") +{ + DataFormat fmt = ResolveLinearWeightFormat(kHalf, kFloat8_e4m3, 128, 128); + REQUIRE(fmt.is_quantized()); + REQUIRE(fmt.dtype == kFloat8_e4m3); + REQUIRE(fmt.block_sizes == std::vector{128, 128}); + REQUIRE(fmt.scales.present()); + REQUIRE(fmt.scales.dtype == kFloat); + REQUIRE(!fmt.zeros.present()); +} + +TEST_CASE("DataFormat FP4", "[data_format]") +{ + DataFormat fmt = ResolveLinearWeightFormat(kHalf, kFloat4_e2m1, 128, 1); + REQUIRE(fmt.is_quantized()); + REQUIRE(fmt.dtype == kFloat4_e2m1); + REQUIRE(fmt.block_sizes == std::vector{128, 1}); + REQUIRE(fmt.scales.present()); + REQUIRE(fmt.scales.dtype == kUint8); + REQUIRE(!fmt.zeros.present()); +} + +TEST_CASE("DataFormat AWQ uint4", "[data_format]") +{ + DataFormat fmt = ResolveLinearWeightFormat(kHalf, kUint4, 128, 1); + REQUIRE(fmt.is_quantized()); + REQUIRE(fmt.dtype == kUint4); + REQUIRE(fmt.block_sizes == std::vector{128, 1}); + REQUIRE(fmt.scales.present()); + REQUIRE(fmt.scales.dtype == kHalf); + REQUIRE(fmt.zeros.present()); + REQUIRE(fmt.zeros.dtype == kHalf); +} + +TEST_CASE("DataFormat uint8 quantized", "[data_format]") +{ + DataFormat fmt = ResolveLinearWeightFormat(kBfloat16, kUint8, 64, 1); + REQUIRE(fmt.is_quantized()); + REQUIRE(fmt.block_sizes == std::vector{64, 1}); + REQUIRE(fmt.scales.dtype == kBfloat16); + REQUIRE(fmt.zeros.dtype == kBfloat16); +} + +TEST_CASE("DataFormat trivial BF16", "[data_format]") +{ + DataFormat fmt = ResolveLinearWeightFormat(kBfloat16, kBfloat16, 1, 1); + REQUIRE(!fmt.is_quantized()); + REQUIRE(fmt.dtype == kBfloat16); +} diff --git a/src/turbomind/engine/engine.cc b/src/turbomind/engine/engine.cc index 391a034dae..02bd5a9bc8 100644 --- a/src/turbomind/engine/engine.cc +++ b/src/turbomind/engine/engine.cc @@ -18,9 +18,12 @@ #include "src/turbomind/core/copy.h" #include "src/turbomind/core/logger.h" +#include "src/turbomind/models/decoder_layer_weight.h" +#include "src/turbomind/models/delta_net_weight.h" #include "src/turbomind/models/language_model.h" #include "src/turbomind/models/llama/SequenceManager.h" #include "src/turbomind/models/llama/llama_params.h" +#include "src/turbomind/models/model_weight.h" #include "src/turbomind/utils/metrics.h" // #include "dbg.h" @@ -53,14 +56,14 @@ struct Engine::Impl { using Requests = vector>; using Signal = std::function; - Impl(DataType dtype, - EngineParam param, - LanguageModel model, - Context& ctx, - Gateway& gateway, - int device_id, - int queue_id, - int phases); + Impl(EngineParam param, + LanguageModel model, + const ModelWeight& weights, + Context& ctx, + Gateway& gateway, + int device_id, + int queue_id, + int phases); void CreateSequenceManager(); @@ -102,7 +105,6 @@ struct Engine::Impl { ~Impl(); - const DataType dtype_; const EngineParam param_; Gateway& gateway_; @@ -126,8 +128,9 @@ struct Engine::Impl { Queue> inbound_; Queue> outbound_; - LanguageModel model_; - ModelExecutor executor_; + LanguageModel model_; + const ModelWeight& weights_; + ModelExecutor executor_; std::thread internal_thread_; @@ -172,15 +175,14 @@ Engine::Impl::~Impl() executor_ = {}; } -Engine::Impl::Impl(DataType dtype, - EngineParam param, - LanguageModel model, - Context& ctx, - Gateway& gateway, - int device_id, - int queue_id, - int phases): - dtype_{dtype}, +Engine::Impl::Impl(EngineParam param, + LanguageModel model, + const ModelWeight& weights, + Context& ctx, + Gateway& gateway, + int device_id, + int queue_id, + int phases): param_{param}, gateway_{gateway}, tp_group_{ctx.comm.h_tp_group}, @@ -192,7 +194,8 @@ Engine::Impl::Impl(DataType dtype, queue_id_{queue_id}, async_{phases > 1}, is_warm_up_{*ctx.is_warm_up}, - model_{std::move(model)} + model_{std::move(model)}, + weights_{weights} { states_.emplace_back(); @@ -204,26 +207,53 @@ Engine::Impl::Impl(DataType dtype, CreateSequenceManager(); // initializes `session_len_trunc_` - const ssize_t max_batch_block_num = - param.max_batch_size * cdiv(session_len_trunc_, model_.attn_param().cache_block_seq_len); - block_ptrs_buf_ = {max_batch_block_num, kCPUpinned}; - block_ptrs_offsets_buf_ = {param.max_batch_size + 1, kCPUpinned}; + const ssize_t max_batch_block_num = param.max_batch_size * cdiv(session_len_trunc_, param_.cache_block_seq_len); + block_ptrs_buf_ = {max_batch_block_num, kCPUpinned}; + block_ptrs_offsets_buf_ = {param.max_batch_size + 1, kCPUpinned}; } void Engine::Impl::CreateSequenceManager() { - const auto cache_block_seq_len = model_.attn_param().cache_block_seq_len; + const auto cache_block_seq_len = param_.cache_block_seq_len; + + // Derive DeltaNet fields if linear attention exists + bool has_linear_attention = false; + int linear_key_head_dim = 0, linear_value_head_dim = 0; + int linear_conv_kernel_dim = 0, linear_num_key_heads = 0, linear_num_value_heads = 0; + for (int i = 0; i < weights_.num_layer; ++i) { + if (auto* dn = weights_.layer(i)->linear_attn.get()) { + has_linear_attention = true; + linear_key_head_dim = dn->key_head_dim; + linear_value_head_dim = dn->value_head_dim; + linear_conv_kernel_dim = dn->d_conv; + linear_num_key_heads = dn->num_k_heads * param_.attn_tp_size; + linear_num_value_heads = dn->num_v_heads * param_.attn_tp_size; + break; + } + } - const auto& model_param = model_.model_param(); + if (has_linear_attention && param_.enable_prefix_caching) { + TM_CHECK(0) << "Prefix caching is unsupported when linear attention is present"; + } - const auto get_free_size = [&] { // + const auto get_free_size = [&] { size_t free{}, total{}; check_cuda_error(cudaMemGetInfo(&free, &total)); return AllReduce(tp_group_, free, comm::RedOp::kMin); }; - seq_mgr_ = std::make_unique(model_param, - dtype_, + seq_mgr_ = std::make_unique(weights_.head_dim, + weights_.kv_head_num / param_.attn_tp_size, + weights_.num_layer, + weights_.layer_types, + param_.quant_policy, + weights_.data_type, + weights_.data_type, // runtime_dtype = data_type + linear_key_head_dim, + linear_value_head_dim, + linear_conv_kernel_dim, + linear_num_key_heads, + linear_num_value_heads, cache_block_seq_len, param_.attn_tp_size, param_.max_batch_size, @@ -248,7 +278,13 @@ void Engine::Impl::Validate(Requests& infer_reqs, Requests& kill_reqs) std::pmr::monotonic_buffer_resource mbr; std::pmr::unordered_map occur(&mbr); - const bool has_linear_attention = HasLinearAttention(model_.model_param()); + bool has_linear_attention = false; + for (auto t : weights_.layer_types) { + if (t == 1) { + has_linear_attention = true; + break; + } + } auto count = [&occur](const auto& reqs) { for (const auto& r : reqs) { @@ -874,15 +910,15 @@ Engine::Engine() = default; Engine::Engine(Engine&&) noexcept = default; Engine& Engine::operator=(Engine&&) noexcept = default; -Engine::Engine(DataType dtype, - EngineParam param, - LanguageModel model, - Context& ctx, - Gateway& gateway, - int device_id, - int dp_rank, - int phases): - impl_{std::make_unique(dtype, param, std::move(model), ctx, gateway, device_id, dp_rank, phases)} +Engine::Engine(EngineParam param, + LanguageModel model, + const ModelWeight& weights, + Context& ctx, + Gateway& gateway, + int device_id, + int dp_rank, + int phases): + impl_{std::make_unique(param, std::move(model), weights, ctx, gateway, device_id, dp_rank, phases)} { } diff --git a/src/turbomind/engine/engine.h b/src/turbomind/engine/engine.h index ea26d196a7..b7e75d268c 100644 --- a/src/turbomind/engine/engine.h +++ b/src/turbomind/engine/engine.h @@ -26,14 +26,14 @@ class Engine { return static_cast(impl_); } - Engine(DataType dtype, - EngineParam param, - LanguageModel model, - Context& ctx, - Gateway& gateway, - int device_id, - int queue_id, - int phases); + Engine(EngineParam param, + LanguageModel model, + const ModelWeight& weights, + Context& ctx, + Gateway& gateway, + int device_id, + int queue_id, + int phases); void Start(); diff --git a/src/turbomind/engine/engine_config.h b/src/turbomind/engine/engine_config.h new file mode 100644 index 0000000000..2c0381e9c4 --- /dev/null +++ b/src/turbomind/engine/engine_config.h @@ -0,0 +1,46 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#pragma once + +#include +#include + +#include "src/turbomind/core/data_type.h" +#include "src/turbomind/core/module.h" + +namespace turbomind { + +struct EngineConfig { +#define ENGINE_FIELDS(X) \ + X(DataType, data_type) \ + X(int, cache_block_seq_len, 0) \ + X(int, quant_policy, 0) \ + X(int, tune_layer_num, 1) \ + X(int, max_batch_size, 0) \ + X(int, max_prefill_token_num, 0) \ + X(int, max_context_token_num, 0) \ + X(int, session_len, 0) \ + X(float, cache_max_block_count, 0) \ + X(int, cache_chunk_size, 0) \ + X(bool, enable_prefix_caching, false) \ + X(bool, enable_metrics, false) \ + X(int, num_tokens_per_iter, 0) \ + X(int, max_prefill_iters, 1) \ + X(int, async_, 0) \ + X(int, outer_dp_size) \ + X(int, attn_dp_size) \ + X(int, attn_tp_size) \ + X(int, attn_cp_size) \ + X(int, mlp_tp_size) \ + X(std::vector, devices) \ + X(int, nnodes) \ + X(int, node_rank) \ + X(std::string, communicator) + + ENGINE_FIELDS(TM_MEMBER) + TM_FOR_EACH(EngineConfig, ENGINE_FIELDS) + +#undef ENGINE_FIELDS +}; + +} // namespace turbomind diff --git a/src/turbomind/kernels/gemm/CMakeLists.txt b/src/turbomind/kernels/gemm/CMakeLists.txt index 0cc5ba8d37..fe666fa2ac 100644 --- a/src/turbomind/kernels/gemm/CMakeLists.txt +++ b/src/turbomind/kernels/gemm/CMakeLists.txt @@ -51,15 +51,14 @@ set_property(TARGET gemm2 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) if (BUILD_TEST) - add_executable(test_gemm_v2 - test/test_gemm_v2.cc - ../../models/llama/LlamaLinear.cu - ../../models/llama/LlamaDenseWeight.cc - test/reference.cu) - target_link_libraries(test_gemm_v2 PRIVATE gemm2 core cublas quantization_kernels gpt_kernels) + # add_executable(test_gemm_v2 + # test/test_gemm_v2.cc + # ../../models/llama/LlamaLinear.cu + # test/reference.cu) + # target_link_libraries(test_gemm_v2 PRIVATE gemm2 core cublas quantization_kernels gpt_kernels) - add_executable(test_moe_utils test/test_moe_utils.cu test/test_utils.cu) - target_link_libraries(test_moe_utils PRIVATE gemm2 core cublas) + # add_executable(test_moe_utils test/test_moe_utils.cu test/test_utils.cu) + # target_link_libraries(test_moe_utils PRIVATE gemm2 core cublas) # if (NOT MSVC) # FetchContent_Declare( diff --git a/src/turbomind/kernels/gemm/convert_v3.cu b/src/turbomind/kernels/gemm/convert_v3.cu index 39fef1a858..36dabc301e 100644 --- a/src/turbomind/kernels/gemm/convert_v3.cu +++ b/src/turbomind/kernels/gemm/convert_v3.cu @@ -112,7 +112,7 @@ std::array GetConverters(DataType data_type, // clang-format on } else { - return {}; // trivial case: dense floating point + return {}; // trivial case: no quantization } } diff --git a/src/turbomind/kernels/gemm/kernel_impl_sm90.h b/src/turbomind/kernels/gemm/kernel_impl_sm90.h index 14ebd5d78b..e787d2d701 100644 --- a/src/turbomind/kernels/gemm/kernel_impl_sm90.h +++ b/src/turbomind/kernels/gemm/kernel_impl_sm90.h @@ -173,6 +173,8 @@ class KernelImplSm90: public Kernel { [[maybe_unused]] const int n = Ddesc.cols; [[maybe_unused]] const int k = Adesc.cols; + TM_CHECK_GE(cdiv(k, TILE_K), 2) << "The kernel requires at least 2 k-tiles to work"; + // std::cout << "M: " << m << ", N: " << n << ", K: " << k << "\n"; auto transpose = [](MatrixLayout x) { diff --git a/src/turbomind/kernels/gemm/test/testbed_v3.h b/src/turbomind/kernels/gemm/test/testbed_v3.h index f1df7456d5..677bd7ce60 100644 --- a/src/turbomind/kernels/gemm/test/testbed_v3.h +++ b/src/turbomind/kernels/gemm/test/testbed_v3.h @@ -11,7 +11,7 @@ #include "src/turbomind/kernels/gemm/types.h" #include "src/turbomind/kernels/quantization.h" -#include "src/turbomind/models/llama/LlamaDenseWeight.h" +#include "src/turbomind/models/linear_weight.h" #include "src/turbomind/models/llama/LlamaLinear.h" #include "src/turbomind/kernels/gpt_kernels.h" @@ -21,8 +21,7 @@ namespace turbomind { using std::vector; using std::unique_ptr; -using DenseWeight = LlamaDenseWeight; -using Linear = LlamaLinear; +using Linear = LlamaLinear; using namespace gemm; @@ -77,6 +76,55 @@ static Tensor CopyTransposed(const Tensor& src, Tensor out = {}) return out; } +/// Link individual expert weights into a batched block view for fused MoE. +static void LinkExperts(std::function experts, int n, LinearWeight& d) +{ + const auto& e0 = *experts(0); + + e0.copy_metadata_to(d); + + d.k_desc.num = d.q_desc.num = n; + + if (e0.bias()) { + d.bias() = Tensor{{n, e0.output_dim}, e0.bias().dtype(), kDEVICE}; + } + + std::vector> weights; + std::vector> scales; + + for (int i = 0; i < n; ++i) { + auto& e = *experts(i); + weights.emplace_back(e.weight().raw_data(), e.k_desc.ld); + if (e.scales()) { + scales.emplace_back(e.scales().raw_data(), e.q_desc.ld); + } + if (e.bias()) { + Copy(e.bias(), d.bias().slice(i, 1).squeeze(0)); + } + } + + auto stream = core::Context::stream().handle(); + + if (d.weight_format.dtype == kFloat8_e4m3 && d.input_dtype() == kFloat8_e4m3) { + auto make_blocked_ptr = [&](const auto& ptrs) { + return std::shared_ptr{gemm::MakeBlockedPtrs(ptrs, stream), [](auto p) { cudaFree(p); }}; + }; + d.weight() = Tensor{make_blocked_ptr(weights), {n}, e0.weight().dtype(), kDEVICE}; + d.scales() = Tensor{make_blocked_ptr(scales), {n}, e0.scales().dtype(), kDEVICE}; + d.k_desc.offsets = d.q_desc.offsets = (int*)1; + } + else { + auto make_strided_ptr = [&](const auto& ptrs) { + return std::shared_ptr{gemm::MakeStridedPtrs(ptrs, stream), [](auto p) { cudaFree(p); }}; + }; + d.weight() = Tensor{make_strided_ptr(weights), {n}, d.weight_format.dtype, kDEVICE}; + if (e0.scales()) { + d.scales() = Tensor{make_strided_ptr(scales), {n}, e0.scales().dtype(), kDEVICE}; + } + d.k_desc.ld = d.q_desc.ld = 0; + } +} + struct Testbed_v3: Parameter { Testbed_v3(const Parameter& param): Parameter{param}, stream_{core::Context::stream().handle()}, linear_{} @@ -100,14 +148,14 @@ struct Testbed_v3: Parameter { cudaGetDeviceProperties(&prop_, 0); - w_original_ = std::make_unique(); - w_quant_ = std::make_unique(); - w_dequant_ = std::make_unique(); + w_original_ = std::make_unique(); + w_quant_ = std::make_unique(); + w_dequant_ = std::make_unique(); for (int i = 0; i < expert_num; ++i) { - e_original_.push_back(std::make_unique()); - e_quant_.push_back(std::make_unique()); - e_dequant_.push_back(std::make_unique()); + e_original_.push_back(std::make_unique()); + e_quant_.push_back(std::make_unique()); + e_dequant_.push_back(std::make_unique()); } GenerateWeight(); @@ -237,44 +285,57 @@ struct Testbed_v3: Parameter { // - quantize weight // - dequantize weight - void GenerateWeight(DenseWeight& original, DenseWeight& quant, DenseWeight& dequant) + void GenerateWeight(LinearWeight& original, LinearWeight& quant, LinearWeight& dequant) { - original.emplace(input_dim, output_dim, data_type, false, data_type, group_size); - rng_.NormalFloat(original.weight, 1., .1); - - quant.emplace(input_dim, output_dim, data_type, false, weight_type, group_size); - dequant.emplace(input_dim, output_dim, data_type, false, data_type, group_size); + auto make_cfg = [&](DataType wt) -> core::LinearConfig { + core::LinearConfig cfg; + cfg.input_dim = input_dim; + cfg.output_dim = output_dim; + cfg.data_type = data_type; + cfg.format = ResolveLinearWeightFormat(data_type, wt, group_size, 1); + cfg.has_bias = false; + return cfg; + }; + + new (&original) LinearWeight(make_cfg(data_type)); + original.param("weight").alloc({(size_t)input_dim, (size_t)output_dim}, data_type); + rng_.NormalFloat(original.weight(), 1., .1); + + new (&quant) LinearWeight(make_cfg(weight_type)); + quant.param("weight").alloc({(size_t)input_dim, (size_t)output_dim}, weight_type); + new (&dequant) LinearWeight(make_cfg(data_type)); + dequant.param("weight").alloc({(size_t)input_dim, (size_t)output_dim}, data_type); Buffer_ rbits; - // rbits = {original.weight.size(), kDEVICE}; + // rbits = {original.weight().size(), kDEVICE}; // rng_.RandomBytes(Tensor{rbits}); /// Weights are allocated in MN-major, but some quantization requires K-major tensor if (weight_type == data_type) { - Copy(original.weight, quant.weight); - Copy(original.weight, dequant.weight); + Copy(original.weight(), quant.weight()); + Copy(original.weight(), dequant.weight()); } else if (weight_type == kFloat8_e4m3) { - QuantizeSymmBlock(quant.weight, quant.scales, original.weight, stream_); - DequantizeSymmBlock(dequant.weight, quant.weight, quant.scales, stream_); + QuantizeSymmBlock(quant.weight(), quant.scales(), original.weight(), stream_); + DequantizeSymmBlock(dequant.weight(), quant.weight(), quant.scales(), stream_); } else if (weight_type == kUint4) { /// Weights are allocated in (M,N), quantization needs K-major tensor - QuantizeGroupwise(quant.weight.t(), - quant.scales.t(), - quant.zeros.t(), - dequant.weight.t(), - original.weight.t(), + QuantizeGroupwise(quant.weight().t(), + quant.scales().t(), + quant.zeros().t(), + dequant.weight().t(), + original.weight().t(), {}, group_size); } else if (weight_type == kFloat4_e2m1) { - QuantizeGroupwise(quant.weight.t(), // - quant.scales.t(), + QuantizeGroupwise(quant.weight().t(), // + quant.scales().t(), {}, - dequant.weight.t(), - original.weight.t(), + dequant.weight().t(), + original.weight().t(), rbits, group_size); } @@ -282,9 +343,9 @@ struct Testbed_v3: Parameter { TM_CHECK(0); } - original.prepare(0); - quant.prepare(expert_num > 0); - dequant.prepare(0); + original.prepare(); + quant.prepare(); + dequant.prepare(); } void GetReference() @@ -299,7 +360,7 @@ struct Testbed_v3: Parameter { } } - void GetReference(const Tensor& x, const unique_ptr& dense, Ref d_) + void GetReference(const Tensor& x, const unique_ptr& dense, Ref d_) { auto& d = d_.get(); if (!d) { @@ -311,7 +372,7 @@ struct Testbed_v3: Parameter { ref_.gemm(x.raw_data(), desc_A, dense->weight.raw_data(), dense->k_desc, d.raw_data(), desc_D); } - void GetReference(const Tensor& x, const vector>& experts, Ref d_) + void GetReference(const Tensor& x, const vector>& experts, Ref d_) { Tensor xe{{x.shape(0) * experts_per_token, input_dim}, data_type, kDEVICE}; Tensor de{{x.shape(0) * experts_per_token, output_dim}, data_type, kDEVICE}; @@ -376,7 +437,7 @@ struct Testbed_v3: Parameter { } } - void Run(const Tensor& x, const vector>& experts) {} + void Run(const Tensor& x, const vector>& experts) {} void Compare() { @@ -421,9 +482,9 @@ struct Testbed_v3: Parameter { Linear linear_; // ! weights are non-movable - unique_ptr w_original_; - unique_ptr w_quant_; - unique_ptr w_dequant_; + unique_ptr w_original_; + unique_ptr w_quant_; + unique_ptr w_dequant_; Tensor x_original_; Tensor x_quant_, x_scale_; @@ -433,9 +494,9 @@ struct Testbed_v3: Parameter { Tensor d_quant_; // x_original * w_quant, quant for X done by `Linear` Tensor d_dequant_; // x_dequant * w_dequant - vector> e_original_; - vector> e_quant_; - vector> e_dequant_; + vector> e_original_; + vector> e_quant_; + vector> e_dequant_; Buffer_ f2n_; Buffer_ en2f_; diff --git a/src/turbomind/kernels/quantization.cu b/src/turbomind/kernels/quantization.cu index 7899226f33..8dc07b85ed 100644 --- a/src/turbomind/kernels/quantization.cu +++ b/src/turbomind/kernels/quantization.cu @@ -66,9 +66,9 @@ void QuantizeSymm(Tensor& out, Tensor& scale, const Tensor& src, cudaStream_t st TM_CHECK_EQ(src.ndim(), 2); TM_CHECK_EQ(src.stride(1), 1); // row-major - const auto [num, dim] = src.shapes(0, 1); + const auto num = src.shape(0); + const auto dim = src.shape(1); - using T = bfloat16_t; using Tout = fp8_e4m3_t; using Tscale = float; @@ -99,15 +99,20 @@ void QuantizeSymm(Tensor& out, Tensor& scale, const Tensor& src, cudaStream_t st constexpr int block_dim = 512; - quant_symm_row<<>>(out.data(), // - out.stride(0), - scale.data(), - scale.stride(0), - src.data(), - src.stride(0), - num, - dim, - 448.f); + auto invoke = [&](auto t) { + using T = decltype(t); + quant_symm_row<<>>(out.data(), // + out.stride(0), + scale.data(), + scale.stride(0), + src.data(), + src.stride(0), + num, + dim, + 448.f); + }; + + TM_DISPATCH_PRIMARY_DTYPES(src.dtype(), invoke); } template diff --git a/src/turbomind/models/CMakeLists.txt b/src/turbomind/models/CMakeLists.txt index 10b8fe9d00..58a5700f98 100644 --- a/src/turbomind/models/CMakeLists.txt +++ b/src/turbomind/models/CMakeLists.txt @@ -4,13 +4,19 @@ add_library(models STATIC language_model.cc input_processor.cc output_processor.cc + linear_weight.cc + norm_weight.cc + attention_weight.cc + ffn_weight.cc + moe_weight.cc + delta_net_weight.cc + decoder_layer_weight.cc + model_weight.cc + model_root.cc llama/LlamaLinear.cu llama/BlockManager.cc llama/BlockTrie.cc llama/SequenceManager.cc - llama/LlamaWeight.cc - llama/LlamaDenseWeight.cc - llama/LlamaDecoderLayerWeight.cc llama/LlamaFfnLayer.cc llama/moe_ffn_layer.cc llama/unified_decoder.cc @@ -18,7 +24,6 @@ add_library(models STATIC llama/llama_kernels.cu llama/llama_utils.cu llama/mla_utils.cu - llama/GatedDeltaNetWeight.cc llama/GatedDeltaNetLayer.cc llama/gated_delta_net_kernels.cu) set_property(TARGET models PROPERTY POSITION_INDEPENDENT_CODE ON) diff --git a/src/turbomind/models/attention_weight.cc b/src/turbomind/models/attention_weight.cc new file mode 100644 index 0000000000..f3cb16ec99 --- /dev/null +++ b/src/turbomind/models/attention_weight.cc @@ -0,0 +1,94 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include "src/turbomind/models/attention_weight.h" + +#include "src/turbomind/core/registry.h" +#include "src/turbomind/kernels/core/math.h" +#include "src/turbomind/models/llama/llama_rope.h" + +namespace turbomind { + +AttentionWeight::AttentionWeight(const core::AttentionConfig& cfg): + hidden_dim(cfg.hidden_dim), + head_dim(cfg.head_dim), + head_num(cfg.head_num), + kv_head_num(cfg.kv_head_num), + kv_lora_rank(cfg.kv_lora_rank), + q_lora_rank(cfg.q_lora_rank), + qk_rope_dim(cfg.qk_rope_dim), + v_head_dim(cfg.v_head_dim), + tp_size(cfg.tp_size), + tp_rank(cfg.tp_rank), + data_type(cfg.data_type), + window_size(cfg.window_size), + output_gate(cfg.output_gate), + softmax_scale(cfg.softmax_scale), + use_logn_attn(cfg.use_logn_attn), + rope(cfg.rope) +{ +} + +void AttentionWeight::prepare() +{ + Module::prepare(); +} + +void init_rope_kernel_param(const core::RopeConfig& rope, RopeKernelParam& rope_kernel) +{ + auto rope_type = static_cast(rope.type); + + rope_kernel.type = rope_type; + rope_kernel.dim = rope.dim; + rope_kernel.scale_factor = -std::log2(rope.base) / rope.dim; + if (rope_type == RopeType::kDynamic) { + rope_kernel.inv_factor = 1.f; + } + else { + rope_kernel.inv_factor = (rope.factor != 0.f) ? 1.0 / rope.factor : 1.f; + } + + if (rope_type == RopeType::kYarn) { + auto& dst = rope_kernel.yarn; + const double PI = 3.14159265358979323846; + + auto find_correction_dim = [&](float num_rotations) { + return (rope.dim * std::log(rope.max_position_embeddings / (num_rotations * 2 * PI))) + / (2 * std::log(rope.base)); + }; + + auto find_correction_range = [&](float low_rot, float high_rot, float& low, float& high) { + low = std::floor(find_correction_dim(low_rot)); + high = std::ceil(find_correction_dim(high_rot)); + low = std::max(low, 0.f); + high = std::min(high, rope.dim - 1.f); + }; + + float low, high; + find_correction_range(rope.yarn_beta_fast, rope.yarn_beta_slow, low, high); + if (low == high) { + high += 0.001f; + } + dst.ramp_inv_factor_div_2 = 1.0 / (high - low) / 2.0; + dst.ramp_inv_factor_mul_min = 1.0 / (high - low) * low; + dst.attention_factor = rope.yarn_attention_factor; + } + else if (rope_type == RopeType::kLlama3) { + auto& dst = rope_kernel.llama3; + + float inv_diff_freq_factor = 1.0 / (rope.llama3_high_freq_factor - rope.llama3_low_freq_factor); + dst.alpha = rope.llama3_original_max_position_embeddings / (2 * 3.14159265358979323846) * inv_diff_freq_factor; + dst.beta = rope.llama3_low_freq_factor * inv_diff_freq_factor; + } + else if (rope_type == RopeType::kMrope) { + auto& dst = rope_kernel.mrope; + dst.section.x = rope.mrope_section[0] * 2; + dst.section.y = rope.mrope_section[1] * 2 + dst.section.x; + dst.section.z = rope.mrope_section[2] * 2 + dst.section.y; + } +} + +TM_MODULE_REGISTER(AttentionWeight, core::AttentionConfig); + +TM_MODULE_METHODS(AttentionWeight, ATTENTION_WEIGHT_CHILDREN, ATTENTION_WEIGHT_PARAMS) + +} // namespace turbomind diff --git a/src/turbomind/models/attention_weight.h b/src/turbomind/models/attention_weight.h new file mode 100644 index 0000000000..878c528dba --- /dev/null +++ b/src/turbomind/models/attention_weight.h @@ -0,0 +1,126 @@ +// Copyright (c) OpenMMLab. All rights reserved. +#pragma once + +#include + +#include "src/turbomind/core/core.h" +#include "src/turbomind/core/module.h" +#include "src/turbomind/models/linear_weight.h" +#include "src/turbomind/models/norm_weight.h" + +namespace turbomind::core { + +using MropeSection = std::array; + +struct RopeConfig { +#define ROPE_FIELDS(X) \ + X(int, type, 0) \ + X(float, base, 10000.f) \ + X(int, dim, 0) \ + X(float, factor, 1.f) \ + X(int, max_position_embeddings, 0) \ + X(float, yarn_attention_factor, 1.f) \ + X(float, yarn_beta_fast, 32.f) \ + X(float, yarn_beta_slow, 1.f) \ + X(float, llama3_low_freq_factor, 1.f) \ + X(float, llama3_high_freq_factor, 4.f) \ + X(int, llama3_original_max_position_embeddings, 0) \ + X(MropeSection, mrope_section, {}) + + ROPE_FIELDS(TM_MEMBER) + TM_FOR_EACH(RopeConfig, ROPE_FIELDS) + +#undef ROPE_FIELDS +}; + +struct AttentionConfig: ModuleConfig { + AttentionConfig(): ModuleConfig{"AttentionWeight"} {} + +#define ATTENTION_FIELDS(X) \ + X(int, hidden_dim) \ + X(int, head_dim) \ + X(int, head_num) \ + X(int, kv_head_num) \ + X(int, kv_lora_rank) \ + X(int, q_lora_rank) \ + X(int, qk_rope_dim) \ + X(int, v_head_dim) \ + X(int, tp_size) \ + X(int, tp_rank) \ + X(DataType, data_type) \ + X(int, window_size, 0) \ + X(bool, output_gate, 0) \ + X(RopeConfig, rope, {}) \ + X(int, qk_nope_dim) \ + X(float, softmax_scale, 0.f) \ + X(bool, use_logn_attn, false) + + ATTENTION_FIELDS(TM_MEMBER) + TM_FOR_EACH(AttentionConfig, ATTENTION_FIELDS) + +#undef ATTENTION_FIELDS +}; + +} // namespace turbomind::core + +namespace turbomind { + +struct RopeKernelParam; +void init_rope_kernel_param(const core::RopeConfig& rope, RopeKernelParam& rope_kernel); + +class AttentionWeight: public core::Module { +public: + const char* type() const override + { + return "AttentionWeight"; + } + + AttentionWeight() = default; + + AttentionWeight(const core::AttentionConfig& cfg); + + void prepare() override; + + // --- X-macro field lists --- +#define ATTENTION_WEIGHT_CHILDREN(X) \ + X(LinearWeight, w_qkv) \ + X(LinearWeight, wo) \ + X(LinearWeight, q_proj) \ + X(LinearWeight, q_a_proj) \ + X(LinearWeight, q_b_proj) \ + X(LinearWeight, kv_a_proj) \ + X(NormWeight, q_norm) \ + X(NormWeight, k_norm) \ + X(NormWeight, q_a_layernorm) \ + X(NormWeight, kv_a_layernorm) + +#define ATTENTION_WEIGHT_PARAMS(X) X(sinks) + + TM_MODULE_DECLARE(AttentionWeight, ATTENTION_WEIGHT_CHILDREN, ATTENTION_WEIGHT_PARAMS) + + bool is_mla() const + { + return kv_lora_rank > 0; + } + + // --- Config fields (public for runtime access) --- + int hidden_dim{}; + int head_dim{}; + int head_num{}; + int kv_head_num{}; + int kv_lora_rank{}; + int q_lora_rank{}; + int qk_rope_dim{}; + int v_head_dim{}; + int tp_size{}; + int tp_rank{}; + DataType data_type{}; + int window_size{}; + bool output_gate{}; + float softmax_scale{}; + bool use_logn_attn{}; + + core::RopeConfig rope{}; +}; + +} // namespace turbomind diff --git a/src/turbomind/models/decoder_layer_weight.cc b/src/turbomind/models/decoder_layer_weight.cc new file mode 100644 index 0000000000..35c5b14ba9 --- /dev/null +++ b/src/turbomind/models/decoder_layer_weight.cc @@ -0,0 +1,40 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include "src/turbomind/models/decoder_layer_weight.h" +#include "src/turbomind/models/attention_weight.h" +#include "src/turbomind/models/delta_net_weight.h" +#include "src/turbomind/models/ffn_weight.h" +#include "src/turbomind/models/moe_weight.h" +#include "src/turbomind/models/norm_weight.h" + +#include "src/turbomind/core/registry.h" + +namespace turbomind { + +DecoderLayerWeight::DecoderLayerWeight(const core::ModuleConfig&) {} + +DecoderLayerWeight::~DecoderLayerWeight() = default; + +bool DecoderLayerWeight::verify(std::vector& missing) +{ + Module::verify(missing); + // At least one of attention or linear_attn must exist + if (!attention && !linear_attn) { + missing.push_back(full_path() + ": missing attention or linear_attn"); + } + // At least one of feed_forward or moe_ffn must exist + if (!feed_forward && !moe_ffn) { + missing.push_back(full_path() + ": missing feed_forward or moe_ffn"); + } + // attention_norm must exist + if (!attention_norm) { + missing.push_back(full_path() + ": missing attention_norm"); + } + return missing.empty(); +} + +TM_MODULE_REGISTER(DecoderLayerWeight, core::ModuleConfig); + +TM_MODULE_METHODS(DecoderLayerWeight, DECODER_LAYER_WEIGHT_CHILDREN, DECODER_LAYER_WEIGHT_PARAMS) + +} // namespace turbomind diff --git a/src/turbomind/models/decoder_layer_weight.h b/src/turbomind/models/decoder_layer_weight.h new file mode 100644 index 0000000000..1e7b089863 --- /dev/null +++ b/src/turbomind/models/decoder_layer_weight.h @@ -0,0 +1,55 @@ +// Copyright (c) OpenMMLab. All rights reserved. +#pragma once + +#include "src/turbomind/core/module.h" + +namespace turbomind::core { + +struct DecoderLayerConfig: ModuleConfig { + DecoderLayerConfig(): ModuleConfig{"DecoderLayerWeight"} {} + template + static void for_each(Visitor&&) + { + } +}; + +} // namespace turbomind::core + +namespace turbomind { + +class AttentionWeight; +class DeltaNetWeight; +class FfnWeight; +class MoeWeight; +class NormWeight; + +/// Architecture-independent decoder layer weight composite. +class DecoderLayerWeight: public core::Module { +public: + const char* type() const override + { + return "DecoderLayerWeight"; + } + + DecoderLayerWeight() = default; + DecoderLayerWeight(const core::ModuleConfig&); + + ~DecoderLayerWeight() override; // defined in .cc where child types are complete + + bool verify(std::vector& missing) override; + + // --- X-macro field lists --- +#define DECODER_LAYER_WEIGHT_CHILDREN(X) \ + X(AttentionWeight, attention) \ + X(DeltaNetWeight, linear_attn) \ + X(FfnWeight, feed_forward) \ + X(MoeWeight, moe_ffn) \ + X(NormWeight, attention_norm) \ + X(NormWeight, ffn_norm) + +#define DECODER_LAYER_WEIGHT_PARAMS(X) + + TM_MODULE_DECLARE(DecoderLayerWeight, DECODER_LAYER_WEIGHT_CHILDREN, DECODER_LAYER_WEIGHT_PARAMS) +}; + +} // namespace turbomind diff --git a/src/turbomind/models/delta_net_weight.cc b/src/turbomind/models/delta_net_weight.cc new file mode 100644 index 0000000000..d365fd2a7d --- /dev/null +++ b/src/turbomind/models/delta_net_weight.cc @@ -0,0 +1,36 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include "src/turbomind/models/delta_net_weight.h" + +#include "src/turbomind/core/registry.h" +#include "src/turbomind/utils/memory_utils.h" + +namespace turbomind { + +DeltaNetWeight::DeltaNetWeight(const core::DeltaNetConfig& cfg): + hidden_dim(cfg.hidden_dim), + num_k_heads(cfg.num_k_heads), + num_v_heads(cfg.num_v_heads), + key_head_dim(cfg.key_head_dim), + value_head_dim(cfg.value_head_dim), + d_conv(cfg.d_conv), + data_type(cfg.data_type), + tp_size(cfg.tp_size), + tp_rank(cfg.tp_rank) +{ +} + +void DeltaNetWeight::prepare() +{ + Module::prepare(); + + EnsureFloatDtype(A_log, data_type); + EnsureFloatDtype(dt_bias, data_type); + EnsureFloatDtype(conv1d, data_type); +} + +TM_MODULE_REGISTER(DeltaNetWeight, core::DeltaNetConfig); + +TM_MODULE_METHODS(DeltaNetWeight, DELTA_NET_WEIGHT_CHILDREN, DELTA_NET_WEIGHT_PARAMS) + +} // namespace turbomind diff --git a/src/turbomind/models/delta_net_weight.h b/src/turbomind/models/delta_net_weight.h new file mode 100644 index 0000000000..92fa9bbe72 --- /dev/null +++ b/src/turbomind/models/delta_net_weight.h @@ -0,0 +1,74 @@ +// Copyright (c) OpenMMLab. All rights reserved. +#pragma once + +#include "src/turbomind/core/core.h" +#include "src/turbomind/core/module.h" +#include "src/turbomind/models/linear_weight.h" +#include "src/turbomind/models/norm_weight.h" + +namespace turbomind::core { + +struct DeltaNetConfig: ModuleConfig { + DeltaNetConfig(): ModuleConfig{"DeltaNetWeight"} {} + +#define DELTANET_FIELDS(X) \ + X(int, hidden_dim) \ + X(int, num_k_heads) \ + X(int, num_v_heads) \ + X(int, key_head_dim) \ + X(int, value_head_dim) \ + X(int, d_conv, 4) \ + X(DataType, data_type) \ + X(int, tp_size) \ + X(int, tp_rank) + + DELTANET_FIELDS(TM_MEMBER) + TM_FOR_EACH(DeltaNetConfig, DELTANET_FIELDS) + +#undef DELTANET_FIELDS +}; + +} // namespace turbomind::core + +namespace turbomind { + +/// Weight module for Gated DeltaNet (linear attention) layers. +class DeltaNetWeight: public core::Module { +public: + const char* type() const override + { + return "DeltaNetWeight"; + } + + DeltaNetWeight() = default; + + DeltaNetWeight(const core::DeltaNetConfig& cfg); + + void prepare() override; + + // --- X-macro field lists --- +#define DELTA_NET_WEIGHT_CHILDREN(X) \ + X(LinearWeight, in_proj_all) \ + X(LinearWeight, out_proj) \ + X(NormWeight, norm) + +#define DELTA_NET_WEIGHT_PARAMS(X) \ + X(conv1d) \ + X(A_log) \ + X(dt_bias) + + TM_MODULE_DECLARE(DeltaNetWeight, DELTA_NET_WEIGHT_CHILDREN, DELTA_NET_WEIGHT_PARAMS) + + // --- Config fields (public for runtime access) --- + int hidden_dim{}; + int num_k_heads{}; + int num_v_heads{}; + int key_head_dim{}; + int value_head_dim{}; + int d_conv{}; + DataType data_type{}; + int tp_size{}; + int tp_rank{}; +}; + +} // namespace turbomind diff --git a/src/turbomind/models/ffn_weight.cc b/src/turbomind/models/ffn_weight.cc new file mode 100644 index 0000000000..9ecb66923d --- /dev/null +++ b/src/turbomind/models/ffn_weight.cc @@ -0,0 +1,48 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include "src/turbomind/models/ffn_weight.h" + +#include "src/turbomind/core/registry.h" +#include "src/turbomind/kernels/gemm/types.h" + +namespace turbomind { + +FfnWeight::FfnWeight(const core::FfnConfig& cfg): + hidden_dim{cfg.hidden_dim}, + inter_size{cfg.inter_size / cfg.tp_size}, + act_type{static_cast(cfg.act_type)}, + is_fused_silu{cfg.fuse_silu && act_type == ActivationType::kSilu}, + is_expert_{cfg.is_expert}, + data_type_{cfg.data_type}, + tp_size{cfg.tp_size}, + tp_rank{cfg.tp_rank} +{ +} + +void FfnWeight::prepare() +{ + // Set epilogue on existing w1w3 child if fused silu is active. + if (w1w3) { + auto* fused = static_cast(w1w3.get()); + if (is_fused_silu) { + fused->epilogue = gemm::Epilogue::kGatedSilu; + } + } + + // Propagate grouped-GEMM flag for MoE expert weights + if (is_expert_) { + for_each_child([](const char*, Module* m) { + if (auto* linear = dynamic_cast(m)) { + linear->set_grouped(true); + } + }); + } + + Module::prepare(); // recurse into children +} + +TM_MODULE_REGISTER(FfnWeight, core::FfnConfig); + +TM_MODULE_METHODS(FfnWeight, FFN_WEIGHT_CHILDREN, FFN_WEIGHT_PARAMS) + +} // namespace turbomind diff --git a/src/turbomind/models/ffn_weight.h b/src/turbomind/models/ffn_weight.h new file mode 100644 index 0000000000..604d91bd35 --- /dev/null +++ b/src/turbomind/models/ffn_weight.h @@ -0,0 +1,70 @@ +// Copyright (c) OpenMMLab. All rights reserved. +#pragma once + +#include "src/turbomind/core/core.h" +#include "src/turbomind/core/module.h" +#include "src/turbomind/kernels/activation.h" +#include "src/turbomind/models/linear_weight.h" + +namespace turbomind::core { + +struct FfnConfig: ModuleConfig { + FfnConfig(): ModuleConfig{"FfnWeight"} {} + +#define FFN_FIELDS(X) \ + X(int, hidden_dim) \ + X(int, inter_size) \ + X(int, act_type) \ + X(bool, fuse_silu) \ + X(bool, is_expert) \ + X(DataType, data_type) \ + X(int, tp_size) \ + X(int, tp_rank) + + FFN_FIELDS(TM_MEMBER) + TM_FOR_EACH(FfnConfig, FFN_FIELDS) + +#undef FFN_FIELDS +}; + +} // namespace turbomind::core + +namespace turbomind { + +class FfnWeight: public core::Module { +public: + const char* type() const override + { + return "FfnWeight"; + } + + FfnWeight() = default; + + FfnWeight(const core::FfnConfig& cfg); + + void prepare() override; + + // --- X-macro child members --- +#define FFN_WEIGHT_CHILDREN(X) \ + X(LinearWeight, w1) \ + X(LinearWeight, w3) \ + X(LinearWeight, w2) \ + X(LinearWeight, w1w3) + +#define FFN_WEIGHT_PARAMS(X) + + TM_MODULE_DECLARE(FfnWeight, FFN_WEIGHT_CHILDREN, FFN_WEIGHT_PARAMS) + + int hidden_dim{}; + int inter_size{}; + ActivationType act_type{}; + bool is_fused_silu{}; + int tp_size{}; + int tp_rank{}; + +private: + bool is_expert_{}; + DataType data_type_{}; +}; + +} // namespace turbomind diff --git a/src/turbomind/models/input_processor.cc b/src/turbomind/models/input_processor.cc index 43f70578e7..a1148a6c2e 100644 --- a/src/turbomind/models/input_processor.cc +++ b/src/turbomind/models/input_processor.cc @@ -14,7 +14,7 @@ using std::vector; struct InputProcessor::Impl { public: - Impl(const EngineParam& engine, const ModelParam& model, int phases): + Impl(const EngineParam& engine, int hidden_units, DataType data_type, int phases): max_batch_size_{engine.max_batch_size}, max_forward_token_num_{engine.max_forward_token_num} { input_ids_buf_ = {max_forward_token_num_, kCPUpinned}; @@ -31,7 +31,7 @@ struct InputProcessor::Impl { d.autoreg_ids_pos = {max_batch_size_, kCPU}; // ! CPU buffer /// TODO: initialize only when required - d.input_embeds_buf = {{max_forward_token_num_, (int)model.hidden_units}, model.data_type, kCPUpinned}; + d.input_embeds_buf = {{max_forward_token_num_, hidden_units}, data_type, kCPUpinned}; } } @@ -241,8 +241,8 @@ struct InputProcessor::Impl { InputProcessor::~InputProcessor() = default; -InputProcessor::InputProcessor(const EngineParam& engine, const ModelParam& model, int phases): - impl_{std::make_unique(engine, model, phases)} +InputProcessor::InputProcessor(const EngineParam& engine, int hidden_units, DataType data_type, int phases): + impl_{std::make_unique(engine, hidden_units, data_type, phases)} { } diff --git a/src/turbomind/models/input_processor.h b/src/turbomind/models/input_processor.h index be7502a9ef..52050543ac 100644 --- a/src/turbomind/models/input_processor.h +++ b/src/turbomind/models/input_processor.h @@ -9,7 +9,7 @@ class InputProcessor { public: ~InputProcessor(); - InputProcessor(const EngineParam& engine, const ModelParam& model, int phases); + InputProcessor(const EngineParam& engine, int hidden_units, DataType data_type, int phases); void Run(BatchOp op, int phase, TensorMap& env); diff --git a/src/turbomind/models/language_model.cc b/src/turbomind/models/language_model.cc index dd2ff5756c..1b5a966c34 100644 --- a/src/turbomind/models/language_model.cc +++ b/src/turbomind/models/language_model.cc @@ -15,11 +15,11 @@ #include "src/turbomind/generation/generation.h" #include "src/turbomind/kernels/gpt_kernels.h" #include "src/turbomind/models/input_processor.h" -#include "src/turbomind/models/llama/LlamaWeight.h" #include "src/turbomind/models/llama/llama_kernels.h" #include "src/turbomind/models/llama/llama_params.h" #include "src/turbomind/models/llama/llama_utils.h" #include "src/turbomind/models/llama/unified_decoder.h" +#include "src/turbomind/models/model_weight.h" #include "src/turbomind/models/output_processor.h" #include "src/turbomind/utils/anomaly_handler.h" #include "src/turbomind/utils/cuda_utils.h" @@ -33,11 +33,8 @@ using std::unique_ptr; using std::shared_ptr; struct LanguageModel::Impl { - const DataType dtype_; - const ModelParam param_; - const AttentionParam attn_param_; const Communicators& comm_; - const LlamaWeight& weights_; + const ModelWeight& weights_; LlamaLinear& linear_; const int tp_size_; @@ -102,14 +99,7 @@ struct LanguageModel::Impl { } } - Impl(DataType dtype, - const ModelParam& model, - const EngineParam& engine, - const AttentionParam& attn, - const MoeParam& moe, - const Context& ctx, - const LlamaWeight& weights, - int phases); + Impl(const EngineParam& engine, const Context& ctx, const ModelWeight& weights, int phases); Tensor LookupEmbedding(const Buffer_& input_ids, Buffer symm_buf); Tensor PostEmbedding(const Tensor& features, Buffer symm_buf); @@ -121,17 +111,7 @@ struct LanguageModel::Impl { void Fetch(int phase, TensorMap& env); }; -LanguageModel::Impl::Impl(DataType dtype, - const ModelParam& model, - const EngineParam& engine, - const AttentionParam& attn, - const MoeParam& moe, - const Context& ctx, - const LlamaWeight& weights, - int phases): - dtype_{dtype}, - param_{model}, - attn_param_{attn}, +LanguageModel::Impl::Impl(const EngineParam& engine, const Context& ctx, const ModelWeight& weights, int phases): comm_{ctx.comm}, weights_{weights}, linear_{*ctx.linear}, @@ -161,19 +141,15 @@ LanguageModel::Impl::Impl(DataType dtype, d.generating = {engine.max_batch_size, kCPU}; } - input_processor_.emplace(engine, param_, phases); + input_processor_.emplace(engine, weights_.hidden_units, weights_.data_type, phases); - unified_decoder_ = std::make_unique(model, engine, attn, moe, ctx, phases); + unified_decoder_ = std::make_unique(engine, ctx, phases, weights_); - generation_ = std::make_unique(kFloat32, - engine.max_batch_size, - engine.session_len, - model.vocab_size, - weights.post_decoder_embedding.output_dim * tp_size_, - comm_.h_tp_group, - phases); + const int vocab_size = weights_.output->output_dim * tp_size_; + + generation_ = std::make_unique( + kFloat32, engine.max_batch_size, engine.session_len, weights_.vocab_size, vocab_size, comm_.h_tp_group, phases); - const int vocab_size = weights_.post_decoder_embedding.output_dim * tp_size_; const ssize_t max_fwd_tokens = engine.max_forward_token_num; if (ctx.comm.d_comm) { @@ -182,18 +158,19 @@ LanguageModel::Impl::Impl(DataType dtype, TM_CHECK(engine.max_forward_token_num % tp_size_ == 0); ssize_t bytes{}; - bytes = std::max(bytes, byte_size(dtype_, max_fwd_tokens * engine.attn_dp_size * model.hidden_units)); - bytes = std::max(bytes, byte_size(dtype_, engine.max_batch_size * vocab_size)); + bytes = std::max(bytes, + byte_size(weights_.data_type, max_fwd_tokens * engine.attn_dp_size * weights_.hidden_units)); + bytes = std::max(bytes, byte_size(weights_.data_type, engine.max_batch_size * vocab_size)); symm_buf_ = {bytes, symm_alloc}; // Compute max logits length based on symm buffer size - max_logits_len_ = symm_buf_.view(dtype_).size() / vocab_size; + max_logits_len_ = symm_buf_.view(weights_.data_type).size() / vocab_size; } else { - max_logits_len_ = std::max(max_fwd_tokens * model.hidden_units / vocab_size, engine.max_batch_size); + max_logits_len_ = std::max(max_fwd_tokens * weights_.hidden_units / vocab_size, engine.max_batch_size); } - output_processor_.emplace(param_, max_logits_len_, tp_rank_, phases, [this](const Tensor& hstate) { + output_processor_.emplace(weights_.vocab_size, max_logits_len_, tp_rank_, phases, [this](const Tensor& hstate) { return PostEmbedding(hstate, symm_buf_); }); } @@ -202,14 +179,14 @@ Tensor LanguageModel::Impl::LookupEmbedding(const Buffer_& input_ids, Buffe { const auto st = core::Context::stream().handle(); - const int hidden_units = param_.hidden_units; + const int hidden_units = weights_.hidden_units; - const auto& embedding_table = weights_.pre_decoder_embedding.weight; + const auto& embedding_table = weights_.tok_embeddings; TM_CHECK_EQ(embedding_table.shape(1) * tp_size_, hidden_units); const int token_num = input_ids.size(); - Tensor input_embeds{{token_num, hidden_units}, dtype_, kDEVICE}; + Tensor input_embeds{{token_num, hidden_units}, weights_.data_type, kDEVICE}; if (token_num == 0) { return input_embeds; @@ -222,7 +199,7 @@ Tensor LanguageModel::Impl::LookupEmbedding(const Buffer_& input_ids, Buffe else if (use_ag2d_) { const auto local_hidden_units = embedding_table.shape(1); - Tensor temp{symm_buf.view(dtype_), {token_num, tp_size_, local_hidden_units}}; + Tensor temp{symm_buf.view(weights_.data_type), {token_num, tp_size_, local_hidden_units}}; Tensor local{temp.slice({0, tp_rank_, 0}, {-1, 1, -1}).squeeze(1)}; invokeEmbeddingLookup(local, input_ids, embedding_table, st); @@ -245,13 +222,14 @@ Tensor LanguageModel::Impl::LookupEmbedding(const Buffer_& input_ids, Buffe else { const auto local_hidden_units = embedding_table.shape(1); - Tensor temp{symm_buf.view(dtype_), {tp_size_, token_num, local_hidden_units}}; + Tensor temp{symm_buf.view(weights_.data_type), {tp_size_, token_num, local_hidden_units}}; Tensor local{temp.slice(tp_rank_).squeeze(0)}; invokeEmbeddingLookup(local, input_ids, embedding_table, st); sync_check_cuda_error(); - comm_.d_comm->AllGather(local.raw_data(), temp.raw_data(), local.size(), dtype_, comm_.d_tp_group, st); + comm_.d_comm->AllGather( + local.raw_data(), temp.raw_data(), local.size(), weights_.data_type, comm_.d_tp_group, st); sync_check_cuda_error(); invokeInPlaceTranspose102((uint16_t*)input_embeds.raw_data(), @@ -274,24 +252,24 @@ Tensor LanguageModel::Impl::PostEmbedding(const Tensor& features, Buffer symm_bu const auto st = core::Context::stream().handle(); const int bsz = features.shape(0); - const int local_vocab_size = weights_.post_decoder_embedding.output_dim; + const int local_vocab_size = weights_.output->output_dim; const int vocab_size = local_vocab_size * tp_size_; if (bsz == 0) { - return Tensor{{0, vocab_size}, dtype_, kDEVICE}; + return Tensor{{0, vocab_size}, weights_.data_type, kDEVICE}; } if (tp_size_ == 1) { - Tensor logits{{bsz, vocab_size}, dtype_, kDEVICE}; - linear_.Forward(features, weights_.post_decoder_embedding, logits); + Tensor logits{{bsz, vocab_size}, weights_.data_type, kDEVICE}; + linear_.Forward(features, *weights_.output, logits); sync_check_cuda_error(); TM_DEBUG_TENSOR(logits, "logits", 1); return logits; } else if (use_ag2d_) { - Tensor logits{symm_buf.view(dtype_), {bsz, tp_size_, local_vocab_size}}; + Tensor logits{symm_buf.view(weights_.data_type), {bsz, tp_size_, local_vocab_size}}; Tensor local = logits.slice({0, tp_rank_, 0}, {-1, 1, -1}); - linear_.Forward(features, weights_.post_decoder_embedding, local.squeeze(1)); + linear_.Forward(features, *weights_.output, local.squeeze(1)); sync_check_cuda_error(); comm_.d_comm->AllGather2D(local.raw_data(), logits.raw_data(), @@ -307,9 +285,9 @@ Tensor LanguageModel::Impl::PostEmbedding(const Tensor& features, Buffer symm_bu return logits.view({bsz, -1}); } else { - Tensor logits{symm_buf.view(dtype_), {tp_size_, bsz, local_vocab_size}}; + Tensor logits{symm_buf.view(weights_.data_type), {tp_size_, bsz, local_vocab_size}}; Tensor local = logits.slice({tp_rank_, 0, 0}, {1, -1, -1}); - linear_.Forward(features, weights_.post_decoder_embedding, local.squeeze(0)); + linear_.Forward(features, *weights_.output, local.squeeze(0)); sync_check_cuda_error(); comm_.d_comm->AllGather(local.raw_data(), logits.raw_data(), local.size(), local.dtype(), comm_.d_tp_group, st); sync_check_cuda_error(); @@ -439,9 +417,9 @@ void LanguageModel::Impl::Forward(int phase, TensorMap& env) env.produce("symm_buf", symm_buf_); } - env.produce("output_norm_weight", weights_.output_norm_weight); + env.produce("output_norm_weight", weights_.norm->weight); - unified_decoder_->Forward(phase, env, weights_.decoder_layer_weights); + unified_decoder_->Forward(phase, env, weights_.layers_list()); // env.at("batch").data()[0]->Notify(); @@ -491,16 +469,9 @@ LanguageModel::~LanguageModel() = default; LanguageModel::LanguageModel(LanguageModel&&) noexcept = default; -LanguageModel::LanguageModel(DataType dtype, - const ModelParam& model, - const EngineParam& engine, - const AttentionParam& attn, - const MoeParam& moe, - const Context& ctx, - const LlamaWeight& weights, - int phases) +LanguageModel::LanguageModel(const EngineParam& engine, const Context& ctx, const ModelWeight& weights, int phases) { - impl_ = std::make_unique(dtype, model, engine, attn, moe, ctx, weights, phases); + impl_ = std::make_unique(engine, ctx, weights, phases); } void LanguageModel::Run(BatchOp op, int phase, TensorMap& env) @@ -508,14 +479,4 @@ void LanguageModel::Run(BatchOp op, int phase, TensorMap& env) return TM_CHECK_NOTNULL(impl_)->Run(op, phase, env); } -const ModelParam& LanguageModel::model_param() const noexcept -{ - return TM_CHECK_NOTNULL(impl_)->param_; -} - -const AttentionParam& LanguageModel::attn_param() const noexcept -{ - return TM_CHECK_NOTNULL(impl_)->attn_param_; -} - } // namespace turbomind diff --git a/src/turbomind/models/language_model.h b/src/turbomind/models/language_model.h index a883f56d5c..5699a9e37f 100644 --- a/src/turbomind/models/language_model.h +++ b/src/turbomind/models/language_model.h @@ -9,7 +9,7 @@ namespace turbomind { -class LlamaWeight; +class ModelWeight; class LanguageModel { public: @@ -24,20 +24,10 @@ class LanguageModel { return static_cast(impl_); } - LanguageModel(DataType dtype, - const ModelParam& model, - const EngineParam& engine, - const AttentionParam& attn, - const MoeParam& moe, - const Context& ctx, - const LlamaWeight& weights, - int phases); + LanguageModel(const EngineParam& engine, const Context& ctx, const ModelWeight& weights, int phases); void Run(BatchOp op, int phase, TensorMap& env); - const ModelParam& model_param() const noexcept; - const AttentionParam& attn_param() const noexcept; - private: struct Impl; std::unique_ptr impl_; diff --git a/src/turbomind/models/linear_weight.cc b/src/turbomind/models/linear_weight.cc new file mode 100644 index 0000000000..be9ffbd3fe --- /dev/null +++ b/src/turbomind/models/linear_weight.cc @@ -0,0 +1,280 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include "src/turbomind/models/linear_weight.h" + +#include "src/turbomind/core/allocator.h" +#include "src/turbomind/core/data_type.h" +#include "src/turbomind/core/registry.h" +#include "src/turbomind/kernels/gemm/cast.h" +#include "src/turbomind/kernels/gemm/convert.h" +#include "src/turbomind/kernels/gemm/types.h" +#include "src/turbomind/kernels/gemm/utils.h" +#include "src/turbomind/kernels/gpt_kernels.h" +#include "src/turbomind/utils/cuda_utils.h" +#include "src/turbomind/utils/memory_utils.h" + +namespace turbomind { + +LinearWeight::LinearWeight(const core::LinearConfig& cfg): + input_dim(cfg.input_dim), + output_dim(cfg.output_dim), + data_type(cfg.data_type), + weight_format(cfg.format), + has_bias_(cfg.has_bias) +{ + std::tie(input_format, output_format) = DeriveActivationFormats(weight_format, data_type, getSMVersion()); +} + +std::pair DeriveActivationFormats(const DataFormat& weight_format, DataType data_type, int sm) +{ + DataFormat in_fmt; + DataFormat out_fmt; + in_fmt.dtype = data_type; + in_fmt.block_sizes = {1, 1}; + out_fmt.dtype = data_type; + out_fmt.block_sizes = {1, 1}; + + // Empty weight_format (from LinearBuilder.set_weight path for embeddings / + // lm_head): treat as trivial. No quantization on I/O. + if (weight_format.dtype == DataType{}) { + return {in_fmt, out_fmt}; + } + + if (!weight_format.is_quantized()) { + return {in_fmt, out_fmt}; + } + + if (weight_format.dtype == kFloat8_e4m3) { + if (sm == 90) { + int gs = weight_format.block_sizes[0]; // K-axis, tensor-shape order + in_fmt.dtype = kFloat8_e4m3; + in_fmt.block_sizes = {gs, 1}; + in_fmt.scales.dtype = kFloat; + } + return {in_fmt, out_fmt}; + } + + // FP4 / U4 / U8: input stays in model activation dtype — the GEMM + // upcasts / dequants on the fly. output_format is also activation dtype. + return {in_fmt, out_fmt}; +} + +gemm::QuantDesc MakeQuantDesc(const DataFormat& fmt) +{ + if (!fmt.is_quantized()) { + return {gemm::QuantType::kNone, 0}; + } + int gs = (fmt.block_sizes.size() > 0) ? fmt.block_sizes[0] : 1; + + if (fmt.dtype == kFloat8_e4m3) { + // Weight format has bidirectional blocking {128, 128} → B-type. + // Activation format has K-axis-only blocking {gs, 1} → K-type. + if (fmt.block_sizes.size() > 1 && fmt.block_sizes[1] > 1) { + return {gemm::QuantType::kB, gs}; + } + return {gemm::QuantType::kK, gs}; + } + // FP4 / U4 / U8: K-grouped quantization + return {gemm::QuantType::kK, gs}; +} + +void LinearWeight::copy_metadata_to(LinearWeight& dst) const +{ + dst.input_dim = input_dim; + dst.output_dim = output_dim; + dst.data_type = data_type; + dst.weight_format = weight_format; + dst.input_format = input_format; + dst.output_format = output_format; + dst.epilogue = epilogue; + dst.has_bias_ = has_bias_; + dst.is_grouped_ = is_grouped_; + dst.k_desc = k_desc; + dst.q_desc = q_desc; +} + +// ====================================================================== +// prepare (weight format conversion) +// ====================================================================== + +void LinearWeight::prepare() +{ + if (!weight) { + return; + } + + // Set up GEMM descriptor (was previously in do_allocate) + k_desc.type = weight.dtype(); + k_desc.order = gemm::kRowMajor; + k_desc.rows = input_dim; + k_desc.cols = output_dim; + k_desc.ld = output_dim; + + // No format conversion needed if weight_spec was never set (trivial weights + // loaded via commit_tensor, e.g. tok_embeddings, output head). + if (weight_format.dtype == DataType{}) { + EnsureFloatDtype(weight, data_type); + if (weight.dtype() == data_type) { + k_desc.type = data_type; + } + return; + } + + auto stream = core::Context::stream().handle(); + + if (weight_format.dtype == kFloat8_e4m3 && input_dtype() == kFloat8_e4m3) { + // FP8 native path: transpose weight and scales for native kernels. + auto process = [&](Tensor& x, MatrixLayout& d, auto dtype) { + using T = decltype(dtype); + Tensor trans{{x.shape(1), x.shape(0)}, x.dtype(), kDEVICE}; + invokeTransposeAxis01((T*)trans.raw_data(), (T*)x.raw_data(), x.shape(0), x.shape(1), 1, stream); + x = std::move(trans); + d = MatrixLayout{x.dtype(), gemm::kColMajor, (int)x.shape(1), (int)x.shape(0), (int)x.stride(0)}; + }; + + TM_CHECK_EQ(weight.dtype(), kFloat8_e4m3); + process(weight, k_desc, uint8_t{}); + + // FP8 native path requires f32 scales; cast if loaded as bf16/fp16. + EnsureFloatDtype(scales, kFloat); + + TM_CHECK_EQ(scales.dtype(), kFloat); + process(scales, q_desc, float{}); + } + else if (weight_format.dtype == kFloat8_e4m3) { + // FP8 non-native path (non-SM90) + } + else { + // General quantization format conversion path. + using namespace gemm; + + auto [conv_w, conv_s] = + GetConverters(data_type, weight_format.dtype, input_dtype(), is_grouped_, getSMVersion()); + + if (conv_w) { + const auto order_w = conv_w->order; + const bool is_A = get_operand_tag(conv_w->pack) == OPERAND_A; + const bool is_B = !is_A; + + const int bits = byte_size(weight_format.dtype, 8); + + Tensor_ tmp{{input_dim, output_dim}, kDEVICE}; + + if (bits == 4) { + extend_to_u16(tmp.data(), (const uint4_t*)weight.raw_data(), tmp.size(), stream); + sync_check_cuda_error(); + } + else if (bits == 8) { + extend_to_u16(tmp.data(), (const uint8_t*)weight.raw_data(), tmp.size(), stream); + sync_check_cuda_error(); + } + else if (bits == 16) { + check_cuda_error( + cudaMemcpyAsync(tmp.raw_data(), weight.raw_data(), weight.byte_size(), cudaMemcpyDefault, stream)); + } + + if (order_w == kRowMajor) { + Tensor_ trans{{output_dim, input_dim}, kDEVICE}; + invokeTransposeAxis01(trans.data(), tmp.data(), input_dim, output_dim, 1, stream); + tmp = trans; + } + + MatrixLayout w_desc{ + data_type, + order_w, + (int)output_dim, + (int)input_dim, + order_w == kRowMajor ? (int)input_dim : (int)output_dim, + }; + + if (is_B) { + std::swap(w_desc.rows, w_desc.cols); + w_desc.order = ~w_desc.order; + } + + MatrixLayout kd = w_desc; + kd.type = weight_format.dtype; + if (bits == 4) { + kd.type = data_type_v; + } + else if (bits == 8) { + kd.type = data_type_v; + } + kd.pack = conv_w->pack; + + check_cuda_error(cudaMemsetAsync(weight.raw_data(), 0, weight.byte_size(), stream)); + TM_CHECK(conv_w->Convert(tmp.data(), w_desc, weight.raw_data(), kd, stream) == 0); + sync_check_cuda_error(); + + kd.type = weight_format.dtype; + if (is_A) { + kd = transpose(kd); + } + k_desc = kd; + } + + if (conv_s) { + const auto order_s = conv_s->order; + const auto pack_s = conv_s->pack; + const bool is_A = get_operand_tag(conv_s->pack) == OPERAND_U; + + Tensor tmp_q; + DataType scale_type; + + if (zeros) { + tmp_q = {{scales.size(), 2}, kHalf, kDEVICE}; + fuse_scales_and_zeros( + tmp_q.data(), scales.data(), zeros.data(), scales.size(), stream); + scale_type = kUint32; + zeros = {}; + scales = empty_like(tmp_q); + } + else if (weight_format.dtype == kFloat8_e4m3) { + tmp_q = empty_like(scales); + Copy(scales, tmp_q); + scale_type = kUint16; + } + else { + tmp_q = empty_like(scales); + Copy(scales, tmp_q); + scale_type = kUint8; + } + + if (data_type == kHalf && weight_format.dtype == kFloat4_e2m1) { + AdjustUe8m0ScaleForHalf(tmp_q.data(), tmp_q.size(), stream); + sync_check_cuda_error(); + } + + int gs = weight_format.block_sizes[0]; // K-axis, tensor-shape order + MatrixLayout s_desc{ + scale_type, + order_s, + (int)output_dim, + (int)input_dim / gs, + (int)output_dim, + }; + + if (!is_A) { + std::swap(s_desc.rows, s_desc.cols); + s_desc.order = ~s_desc.order; + } + + MatrixLayout qd = s_desc; + qd.pack = pack_s; + + TM_CHECK(conv_s->Convert(tmp_q.raw_data(), s_desc, scales.raw_data(), qd, stream) == 0); + sync_check_cuda_error(); + + if (is_A) { + qd = transpose(qd); + } + q_desc = qd; + } + } +} + +TM_MODULE_REGISTER(LinearWeight, core::LinearConfig); + +TM_MODULE_METHODS(LinearWeight, LINEAR_WEIGHT_CHILDREN, LINEAR_WEIGHT_PARAMS) + +} // namespace turbomind diff --git a/src/turbomind/models/linear_weight.h b/src/turbomind/models/linear_weight.h new file mode 100644 index 0000000000..dd4680d061 --- /dev/null +++ b/src/turbomind/models/linear_weight.h @@ -0,0 +1,105 @@ +// Copyright (c) OpenMMLab. All rights reserved. +#pragma once + +#include "src/turbomind/core/core.h" +#include "src/turbomind/core/data_format.h" +#include "src/turbomind/core/module.h" +#include "src/turbomind/kernels/gemm/types.h" + +namespace turbomind::core { + +struct LinearConfig: ModuleConfig { + LinearConfig(): ModuleConfig{"LinearWeight"} {} + +#define LINEAR_FIELDS(X) \ + X(int, input_dim) \ + X(int, output_dim) \ + X(DataType, data_type) \ + X(DataFormat, format) \ + X(bool, has_bias) + + LINEAR_FIELDS(TM_MEMBER) + TM_FOR_EACH(LinearConfig, LINEAR_FIELDS) + +#undef LINEAR_FIELDS +}; + +} // namespace turbomind::core + +namespace turbomind { + +using gemm::Epilogue; +using gemm::MatrixLayout; + +/// Derive (input_format, output_format) for a GEMM whose weight uses +/// `weight_format`, given the model's activation dtype and hardware SM. +std::pair DeriveActivationFormats(const DataFormat& weight_format, DataType data_type, int sm); + +/// Derive GEMM QuantDesc for an operand described by DataFormat. +/// For unquantized formats, returns {QuantType::kNone, 0}. +gemm::QuantDesc MakeQuantDesc(const DataFormat& fmt); + +class LinearWeight: public core::Module { +public: + const char* type() const override + { + return "LinearWeight"; + } + + LinearWeight() = default; + LinearWeight(const core::LinearConfig& cfg); + + void prepare() override; + void copy_metadata_to(LinearWeight& dst) const; + + /// Set grouped-GEMM mode (for MoE expert weights that need row-major layout). + void set_grouped(bool grouped) + { + is_grouped_ = grouped; + } + + explicit operator bool() const noexcept + { + return static_cast(weight); + } + + // --- three DataFormats fully describe the GEMM --- + DataFormat weight_format{}; // from cfg.format + DataFormat input_format{}; // derived in ctor + DataFormat output_format{}; // derived in ctor + + DataType input_dtype() const + { + return input_format.dtype; + } + DataType output_dtype() const + { + return output_format.dtype; + } + + // --- dimensions + model activation dtype --- + int input_dim = 0; + int output_dim = 0; + DataType data_type{}; // model activation dtype, copied from cfg.data_type + + // --- GEMM knobs --- + Epilogue epilogue{}; + MatrixLayout k_desc{}; + MatrixLayout q_desc{}; + +#define LINEAR_WEIGHT_CHILDREN(X) + +#define LINEAR_WEIGHT_PARAMS(X) \ + X(weight) \ + X(bias) \ + X(scales) \ + X(zeros) + + TM_MODULE_DECLARE(LinearWeight, LINEAR_WEIGHT_CHILDREN, LINEAR_WEIGHT_PARAMS) + +private: + bool has_bias_ = false; + bool is_grouped_ = false; +}; + +} // namespace turbomind diff --git a/src/turbomind/models/llama/CMakeLists.txt b/src/turbomind/models/llama/CMakeLists.txt index 121b80f09a..361c01ca82 100644 --- a/src/turbomind/models/llama/CMakeLists.txt +++ b/src/turbomind/models/llama/CMakeLists.txt @@ -13,7 +13,6 @@ add_library(Llama STATIC BlockTrie.cc SequenceManager.cc LlamaWeight.cc - LlamaDenseWeight.cc LlamaDecoderLayerWeight.cc LlamaFfnLayer.cc moe_ffn_layer.cc diff --git a/src/turbomind/models/llama/GatedDeltaNetLayer.cc b/src/turbomind/models/llama/GatedDeltaNetLayer.cc index e5a7459143..accba2833a 100644 --- a/src/turbomind/models/llama/GatedDeltaNetLayer.cc +++ b/src/turbomind/models/llama/GatedDeltaNetLayer.cc @@ -9,43 +9,19 @@ namespace turbomind { -GatedDeltaNetLayer::GatedDeltaNetLayer(const ModelParam& model, - const AttentionParam& attn, - const EngineParam& engine, - int tp_size, - const Context& ctx, - int phases): - hidden_units_(model.hidden_units), - num_k_heads_(model.linear_num_key_heads / tp_size), - num_v_heads_(model.linear_num_value_heads / tp_size), - key_head_dim_(model.linear_key_head_dim > 0 ? model.linear_key_head_dim : model.head_dim), - value_head_dim_(model.linear_value_head_dim > 0 ? model.linear_value_head_dim : model.head_dim), - d_conv_(model.linear_conv_kernel_dim > 0 ? model.linear_conv_kernel_dim : 4), - key_dim_(num_k_heads_ * key_head_dim_), - value_dim_(num_v_heads_ * value_head_dim_), - conv_dim_(key_dim_ * 2 + value_dim_), - norm_eps_(model.norm_eps), - dtype_(model.data_type), - state_dtype_(model.linear_state_dtype), - linear_(*ctx.linear) +GatedDeltaNetLayer::GatedDeltaNetLayer(DataType state_dtype, + const std::vector& layer_types, + const EngineParam& engine, + const Context& ctx, + int phases): + tp_size_(engine.attn_tp_size), num_linear_layers_(0), state_dtype_(state_dtype), linear_(*ctx.linear) { - layer_types_ = model.layer_types; - num_linear_layers_ = 0; + layer_types_ = layer_types; for (auto t : layer_types_) { if (t == 1) ++num_linear_layers_; } - TM_LOG_INFO("GatedDeltaNetLayer: num_k={} num_v={} k_dim={} v_dim={} " - "conv_dim={} d_conv={} num_linear_layers={}", - num_k_heads_, - num_v_heads_, - key_dim_, - value_dim_, - conv_dim_, - d_conv_, - num_linear_layers_); - if (num_linear_layers_ > 0) { conv_state_ptrs_buf_ = {engine.max_batch_size, kCPUpinned}; recurrent_state_ptrs_buf_ = {engine.max_batch_size, kCPUpinned}; @@ -79,8 +55,7 @@ GatedDeltaNetLayer::~GatedDeltaNetLayer() void GatedDeltaNetLayer::Run(BatchOp op, int phase, TensorMap& env) { if (op == BatchOp::kAdd) { - Buffer_ rc = env.at("requests").buffer(); - const auto dtype = dtype_; + Buffer_ rc = env.at("requests").buffer(); for (int i = 0; i < rc.size(); ++i) {} } else if (op == BatchOp::kSetup) { @@ -161,21 +136,31 @@ void GatedDeltaNetLayer::Forward(ForwardParam p) auto dispatch = [&](auto t) { using T = decltype(t); + const auto& w = *p.weights; + const int num_k_heads = w.num_k_heads / tp_size_; + const int num_v_heads = w.num_v_heads / tp_size_; + const int key_head_dim = w.key_head_dim; + const int value_head_dim = w.value_head_dim; + const int d_conv = w.d_conv; + const int key_dim = num_k_heads * key_head_dim; + const int value_dim = num_v_heads * value_head_dim; + const int conv_dim = key_dim * 2 + value_dim; + // ================================================================= // 1. Single fused input projection: reads p.input once from HBM. // Output columns are ordered: [qkv | z | b | a] - // where the split dims are: conv_dim_, value_dim_, v_heads_tp_, v_heads_tp_ + // where the split dims are: conv_dim, value_dim, v_heads_tp, v_heads_tp // ================================================================= - const int v_heads_tp = num_v_heads_; // already TP-sharded - Tensor all_proj = linear_.Forward(p.input, weights.in_proj_all); + const int v_heads_tp = num_v_heads; // already TP-sharded + Tensor all_proj = linear_.Forward(p.input, *weights.in_proj_all); sync_check_cuda_error(); // Column offsets per token (all_proj is token-major, row-major): - // [0, conv_dim_) -> mixed_qkv - // [conv_dim_, +value_dim_) -> z - // [conv_dim_+value_dim_, +v_heads_tp) -> b (beta logit) - // [conv_dim_+value_dim_+v_heads_tp, +v_heads_tp) -> a (alpha/dt) - const int all_col = conv_dim_ + value_dim_ + v_heads_tp * 2; + // [0, conv_dim) -> mixed_qkv + // [conv_dim, +value_dim) -> z + // [conv_dim+value_dim, +v_heads_tp) -> b (beta logit) + // [conv_dim+value_dim+v_heads_tp, +v_heads_tp) -> a (alpha/dt) + const int all_col = conv_dim + value_dim + v_heads_tp * 2; // const T* sub-pointers are derived per-request below; stride = all_col. // ================================================================= @@ -183,13 +168,13 @@ void GatedDeltaNetLayer::Forward(ForwardParam p) // b_raw and a_raw are sliced from the fused projection output. // Stride between tokens is all_col elements. // ================================================================= - const int bg_total = token_num * num_v_heads_; + const int bg_total = token_num * num_v_heads; - const int b_offset = conv_dim_ + value_dim_; // column offset to b logits - const int a_offset = b_offset + v_heads_tp; // column offset to a logits + const int b_offset = conv_dim + value_dim; // column offset to b logits + const int a_offset = b_offset + v_heads_tp; // column offset to a logits - Tensor beta{{token_num, num_v_heads_}, dtype, device}; - Tensor g{{token_num, num_v_heads_}, dtype, device}; + Tensor beta{{token_num, num_v_heads}, dtype, device}; + Tensor g{{token_num, num_v_heads}, dtype, device}; auto b = all_proj.slice({0, b_offset}, {-1, v_heads_tp}); auto a = all_proj.slice({0, a_offset}, {-1, v_heads_tp}); @@ -199,12 +184,12 @@ void GatedDeltaNetLayer::Forward(ForwardParam p) // ================================================================= // 3. Process all requests at once via batched kernel launches // ================================================================= - Tensor attn_out{{token_num, value_dim_}, dtype, device}; - Tensor conv_out{{token_num, conv_dim_}, dtype, device}; + Tensor attn_out{{token_num, value_dim}, dtype, device}; + Tensor conv_out{{token_num, conv_dim}, dtype, device}; const int state_layer_idx = linear_layer_index(p.layer_id, layer_types_); - const int conv_state_layer_offset = state_layer_idx * (conv_dim_ * d_conv_); - const int recurrent_state_layer_offset = state_layer_idx * (num_v_heads_ * key_head_dim_ * value_head_dim_); + const int conv_state_layer_offset = state_layer_idx * (conv_dim * d_conv); + const int recurrent_state_layer_offset = state_layer_idx * (num_v_heads * key_head_dim * value_head_dim); // ----- 3a. Fused Causal Conv1d + SiLU (all requests) ----- // all_proj carries the non-contiguous qkv slice (stride = all_col); @@ -252,7 +237,7 @@ void GatedDeltaNetLayer::Forward(ForwardParam p) dc_state, dc_q, decode_count, - num_k_heads_, + num_k_heads, recurrent_state_layer_offset, state_dtype_, sm_count_, @@ -269,7 +254,7 @@ void GatedDeltaNetLayer::Forward(ForwardParam p) pf_state, pf_q, prefill_count, - num_k_heads_, + num_k_heads, recurrent_state_layer_offset, state_dtype_, sm_count_, @@ -290,7 +275,7 @@ void GatedDeltaNetLayer::Forward(ForwardParam p) state_slice, q_slice, decode_count, - num_k_heads_, + num_k_heads, recurrent_state_layer_offset, state_dtype_, sm_count_, @@ -307,7 +292,7 @@ void GatedDeltaNetLayer::Forward(ForwardParam p) state_slice, q_slice, prefill_count, - num_k_heads_, + num_k_heads, recurrent_state_layer_offset, state_dtype_, sm_count_, @@ -319,16 +304,16 @@ void GatedDeltaNetLayer::Forward(ForwardParam p) sync_check_cuda_error(); // ----- 3c. RMSNormGated (all tokens at once) ----- - // Gate (z) lives at column conv_dim_ of all_proj with row-stride all_col. - Tensor gate = all_proj.slice({0, conv_dim_}, {-1, value_dim_}); - Tensor hidden_view = attn_out.view({token_num * num_v_heads_, value_head_dim_}); - invokeRMSNormGated(hidden_view, gate, weights.norm, norm_eps_, stream); + // Gate (z) lives at column conv_dim of all_proj with row-stride all_col. + Tensor gate = all_proj.slice({0, conv_dim}, {-1, value_dim}); + Tensor hidden_view = attn_out.view({token_num * num_v_heads, value_head_dim}); + invokeRMSNormGated(hidden_view, gate, weights.norm->weight, weights.norm->norm_eps_, stream); sync_check_cuda_error(); // ================================================================= // 4. Output projection (all tokens at once) // ================================================================= - (void)linear_.Forward(attn_out, weights.out_proj, p.output); + (void)linear_.Forward(attn_out, *weights.out_proj, p.output); sync_check_cuda_error(); }; diff --git a/src/turbomind/models/llama/GatedDeltaNetLayer.h b/src/turbomind/models/llama/GatedDeltaNetLayer.h index 67e240c891..bb6b1e2c9a 100644 --- a/src/turbomind/models/llama/GatedDeltaNetLayer.h +++ b/src/turbomind/models/llama/GatedDeltaNetLayer.h @@ -2,7 +2,7 @@ #include "src/turbomind/core/tensor.h" #include "src/turbomind/engine/batch.h" -#include "src/turbomind/models/llama/GatedDeltaNetWeight.h" +#include "src/turbomind/models/delta_net_weight.h" #include "src/turbomind/models/llama/LlamaLinear.h" #include "src/turbomind/models/llama/context.h" #include "src/turbomind/models/llama/llama_params.h" @@ -12,19 +12,18 @@ namespace turbomind { class GatedDeltaNetLayer { public: struct ForwardParam { - int phase; - Tensor input; - Tensor output; - const GatedDeltaNetWeight* weights; - int layer_id; + int phase; + Tensor input; + Tensor output; + const DeltaNetWeight* weights; + int layer_id; }; - GatedDeltaNetLayer(const ModelParam& model, - const AttentionParam& attn, - const EngineParam& engine, - int tp_size, - const Context& ctx, - int phases); + GatedDeltaNetLayer(DataType state_dtype, + const std::vector& layer_types, + const EngineParam& engine, + const Context& ctx, + int phases); ~GatedDeltaNetLayer(); @@ -35,32 +34,21 @@ class GatedDeltaNetLayer { private: void Setup(int phase, TensorMap& env); - // Model dimensions - int hidden_units_; - int num_k_heads_; - int num_v_heads_; - int key_head_dim_; - int value_head_dim_; - int d_conv_; - int key_dim_; // num_k_heads * key_head_dim - int value_dim_; // num_v_heads * value_head_dim - int conv_dim_; // key_dim * 2 + value_dim - int num_linear_layers_; // count of linear attention layers for state sizing - std::vector layer_types_; // model layer types for index mapping - - float norm_eps_; - DataType dtype_; - DataType state_dtype_; // recurrent state dtype (may differ from dtype_ for float32 state) + // Config passed at construction + int tp_size_; + int num_linear_layers_; + std::vector layer_types_; + DataType state_dtype_; LlamaLinear& linear_; // Per-phase batch data (mirrors UnifiedAttentionLayer pattern) struct Data { - std::vector rc; // borrowed batch RequestCache pointers - std::vector input_lens; // snapshot of input_len per request (captured at Setup time) + std::vector rc; + std::vector input_lens; int batch_size = 0; - Buffer_ q_offsets; // cumulative input-token offsets, device buffer - Buffer_ k_offsets; // cumulative key (history+input) offsets, device buffer + Buffer_ q_offsets; + Buffer_ k_offsets; std::vector conv_states; std::vector recurrent_states; Buffer_ conv_state_ptrs; @@ -72,14 +60,12 @@ class GatedDeltaNetLayer { Buffer_ conv_state_ptrs_buf_; Buffer_ recurrent_state_ptrs_buf_; - // Queried once at construction; passed to all three kernel launchers. int sm_count_{1}; - Buffer_ work_counter_; // 1-element device int for v3 atomic claiming + Buffer_ work_counter_; - // Dual-stream dispatch: prefill on high-priority aux stream, decode on main cudaStream_t aux_stream_{}; - cudaEvent_t ev_before_{}; // main→aux: prior work done - cudaEvent_t ev_after_{}; // aux→main: prefill done + cudaEvent_t ev_before_{}; + cudaEvent_t ev_after_{}; }; } // namespace turbomind diff --git a/src/turbomind/models/llama/GatedDeltaNetWeight.cc b/src/turbomind/models/llama/GatedDeltaNetWeight.cc deleted file mode 100644 index c31ab7c0f2..0000000000 --- a/src/turbomind/models/llama/GatedDeltaNetWeight.cc +++ /dev/null @@ -1,176 +0,0 @@ -#include "src/turbomind/models/llama/GatedDeltaNetWeight.h" -#include "src/turbomind/kernels/gpt_kernels.h" -#include "src/turbomind/utils/cuda_utils.h" - -namespace turbomind { - -GatedDeltaNetWeight::GatedDeltaNetWeight(int hidden_dim, - int num_k_heads, - int num_v_heads, - int key_head_dim, - int value_head_dim, - int d_conv, - bool bias, - int tp_size, - int tp_rank, - DataType data_type, - DataType weight_type, - int group_size): - tp_rank_(tp_rank), tp_size_(tp_size) -{ - const int key_dim = num_k_heads * key_head_dim / tp_size; - const int value_dim = num_v_heads * value_head_dim / tp_size; - const int v_heads_tp = num_v_heads / tp_size; - const int conv_dim = key_dim * 2 + value_dim; - - // GatedDeltaNet projections are stored as plain dense weights in the checkpoint - // (dense_wtype = data_type avoids quantization path for these projections). - const DataType dense_wtype = data_type; - const int dense_gsz = 0; - - // Individual projections registered for checkpoint loading - in_proj_qkv.emplace(hidden_dim, conv_dim, data_type, bias, dense_wtype, dense_gsz); - in_proj_z.emplace(hidden_dim, value_dim, data_type, bias, dense_wtype, dense_gsz); - in_proj_b.emplace(hidden_dim, v_heads_tp, data_type, bias, dense_wtype, dense_gsz); - in_proj_a.emplace(hidden_dim, v_heads_tp, data_type, bias, dense_wtype, dense_gsz); - out_proj.emplace(value_dim, hidden_dim, data_type, bias, dense_wtype, dense_gsz); - - register_module("in_proj_qkv", in_proj_qkv, tp_rank_); - register_module("in_proj_z", in_proj_z, tp_rank_); - register_module("in_proj_b", in_proj_b, tp_rank_); - register_module("in_proj_a", in_proj_a, tp_rank_); - register_module("out_proj", out_proj, tp_rank_); - - // conv1d: depthwise weights, shape (conv_dim, d_conv) - conv1d = Tensor{{conv_dim, d_conv}, data_type, kDEVICE}; - register_parameter("conv1d." + std::to_string(tp_rank_) + ".weight", conv1d); - - // A_log: log-space decay per head, shape (num_v_heads/tp,) - A_log = Tensor{{v_heads_tp}, data_type, kDEVICE}; - register_parameter("A_log." + std::to_string(tp_rank_) + ".weight", A_log); - - // dt_bias: per head, shape (num_v_heads/tp,) - dt_bias = Tensor{{v_heads_tp}, data_type, kDEVICE}; - register_parameter("dt_bias." + std::to_string(tp_rank_) + ".weight", dt_bias); - - // norm: RMSNormGated weight, shape (value_head_dim,) - norm = Tensor{{value_head_dim}, data_type, kDEVICE}; - register_parameter("norm.weight", norm); -} - -// --------------------------------------------------------------------------- -// Row-wise concatenation of 4 weight matrices into a single pre-allocated -// destination tensor. -// -// Each source weight has shape (input_dim, out_dim_i) in row-major storage. -// The destination has shape (input_dim, sum_i out_dim_i) and rows are filled -// by concatenating the corresponding source rows in order. -// -// Implemented with cudaMemcpy2DAsync so that no extra temporary is needed: -// each source "column block" is scattered into the correct column range of -// the destination in one pass per source. -// --------------------------------------------------------------------------- -static void -concat_weights_4(const Tensor& a, const Tensor& b, const Tensor& c, const Tensor& d, Tensor& dst, cudaStream_t st) -{ - // Tensors are (K=input_dim, M=output_dim) in row-major order. - // Each row of `dst` is [a_row | b_row | c_row | d_row]. - const int K = dst.shape(0); - const int M_a = a.shape(1); - const int M_b = b.shape(1); - const int M_c = c.shape(1); - const int M_d = d.shape(1); - const int M_dst = dst.shape(1); // M_a + M_b + M_c + M_d - const int elem_sz = byte_size(dst.dtype(), 1); - - // Pitch of the destination row in bytes - const size_t dst_pitch = (size_t)M_dst * elem_sz; - const size_t src_pitch_a = (size_t)M_a * elem_sz; - const size_t src_pitch_b = (size_t)M_b * elem_sz; - const size_t src_pitch_c = (size_t)M_c * elem_sz; - const size_t src_pitch_d = (size_t)M_d * elem_sz; - - char* dst_ptr = reinterpret_cast(dst.raw_data()); - - // Columns [0, M_a) - check_cuda_error( - cudaMemcpy2DAsync(dst_ptr, dst_pitch, a.raw_data(), src_pitch_a, src_pitch_a, K, cudaMemcpyDefault, st)); - - // Columns [M_a, M_a+M_b) - check_cuda_error(cudaMemcpy2DAsync( - dst_ptr + src_pitch_a, dst_pitch, b.raw_data(), src_pitch_b, src_pitch_b, K, cudaMemcpyDefault, st)); - - // Columns [M_a+M_b, M_a+M_b+M_c) - check_cuda_error(cudaMemcpy2DAsync(dst_ptr + src_pitch_a + src_pitch_b, - dst_pitch, - c.raw_data(), - src_pitch_c, - src_pitch_c, - K, - cudaMemcpyDefault, - st)); - - // Columns [M_a+M_b+M_c, M_dst) - check_cuda_error(cudaMemcpy2DAsync(dst_ptr + src_pitch_a + src_pitch_b + src_pitch_c, - dst_pitch, - d.raw_data(), - src_pitch_d, - src_pitch_d, - K, - cudaMemcpyDefault, - st)); - sync_check_cuda_error(); -} - -void GatedDeltaNetWeight::prepare() -{ - auto stream = core::Context::stream().handle(); - - // Preprocess individual weights (converts blockscale FP8, etc.) - in_proj_qkv.preprocess(); - in_proj_z.preprocess(); - in_proj_b.preprocess(); - in_proj_a.preprocess(); - out_proj.preprocess(); - out_proj.prepare(); - - // Build the fused input projection weight: - // shape (hidden_dim, conv_dim + value_dim + 2*v_heads_tp) - // = [in_proj_qkv | in_proj_z | in_proj_b | in_proj_a] (column-wise) - const int out_all = in_proj_qkv.output_dim // - + in_proj_z.output_dim // - + in_proj_b.output_dim // - + in_proj_a.output_dim; - - in_proj_all.emplace(in_proj_qkv.input_dim, - out_all, - in_proj_qkv.data_type, - /*bias=*/false, - in_proj_qkv.weight_type, - in_proj_qkv.group_size); - - concat_weights_4( - in_proj_qkv.weight, in_proj_z.weight, in_proj_b.weight, in_proj_a.weight, in_proj_all.weight, stream); - - // Prepare (convert/repack) the fused weight for GEMM - in_proj_all.prepare(); - - // Release the now-redundant individual weight tensors to free HBM - in_proj_qkv = {}; - in_proj_z = {}; - in_proj_b = {}; - in_proj_a = {}; - - // Transpose conv1d from checkpoint layout [conv_dim, d_conv] to kernel layout [d_conv, conv_dim] - { - const int rows = conv1d.shape(0); // conv_dim - const int cols = conv1d.shape(1); // d_conv - - Tensor conv1d_t{{cols, rows}, conv1d.dtype(), kDEVICE}; - invokeTransposeAxis01((uint16_t*)conv1d_t.raw_data(), (uint16_t*)conv1d.raw_data(), rows, cols, 1, stream); - sync_check_cuda_error(); - conv1d = std::move(conv1d_t); - } -} - -} // namespace turbomind diff --git a/src/turbomind/models/llama/GatedDeltaNetWeight.h b/src/turbomind/models/llama/GatedDeltaNetWeight.h deleted file mode 100644 index 6683584cfd..0000000000 --- a/src/turbomind/models/llama/GatedDeltaNetWeight.h +++ /dev/null @@ -1,52 +0,0 @@ -#pragma once - -#include "src/turbomind/core/core.h" -#include "src/turbomind/core/module.h" -#include "src/turbomind/models/llama/LlamaDenseWeight.h" - -namespace turbomind { - -struct GatedDeltaNetWeight: public core::Module { - - GatedDeltaNetWeight() = default; - - GatedDeltaNetWeight(int hidden_dim, - int num_k_heads, - int num_v_heads, - int key_head_dim, - int value_head_dim, - int d_conv, - bool bias, - int tp_size, - int tp_rank, - DataType data_type, - DataType weight_type, - int group_size); - - void prepare(); - - // Individual projections – populated at load time from the checkpoint. - // After prepare() completes they are released (null-ed) to free HBM. - LlamaDenseWeight in_proj_qkv; // hidden -> key_dim*2 + value_dim - LlamaDenseWeight in_proj_z; // hidden -> value_dim (output gate) - LlamaDenseWeight in_proj_b; // hidden -> num_v_heads (beta, per-head scalar) - LlamaDenseWeight in_proj_a; // hidden -> num_v_heads (alpha/dt, per-head scalar) - - // Fused projection: hidden -> (conv_dim + value_dim + 2*v_heads_tp). - // Built from the four above in prepare(); used for all inference GEMMs. - // Reduces p.input HBM reads from 4× to 1× per forward pass. - LlamaDenseWeight in_proj_all; - - LlamaDenseWeight out_proj; // value_dim -> hidden - - // Non-dense parameters - Tensor conv1d; // depthwise conv weights: (d_conv, conv_dim) - Tensor A_log; // log-space decay: (num_v_heads,) - Tensor dt_bias; // dt bias: (num_v_heads,) - Tensor norm; // RMSNormGated weight: (value_head_dim,) - - int tp_rank_; - int tp_size_; -}; - -} // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc deleted file mode 100644 index ca7fc25e97..0000000000 --- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc +++ /dev/null @@ -1,179 +0,0 @@ -/* - * Copyright (c) OpenMMLab. All rights reserved. - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Modified from -// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.cc - -#include - -#include -#include - -#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h" - -#include "src/turbomind/core/data_type.h" -#include "src/turbomind/core/logger.h" -#include "src/turbomind/models/llama/LlamaDenseWeight.h" -#include "src/turbomind/models/llama/llama_params.h" -#include "src/turbomind/utils/cuda_utils.h" - -namespace turbomind { - -static bool is_fuse_silu_act() -{ - static const bool value = [] { - const auto str = std::getenv("TM_FUSE_SILU_ACT"); - if (str) { - try { - auto v = std::stoi(str) != 0; - TM_LOG_INFO("TM_FUSE_SILU_ACT={}", (int)v); - return v; - } - catch (...) { - } - } - // TM_LOG_INFO("TM_FUSE_SILU_ACT=1"); - return true; - }(); - return value; -} - -LlamaDecoderLayerWeight::LlamaDecoderLayerWeight( - DataType data_type, int layer_id, const ModelParam& model, const EngineParam& engine, const MoeParam& moe_param): - head_num_(model.head_num), - kv_head_num_(model.kv_head_num), - size_per_head_(model.head_dim), - hidden_units_(model.hidden_units), - inter_size_(model.inter_size.at(layer_id)), - data_type_{data_type}, - weight_type_(model.weight_type), - expert_weight_type_(model.expert_weight_type), - attn_bias_(model.attn_bias), - attn_tp_size_(engine.attn_tp_size), - attn_tp_rank_(engine.attn_tp_rank), - mlp_tp_size_(engine.mlp_tp_size), - mlp_tp_rank_(engine.mlp_tp_rank) -{ - bool is_linear_attention = false; - if (layer_id < (int)model.layer_types.size() && model.layer_types[layer_id] == 1) { - is_linear_attention = true; - } - - if (is_linear_attention) { - linear_attn_weights.reset( - new GatedDeltaNetWeight{hidden_units_, - model.linear_num_key_heads, - model.linear_num_value_heads, - model.linear_key_head_dim, - model.linear_value_head_dim, - model.linear_conv_kernel_dim > 0 ? model.linear_conv_kernel_dim : 4, - attn_bias_, - attn_tp_size_, - attn_tp_rank_, - data_type_, - weight_type_, - model.group_size}); - register_module("linear_attn", *linear_attn_weights); - } - else { - // Attention uses weight_type (fp16 in mixed quant scenarios) - self_attn_weights.reset(new LlamaAttentionWeight{hidden_units_, - size_per_head_, - head_num_, - kv_head_num_, - model.mla, - attn_bias_, - model.qk_norm, - attn_tp_size_, - attn_tp_rank_, - data_type_, - weight_type_, - model.group_size, - model.window_size.empty() ? 0 : model.window_size.at(layer_id), - model.attn_sink, - model.attn_output_gate}); - register_module("attention", *self_attn_weights); - } - - // FFN uses ffn_weight_type, except for layers fully excluded from - // quantization (e.g. 'model.layers.0.' in modules_to_not_convert) - // where all weights—including FFN—are in data_type (fp16). - if (inter_size_) { - const DataType ffn_wtype = model.unquantized_expert_layers.count(layer_id) ? data_type_ : model.ffn_weight_type; - const bool is_cublas_gemm = byte_size(ffn_wtype, 8) == 16; - ffn_weights.reset(new LlamaFfnWeight{ - hidden_units_, - inter_size_, - model.mlp_bias, - mlp_tp_size_, - mlp_tp_rank_, - data_type_, - ffn_wtype, - model.group_size, - model.act_type, - is_fuse_silu_act() && !is_cublas_gemm, - }); - register_module("feed_forward", *ffn_weights); - } - - // MoE routed experts use expert_weight_type (int4 for AWQ, e2m1 for mxfp4) - // unless the layer is in unquantized_expert_layers (e.g. layer 0 excluded - // from quantization via modules_to_not_convert). - if (layer_id < moe_param.expert_num.size() && moe_param.expert_num[layer_id]) { - const DataType moe_wtype = model.unquantized_expert_layers.count(layer_id) ? data_type_ : expert_weight_type_; - moe_weights.reset(new MoeFfnWeight{layer_id, - moe_param, - hidden_units_, - model.mlp_bias, - data_type_, - moe_wtype, - model.group_size, - mlp_tp_size_, - mlp_tp_rank_, - model.act_type, - is_fuse_silu_act()}); - register_module("moe_ffn", *moe_weights); - } - - self_attn_norm = Tensor{{hidden_units_}, data_type_, kDEVICE}; - ffn_norm = Tensor{{hidden_units_}, data_type_, kDEVICE}; - register_parameter("attention_norm.weight", self_attn_norm); - register_parameter("ffn_norm.weight", ffn_norm); -} - -LlamaDecoderLayerWeight::~LlamaDecoderLayerWeight() = default; - -void LlamaDecoderLayerWeight::prepare(const cudaDeviceProp& prop, cudaStream_t st) -{ - if (self_attn_weights) { - self_attn_weights->prepare(); - } - - if (linear_attn_weights) { - linear_attn_weights->prepare(); - } - - if (ffn_weights) { - ffn_weights->prepare(false); - } - - if (moe_weights) { - moe_weights->prepare(); - } -} - -} // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h deleted file mode 100644 index 6ac387ab12..0000000000 --- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) OpenMMLab. All rights reserved. - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Modified from -// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.h - -#pragma once - -#include "src/turbomind/core/core.h" - -#include "src/turbomind/models/llama/GatedDeltaNetWeight.h" -#include "src/turbomind/models/llama/LlamaDenseWeight.h" -#include "src/turbomind/models/llama/llama_params.h" - -namespace turbomind { - -struct LlamaDecoderLayerWeight: core::Module { -public: - LlamaDecoderLayerWeight() = delete; - - LlamaDecoderLayerWeight(DataType data_type, - int layer_id, - const ModelParam& model, - const EngineParam& engine, - const MoeParam& moe_param); - - ~LlamaDecoderLayerWeight(); - LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight&) = delete; - LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight&) = delete; - - void prepare(const cudaDeviceProp& prop, cudaStream_t st); - - Tensor self_attn_norm; - Tensor ffn_norm; - - std::unique_ptr self_attn_weights; - std::unique_ptr linear_attn_weights; - - std::unique_ptr ffn_weights; - std::unique_ptr moe_weights; - -private: - int head_num_; - int kv_head_num_; - int size_per_head_; - int hidden_units_; - int inter_size_; - - DataType data_type_; - DataType weight_type_; - DataType expert_weight_type_; - - int bit_size_; - bool attn_bias_; - int attn_tp_size_; - int attn_tp_rank_; - int mlp_tp_size_; - int mlp_tp_rank_; -}; - -} // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaDenseWeight.cc b/src/turbomind/models/llama/LlamaDenseWeight.cc deleted file mode 100644 index 1764d3622a..0000000000 --- a/src/turbomind/models/llama/LlamaDenseWeight.cc +++ /dev/null @@ -1,690 +0,0 @@ -// Copyright (c) OpenMMLab. All rights reserved. - -#include - -#include "src/turbomind/models/llama/LlamaDenseWeight.h" - -#include "src/turbomind/core/allocator.h" -#include "src/turbomind/core/data_type.h" - -#include "src/turbomind/kernels/activation.h" -#include "src/turbomind/kernels/gemm/cast.h" -#include "src/turbomind/kernels/gemm/convert.h" -#include "src/turbomind/kernels/gemm/gemm.h" -#include "src/turbomind/kernels/gemm/types.h" -#include "src/turbomind/kernels/gemm/utils.h" -#include "src/turbomind/kernels/gpt_kernels.h" -#include "src/turbomind/utils/cuda_utils.h" - -namespace turbomind { - -void LlamaDenseWeight::emplace( - int input_dim, int output_dim, DataType data_type, bool bias, DataType weight_type, int group_size) -{ - this->data_type = data_type; - this->input_type = data_type; - this->weight_type = weight_type; - this->input_dim = input_dim; - this->output_dim = output_dim; - this->group_size = group_size; - - const bool is_qweight = weight_type == kUint4 || weight_type == kUint8; - - weight = Tensor({input_dim, output_dim}, weight_type, kDEVICE); - register_parameter(is_qweight ? "qweight" : "weight", weight); - - if (bias) { - this->bias = Tensor{{output_dim}, data_type, kDEVICE}; - register_parameter("bias", this->bias); - } - - if (weight_type == kFloat8_e4m3) { - TM_CHECK_EQ(group_size, 128); - scales = Tensor{{cdiv(input_dim, group_size), cdiv(output_dim, group_size)}, kFloat, kDEVICE}; - weight_quant = QuantDesc{gemm::QuantType::kB, group_size}; - if (getSMVersion() == 90) { - input_type = kFloat8_e4m3; - input_quant = QuantDesc{gemm::QuantType::kK, group_size}; - } - register_parameter("scales", scales); - } - else if (weight_type == kFloat4_e2m1) { - scales = Tensor{{cdiv(input_dim, group_size), output_dim}, kUint8, kDEVICE}; - input_type = data_type; - weight_quant = QuantDesc{gemm::QuantType::kK, group_size}; - register_parameter("scales", scales); - } - else if (is_qweight) { - TM_CHECK(input_dim % group_size == 0) << input_dim << " " << group_size; - scales = Tensor{{input_dim / group_size, output_dim}, data_type, kDEVICE}; - zeros = Tensor{{input_dim / group_size, output_dim}, data_type, kDEVICE}; - weight_quant = QuantDesc{gemm::QuantType::kK, group_size}; - register_parameter("scales", scales); - register_parameter("zeros", zeros); - } - - k_desc = {}; - q_desc = {}; - - // default case: floating point, N-major - k_desc.type = weight.dtype(); - k_desc.order = gemm::kRowMajor; - k_desc.rows = input_dim; - k_desc.cols = output_dim; - k_desc.ld = output_dim; -} - -void LlamaDenseWeight::preprocess() -{ - if (!weight) { - return; - } - if (weight_quant.type == gemm::QuantType::kB && input_quant.type == gemm::QuantType::kNone) { - // Convert blockwise scales to groupwise scales - weight_quant.type = gemm::QuantType::kK; - scales = BlockscaleToGroupscale(scales, data_type, weight_quant.group_size); - } -} - -static void Convert(LlamaDenseWeight& dense, bool is_grouped, cudaStream_t st) -{ - using namespace gemm; - - auto [conv_w, conv_s] = - GetConverters(dense.data_type, dense.weight_type, dense.input_type, is_grouped, getSMVersion()); - - if (conv_w) { - const auto order_w = conv_w->order; - const bool is_A = get_operand_tag(conv_w->pack) == OPERAND_A; - const bool is_B = !is_A; - - const int bits = byte_size(dense.weight_type, 8); - - Tensor_ tmp{{dense.input_dim, dense.output_dim}, kDEVICE}; - - if (bits == 4) { // u4 -> u16 - extend_to_u16(tmp.data(), (const uint4_t*)dense.weight.raw_data(), tmp.size(), st); - sync_check_cuda_error(); - } - else if (bits == 8) { // u8 -> u16 - extend_to_u16(tmp.data(), (const uint8_t*)dense.weight.raw_data(), tmp.size(), st); - sync_check_cuda_error(); - } - else if (bits == 16) { - check_cuda_error( - cudaMemcpyAsync(tmp.raw_data(), dense.weight.raw_data(), tmp.byte_size(), cudaMemcpyDefault, st)); - } - - if (order_w == kRowMajor) { // (k,m) -> (m,k) - Tensor_ trans{{dense.output_dim, dense.input_dim}, kDEVICE}; - invokeTransposeAxis01(trans.data(), tmp.data(), dense.input_dim, dense.output_dim, 1, st); - tmp = trans; - } - - MatrixLayout w_desc{ - dense.data_type, - order_w, - (int)dense.output_dim, // M - (int)dense.input_dim, // K - order_w == kRowMajor ? (int)dense.input_dim : (int)dense.output_dim, - }; - - if (is_B) { - std::swap(w_desc.rows, w_desc.cols); - w_desc.order = ~w_desc.order; - } - - MatrixLayout k_desc = w_desc; - k_desc.type = dense.weight_type; - // Converter does not recognize e2m1 / e4m3 - if (bits == 4) { - k_desc.type = data_type_v; - } - else if (bits == 8) { - k_desc.type = data_type_v; - } - k_desc.pack = conv_w->pack; - - check_cuda_error(cudaMemsetAsync(dense.weight.raw_data(), 0, dense.weight.byte_size(), st)); - - TM_CHECK(conv_w->Convert(tmp.data(), w_desc, dense.weight.raw_data(), k_desc, st) == 0); - - sync_check_cuda_error(); - - k_desc.type = dense.weight_type; - if (is_A) { - k_desc = transpose(k_desc); - } - dense.k_desc = k_desc; - } - - if (conv_s) { - const auto order_s = conv_s->order; - const auto pack_s = conv_s->pack; - const bool is_A = get_operand_tag(conv_s->pack) == OPERAND_U; - const bool is_B = !is_A; - - Tensor tmp_q; - DataType scale_type; - - if (dense.zeros) { // AWQ/GPTQ fuse scales and zeros - tmp_q = {{dense.scales.size(), 2}, kHalf, kDEVICE}; - fuse_scales_and_zeros( - tmp_q.data(), dense.scales.data(), dense.zeros.data(), dense.scales.size(), st); - scale_type = kUint32; // half2 - dense.zeros = {}; - dense.scales = empty_like(tmp_q); - } - else if (dense.weight_type == kFloat8_e4m3) { // e4m3 - tmp_q = empty_like(dense.scales); - Copy(dense.scales, tmp_q); - scale_type = kUint16; // bf16 - } - else { // mxfp4 - tmp_q = empty_like(dense.scales); - Copy(dense.scales, tmp_q); - scale_type = kUint8; // ue8m0 - } - - if (dense.data_type == kHalf && dense.weight_type == kFloat4_e2m1) { // mxfp4 - AdjustUe8m0ScaleForHalf(tmp_q.data(), tmp_q.size(), st); - sync_check_cuda_error(); - } - - MatrixLayout s_desc{ - scale_type, - order_s, - (int)dense.output_dim, // M - (int)dense.input_dim / dense.group_size, // K - (int)dense.output_dim, // always MN-major - }; - - if (is_B) { - std::swap(s_desc.rows, s_desc.cols); - s_desc.order = ~s_desc.order; - } - - MatrixLayout q_desc = s_desc; - q_desc.pack = pack_s; - - TM_CHECK(conv_s->Convert(tmp_q.raw_data(), s_desc, dense.scales.raw_data(), q_desc, st) == 0); - sync_check_cuda_error(); - - // weight is placed at B in `Linear` - if (is_A) { - q_desc = transpose(q_desc); - } - dense.q_desc = q_desc; - } -} - -static void ConvertBlockscaleFP8Native(LlamaDenseWeight& dense, cudaStream_t stream) -{ - using namespace gemm; - - TM_CHECK_GE(getSMVersion(), 90); - TM_CHECK_EQ(dense.data_type, data_type_v); - - auto process = [&](Tensor& x, MatrixLayout& d, auto dtype) { - using T = decltype(dtype); - Tensor trans{{x.shape(1), x.shape(0)}, x.dtype(), kDEVICE}; - invokeTransposeAxis01((T*)trans.raw_data(), (T*)x.raw_data(), x.shape(0), x.shape(1), 1, stream); - x = std::move(trans); - d = MatrixLayout{x.dtype(), // - kColMajor, - (int)x.shape(1), - (int)x.shape(0), - (int)x.stride(0)}; - }; - - TM_CHECK_EQ(dense.weight.dtype(), kFloat8_e4m3); - process(dense.weight, dense.k_desc, uint8_t{}); - - TM_CHECK_EQ(dense.scales.dtype(), kFloat); - process(dense.scales, dense.q_desc, float{}); -} - -void LlamaDenseWeight::prepare(bool fused_moe) -{ - if (!weight) { - return; - } - - auto stream = core::Context::stream().handle(); - - if (weight_type == kFloat8_e4m3 && input_type == kFloat8_e4m3) { - ConvertBlockscaleFP8Native(*this, stream); - } - else { - Convert(*this, fused_moe, stream); - } -} - -LlamaAttentionWeight::LlamaAttentionWeight(int hidden_dim, - int head_dim, - int head_num, - int kv_head_num, - MLAParam mla, - bool bias, - bool qk_norm, - int tp_size, - int tp_rank, - DataType data_type, - DataType weight_type, - int group_size, - int window_size, - bool sink, - bool attn_output_gate) -{ - this->window_size = window_size; - - // attn_output_gate doubles Q dimension (extra gate projection fused into Q) - const int q_factor = attn_output_gate ? 2 : 1; - - if (mla.kv_lora_rank == 0) { - qkv.emplace(hidden_dim, - (head_num * q_factor + 2 * kv_head_num) * head_dim / tp_size, - data_type, - bias, - weight_type, - group_size); - register_module("w_qkv", qkv, tp_rank); - if (qk_norm) { - q_a_layernorm = Tensor{{head_dim}, data_type, kDEVICE}; - kv_a_layernorm = Tensor{{head_dim}, data_type, kDEVICE}; - register_parameter("q_norm", q_a_layernorm); - register_parameter("k_norm", kv_a_layernorm); - } - } - else { - const int qk_nope_dim = head_dim - mla.qk_rope_dim; - if (mla.q_lora_rank) { - q_a_proj.emplace(hidden_dim, mla.q_lora_rank, data_type, false, weight_type, group_size); - q_b_proj.emplace(mla.q_lora_rank, head_num * head_dim / tp_size, data_type, false, weight_type, group_size); - q_a_layernorm = Tensor{{q_b_proj.input_dim}, data_type, kDEVICE}; - register_module("q_a_proj", q_a_proj); - register_module("q_b_proj", q_b_proj, tp_rank); - register_parameter("q_a_layernorm", q_a_layernorm); - } - else { - q_proj.emplace(hidden_dim, head_num * head_dim / tp_size, data_type, false, weight_type, group_size); - register_module("q_proj", q_proj, tp_rank); - } - kv_a_proj.emplace(hidden_dim, mla.kv_lora_rank + mla.qk_rope_dim, data_type, false, weight_type, group_size); - // kv_b_proj.emplace(mla.kv_lora_rank, - // head_num * (qk_nope_dim + mla.v_head_dim) / tp_size, - // data_type, - // false, - // weight_type, - // group_size); - - kv_a_layernorm = Tensor{{mla.kv_lora_rank}, data_type, kDEVICE}; - register_module("kv_a_proj", kv_a_proj); - // register_module("kv_b_proj", kv_b_proj, tp_rank); - register_parameter("kv_a_layernorm", kv_a_layernorm); - } - output.emplace((head_num * head_dim) / tp_size, hidden_dim, data_type, bias, weight_type, group_size); - register_module("wo", output, tp_rank); - - if (sink) { - sinks = Tensor{{head_num / tp_size}, data_type, kDEVICE}; - register_parameter(std::to_string(tp_rank) + ".sinks", sinks); - } -} - -void LlamaAttentionWeight::prepare() -{ - std::vector weights{ - &qkv, &output, &q_a_proj, &q_a_proj, &q_b_proj, &kv_a_proj // &kv_b_proj, - }; - for (auto& w : weights) { - w->preprocess(); - w->prepare(); - } -} - -LlamaFfnWeight::LlamaFfnWeight(int hidden_dim, - int inter_size, - bool bias, - int tp_size, - int tp_rank, - DataType data_type, - DataType weight_type, - int group_size, - ActivationType act_type, - bool fuse_silu_act) -{ - TM_CHECK(inter_size % tp_size == 0) << inter_size << " " << tp_size; - - inter_size /= tp_size; - - this->inter_size = inter_size; - this->tp_rank = tp_rank; - this->act_type = act_type; - this->is_fused_silu = fuse_silu_act && this->act_type == ActivationType::kSilu; - - gating.emplace(hidden_dim, inter_size, data_type, bias, weight_type, group_size); - - intermediate.emplace(hidden_dim, inter_size, data_type, bias, weight_type, group_size); - - output.emplace(inter_size, hidden_dim, data_type, bias, weight_type, group_size); - - if (gating.input_type == kFloat8_e4m3) { // SM90 FP8*FP8 GEMM, can't fuse - this->is_fused_silu = false; - } - - register_module("w1", gating, tp_rank); - register_module("w3", intermediate, tp_rank); - register_module("w2", output, tp_rank); -} - -static void Interleave(const Tensor& a, const Tensor& b, Tensor& c, cudaStream_t st) -{ - TM_CHECK(a.layout() == b.layout()); - int M, K; - if (a.ndim() == 2) { - std::tie(K, M) = a.shapes(0, 1); - } - else { - M = a.shape(0); - K = 1; - } - auto a_ = a.raw_data(); - auto b_ = b.raw_data(); - auto c_ = c.raw_data(); - - const int bits = byte_size(a.dtype(), 8); - if (bits == 4) { - Buffer_ ta{a.size(), kDEVICE}; - Buffer_ tb{b.size(), kDEVICE}; - Buffer_ tc{c.size(), kDEVICE}; - extend_to_u8(ta.data(), (uint4_t*)a_, a.size(), st); - extend_to_u8(tb.data(), (uint4_t*)b_, b.size(), st); - interleave_output_dims(tc.data(), ta.data(), tb.data(), M, K, st); - compact_to_u4((uint4_t*)c_, tc.data(), c.size(), st); - } - else if (bits == 8) { - interleave_output_dims((uint8_t*)c_, (uint8_t*)a_, (uint8_t*)b_, M, K, st); - } - else if (bits == 16) { - interleave_output_dims((uint16_t*)c_, (uint16_t*)a_, (uint16_t*)b_, M, K, st); - } - else if (bits == 32) { - interleave_output_dims((uint32_t*)c_, (uint32_t*)a_, (uint32_t*)b_, M, K, st); - } - else { - TM_CHECK(0); - } -} - -void interleave(LlamaDenseWeight& c, LlamaDenseWeight& a, LlamaDenseWeight& b, DataType data_type, cudaStream_t st) -{ - TM_CHECK_EQ(c.input_dim, a.input_dim); - TM_CHECK_EQ(c.input_dim, b.input_dim); - TM_CHECK_EQ(c.output_dim, a.output_dim * 2); - TM_CHECK_EQ(c.output_dim, b.output_dim * 2); - TM_CHECK_EQ(c.group_size, a.group_size); - TM_CHECK_EQ(c.group_size, b.group_size); - - Interleave(a.weight, b.weight, c.weight, st); - sync_check_cuda_error(); - - if (a.scales) { - Interleave(a.scales, b.scales, c.scales, st); - sync_check_cuda_error(); - } - if (a.zeros) { - Interleave(a.zeros, b.zeros, c.zeros, st); - sync_check_cuda_error(); - } - if (a.bias) { - Interleave(a.bias, b.bias, c.bias, st); - sync_check_cuda_error(); - } -} - -static void Chunk(const Tensor& a, const Tensor& b, Tensor& c, cudaStream_t st) -{ - TM_CHECK(a.layout() == b.layout()); - int M, K, spitch, dpitch; - if (a.ndim() == 2) { - std::tie(K, M) = a.shapes(0, 1); - spitch = byte_size(a.dtype(), a.stride(0)); - dpitch = byte_size(c.dtype(), c.stride(0)); - } - else { - M = a.shape(0); - K = 1; - spitch = byte_size(a.dtype(), M); - dpitch = byte_size(c.dtype(), c.shape(0)); - } - int height = K; - int width = byte_size(a.dtype(), M); - check_cuda_error(cudaMemcpy2DAsync((char*)c.raw_data(), // - dpitch, - (const char*)a.raw_data(), - spitch, - width, - height, - cudaMemcpyDefault, - st)); - check_cuda_error(cudaMemcpy2DAsync((char*)c.raw_data() + width, // - dpitch, - (const char*)b.raw_data(), - spitch, - width, - height, - cudaMemcpyDefault, - st)); -} - -void chunk(LlamaDenseWeight& c, LlamaDenseWeight& a, LlamaDenseWeight& b, DataType data_type, cudaStream_t st) -{ - TM_CHECK_EQ(c.input_dim, a.input_dim); - TM_CHECK_EQ(c.input_dim, b.input_dim); - TM_CHECK_EQ(c.output_dim, a.output_dim * 2); - TM_CHECK_EQ(c.output_dim, b.output_dim * 2); - TM_CHECK_EQ(c.group_size, a.group_size); - TM_CHECK_EQ(c.group_size, b.group_size); - - Chunk(a.weight, b.weight, c.weight, st); - sync_check_cuda_error(); - - if (a.scales) { - Chunk(a.scales, b.scales, c.scales, st); - sync_check_cuda_error(); - } - if (a.zeros) { - Chunk(a.zeros, b.zeros, c.zeros, st); - sync_check_cuda_error(); - } - if (a.bias) { - Chunk(a.bias, b.bias, c.bias, st); - sync_check_cuda_error(); - } -} - -void LlamaFfnWeight::prepare(bool fused_moe) -{ - const auto data_type = gating.data_type; - - auto stream = core::Context().stream().handle(); - - gating.preprocess(); - intermediate.preprocess(); - - if (fuse_up_and_gate) { - auto& gate_and_up = fused_gating_intermediate; - - gate_and_up.emplace(gating.input_dim, // - gating.output_dim * 2, - gating.data_type, - (bool)gating.bias, - gating.weight_type, - gating.group_size); - gate_and_up.preprocess(); - register_module("w1w3", gate_and_up, this->tp_rank); - - if (is_fused_silu) { - interleave(gate_and_up, gating, intermediate, data_type, stream); - gate_and_up.epilogue = gemm::Epilogue::kGatedSilu; - } - else { - chunk(gate_and_up, gating, intermediate, data_type, stream); - } - - fused_gating_intermediate.prepare(fused_moe); - - gating = {}; - intermediate = {}; - } - else { - gating.prepare(fused_moe); - intermediate.prepare(fused_moe); - } - - output.preprocess(); - output.prepare(fused_moe); -} - -MoeFfnWeight::MoeFfnWeight(int layer_id, - const MoeParam& param, - int hidden_dim, - bool mlp_bias, - DataType data_type, - DataType weight_type, - int group_size, - int tp_size, - int tp_rank, - ActivationType act_type, - bool fuse_silu_act) -{ - if ((int)param.expert_num.size() <= layer_id) { - return; - } - - const int expert_num = param.expert_num[layer_id]; - - if (expert_num == 0) { - return; - } - - gate.emplace(hidden_dim, expert_num, data_type, param.router_bias, data_type, 1); - register_module("gate", gate); - - if (param.topk_method == "noaux_tc") { - score_correction_bias = Tensor{{expert_num}, kFloat, kDEVICE}; - register_parameter("gate.score_correction_bias", score_correction_bias); - } - - method = param.method; - - const bool is_cublas_gemm = method == MoeParam::kNaive && byte_size(weight_type, 8) == 16; - if (is_cublas_gemm || mlp_bias) { - fuse_silu_act = false; - } - - experts.reserve(expert_num); - for (int i = 0; i < expert_num; ++i) { - experts.emplace_back(new LlamaFfnWeight{hidden_dim, - param.inter_size, - mlp_bias, - tp_size, - tp_rank, - data_type, - weight_type, - group_size, - act_type, - fuse_silu_act}); - register_module("experts", *experts.back(), i); - } - - if (param.shared_gate) { - shared_gate.emplace(hidden_dim, 1, data_type, false, data_type, 1); - register_module("shared_gate", shared_gate); - } -} - -void MoeFfnWeight::prepare() -{ - const auto fused_moe = method == MoeParam::kFused; - - gate.prepare(); - shared_gate.prepare(); - - for (auto& e : experts) { - e->prepare(fused_moe); - } - - const int n = experts.size(); - LinkExperts([&](int i) { return &experts[i]->fused_gating_intermediate; }, n, block.fused_gating_intermediate); - LinkExperts([&](int i) { return &experts[i]->output; }, n, block.output); - - auto& e = *experts.at(0); - // Copy MLP properties - block.inter_size = e.inter_size; - block.is_fused_silu = e.is_fused_silu; - block.act_type = e.act_type; -} - -void LinkExperts(std::function experts, int n, LlamaDenseWeight& d) -{ - const auto& e = *experts(0); - - d.input_dim = e.input_dim; - d.output_dim = e.output_dim; - d.group_size = e.group_size; - d.data_type = e.data_type; - d.input_type = e.input_type; - d.weight_type = e.weight_type; - d.input_quant = e.input_quant; - d.weight_quant = e.weight_quant; - d.k_desc = e.k_desc; - d.q_desc = e.q_desc; - d.epilogue = e.epilogue; - - d.k_desc.num = d.q_desc.num = n; - - if (e.bias) { - d.bias = Tensor{{n, e.output_dim}, e.bias.dtype(), kDEVICE}; - } - - std::vector> weights; - std::vector> scales; - - for (int i = 0; i < n; ++i) { - auto& e = *experts(i); - weights.emplace_back(e.weight.raw_data(), e.k_desc.ld); - if (e.scales) { - scales.emplace_back(e.scales.raw_data(), e.q_desc.ld); - } - if (e.bias) { - Copy(e.bias, d.bias.slice(i, 1).squeeze(0)); - } - } - - auto stream = core::Context::stream().handle(); - - if (d.weight_type == kFloat8_e4m3 && d.input_type == kFloat8_e4m3) { - auto make_blocked_ptr = [&](const auto& ptrs) { - return std::shared_ptr{gemm::MakeBlockedPtrs(ptrs, stream), [](auto p) { cudaFree(p); }}; - }; - d.weight = Tensor{make_blocked_ptr(weights), {n}, e.weight.dtype(), kDEVICE}; - d.scales = Tensor{make_blocked_ptr(scales), {n}, e.scales.dtype(), kDEVICE}; - // This is needed to be recognized as blocked striding mode - d.k_desc.offsets = d.q_desc.offsets = (int*)1; - } - else { - auto make_strided_ptr = [&](const auto& ptrs) { - return std::shared_ptr{gemm::MakeStridedPtrs(ptrs, stream), [](auto p) { cudaFree(p); }}; - }; - d.weight = Tensor{make_strided_ptr(weights), {n}, d.weight_type, kDEVICE}; - if (e.scales) { - d.scales = Tensor{make_strided_ptr(scales), {n}, e.scales.dtype(), kDEVICE}; - } - // pre-sm90 grouped GEMM need `ld == 0 to resolve strided_ptr - d.k_desc.ld = d.q_desc.ld = 0; - } -} - -} // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h deleted file mode 100644 index 7aa8673586..0000000000 --- a/src/turbomind/models/llama/LlamaDenseWeight.h +++ /dev/null @@ -1,190 +0,0 @@ -/* - * Copyright (c) OpenMMLab. All rights reserved. - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/DenseWeight.h - -#pragma once - -#include "src/turbomind/core/core.h" -#include "src/turbomind/core/module.h" - -#include "src/turbomind/kernels/activation.h" -#include "src/turbomind/kernels/gemm/types.h" - -#include "src/turbomind/models/llama/llama_params.h" - -namespace turbomind { - -using gemm::QuantDesc; -using gemm::MatrixLayout; -using gemm::Epilogue; - -struct LlamaDenseWeight: public core::Module { - - LlamaDenseWeight(): - data_type{}, weight_type{}, input_type{}, weight_quant{}, input_quant{}, epilogue{}, k_desc{}, q_desc{} - { - } - - void emplace(int input_dim, int output_dim, DataType data_type, bool bias, DataType weight_type, int group_size); - - void preprocess(); - - void prepare(bool fused_moe = 0); - - LlamaDenseWeight& operator=(std::nullptr_t) - { - this->~LlamaDenseWeight(); - new (this) LlamaDenseWeight{}; - return *this; - } - - operator bool() const noexcept - { - return static_cast(weight); - } - - int input_dim = 0; - int output_dim = 0; - int group_size = 1; - - Tensor weight; - Tensor bias; - - Tensor scales; - Tensor zeros; - - DataType data_type; - - DataType weight_type; - DataType input_type; - - QuantDesc weight_quant; - QuantDesc input_quant; - - Epilogue epilogue; - - MatrixLayout k_desc; - MatrixLayout q_desc; -}; - -struct LlamaAttentionWeight: public core::Module { - - LlamaAttentionWeight() = default; - - LlamaAttentionWeight(int hidden_dim, - int head_dim, - int head_num, - int kv_head_num, - MLAParam mla, - bool bias, - bool qk_norm, - int tp_size, - int tp_rank, - DataType data_type, - DataType weight_type, - int group_size, - int window_size, - bool sink, - bool attn_output_gate = false); - - void prepare(); - - LlamaDenseWeight qkv; - LlamaDenseWeight output; - - Tensor sinks; - - LlamaDenseWeight q_proj; - LlamaDenseWeight q_a_proj; - LlamaDenseWeight q_b_proj; - LlamaDenseWeight kv_a_proj; - // LlamaDenseWeight kv_b_proj; - - Tensor q_a_layernorm; - Tensor kv_a_layernorm; - - int window_size{}; -}; - -struct LlamaFfnWeight: core::Module { - - LlamaFfnWeight() = default; - - LlamaFfnWeight(int hidden_dim, - int inter_size, - bool bias, - int tp_size, - int tp_rank, - DataType data_type, - DataType weight_type, - int group_size, - ActivationType act_type, - bool fuse_silu_act); - - static constexpr bool fuse_up_and_gate = true; - - void prepare(bool fused_moe); - - LlamaDenseWeight gating; - LlamaDenseWeight intermediate; - LlamaDenseWeight output; - LlamaDenseWeight fused_gating_intermediate; - - ActivationType act_type; - - int inter_size{}; - bool is_fused_silu{}; - - int tp_rank{}; -}; - -struct MoeFfnWeight: core::Module { - - MoeFfnWeight() = default; - - MoeFfnWeight(int layer_id, - const MoeParam& param, - int hidden_dim, - bool mlp_bias, - DataType data_type, - DataType weight_type, - int group_size, - int tp_size, - int tp_rank, - ActivationType act_type, - bool fuse_silu_act); - - void prepare(); - - LlamaDenseWeight gate; - LlamaDenseWeight shared_gate; - - /// Per-expert score correction bias for noaux_tc routing (optional; used when topk_method == "noaux_tc") - Tensor score_correction_bias; - - std::vector> experts; - - // reference into `experts` - LlamaFfnWeight block; - - MoeParam::Method method{}; -}; - -void LinkExperts(std::function experts, int n, LlamaDenseWeight& d); - -} // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaFfnLayer.cc b/src/turbomind/models/llama/LlamaFfnLayer.cc index d9b91bf929..a989987007 100644 --- a/src/turbomind/models/llama/LlamaFfnLayer.cc +++ b/src/turbomind/models/llama/LlamaFfnLayer.cc @@ -39,8 +39,11 @@ void LlamaFfnLayer::forward(ForwardParam param) Tensor gating; Tensor inter; - if (mlp.fused_gating_intermediate.weight) { - auto mix = linear_.Forward(param.input, mlp.fused_gating_intermediate); + auto* fused = mlp.w1w3.get(); + bool use_fused = fused && fused->weight; + + if (use_fused) { + auto mix = linear_.Forward(param.input, *fused); sync_check_cuda_error(); gating = mix.slice({0, 0}, {(int)token_num, inter_size}); @@ -49,16 +52,18 @@ void LlamaFfnLayer::forward(ForwardParam param) } } else { - gating = linear_.Forward(param.input, mlp.gating); + gating = linear_.Forward(param.input, *mlp.w1); sync_check_cuda_error(); TM_DEBUG_TENSOR(gating, Concat("w1", layer_id), 3); - inter = linear_.Forward(param.input, mlp.intermediate); + inter = linear_.Forward(param.input, *mlp.w3); sync_check_cuda_error(); TM_DEBUG_TENSOR(inter, Concat("w3", layer_id), 3); } - if (!mlp.is_fused_silu) { + // When using the fused kernel (w1w3 + fused silu), activation is already applied. + // Otherwise (separate w1/w3 or non-fused), apply activation explicitly. + if (!use_fused || !mlp.is_fused_silu) { // gate' = silu(gate) * up Activation(gating, inter, mlp.act_type, stream); sync_check_cuda_error(); @@ -67,7 +72,7 @@ void LlamaFfnLayer::forward(ForwardParam param) { // w2(x) NvtxScope scope("w2"); - linear_.Forward(gating, mlp.output, param.output); + linear_.Forward(gating, *mlp.w2, param.output); sync_check_cuda_error(); } } diff --git a/src/turbomind/models/llama/LlamaFfnLayer.h b/src/turbomind/models/llama/LlamaFfnLayer.h index ea5bee7987..059ab6b010 100644 --- a/src/turbomind/models/llama/LlamaFfnLayer.h +++ b/src/turbomind/models/llama/LlamaFfnLayer.h @@ -20,30 +20,26 @@ #pragma once #include "src/turbomind/core/core.h" -#include "src/turbomind/models/llama/LlamaDenseWeight.h" +#include "src/turbomind/models/ffn_weight.h" #include "src/turbomind/models/llama/LlamaLinear.h" #include "src/turbomind/models/llama/context.h" -#include "src/turbomind/models/llama/llama_params.h" namespace turbomind { class LlamaFfnLayer { public: - LlamaFfnLayer(const ModelParam& model, const Context& ctx): hidden_units_(model.hidden_units), linear_(*ctx.linear) - { - } + LlamaFfnLayer(const Context& ctx): linear_(*ctx.linear) {} struct ForwardParam { - Tensor input; - Tensor output; - const LlamaFfnWeight* weights; - int layer_id; + Tensor input; + Tensor output; + const FfnWeight* weights; + int layer_id; }; void forward(ForwardParam param); private: - const size_t hidden_units_; LlamaLinear& linear_; }; diff --git a/src/turbomind/models/llama/LlamaLinear.cu b/src/turbomind/models/llama/LlamaLinear.cu index 8578f0144e..b2a0386b60 100644 --- a/src/turbomind/models/llama/LlamaLinear.cu +++ b/src/turbomind/models/llama/LlamaLinear.cu @@ -12,7 +12,7 @@ #include "src/turbomind/kernels/quantization.h" -#include "src/turbomind/models/llama/LlamaDenseWeight.h" +#include "src/turbomind/models/linear_weight.h" #include "src/turbomind/models/llama/LlamaLinear.h" #include "src/turbomind/utils/cuda_utils.h" @@ -53,17 +53,17 @@ struct LlamaLinear::Impl { workspace_ = {}; } - std::tuple GetOperandB(const LlamaDenseWeight& dense) + std::tuple GetOperandB(const LinearWeight& weight) { - const Tensor& B = dense.weight; - const Tensor& V = dense.scales; - MatrixLayout desc_B = dense.k_desc; - MatrixLayout desc_V = dense.q_desc; + const Tensor& B = weight.weight; + const Tensor& V = weight.scales; + MatrixLayout desc_B = weight.k_desc; + MatrixLayout desc_V = weight.q_desc; return {B, desc_B, V, desc_V}; } std::tuple - GetOperandA(const LlamaDenseWeight& dense, const Tensor& input, Buffer_ indices, const Buffer_& offsets) + GetOperandA(const LinearWeight& weight, const Tensor& input, Buffer_ indices, const Buffer_& offsets) { auto st = core::Context::stream().handle(); @@ -73,7 +73,7 @@ struct LlamaLinear::Impl { const int m = indices ? indices.size() : input.shape(0); // Currently, FP8 only; INT8 may be added later - if (input.dtype() != dense.input_type) { + if (input.dtype() != weight.input_dtype()) { QuantizeSymm(A, U, input, st); sync_check_cuda_error(); } @@ -101,7 +101,7 @@ struct LlamaLinear::Impl { desc_U = {U.dtype(), kColMajor, (int)U.shape(1), (int)U.shape(0), (int)U.stride(0)}; } if (offsets) { - desc_A.num = desc_U.num = dense.k_desc.num; + desc_A.num = desc_U.num = weight.k_desc.num; desc_A.offsets = desc_U.offsets = const_cast(offsets.data()); } if (indices) { @@ -111,28 +111,28 @@ struct LlamaLinear::Impl { return {A, desc_A, U, desc_U}; } - void Forward(Tensor& output, - const Tensor& input, // - const LlamaDenseWeight& dense, - const Buffer_& indices, - const Buffer_& offsets) + void Forward(Tensor& output, + const Tensor& input, // + const LinearWeight& weight, + const Buffer_& indices, + const Buffer_& offsets) { using namespace gemm; Operation op{}; op.dispatch = dispatch_policy_; - op.epilogue = dense.epilogue; - op.quant_a = dense.input_quant; - op.quant_b = dense.weight_quant; + op.epilogue = weight.epilogue; + op.quant_a = MakeQuantDesc(weight.input_format); + op.quant_b = MakeQuantDesc(weight.weight_format); op.batch_dim = 0; - auto&& [A, desc_A, U, desc_U] = GetOperandA(dense, input, indices, offsets); - auto&& [B, desc_B, V, desc_V] = GetOperandB(dense); + auto&& [A, desc_A, U, desc_U] = GetOperandA(weight, input, indices, offsets); + auto&& [B, desc_B, V, desc_V] = GetOperandB(weight); Tensor& D = output; if (!D) { - int dim = dense.epilogue == Epilogue::kGatedSilu ? dense.output_dim / 2 : dense.output_dim; - D = Tensor{{desc_A.rows, dim}, dense.data_type, kDEVICE}; + int dim = weight.epilogue == Epilogue::kGatedSilu ? weight.output_dim / 2 : weight.output_dim; + D = Tensor{{desc_A.rows, dim}, weight.output_dtype(), kDEVICE}; } // std::cout << "D: " << D << " " << desc_B.num << "\n"; @@ -141,7 +141,7 @@ struct LlamaLinear::Impl { output.dtype(), kRowMajor, (int)output.shape(0), - dense.output_dim, + weight.output_dim, (int)output.stride(0), }; @@ -181,18 +181,18 @@ struct LlamaLinear::Impl { LlamaLinear::LlamaLinear(): impl_{std::make_shared()} {} -Tensor LlamaLinear::Forward(const Tensor& input, // - const LlamaDenseWeight& weight, - std::optional output) +Tensor LlamaLinear::Forward(const Tensor& input, // + const LinearWeight& weight, + std::optional output) { return Forward(input, weight, {}, {}, output); } -Tensor LlamaLinear::Forward(const Tensor& input, // - const LlamaDenseWeight& weight, - const Buffer_& indices, - const Buffer_& offsets, - std::optional output) +Tensor LlamaLinear::Forward(const Tensor& input, // + const LinearWeight& weight, + const Buffer_& indices, + const Buffer_& offsets, + std::optional output) { Tensor in = input.view({-1, input.shape(-1)}); Tensor out; diff --git a/src/turbomind/models/llama/LlamaLinear.h b/src/turbomind/models/llama/LlamaLinear.h index 8c4037b48e..d2c9354204 100644 --- a/src/turbomind/models/llama/LlamaLinear.h +++ b/src/turbomind/models/llama/LlamaLinear.h @@ -6,7 +6,7 @@ #include #include "src/turbomind/core/core.h" -#include "src/turbomind/models/llama/LlamaDenseWeight.h" +#include "src/turbomind/models/linear_weight.h" namespace turbomind { @@ -14,15 +14,15 @@ class LlamaLinear { public: explicit LlamaLinear(); - Tensor Forward(const Tensor& input, // - const LlamaDenseWeight& weight, - std::optional output = {}); + Tensor Forward(const Tensor& input, // + const LinearWeight& weight, + std::optional output = {}); - Tensor Forward(const Tensor& input, - const LlamaDenseWeight& weight, - const Buffer_& indices, - const Buffer_& offsets, - std::optional output = {}); + Tensor Forward(const Tensor& input, + const LinearWeight& weight, + const Buffer_& indices, + const Buffer_& offsets, + std::optional output = {}); void set_measure(bool measure); diff --git a/src/turbomind/models/llama/LlamaWeight.cc b/src/turbomind/models/llama/LlamaWeight.cc deleted file mode 100644 index 26ba9ca198..0000000000 --- a/src/turbomind/models/llama/LlamaWeight.cc +++ /dev/null @@ -1,187 +0,0 @@ -/* - * Copyright (c) OpenMMLab. All rights reserved. - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Modified from -// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptWeight.cc - -#include - -#include "src/turbomind/core/allocator.h" -#include "src/turbomind/core/context.h" -#include "src/turbomind/models/llama/LlamaDenseWeight.h" -#include "src/turbomind/models/llama/LlamaWeight.h" -#include "src/turbomind/models/llama/llama_params.h" -#include "src/turbomind/utils/cuda_utils.h" - -namespace turbomind { - -LlamaWeight::LlamaWeight(DataType data_type, - const ModelParam& model, - const EngineParam& engine_param, - const MoeParam& moe_param): - model_param_{model}, - engine_param_{engine_param}, - moe_param_{moe_param}, - hidden_units_(model.hidden_units), - inter_size_(model.inter_size), - vocab_size_(model.vocab_size), - vocab_size_padded_(model.vocab_size), - embedding_size_(model.embedding_size), - num_layer_(model.layer_num), - data_type_{data_type}, - weight_type_{model.weight_type}, - tp_size_(engine_param.attn_tp_size * engine_param.attn_cp_size), - tp_rank_(engine_param.attn_tp_rank * engine_param.attn_cp_size + engine_param.attn_cp_rank) -{ - if (vocab_size_padded_ % tp_size_ != 0) { - vocab_size_padded_ = (vocab_size_ + tp_size_ - 1) / tp_size_ * tp_size_; - TM_LOG_WARN("pad vocab size from {} to {}", vocab_size_, vocab_size_padded_); - } - if (embedding_size_ % tp_size_ != 0) { - embedding_size_ = (embedding_size_ + tp_size_ - 1) / tp_size_ * tp_size_; - TM_LOG_WARN("pad embed size from {} to {}", embedding_size_, embedding_size_); - } - FT_CHECK(hidden_units_ % tp_size_ == 0); - TM_CHECK_EQ(vocab_size_padded_ % tp_size_, 0); - TM_CHECK_EQ(hidden_units_ % tp_size_, 0); - - stream_ = core::Stream::create(); - alloca_ = core::Allocator{stream_, false}; - - initialize(); -} - -LlamaWeight::~LlamaWeight() -{ - release(); -} - -bool LlamaWeight::is_initialized() const -{ - return initialized_; -} - -void LlamaWeight::initialize() -{ - core::ContextGuard guard = context(); - - pre_decoder_embedding.emplace(embedding_size_, hidden_units_ / tp_size_, data_type_, false, data_type_, 1); - post_decoder_embedding.emplace(hidden_units_, vocab_size_padded_ / tp_size_, data_type_, false, data_type_, 1); - register_module("tok_embeddings", pre_decoder_embedding, tp_rank_); - register_module("output", post_decoder_embedding, tp_rank_); - - /// Lower VRAM pressure on consumer grade GPUs - /// TODO: Support token embeds on pinned host memory - pre_decoder_embedding.weight = empty_like(pre_decoder_embedding.weight, kCPU); - post_decoder_embedding.weight = empty_like(post_decoder_embedding.weight, kCPU); - - decoder_layer_weights.reserve(num_layer_); - for (int i = 0; i < num_layer_; ++i) { - decoder_layer_weights.emplace_back( - new LlamaDecoderLayerWeight(data_type_, i, model_param_, engine_param_, moe_param_)); - register_module("layers", *decoder_layer_weights.back(), i); - } - - output_norm_weight = Tensor{{hidden_units_}, data_type_, kDEVICE}; - register_parameter("norm.weight", output_norm_weight); - initialized_ = true; -} - -void LlamaWeight::release() -{ - core::ContextGuard guard = context(); - - pre_decoder_embedding = {}; - post_decoder_embedding = {}; - output_norm_weight = {}; - - for (auto& p : decoder_layer_weights) { - delete p; - } - - decoder_layer_weights.clear(); - pinned_weights_.clear(); - - // Wait for deallocations - core::Context::stream().Sync(); - - // release memory back to os - core::Context::device_alloc()->trim(0); - initialized_ = false; -} - -void LlamaWeight::to_device(const core::Device& device) -{ - TM_CHECK(device.type == kCPU || device.type == kDEVICE); - core::ContextGuard guard{stream_, alloca_, Allocator{kCPUpinned}}; - - auto tensor_ptr_map = get_parameters(); - for (auto& [name, tensor_ptr] : tensor_ptr_map) { - if (device.type == kCPU) { - if (pinned_weights_.find(name) == pinned_weights_.end()) { - pinned_weights_[name] = empty_like(*tensor_ptr, kCPUpinned); - Copy(*tensor_ptr, pinned_weights_[name]); - } - *tensor_ptr = {}; - } - else { - TM_CHECK(pinned_weights_.find(name) != pinned_weights_.end()); - *tensor_ptr = empty_like(pinned_weights_[name], kDEVICE); - Copy(pinned_weights_[name], *tensor_ptr); - } - } - core::Context::stream().Sync(); - if (device.type == kCPU) { - core::Context::device_alloc()->trim(0); - } -} - -core::ContextGuard LlamaWeight::context() const -{ - return core::ContextGuard{stream_, alloca_}; -} - -void LlamaWeight::prepare(const cudaDeviceProp& prop) -{ - core::ContextGuard guard = context(); - - // Wait for the weights to be filled externally - check_cuda_error(cudaDeviceSynchronize()); - - auto stream = core::Context::stream().handle(); - - for (auto& layer : decoder_layer_weights) { - layer->prepare(prop, stream); - } - - auto to_device = [](Tensor& x) { - auto tmp = std::exchange(x, empty_like(x, kDEVICE)); - Copy(tmp, x); - return tmp; - }; - - // Keep the host tensor until stream synchronization - auto tmp_token_embeds = to_device(pre_decoder_embedding.weight); - auto tmp_lm_head = to_device(post_decoder_embedding.weight); - - post_decoder_embedding.prepare(); - - // Block until processing is done - check_cuda_error(cudaStreamSynchronize(stream)); -} - -} // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaWeight.h b/src/turbomind/models/llama/LlamaWeight.h deleted file mode 100644 index 5b018ab4ab..0000000000 --- a/src/turbomind/models/llama/LlamaWeight.h +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (c) OpenMMLab. All rights reserved. - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Modified from -// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptWeight.h - -#pragma once - -#include - -#include "src/turbomind/core/context.h" -#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h" -#include "src/turbomind/models/llama/LlamaDenseWeight.h" -#include "src/turbomind/models/llama/llama_params.h" - -namespace turbomind { - -struct LlamaWeight: core::Module { - LlamaWeight() = default; - - LlamaWeight(DataType data_type, - const ModelParam& model_param, - const EngineParam& engine_param, - const MoeParam& moe_param); - - ~LlamaWeight(); - - LlamaWeight(const LlamaWeight&) = delete; - LlamaWeight& operator=(const LlamaWeight&) = delete; - - void prepare(const cudaDeviceProp& prop); - - bool is_initialized() const; - - void initialize(); - - void release(); - - void to_device(const core::Device& device); - - core::ContextGuard context() const; - - std::vector decoder_layer_weights; - - LlamaDenseWeight pre_decoder_embedding; - LlamaDenseWeight post_decoder_embedding; - - Tensor output_norm_weight; - -private: - const ModelParam model_param_; - const EngineParam engine_param_; - const MoeParam moe_param_; - - int hidden_units_; - int vocab_size_; - int vocab_size_padded_; - int embedding_size_; - int num_layer_; - - DataType data_type_; - DataType weight_type_; - - std::unordered_map pinned_weights_; - - int tp_size_; // this will follow attn tp param - int tp_rank_; - - std::vector inter_size_; - - core::Stream stream_; - core::Allocator alloca_; - bool initialized_{false}; -}; - -} // namespace turbomind diff --git a/src/turbomind/models/llama/SequenceManager.cc b/src/turbomind/models/llama/SequenceManager.cc index ea2817727d..09d516df49 100644 --- a/src/turbomind/models/llama/SequenceManager.cc +++ b/src/turbomind/models/llama/SequenceManager.cc @@ -31,26 +31,36 @@ std::string vector2string(const std::vector& data) return ss.str(); } -SequenceManager::SequenceManager(const ModelParam& model_param, - DataType runtime_dtype, - int cache_block_seq_len, - int attn_tp_size, - int max_batch_size, - double block_count, - int chunk_size, - bool enable_prefix_caching, - int rank, - int attn_cp_size, - core::Allocator allocator, - GetFreeMemSize get_free_size): +SequenceManager::SequenceManager(int head_dim, + int kv_head_num, + int num_layer, + const std::vector& layer_types, + int quant_policy, + DataType data_type, + DataType runtime_dtype, + int linear_key_head_dim, + int linear_value_head_dim, + int linear_conv_kernel_dim, + int linear_num_key_heads, + int linear_num_value_heads, + int cache_block_seq_len, + int attn_tp_size, + int max_batch_size, + double block_count, + int chunk_size, + bool enable_prefix_caching, + int rank, + int attn_cp_size, + core::Allocator allocator, + GetFreeMemSize get_free_size): block_seq_len_(cache_block_seq_len), rank_(rank), attn_cp_size_(attn_cp_size) { TM_CHECK_GT(attn_tp_size, 0); TM_CHECK_GT(cache_block_seq_len, 0); - int cache_layer_num = model_param.layer_num; + int cache_layer_num = num_layer; int num_linear_layers = 0; - for (const auto& type : model_param.layer_types) { + for (const auto& type : layer_types) { if (type == 1) { --cache_layer_num; ++num_linear_layers; @@ -61,22 +71,19 @@ SequenceManager::SequenceManager(const ModelParam& model_param, if (num_linear_layers > 0) { - const int key_head_dim = - model_param.linear_key_head_dim > 0 ? model_param.linear_key_head_dim : model_param.head_dim; - const int value_head_dim = - model_param.linear_value_head_dim > 0 ? model_param.linear_value_head_dim : model_param.head_dim; - const int d_conv = model_param.linear_conv_kernel_dim > 0 ? model_param.linear_conv_kernel_dim : 4; - const int num_k_heads = model_param.linear_num_key_heads / attn_tp_size; - const int num_v_heads = model_param.linear_num_value_heads / attn_tp_size; - const int key_dim = num_k_heads * key_head_dim; - const int value_dim = num_v_heads * value_head_dim; - const int conv_dim = key_dim * 2 + value_dim; + const int key_head_dim = linear_key_head_dim > 0 ? linear_key_head_dim : head_dim; + const int value_head_dim = linear_value_head_dim > 0 ? linear_value_head_dim : head_dim; + const int d_conv = linear_conv_kernel_dim > 0 ? linear_conv_kernel_dim : 4; + const int num_k_heads = linear_num_key_heads / attn_tp_size; + const int num_v_heads = linear_num_value_heads / attn_tp_size; + const int key_dim = num_k_heads * key_head_dim; + const int value_dim = num_v_heads * value_head_dim; + const int conv_dim = key_dim * 2 + value_dim; TM_CHECK_GT(max_batch_size, 0); - pooled_conv_states_ = {{max_batch_size, num_linear_layers, d_conv, conv_dim}, model_param.data_type, kDEVICE}; - pooled_recurrent_states_ = {{max_batch_size, num_linear_layers, num_v_heads, key_head_dim, value_head_dim}, - model_param.linear_state_dtype, - kDEVICE}; + pooled_conv_states_ = {{max_batch_size, num_linear_layers, d_conv, conv_dim}, data_type, kDEVICE}; + pooled_recurrent_states_ = { + {max_batch_size, num_linear_layers, num_v_heads, key_head_dim, value_head_dim}, data_type, kDEVICE}; free_linear_state_slots_.reserve(max_batch_size); for (int slot = max_batch_size - 1; slot >= 0; --slot) { @@ -94,17 +101,16 @@ SequenceManager::SequenceManager(const ModelParam& model_param, (pooled_conv_states_.byte_size() + pooled_recurrent_states_.byte_size()) * mb); } - const int dbits = byte_size(runtime_dtype, 8); - const auto quant_policy = model_param.quant_policy; - const int elem_bits = quant_policy ? quant_policy : dbits; + const int dbits = byte_size(runtime_dtype, 8); + const int elem_bits = quant_policy ? quant_policy : dbits; BlockConfig block_config{ - (int)model_param.head_dim, - (int)model_param.kv_head_num / attn_tp_size, + head_dim, + kv_head_num, cache_block_seq_len, elem_bits == dbits ? 0 : dbits, elem_bits, - model_param.head_dim == 576, // share kv + head_dim == 576, // share kv }; block::Layout layout{block_config}; diff --git a/src/turbomind/models/llama/SequenceManager.h b/src/turbomind/models/llama/SequenceManager.h index fff2706379..1b728cd825 100644 --- a/src/turbomind/models/llama/SequenceManager.h +++ b/src/turbomind/models/llama/SequenceManager.h @@ -11,7 +11,6 @@ #include "src/turbomind/models/llama/BlockManager.h" #include "src/turbomind/models/llama/BlockTrie.h" -#include "src/turbomind/models/llama/llama_params.h" namespace turbomind { @@ -90,18 +89,28 @@ class SequenceManager { }; // clang-format on - explicit SequenceManager(const ModelParam& model_param, - DataType runtime_dtype, - int cache_block_seq_len, - int attn_tp_size, - int max_batch_size, - double block_count, - int chunk_size, - bool enable_prefix_caching, - int rank, - int attn_cp_size, - core::Allocator allocator, - GetFreeMemSize get_free_size); + explicit SequenceManager(int head_dim, + int kv_head_num, + int num_layer, + const std::vector& layer_types, + int quant_policy, + DataType data_type, + DataType runtime_dtype, + int linear_key_head_dim, + int linear_value_head_dim, + int linear_conv_kernel_dim, + int linear_num_key_heads, + int linear_num_value_heads, + int cache_block_seq_len, + int attn_tp_size, + int max_batch_size, + double block_count, + int chunk_size, + bool enable_prefix_caching, + int rank, + int attn_cp_size, + core::Allocator allocator, + GetFreeMemSize get_free_size); SequenceManager(const SequenceManager&) = delete; SequenceManager(SequenceManager&&) noexcept = default; diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h index b61e5b1fe8..4ce0a586fa 100644 --- a/src/turbomind/models/llama/llama_params.h +++ b/src/turbomind/models/llama/llama_params.h @@ -3,161 +3,22 @@ #pragma once #include -#include -#include -#include -#include -#include "src/turbomind/core/data_type.h" -#include "src/turbomind/kernels/activation.h" -#include "src/turbomind/models/llama/llama_rope.h" +#include "src/turbomind/engine/engine_config.h" namespace turbomind { -struct MLAParam { - int q_lora_rank; - int kv_lora_rank; - int qk_rope_dim; - int v_head_dim; -}; - -struct ModelParam { - size_t head_num; - size_t head_dim; - size_t kv_head_num; - size_t hidden_units; - size_t layer_num; - size_t vocab_size; - size_t embedding_size; - float norm_eps; - int quant_policy; - bool attn_bias; - bool attn_sink; - bool mlp_bias; - DataType data_type; - - // Weight types for mixed quantization support. - // Models like mixed AWQ (e.g. QuantTrio GLM-4.7-Flash) quantize FFN/expert - // weights to int4 but keep attention weights as fp16. GptOss mxfp4 quantizes - // only MoE experts to e2m1 while keeping attention and shared experts as fp16. - // - // weight_type ffn_weight_type expert_weight_type - // Pure fp16 float16 float16 float16 - // Full AWQ int4 int4 int4 - // Mixed AWQ float16 int4 int4 - // GptOss mxfp4 bfloat16 bfloat16 e2m1 - DataType weight_type; // attention weights - DataType expert_weight_type; // MoE routed expert weights - DataType ffn_weight_type; // dense FFN / shared expert weights - - int group_size; - MLAParam mla; - bool qk_norm; - int tune_layer_num; - - ActivationType act_type; - - std::vector window_size; - std::vector inter_size; - std::vector layer_types; - - // Qwen3.5 Gated DeltaNet linear attention params - int linear_key_head_dim = 0; - int linear_value_head_dim = 0; - int linear_conv_kernel_dim = 0; - int linear_num_key_heads = 0; - int linear_num_value_heads = 0; - - DataType linear_state_dtype = {}; - - bool attn_output_gate = false; // Qwen3.5: doubles Q projection in full-attention layers - - // Layer indices whose MoE experts use data_type (fp16) instead of - // expert_weight_type (e.g. int4). Populated from modules_to_not_convert - // patterns like 'model.layers.0.'. - std::set unquantized_expert_layers; -}; - -inline bool HasLinearAttention(const ModelParam& model_param) -{ - for (int type : model_param.layer_types) { - if (type == 1) { - return true; - } - } - return false; -} - -/// TODO: rename all `gate` in the context of MoE router to `router` -struct MoeParam { - enum Method - { - kNaive, - kFused - } method; - - int experts_per_token; - int inter_size; - bool norm_topk_prob; - bool shared_gate; - float routed_scale; - - bool router_bias; - - int topk_group; - std::string topk_method; - int n_group; - std::string scoring_func; - int router_n_groups; - - std::vector expert_num; -}; - -struct AttentionParam { - float softmax_scale; - int cache_block_seq_len; - // logn attention - bool use_logn_attn; - int max_position_embeddings; - // rotary embedding - RopeParam rope; -}; - -struct EngineParam { - // batch params - int max_batch_size; - int session_len; - int step_length; - - // cache params - float cache_max_block_count; - int cache_chunk_size; - bool enable_prefix_caching; - bool enable_metrics; - - // chunking params - int max_forward_token_num; - int max_context_token_num; - int num_tokens_per_iter; - int max_prefill_iters; - - // parallel params - int outer_dp_size; - int outer_dp_rank; - int attn_dp_size; - int attn_dp_rank; - int attn_tp_size; - int attn_tp_rank; - int attn_cp_size; - int attn_cp_rank; - int mlp_tp_size; - int mlp_tp_rank; - - // multi-node - int nnodes; - int node_rank; - - std::vector devices; +struct EngineParam: EngineConfig { + // Runtime-derived fields (set in CreateContext) + int outer_dp_rank = 0; + int attn_dp_rank = 0; + int attn_tp_rank = 0; + int attn_cp_rank = 0; + int mlp_tp_rank = 0; + int model_tp_rank = 0; // rank(d_tp_group), in [0, attn_tp_size × attn_cp_size) + + // Derived field (set in Impl ctor) + int max_forward_token_num = 0; }; } // namespace turbomind diff --git a/src/turbomind/models/llama/llama_rope.h b/src/turbomind/models/llama/llama_rope.h index 83ffeeff5c..83732cb1d3 100644 --- a/src/turbomind/models/llama/llama_rope.h +++ b/src/turbomind/models/llama/llama_rope.h @@ -2,10 +2,6 @@ #pragma once -#include -#include -#include - #include namespace turbomind { @@ -21,48 +17,6 @@ enum class RopeType kMrope, }; -inline RopeType GetRoPEType(const std::string& type) -{ - std::map lookup = {{"default", RopeType::kDefault}, - {"linear", RopeType::kLinear}, - {"dynamic", RopeType::kDynamic}, - {"yarn", RopeType::kYarn}, - {"llama3", RopeType::kLlama3}, - {"mrope", RopeType::kMrope}}; - return lookup.at(type); -} - -struct YarnRopeParam { - float attention_factor; - float beta_fast; - float beta_slow; -}; - -struct Llama3RopeParam { - float low_freq_factor; - float high_freq_factor; - int original_max_position_embeddings; -}; - -struct MropeRopeParam { - int3 section; -}; - -struct RopeParam { - RopeType type; - // common - float base; - int dim; - float factor; - int max_position_embeddings; - // unique - union { - YarnRopeParam yarn; - Llama3RopeParam llama3; - MropeRopeParam mrope; - }; -}; - struct YarnRopeKernelParam { float scale_factor; float attention_factor; @@ -98,62 +52,4 @@ struct RopeKernelParam { MropeRopeKernelParam mrope; }; -inline void init_rope_kernel_param(const RopeParam& rope, RopeKernelParam& rope_kernel) -{ - rope_kernel.type = rope.type; - rope_kernel.dim = rope.dim; - rope_kernel.scale_factor = -std::log2(rope.base) / rope.dim; - if (rope.type == RopeType::kDynamic) { - rope_kernel.inv_factor = 1.f; - } - else { - rope_kernel.inv_factor = (rope.factor != 0.f) ? 1.0 / rope.factor : 1.f; - } - - if (rope.type == RopeType::kYarn) { - auto& src = rope.yarn; - auto& dst = rope_kernel.yarn; - const double PI = 3.14159265358979323846; - - auto find_correction_dim = [&](float num_rotations) { - return (rope.dim * std::log(rope.max_position_embeddings / (num_rotations * 2 * PI))) - / (2 * std::log(rope.base)); - }; - - auto find_correction_range = [&](float low_rot, float high_rot, float& low, float& high) { - low = std::floor(find_correction_dim(low_rot)); - high = std::ceil(find_correction_dim(high_rot)); - low = std::max(low, 0.f); - high = std::min(high, rope.dim - 1.f); - }; - - float low, high; - find_correction_range(src.beta_fast, src.beta_slow, low, high); - // https://github.com/huggingface/transformers/blob/6c3f168b36882f0beebaa9121eafa1928ba29633/src/transformers/modeling_rope_utils.py#L216 - if (low == high) { - high += 0.001f; - } - dst.ramp_inv_factor_div_2 = 1.0 / (high - low) / 2.0; - dst.ramp_inv_factor_mul_min = 1.0 / (high - low) * low; - dst.attention_factor = src.attention_factor; - } - else if (rope.type == RopeType::kLlama3) { - auto& src = rope.llama3; - auto& dst = rope_kernel.llama3; - - const double PI = 3.14159265358979323846; - float inv_diff_freq_factor = 1.0 / (src.high_freq_factor - src.low_freq_factor); - dst.alpha = src.original_max_position_embeddings / (2 * PI) * inv_diff_freq_factor; - dst.beta = src.low_freq_factor * inv_diff_freq_factor; - } - - else if (rope.type == RopeType::kMrope) { - auto& src = rope.mrope; - auto& dst = rope_kernel.mrope; - dst.section.x = src.section.x * 2; - dst.section.y = src.section.y * 2 + dst.section.x; - dst.section.z = src.section.z * 2 + dst.section.y; - } -} - } // namespace turbomind diff --git a/src/turbomind/models/llama/moe_ffn_layer.cc b/src/turbomind/models/llama/moe_ffn_layer.cc index f1a16f5a68..d44c1c4932 100644 --- a/src/turbomind/models/llama/moe_ffn_layer.cc +++ b/src/turbomind/models/llama/moe_ffn_layer.cc @@ -6,11 +6,10 @@ #include "src/turbomind/kernels/activation.h" #include "src/turbomind/kernels/norm/rms_norm.h" -#include "src/turbomind/models/llama/LlamaDenseWeight.h" #include "src/turbomind/models/llama/LlamaLinear.h" -#include "src/turbomind/models/llama/llama_params.h" #include "src/turbomind/models/llama/llama_utils.h" #include "src/turbomind/models/llama/moe_ffn_layer.h" +#include "src/turbomind/models/moe_weight.h" #include "src/turbomind/utils/anomaly_handler.h" #include "src/turbomind/utils/cuda_utils.h" @@ -19,54 +18,40 @@ namespace turbomind { -MoeFfnLayer::MoeFfnLayer(const ModelParam& model, const MoeParam& param, const EngineParam& engine, const Context& ctx): - inter_size_(param.inter_size / engine.mlp_tp_size), - hidden_dim_(model.hidden_units), +MoeFfnLayer::MoeFfnLayer(const EngineParam& engine, const Context& ctx): tp_size_(engine.mlp_tp_size), - param_(param), - is_warm_up_{*ctx.is_warm_up}, - linear_(*ctx.linear) + max_token_num_(engine.max_forward_token_num * engine.attn_dp_size), + is_warm_up_(*ctx.is_warm_up), + linear_(*ctx.linear), + expert_ffn_(std::make_unique(ctx)) { - TM_CHECK(!param.expert_num.empty()); +} - const int max_expert_num = *std::max_element(param.expert_num.begin(), param.expert_num.end()); +void MoeFfnLayer::Init(ForwardParam& p) +{ + const int expert_num = p.weights->num_experts(); + const int experts_per_token = p.weights->experts_per_token; - if (param_.method == MoeParam::kFused) { - // pass - } - else { - expert_ffn_ = std::make_unique(model, ctx); - } + h_offsets_ = {expert_num + 1, kCPU}; + + const int pad_token_num = (max_token_num_ + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize; - h_offsets_ = {max_expert_num + 1, kCPUpinned}; - - const int max_token_num = engine.max_forward_token_num * engine.attn_dp_size; - const int pad_token_num = (max_token_num + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize; - - // dbg(inter_size_, - // hidden_dim_, - // tp_size_, - // param_.method, - // param.expert_num, - // max_expert_num, - // max_token_num, - // pad_token_num, - // param_.experts_per_token); - - masks_ = {max_expert_num * pad_token_num, kDEVICE}; - f2n_ = {param_.experts_per_token * max_token_num, kDEVICE}; - f2E_ = {param_.experts_per_token * max_token_num, kDEVICE}; - en2f_ = {param_.experts_per_token * max_token_num, kDEVICE}; - scales_ = {param_.experts_per_token * max_token_num, kDEVICE}; - offsets_ = {max_expert_num + 1, kDEVICE}; - accum_ = {max_expert_num * kMoeGateMaxTiles, kDEVICE}; + masks_ = {expert_num * pad_token_num, kDEVICE}; + f2n_ = {experts_per_token * max_token_num_, kDEVICE}; + f2E_ = {experts_per_token * max_token_num_, kDEVICE}; + en2f_ = {experts_per_token * max_token_num_, kDEVICE}; + scales_ = {experts_per_token * max_token_num_, kDEVICE}; + offsets_ = {expert_num + 1, kDEVICE}; + accum_ = {expert_num * kMoeGateMaxTiles, kDEVICE}; + + initialized_ = true; } -Tensor_ MoeFfnLayer::Gate(const Tensor& input, const LlamaDenseWeight& gate) +Tensor_ MoeFfnLayer::Gate(const Tensor& input, const LinearWeight& gate) { - auto& weight = gate.weight; - TM_CHECK_EQ(input.shape(1), weight.shape(0)); - Tensor_ logits{{input.shape(0), weight.shape(1)}, kDEVICE}; + auto& w = gate.weight; + TM_CHECK_EQ(input.shape(1), w.shape(0)); + Tensor_ logits{{input.shape(0), w.shape(1)}, kDEVICE}; linear_.Forward(input, gate, logits); sync_check_cuda_error(); ApplyBias(logits, gate.bias, core::Context::stream().handle()); @@ -76,28 +61,37 @@ Tensor_ MoeFfnLayer::Gate(const Tensor& input, const LlamaDenseWeight& ga void MoeFfnLayer::Forward(ForwardParam& p) { + if (!initialized_) { + Init(p); + } + const int tokens = p.input.shape(0); const auto& moe = *p.weights; + const auto& block = *TM_CHECK_NOTNULL(moe.block()); + + const int hidden_dim = block.hidden_dim; + const int inter_size = block.inter_size; + const size_t padded = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize; - const int expert_num = moe.experts.size(); + const int expert_num = moe.num_experts(); FT_CHECK(expert_num); - auto logits = Gate(p.input, moe.gate); + auto logits = Gate(p.input, *moe.gate.get()); TM_DEBUG_TENSOR(logits, "logits", 2); const auto st = core::Context::stream().handle(); - // dump_logits(tokens, layer_id); - - if (param_.topk_method == "noaux_tc") { + if (p.weights->topk_method == "noaux_tc") { // invokeMoeGate_NoAuxTC clears accum and masks internally - TM_CHECK_EQ(param_.n_group, 1); - TM_CHECK_EQ(param_.topk_group, 1); - const float* correction_bias = - (moe.score_correction_bias.size() > 0) ? moe.score_correction_bias.data() : nullptr; + TM_CHECK_EQ(p.weights->n_group, 1); + TM_CHECK_EQ(p.weights->topk_group, 1); + const float* correction_bias = nullptr; + if (moe.score_correction_bias) { + correction_bias = moe.score_correction_bias.size() > 0 ? moe.score_correction_bias.data() : nullptr; + } invokeMoeGate_NoAuxTC(f2n_.data(), f2E_.data(), en2f_.data(), @@ -110,10 +104,10 @@ void MoeFfnLayer::Forward(ForwardParam& p) tokens, padded, expert_num, - param_.experts_per_token, - param_.norm_topk_prob, - param_.routed_scale, - param_.scoring_func == "sigmoid", + p.weights->experts_per_token, + p.weights->norm_topk_prob, + p.weights->routed_scale, + p.weights->scoring_func == "sigmoid", st); } else { @@ -121,9 +115,9 @@ void MoeFfnLayer::Forward(ForwardParam& p) check_cuda_error(cudaMemsetAsync(accum_.data(), 0, sizeof(int) * expert_num * kMoeGateMaxTiles, st)); bool softmax = true; - if (param_.topk_method == "group_limited_greedy") { + if (p.weights->topk_method == "group_limited_greedy") { invokeMoeSoftmaxMaskTopKGroups( - logits.data(), tokens, expert_num, expert_num / param_.n_group, param_.topk_group, st); + logits.data(), tokens, expert_num, expert_num / p.weights->n_group, p.weights->topk_group, st); sync_check_cuda_error(); softmax = false; } @@ -140,17 +134,17 @@ void MoeFfnLayer::Forward(ForwardParam& p) tokens, padded, expert_num, - param_.experts_per_token, + p.weights->experts_per_token, softmax, - param_.norm_topk_prob, - param_.routed_scale, + p.weights->norm_topk_prob, + p.weights->routed_scale, st); } sync_check_cuda_error(); if (is_warm_up_) { std::mt19937 g; - const auto expert_ids = SampleUniform(tokens, expert_num, param_.experts_per_token, g); + const auto expert_ids = SampleUniform(tokens, expert_num, p.weights->experts_per_token, g); std::vector cnt(expert_num); for (const auto& x : expert_ids) { ++cnt[x]; @@ -163,48 +157,41 @@ void MoeFfnLayer::Forward(ForwardParam& p) cudaMemcpyAsync(offsets_.data(), h_offsets_.data(), sizeof(int) * (expert_num + 1), cudaMemcpyDefault, st)); } - temp_ = Tensor{{param_.experts_per_token * tokens, hidden_dim_}, p.input.dtype(), p.input.device()}; + temp_ = Tensor{{p.weights->experts_per_token * tokens, hidden_dim}, p.input.dtype(), p.input.device()}; - if (param_.method == MoeParam::kNaive) { + auto indices = f2n_.slice(0, tokens * p.weights->experts_per_token); + auto offsets = offsets_.slice(0, expert_num + 1); - invokeMoeDispatch(temp_, p.input, f2n_.data(), param_.experts_per_token, st); + if (block.w1w3) { + // Fused w1w3 path + Tensor inter = linear_.Forward(p.input, *block.w1w3, indices, offsets_); sync_check_cuda_error(); - check_cuda_error( - cudaMemcpyAsync(h_offsets_.data(), offsets_.data(), sizeof(int) * (expert_num + 1), cudaMemcpyDefault, st)); - - check_cuda_error(cudaStreamSynchronize(st)); - - TM_CHECK_EQ(h_offsets_[expert_num], tokens * param_.experts_per_token); - - for (int i = 0; i < expert_num; ++i) { - if (int count = h_offsets_[i + 1] - h_offsets_[i]) { - auto io = temp_.slice({h_offsets_[i], 0}, {count, -1}); - expert_ffn_->forward({io, io, moe.experts.at(i).get(), p.layer_id}); - } + if (!block.is_fused_silu) { + Activation(inter, block.w1w3->bias, f2E_, block.act_type, st); + sync_check_cuda_error(); } + + linear_.Forward(inter.slice({0, 0}, {-1, inter_size}), *block.w2, {}, offsets, temp_); + sync_check_cuda_error(); } else { + // Separate w1/w3 path + Tensor gating = linear_.Forward(p.input, *block.w1, indices, offsets_); + sync_check_cuda_error(); - auto& block = moe.block; - - auto indices = f2n_.slice(0, tokens * param_.experts_per_token); - auto offsets = offsets_.slice(0, expert_num + 1); - - Tensor inter = linear_.Forward(p.input, block.fused_gating_intermediate, indices, offsets_); + Tensor up = linear_.Forward(p.input, *block.w3, indices, offsets_); sync_check_cuda_error(); - if (!block.is_fused_silu) { - Activation(inter, block.fused_gating_intermediate.bias, f2E_, moe.block.act_type, st); - sync_check_cuda_error(); - } + Activation(gating, up, block.act_type, st); + sync_check_cuda_error(); - linear_.Forward(inter.slice({0, 0}, {-1, inter_size_}), block.output, {}, offsets, temp_); + linear_.Forward(gating, *block.w2, {}, offsets, temp_); sync_check_cuda_error(); } - if (moe.shared_gate.weight) { - shared_scales_ = Gate(p.input, moe.shared_gate); + if (moe.shared_gate) { + shared_scales_ = Gate(p.input, *moe.shared_gate); } } @@ -214,12 +201,12 @@ void MoeFfnLayer::Combine(ForwardParam& p) invokeMoeCombine(p.output, temp_, - p.weights->block.output.bias, + moe.block()->w2->bias, scales_.data(), en2f_.data(), f2E_.data(), shared_scales_.data_or((float*)nullptr), - param_.experts_per_token, + p.weights->experts_per_token, 1.f / tp_size_, p.scale, core::Context::stream().handle()); diff --git a/src/turbomind/models/llama/moe_ffn_layer.h b/src/turbomind/models/llama/moe_ffn_layer.h index 939cd9c60e..d50bc4869b 100644 --- a/src/turbomind/models/llama/moe_ffn_layer.h +++ b/src/turbomind/models/llama/moe_ffn_layer.h @@ -4,22 +4,22 @@ #include "src/turbomind/kernels/gemm/context.h" #include "src/turbomind/kernels/gemm/moe_utils_v2.h" -#include "src/turbomind/models/llama/LlamaDenseWeight.h" #include "src/turbomind/models/llama/LlamaFfnLayer.h" #include "src/turbomind/models/llama/llama_params.h" +#include "src/turbomind/models/moe_weight.h" namespace turbomind { class MoeFfnLayer { public: - MoeFfnLayer(const ModelParam& model, const MoeParam& param, const EngineParam& engine, const Context& ctx); + MoeFfnLayer(const EngineParam& engine, const Context& ctx); struct ForwardParam { - Tensor input; - Tensor output; - const MoeFfnWeight* weights; - float scale; - int layer_id; + Tensor input; + Tensor output; + const MoeWeight* weights; + float scale; + int layer_id; }; void Forward(ForwardParam& p); @@ -27,22 +27,22 @@ class MoeFfnLayer { void Combine(ForwardParam& p); private: - Tensor_ Gate(const Tensor& input, const LlamaDenseWeight& gate); + void Init(ForwardParam& p); + + Tensor_ Gate(const Tensor& input, const LinearWeight& gate); void dump_logits(int token_num, int layer_id, int expert_num); - const int inter_size_; - const int hidden_dim_; const int tp_size_; - - const MoeParam param_; - - int& is_warm_up_; + const int max_token_num_; + int& is_warm_up_; LlamaLinear& linear_; std::unique_ptr expert_ffn_; + bool initialized_ = false; + /////////////////////////////////////////////////////// /// runtime states Buffer_ h_offsets_; diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc index 390ea72926..e1db9f4710 100644 --- a/src/turbomind/models/llama/unified_attention_layer.cc +++ b/src/turbomind/models/llama/unified_attention_layer.cc @@ -91,67 +91,44 @@ UnifiedAttentionLayer::~UnifiedAttentionLayer() aux_stream_ = {}; } -UnifiedAttentionLayer::UnifiedAttentionLayer(const ModelParam& model, - const AttentionParam& attn, - const EngineParam& engine, - int tp_size, - const Context& ctx, - int phases, - bool init): - head_num_(model.head_num), - kv_head_num_(model.kv_head_num), - size_per_head_(model.head_dim), - hidden_units_(model.hidden_units), - local_head_num_(head_num_ / tp_size), - local_kv_head_num_(model.kv_head_num / tp_size), - param_(attn), - model_param_(model), - engine_param_(engine), - cp_fn_ctx_(ctx.comm.d_comm, ctx.comm.d_cp_group), +UnifiedAttentionLayer::UnifiedAttentionLayer(int quant_policy, + const std::vector& layer_types, + int layer_num, + std::vector attn_weights, + const EngineParam& engine, + const Context& ctx, + int phases, + bool init): + quant_policy_{quant_policy}, + rope_{attn_weights[0]->rope}, + engine_param_{engine}, + cp_fn_ctx_{ctx.comm.d_comm, ctx.comm.d_cp_group}, is_warm_up_{*ctx.is_warm_up}, - context_(ctx), + context_{ctx}, + init_{init}, linear_(*ctx.linear), - arch_(getSMVersion()) + arch_{getSMVersion()} { - TM_CHECK_EQ(head_num_ % tp_size, 0) << head_num_ << " " << tp_size; - TM_CHECK_EQ(head_num_ % kv_head_num_, 0) << head_num_ << " " << kv_head_num_; + TM_CHECK(!attn_weights.empty()) << "attn_weights must not be empty"; + TM_CHECK(attn_weights[0]) << "attn_weights[0] must not be null"; check_cuda_error(cudaStreamCreateWithFlags(&aux_stream_, cudaStreamNonBlocking)); check_cuda_error(cudaEventCreateWithFlags(&qkv_event_, cudaEventDisableTiming)); check_cuda_error(cudaEventCreateWithFlags(&aux_event_, cudaEventDisableTiming)); - init_rope_kernel_param(param_.rope, rope_param_); + init_rope_kernel_param(rope_, rope_param_); // Skip other attention layer types - std::vector layer_types = model_param_.layer_types; - layer_types.resize(model_param_.layer_num); - cache_layer_ids_.resize(layer_types.size(), -1); + std::vector types = layer_types; + types.resize(layer_num); + cache_layer_ids_.resize(types.size(), -1); int next_cache_id = 0; - for (size_t i = 0; i < layer_types.size(); ++i) { - if (layer_types[i] == 0) { + for (size_t i = 0; i < types.size(); ++i) { + if (types[i] == 0) { cache_layer_ids_[i] = next_cache_id++; } } - Allocator alloc = core::Context::device_alloc(); - ssize_t workspace_tokens = kMaxWorkspaceTokens; - if (engine_param_.attn_cp_size > 1) { - alloc = GetSymmAllocator(ctx.comm.d_comm); - workspace_tokens += engine_param_.max_forward_token_num; - } - // partial_O layout: - // w/ cp, decode(q, h, k, 2) + prefill(q, h, 1, 2) - // w/o cp, decode(q, h, k, 2) - partial_O_ = Tensor_({workspace_tokens, local_head_num_, size_per_head_}, kDEVICE); - partial_ML_ = Tensor_({engine_param_.attn_cp_size, workspace_tokens, local_head_num_, 2}, alloc); - split_cnt_ = Tensor_({workspace_tokens}, kDEVICE); - if (init) { - const int dim = (int)local_head_num_ * (int)size_per_head_; - tmp_attn_ = Tensor{{engine_param_.max_forward_token_num, dim}, model.data_type, kDEVICE}; - } - - Clear(split_cnt_.buffer()); - const int bsz = engine.max_batch_size; if (rope_param_.type == RopeType::kDynamic) { @@ -162,7 +139,7 @@ UnifiedAttentionLayer::UnifiedAttentionLayer(const ModelParam& model, mrope_position_delta_buf_ = {bsz, kCPUpinned}; mrope_length_buf_ = {bsz, kCPUpinned}; } - const int max_blocks = bsz * cdiv(engine.session_len, param_.cache_block_seq_len); + const int max_blocks = bsz * cdiv(engine.session_len, engine_param_.cache_block_seq_len); for (int i = 0; i < phases; ++i) { auto& d = data_.emplace_back(std::make_shared()); d->block_ptrs = {max_blocks + 16, kDEVICE}; @@ -178,9 +155,37 @@ UnifiedAttentionLayer::UnifiedAttentionLayer(const ModelParam& model, rope_param_.mrope.stride = d->mrope_position_ids.stride(0); } } + + // Eagerly initialize workspace buffers (was previously lazy in Init()) + { + const auto& w = *attn_weights[0]; + const int tp_size = w.tp_size; + const int local_head_num = w.head_num / tp_size; + const int size_per_head = w.head_dim; + + TM_CHECK_EQ(w.head_num % tp_size, 0) << w.head_num << " " << tp_size; + TM_CHECK_EQ(w.head_num % w.kv_head_num, 0) << w.head_num << " " << w.kv_head_num; + + ssize_t workspace_tokens = kMaxWorkspaceTokens; + Allocator alloc = core::Context::device_alloc(); + if (engine_param_.attn_cp_size > 1) { + alloc = GetSymmAllocator(context_.comm.d_comm); + workspace_tokens += engine_param_.max_forward_token_num; + } + + partial_O_ = Tensor_({workspace_tokens, local_head_num, size_per_head}, kDEVICE); + partial_ML_ = Tensor_({engine_param_.attn_cp_size, workspace_tokens, local_head_num, 2}, alloc); + split_cnt_ = Tensor_({workspace_tokens}, kDEVICE); + if (init_) { + const int dim = local_head_num * size_per_head; + tmp_attn_ = Tensor{{engine_param_.max_forward_token_num, dim}, w.data_type, kDEVICE}; + } + + Clear(split_cnt_.buffer()); + } } -static void init_dynamic_ntk(RequestCache& cache, const RopeParam& rope) +static void init_dynamic_ntk(RequestCache& cache, const core::RopeConfig& rope) { cache.rope_base = rope.base; if (auto scaling_factor = rope.factor; scaling_factor > 1.f) { @@ -203,7 +208,7 @@ void UnifiedAttentionLayer::Run(BatchOp op, int phase, TensorMap& env) Buffer_ rc = env.at("requests").buffer(); if (rope_param_.type == RopeType::kDynamic) { for (int i = 0; i < rc.size(); ++i) { - init_dynamic_ntk(*rc[i], param_.rope); + init_dynamic_ntk(*rc[i], rope_); } } } @@ -313,6 +318,8 @@ void UnifiedAttentionLayer::Forward(ForwardParam p) const auto& weights = *p.weights; + TM_LOG_DEBUG("layer=%d, token_num=%d", layer_id, token_num); + Tensor qkv; auto& d = *data_.at(p.phase); @@ -321,14 +328,12 @@ void UnifiedAttentionLayer::Forward(ForwardParam p) // DebugTensor(p.input.slice(d.dbg_offset, d.dbg_size), Concat("attn_in", p.layer_id), 0); // } - if (weights.qkv.output_dim) { + if (weights.w_qkv && weights.w_qkv->output_dim) { // [token_num, hidden_dim] -> [token_num, local_q_kv_head_num, head_dim] - qkv = linear_.Forward(p.input, weights.qkv); + qkv = linear_.Forward(p.input, *weights.w_qkv); sync_check_cuda_error(); - if (model_param_.qk_norm) { - qk_norm(qkv, weights); - } + qk_norm(qkv, weights); } else { qkv = forward_mla(p.input, weights); @@ -345,12 +350,16 @@ void UnifiedAttentionLayer::Forward(ForwardParam p) // Apply sigmoid gating: attn *= sigmoid(gate) // Gate is stored at the end of each token's QKV: [Q|K|V|Gate] - if (model_param_.attn_output_gate) { - const int q_count = qkv.shape(0); - const int attn_dim = local_head_num_ * size_per_head_; - const int gate_offset = (local_head_num_ + 2 * local_kv_head_num_) * size_per_head_; - const int qkv_stride = (2 * local_head_num_ + 2 * local_kv_head_num_) * size_per_head_; - const auto stream = core::Context::stream().handle(); + if (weights.output_gate) { + const int tp_size = weights.tp_size; + const int local_head_num = weights.head_num / tp_size; + const int local_kv_head_num = weights.kv_head_num / tp_size; + const int size_per_head = weights.head_dim; + const int q_count = qkv.shape(0); + const int attn_dim = local_head_num * size_per_head; + const int gate_offset = (local_head_num + 2 * local_kv_head_num) * size_per_head; + const int qkv_stride = (2 * local_head_num + 2 * local_kv_head_num) * size_per_head; + const auto stream = core::Context::stream().handle(); invokeSigmoidGateMultiply(attn.raw_data(), (const char*)qkv.raw_data() + gate_offset * byte_size(qkv.dtype(), 1), attn_dim, @@ -369,13 +378,18 @@ void UnifiedAttentionLayer::Forward(ForwardParam p) ////////////////////////////////////////////// /// output gemm -> - (void)linear_.Forward(attn, weights.output, p.output); + (void)linear_.Forward(attn, *weights.wo, p.output); sync_check_cuda_error(); } template Tensor UnifiedAttentionLayer::core_attention(Tensor& qkv, const ForwardParam& p, const WeightType& weights) { + const int tp_size = weights.tp_size; + const int local_head_num = weights.head_num / tp_size; + const int local_kv_head_num = weights.kv_head_num / tp_size; + const int size_per_head = weights.head_dim; + const auto device = qkv.device(); const auto dtype = qkv.dtype(); @@ -386,20 +400,19 @@ Tensor UnifiedAttentionLayer::core_attention(Tensor& qkv, const ForwardParam& p, TM_CHECK_EQ(d.prefill.q_sum + d.decode.n, q_count); - const int local_q_kv_head_num = local_head_num_ + 2 * local_kv_head_num_; + const int local_q_kv_head_num = local_head_num + 2 * local_kv_head_num; Tensor attn; if (tmp_attn_) { attn = tmp_attn_.slice(0, q_count); } else { - attn = {{q_count, (int)local_head_num_ * (int)size_per_head_}, dtype, device}; + attn = {{q_count, local_head_num * size_per_head}, dtype, device}; } - const bool is_mla = model_param_.mla.kv_lora_rank > 0; + const bool is_mla = weights.is_mla(); - Tensor tmp_kv{ - {(int)local_kv_head_num_, is_mla ? 1 : 2, d.prefill.k_sum + MAX_CTA_S, (int)size_per_head_}, dtype, device}; + Tensor tmp_kv{{local_kv_head_num, is_mla ? 1 : 2, d.prefill.k_sum + MAX_CTA_S, size_per_head}, dtype, device}; const int cache_layer_id = cache_layer_ids_[p.layer_id]; @@ -410,27 +423,27 @@ Tensor UnifiedAttentionLayer::core_attention(Tensor& qkv, const ForwardParam& p, params.out = (T*)attn.raw_data(); params.q = (T*)qkv.raw_data(); - params.k = params.q + local_head_num_ * size_per_head_; + params.k = params.q + local_head_num * size_per_head; if (is_mla) { params.v = params.k; - params.stride = (local_head_num_ + 1 * local_kv_head_num_) * size_per_head_; + params.stride = (local_head_num + 1 * local_kv_head_num) * size_per_head; } else { - params.v = params.k + local_kv_head_num_ * size_per_head_; + params.v = params.k + local_kv_head_num * size_per_head; // When attn_output_gate, QKV layout is [Q|K|V|Gate] per token // stride must account for the extra gate portion at the end - if (model_param_.attn_output_gate) { - params.stride = (2 * local_head_num_ + 2 * local_kv_head_num_) * size_per_head_; + if (weights.output_gate) { + params.stride = (2 * local_head_num + 2 * local_kv_head_num) * size_per_head; } else { - params.stride = (local_head_num_ + 2 * local_kv_head_num_) * size_per_head_; + params.stride = (local_head_num + 2 * local_kv_head_num) * size_per_head; } } - if (weights.qkv.bias) { - params.q_bias = (T*)weights.qkv.bias.data_or(nullptr); - params.k_bias = params.q_bias + local_head_num_ * size_per_head_; - params.v_bias = params.k_bias + local_kv_head_num_ * size_per_head_; + if (!is_mla && weights.w_qkv && weights.w_qkv->bias) { + params.q_bias = (T*)weights.w_qkv->bias.data_or(nullptr); + params.k_bias = params.q_bias + local_head_num * size_per_head; + params.v_bias = params.k_bias + local_kv_head_num * size_per_head; } params.batch_size = stat.n; @@ -443,21 +456,21 @@ Tensor UnifiedAttentionLayer::core_attention(Tensor& qkv, const ForwardParam& p, params.block_iter_params = BlockIteratorParams{(char**)d.block_ptrs.data(), // d.block_ptrs_offsets.data() + offset, cache_layer_id, - (int)param_.cache_block_seq_len}; + engine_param_.cache_block_seq_len}; // prefill only if (is_mla) { params.linear_iter_params = LinearIteratorParams{ - tmp_kv.raw_data(), // flattened KV - stat.k_sum * size_per_head_, // stride to next head - 0 // stride from K to V + tmp_kv.raw_data(), // flattened KV + stat.k_sum * size_per_head, // stride to next head + 0 // stride from K to V }; } else { params.linear_iter_params = LinearIteratorParams{ - tmp_kv.raw_data(), // flattened KV - stat.k_sum * size_per_head_ * 2, // stride to next head - stat.k_sum * size_per_head_ // stride from K to V + tmp_kv.raw_data(), // flattened KV + stat.k_sum * size_per_head * 2, // stride to next head + stat.k_sum * size_per_head // stride from K to V }; } @@ -465,21 +478,21 @@ Tensor UnifiedAttentionLayer::core_attention(Tensor& qkv, const ForwardParam& p, params.cu_q_len = d.q_offsets.data() + offset; params.cu_k_len = d.k_offsets.data() + offset; - params.num_heads = local_head_num_; - params.num_kv_heads = local_kv_head_num_; - params.size_per_head = size_per_head_; + params.num_heads = local_head_num; + params.num_kv_heads = local_kv_head_num; + params.size_per_head = size_per_head; params.layer_id = cache_layer_id; double scaling = 1.; - if (param_.softmax_scale) { // model predefined softmax scale - scaling *= param_.softmax_scale; + if (weights.softmax_scale) { // model predefined softmax scale + scaling *= weights.softmax_scale; } else { // default value scaling /= std::sqrt((float)params.size_per_head); } params.inv_sqrt_dh = scaling * std::log2(std::exp(1.)); - params.sinks = weights.sinks.data_or((T*)nullptr); + params.sinks = weights.sinks ? weights.sinks.data_or((T*)nullptr) : (T*)nullptr; params.scale_sinks = scaling; params.window_size = weights.window_size; @@ -498,8 +511,8 @@ Tensor UnifiedAttentionLayer::core_attention(Tensor& qkv, const ForwardParam& p, } // logn attn - params.use_logn_attn = param_.use_logn_attn; - params.max_position_embeddings = param_.max_position_embeddings; + params.use_logn_attn = weights.use_logn_attn; + params.max_position_embeddings = weights.rope.max_position_embeddings; // Decoding use only for now params.split_cnt = split_cnt_.data(); @@ -515,9 +528,9 @@ Tensor UnifiedAttentionLayer::core_attention(Tensor& qkv, const ForwardParam& p, // update ML,O offset if both prefill and decode present const int offset_ML_stage = - engine_param_.attn_cp_size * (offset ? kMaxWorkspaceTokens * local_head_num_ * 2 : 0); - const int offset_ML_rank = params.cp_rank * params.token_num * local_head_num_ * params.max_split_k * 2; - const int offset_O = offset ? kMaxWorkspaceTokens * local_head_num_ * size_per_head_ : 0; + engine_param_.attn_cp_size * (offset ? kMaxWorkspaceTokens * local_head_num * 2 : 0); + const int offset_ML_rank = params.cp_rank * params.token_num * local_head_num * params.max_split_k * 2; + const int offset_O = offset ? kMaxWorkspaceTokens * local_head_num * size_per_head : 0; params.partial_ML = partial_ML_.data() + offset_ML_stage + offset_ML_rank; params.partial_O = partial_O_.data() + offset_O; @@ -527,7 +540,7 @@ Tensor UnifiedAttentionLayer::core_attention(Tensor& qkv, const ForwardParam& p, params.cp_fn = CpPost; params.cp_fn_ctx = (void*)&cp_fn_ctx_; cp_fn_ctx_.cp_rank = params.cp_rank; - cp_fn_ctx_.count = params.token_num * local_head_num_ * params.max_split_k * 2; + cp_fn_ctx_.count = params.token_num * local_head_num * params.max_split_k * 2; cp_fn_ctx_.partial_ML = partial_ML_.data() + offset_ML_stage; cp_fn_ctx_.stream = stream; } @@ -535,7 +548,7 @@ Tensor UnifiedAttentionLayer::core_attention(Tensor& qkv, const ForwardParam& p, params.arch = arch_; params.stream = stream; - params.quant_policy = model_param_.quant_policy; + params.quant_policy = quant_policy_; return params; }; @@ -592,48 +605,53 @@ Tensor UnifiedAttentionLayer::core_attention(Tensor& qkv, const ForwardParam& p, Tensor UnifiedAttentionLayer::forward_mla(const Tensor& hidden_state, const WeightType& w) { + const int tp_size = w.tp_size; + const int local_head_num = w.head_num / tp_size; + const int local_kv_head_num = w.kv_head_num / tp_size; + const int size_per_head = w.head_dim; + const auto token_num = hidden_state.shape(0); const auto dtype = hidden_state.dtype(); - const int q_lora_rank = w.q_a_proj.output_dim; - const int kv_lora_rank = w.kv_a_layernorm.size(); - const int qk_rope_dim = w.kv_a_proj.output_dim - kv_lora_rank; + const int q_lora_rank = w.q_a_proj->output_dim; + const int kv_lora_rank = w.kv_a_layernorm->weight.size(); + const int qk_rope_dim = w.kv_a_proj->output_dim - kv_lora_rank; Tensor q; const auto stream = core::Context::stream().handle(); - if (w.q_proj.weight) { - q = linear_.Forward(hidden_state, w.q_proj); + if (w.q_proj && w.q_proj->weight) { + q = linear_.Forward(hidden_state, *w.q_proj); sync_check_cuda_error(); } else { - Tensor q_a = linear_.Forward(hidden_state, w.q_a_proj); + Tensor q_a = linear_.Forward(hidden_state, *w.q_a_proj); sync_check_cuda_error(); - invokeRMSNorm(q_a, q_a, w.q_a_layernorm, model_param_.norm_eps, stream); + invokeRMSNorm(q_a, q_a, w.q_a_layernorm->weight, w.q_a_layernorm->norm_eps_, stream); sync_check_cuda_error(); - q = linear_.Forward(q_a, w.q_b_proj); + q = linear_.Forward(q_a, *w.q_b_proj); sync_check_cuda_error(); } - Tensor kv_a_k_pe = linear_.Forward(hidden_state, w.kv_a_proj); + Tensor kv_a_k_pe = linear_.Forward(hidden_state, *w.kv_a_proj); sync_check_cuda_error(); auto kv_a = kv_a_k_pe.slice({0, 0}, {-1, kv_lora_rank}); - invokeRMSNorm(kv_a, kv_a, w.kv_a_layernorm, model_param_.norm_eps, stream); + invokeRMSNorm(kv_a, kv_a, w.kv_a_layernorm->weight, w.kv_a_layernorm->norm_eps_, stream); sync_check_cuda_error(); - const int local_q_kv_head_num = local_head_num_ + 1 * local_kv_head_num_; + const int local_q_kv_head_num = local_head_num + 1 * local_kv_head_num; - Tensor qkv{{token_num, local_q_kv_head_num, size_per_head_}, dtype, hidden_state.device()}; + Tensor qkv{{token_num, local_q_kv_head_num, size_per_head}, dtype, hidden_state.device()}; MLACopyQKV(dtype, qkv.raw_data(), q.raw_data(), kv_a_k_pe.raw_data(), token_num, - local_head_num_, + local_head_num, kv_lora_rank, qk_rope_dim, stream); @@ -644,23 +662,32 @@ Tensor UnifiedAttentionLayer::forward_mla(const Tensor& hidden_state, const Weig void UnifiedAttentionLayer::qk_norm(Tensor& qkv, const WeightType& weights) { + if (!(weights.q_norm || weights.k_norm)) { + return; + } + + TM_CHECK(weights.q_norm && weights.k_norm); + + const int tp_size = weights.tp_size; + const int local_head_num = weights.head_num / tp_size; + const int local_kv_head_num = weights.kv_head_num / tp_size; + const int size_per_head = weights.head_dim; + const auto stream = core::Context::stream().handle(); check_cuda_error(cudaEventRecord(qkv_event_, stream)); check_cuda_error(cudaStreamWaitEvent(aux_stream_, qkv_event_)); - TM_CHECK(model_param_.attn_bias == false) << "not implemented"; - const auto token_num = qkv.shape(0); - auto qkv3 = qkv.view({token_num, -1, (int)size_per_head_}); + auto qkv3 = qkv.view({token_num, -1, size_per_head}); - auto q = qkv3.slice({0, 0, 0}, {-1, (int)local_head_num_, -1}); - invokeRMSNormQK(q, weights.q_a_layernorm, model_param_.norm_eps, stream); + auto q = qkv3.slice({0, 0, 0}, {-1, local_head_num, -1}); + invokeRMSNormQK(q, weights.q_norm->weight, weights.q_norm->norm_eps_, stream); sync_check_cuda_error(); - auto k = qkv3.slice({0, (int)local_head_num_, 0}, {-1, (int)local_kv_head_num_, -1}); - invokeRMSNormQK(k, weights.kv_a_layernorm, model_param_.norm_eps, aux_stream_); + auto k = qkv3.slice({0, local_head_num, 0}, {-1, local_kv_head_num, -1}); + invokeRMSNormQK(k, weights.k_norm->weight, weights.k_norm->norm_eps_, aux_stream_); sync_check_cuda_error(); check_cuda_error(cudaEventRecord(aux_event_, aux_stream_)); diff --git a/src/turbomind/models/llama/unified_attention_layer.h b/src/turbomind/models/llama/unified_attention_layer.h index 457029ca5c..79c20d3115 100644 --- a/src/turbomind/models/llama/unified_attention_layer.h +++ b/src/turbomind/models/llama/unified_attention_layer.h @@ -22,15 +22,17 @@ #pragma once #include +#include #include "src/turbomind/core/core.h" #include "src/turbomind/engine/batch.h" #include "src/turbomind/kernels/attention/cp_utils.h" #include "src/turbomind/kernels/gemm/test/test_utils.h" -#include "src/turbomind/models/llama/LlamaDenseWeight.h" +#include "src/turbomind/models/attention_weight.h" #include "src/turbomind/models/llama/LlamaLinear.h" #include "src/turbomind/models/llama/context.h" #include "src/turbomind/models/llama/llama_params.h" +#include "src/turbomind/models/llama/llama_rope.h" namespace turbomind { @@ -38,7 +40,7 @@ struct AttentionData; class UnifiedAttentionLayer { public: - using WeightType = LlamaAttentionWeight; + using WeightType = AttentionWeight; static constexpr int kMaxKVSplits = 128; static constexpr int kMaxWorkspaceTokens = 4096; @@ -53,13 +55,14 @@ class UnifiedAttentionLayer { ~UnifiedAttentionLayer(); - UnifiedAttentionLayer(const ModelParam& model, - const AttentionParam& attn, - const EngineParam& engine, - int tp_size, - const Context& context, - int phases, - bool init); + UnifiedAttentionLayer(int quant_policy, + const std::vector& layer_types, + int layer_num, + std::vector attn_weights, + const EngineParam& engine, + const Context& context, + int phases, + bool init); void Run(BatchOp op, int phase, TensorMap& env); @@ -77,19 +80,12 @@ class UnifiedAttentionLayer { void qk_norm(Tensor& qkv, const WeightType& weights); private: - const int head_num_; - const int kv_head_num_; - const int size_per_head_; - const int hidden_units_; - const int local_head_num_; - const int local_kv_head_num_; - - const AttentionParam param_; - const EngineParam engine_param_; - const ModelParam model_param_; - const Context& context_; - - int& is_warm_up_; + const int quant_policy_; + const core::RopeConfig rope_; + const EngineParam engine_param_; + const Context& context_; + int& is_warm_up_; + const bool init_; LlamaLinear& linear_; const int arch_{}; @@ -107,7 +103,7 @@ class UnifiedAttentionLayer { std::vector cache_layer_ids_; /////////////////////////////////////////////////////// - /// temp runtime buffers + /// temp runtime buffers (allocated in constructor) Tensor_ partial_O_; Tensor_ partial_ML_; Tensor_ split_cnt_; diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc index 0a8d7508cd..20112b03c0 100644 --- a/src/turbomind/models/llama/unified_decoder.cc +++ b/src/turbomind/models/llama/unified_decoder.cc @@ -8,11 +8,13 @@ #include "src/turbomind/core/allocator.h" #include "src/turbomind/kernels/core/math.h" #include "src/turbomind/kernels/norm/rms_norm.h" +#include "src/turbomind/models/decoder_layer_weight.h" #include "src/turbomind/models/llama/llama_kernels.h" #include "src/turbomind/models/llama/llama_utils.h" #include "src/turbomind/models/llama/moe_ffn_layer.h" #include "src/turbomind/models/llama/unified_attention_layer.h" #include "src/turbomind/models/llama/unified_decoder.h" +#include "src/turbomind/models/model_weight.h" #include "src/turbomind/utils/anomaly_handler.h" #include "src/turbomind/utils/cuda_utils.h" @@ -30,37 +32,70 @@ void UnifiedDecoder::Run(BatchOp op, int phase, TensorMap& env) } } -UnifiedDecoder::UnifiedDecoder(const ModelParam& model, - const EngineParam& engine, - const AttentionParam& attn, - const MoeParam& moe, - const Context& ctx, - int phases): - layer_num_(model.layer_num), - hidden_units_(model.hidden_units), +UnifiedDecoder::UnifiedDecoder(const EngineParam& engine, + const Context& ctx, + int phases, + const ModelWeight& model_weight): + layer_num_(model_weight.num_layer), + hidden_units_(model_weight.hidden_units), attn_tp_size_(engine.attn_tp_size), attn_dp_size_(engine.attn_dp_size), attn_dp_rank_(engine.attn_dp_rank), mlp_tp_size_(engine.mlp_tp_size), attn_tp_group_(ctx.comm.d_tp_group), - rmsnorm_eps_(model.norm_eps), d_comm_(ctx.comm.d_comm), - tune_layer_num_(model.tune_layer_num), + tune_layer_num_(engine.tune_layer_num), is_warm_up_{*ctx.is_warm_up} { - if (std::accumulate(moe.expert_num.begin(), moe.expert_num.end(), 0LL)) { - moe_ffn_layer_ = std::make_unique(model, moe, engine, ctx); + bool has_moe = false; + for (int i = 0; i < model_weight.num_layer; ++i) { + if (model_weight.layer(i)->moe_ffn) { + has_moe = true; + break; + } + } + if (has_moe) { + moe_ffn_layer_ = std::make_unique(engine, ctx); } - attn_layer_ = - std::make_unique(model, attn, engine, attn_tp_size_, ctx, phases, (bool)moe_ffn_layer_); + std::vector attn_weights; + attn_weights.reserve(model_weight.num_layer); + for (int i = 0; i < model_weight.num_layer; ++i) { + if (auto* attn = model_weight.layer(i)->attention.get()) { + attn_weights.push_back(attn); + } + } - if (std::find(model.layer_types.begin(), model.layer_types.end(), 1) != model.layer_types.end()) { - linear_attn_layer_ = std::make_unique(model, attn, engine, attn_tp_size_, ctx, phases); + attn_layer_ = std::make_unique(engine.quant_policy, + model_weight.layer_types, + model_weight.num_layer, + attn_weights, + engine, + ctx, + phases, + (bool)moe_ffn_layer_); + + bool has_linear_attn = false; + for (auto t : model_weight.layer_types) { + if (t == 1) { + has_linear_attn = true; + break; + } + } + if (has_linear_attn) { + linear_attn_layer_ = + std::make_unique(model_weight.data_type, model_weight.layer_types, engine, ctx, phases); } - if (std::accumulate(model.inter_size.begin(), model.inter_size.end(), 0LL)) { - ffn_layer_ = std::make_unique(model, ctx); + bool has_ffn = false; + for (int i = 0; i < model_weight.num_layer; ++i) { + if (model_weight.layer(i)->feed_forward) { + has_ffn = true; + break; + } + } + if (has_ffn) { + ffn_layer_ = std::make_unique(ctx); } } @@ -68,6 +103,7 @@ void UnifiedDecoder::AllreduceResidualRMSnorm(Tensor& hidden_states, Tensor& residual, const Tensor& bias, const Tensor& weight, + float eps, int token_num, int group0, int group1, @@ -83,7 +119,7 @@ void UnifiedDecoder::AllreduceResidualRMSnorm(Tensor& hidden_states, residual.data_or((void*)nullptr), bias.data_or((void*)nullptr), weight.raw_data(), - rmsnorm_eps_, + eps, hidden_units_, dtype, group0, @@ -97,7 +133,7 @@ void UnifiedDecoder::AllreduceResidualRMSnorm(Tensor& hidden_states, residual.data_or((void*)nullptr), bias.data_or((void*)nullptr), weight.raw_data(), - rmsnorm_eps_, + eps, hidden_units_, token_num, dtype, @@ -113,7 +149,7 @@ void UnifiedDecoder::AllreduceResidualRMSnorm(Tensor& hidden_states, dtype, hidden_units_, token_num, - rmsnorm_eps_, + eps, stream); sync_check_cuda_error(); } @@ -174,12 +210,18 @@ void UnifiedDecoder::Forward(int phase, TensorMap& args, const std::vectorself_attn_norm, "norm_weight", 2); const auto stream = core::Context::stream().handle(); - invokeRMSNorm(local_hidden_states, local_residual, weights.at(0)->self_attn_norm, rmsnorm_eps_, stream); + invokeRMSNorm(local_hidden_states, + local_residual, + weights.at(0)->attention_norm->weight, + weights.at(0)->attention_norm->norm_eps_, + stream); + sync_check_cuda_error(); TM_DEBUG_TENSOR(local_hidden_states, Concat("norm0", 0), 2); @@ -201,13 +243,13 @@ void UnifiedDecoder::Forward(int phase, TensorMap& args, const std::vectorlinear_attn_weights) { + if (weights.at(layer)->linear_attn) { linear_attn_layer_->Forward( - {phase, local_hidden_states, local_hidden_states, weights.at(layer)->linear_attn_weights.get(), layer}); + {phase, local_hidden_states, local_hidden_states, weights.at(layer)->linear_attn.get(), layer}); } else { - attn_layer_->Forward( - {phase, local_hidden_states, local_hidden_states, weights.at(layer)->self_attn_weights.get(), layer}); + auto* attn = weights.at(layer)->attention.get(); + attn_layer_->Forward({phase, local_hidden_states, local_hidden_states, attn, layer}); } TM_DEBUG_TENSOR(local_hidden_states, Concat("attn_block", layer), 2); @@ -215,17 +257,18 @@ void UnifiedDecoder::Forward(int phase, TensorMap& args, const std::vectorlinear_attn_weights) { - out_bias = weights.at(layer)->linear_attn_weights->out_proj.bias; + if (weights.at(layer)->linear_attn) { + out_bias = weights.at(layer)->linear_attn->out_proj->bias; } else { - out_bias = weights.at(layer)->self_attn_weights->output.bias; + out_bias = weights.at(layer)->attention->wo->bias; } AllreduceResidualRMSnorm(global_hidden_states, local_residual, out_bias, - weights.at(layer)->ffn_norm, + weights.at(layer)->ffn_norm->weight, + weights.at(layer)->ffn_norm->norm_eps_, local_token_num, attn_tp_group_, 0, @@ -239,18 +282,18 @@ void UnifiedDecoder::Forward(int phase, TensorMap& args, const std::vector moe_fwd_param; - if (weights.at(layer)->moe_weights) { + if (weights.at(layer)->moe_ffn) { moe_fwd_param = MoeFfnLayer::ForwardParam{global_hidden_states, global_hidden_states, - weights.at(layer)->moe_weights.get(), + weights.at(layer)->moe_ffn.get(), ffn_layer_ ? 1.f : 0.f, layer}; moe_ffn_layer_->Forward(*moe_fwd_param); } - if (weights.at(layer)->ffn_weights) { + if (ffn_layer_ && weights.at(layer)->feed_forward) { ffn_layer_->forward( - {global_hidden_states, global_hidden_states, weights.at(layer)->ffn_weights.get(), (int)layer}); + {global_hidden_states, global_hidden_states, weights.at(layer)->feed_forward.get(), (int)layer}); } if (moe_fwd_param) { @@ -261,12 +304,13 @@ void UnifiedDecoder::Forward(int phase, TensorMap& args, const std::vectorself_attn_norm : args.at("output_norm_weight"); + auto& scale_weight = !last ? weights.at(layer + 1)->attention_norm->weight : args.at("output_norm_weight"); AllreduceResidualRMSnorm(global_hidden_states, local_residual, {}, scale_weight, + weights.at(layer)->ffn_norm->norm_eps_, local_token_num, 0, attn_tp_group_, diff --git a/src/turbomind/models/llama/unified_decoder.h b/src/turbomind/models/llama/unified_decoder.h index 05e9ea73a4..7c1f36af65 100644 --- a/src/turbomind/models/llama/unified_decoder.h +++ b/src/turbomind/models/llama/unified_decoder.h @@ -2,7 +2,6 @@ #include "src/turbomind/comm/device_comm.h" #include "src/turbomind/models/llama/GatedDeltaNetLayer.h" -#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h" #include "src/turbomind/models/llama/LlamaFfnLayer.h" #include "src/turbomind/models/llama/context.h" #include "src/turbomind/models/llama/llama_params.h" @@ -11,16 +10,14 @@ namespace turbomind { +class ModelWeight; +class DecoderLayerWeight; + class UnifiedDecoder { public: - using WeightType = LlamaDecoderLayerWeight; + using WeightType = DecoderLayerWeight; - UnifiedDecoder(const ModelParam& model, - const EngineParam& engine, - const AttentionParam& attn, - const MoeParam& moe, - const Context& ctx, - int phases); + UnifiedDecoder(const EngineParam& engine, const Context& ctx, int phases, const ModelWeight& model_weight); void Run(BatchOp op, int phase, TensorMap& env); @@ -37,8 +34,6 @@ class UnifiedDecoder { const int attn_tp_group_; - const float rmsnorm_eps_; - comm::DeviceCommImpl* const d_comm_; const int tune_layer_num_; @@ -54,6 +49,7 @@ class UnifiedDecoder { Tensor& residual, const Tensor& bias, const Tensor& weight, + float eps, int token_num, int t0, int t1, diff --git a/src/turbomind/models/model_root.cc b/src/turbomind/models/model_root.cc new file mode 100644 index 0000000000..5b2092a7ec --- /dev/null +++ b/src/turbomind/models/model_root.cc @@ -0,0 +1,26 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include "src/turbomind/models/model_root.h" +#include "src/turbomind/core/check.h" + +namespace turbomind { + +ModelRoot::ModelRoot() +{ + // CUDA device is already set by CudaDeviceGuard in TurboMind::CreateRoot. + stream_ = core::Stream::create(); + alloca_ = core::Allocator{stream_, /*use_default_pool=*/true}; +} + +ModelRoot::~ModelRoot() = default; + +void ModelRoot::prepare() +{ + TM_CHECK(text_model) << "ModelRoot::prepare: text_model not attached; did the spec " + "forget root.build()?"; + Module::prepare(); +} + +TM_MODULE_METHODS(ModelRoot, MODEL_ROOT_CHILDREN, MODEL_ROOT_PARAMS) + +} // namespace turbomind diff --git a/src/turbomind/models/model_root.h b/src/turbomind/models/model_root.h new file mode 100644 index 0000000000..0ea0b0ad72 --- /dev/null +++ b/src/turbomind/models/model_root.h @@ -0,0 +1,58 @@ +// Copyright (c) OpenMMLab. All rights reserved. +#pragma once + +#include "src/turbomind/core/core.h" +#include "src/turbomind/core/module.h" +#include "src/turbomind/models/model_weight.h" + +namespace turbomind { + +/// Sentinel root for the weight tree. Lives in TurboMind::Impl::weights_ +/// and owns the CUDA stream + pool-backed allocator used during weight +/// loading. Python creates a ModelWeight via _tm.create_module and +/// attaches it as the `text_model` child via add_child_raw. +class ModelRoot: public core::Module { +public: + const char* type() const override + { + return "ModelRoot"; + } + + ModelRoot(); + ~ModelRoot() override; + + void prepare() override; + + core::ContextGuard context() const + { + return core::ContextGuard{stream_, alloca_}; + } + + const core::Stream& stream() const + { + return stream_; + } + const core::Allocator& allocator() const + { + return alloca_; + } + + /// Convenience accessor. Nullptr before Python attaches via + /// `add_child_raw('text_model', ...)`. + ModelWeight* text_model_ptr() const + { + return text_model.get(); + } + +#define MODEL_ROOT_CHILDREN(X) X(ModelWeight, text_model) + +#define MODEL_ROOT_PARAMS(X) + + TM_MODULE_DECLARE(ModelRoot, MODEL_ROOT_CHILDREN, MODEL_ROOT_PARAMS) + +private: + core::Stream stream_{}; + core::Allocator alloca_{}; +}; + +} // namespace turbomind diff --git a/src/turbomind/models/model_weight.cc b/src/turbomind/models/model_weight.cc new file mode 100644 index 0000000000..21f954e2cd --- /dev/null +++ b/src/turbomind/models/model_weight.cc @@ -0,0 +1,88 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include "src/turbomind/models/model_weight.h" +#include "src/turbomind/core/registry.h" +#include "src/turbomind/models/attention_weight.h" +#include "src/turbomind/models/decoder_layer_weight.h" + +namespace turbomind { + +ModelWeight::ModelWeight(const core::ModelWeightConfig& cfg): + tp_size(cfg.tp_size), tp_rank(cfg.tp_rank), data_type(cfg.data_type), hidden_units(cfg.hidden_units) +{ +} + +void ModelWeight::prepare() +{ + for_each_child([](const char* /*name*/, Module* child) { + if (child) + child->prepare(); + }); + + auto* l0 = layer(0); + TM_CHECK(l0); + // Find first full-attention layer (linear-attn layers have no attention child) + DecoderLayerWeight* attn_layer = nullptr; + for (int i = 0; i < (int)layers->size(); ++i) { + if (layer(i)->attention) { + attn_layer = layer(i); + break; + } + } + TM_CHECK(attn_layer) << "No full-attention layer found"; + head_dim = attn_layer->attention->head_dim; + kv_head_num = attn_layer->attention->kv_head_num; + + vocab_size = tok_embeddings.shape(0); + embedding_size = vocab_size; + num_layer = layers->size(); + vocab_size_padded = TM_CHECK_NOTNULL(output)->output_dim * tp_size; + + layer_types.resize(num_layer); + for (int i = 0; i < num_layer; ++i) { + layer_types[i] = layer(i)->linear_attn ? 1 : 0; + } + + EnsureFloatDtype(tok_embeddings, data_type); +} + +DecoderLayerWeight* ModelWeight::layer(int i) const +{ + if (!layers) { + return nullptr; + } + return static_cast(layers->child(std::to_string(i))); +} + +std::vector ModelWeight::layers_list() const +{ + if (!layers_cache_.empty()) { + return layers_cache_; + } + if (!layers) { + return {}; + } + layers_cache_.resize(layers->size()); + for (int i = 0; i < layers->size(); ++i) { + layers_cache_[i] = static_cast(layers->child(std::to_string(i))); + } + return layers_cache_; +} + +bool ModelWeight::verify(std::vector& missing) +{ + Module::verify(missing); + if (!tok_embeddings) { + missing.push_back(full_path() + ": missing tok_embeddings"); + } + if (!norm) { + missing.push_back(full_path() + ": missing norm"); + } + return missing.empty(); +} + +TM_MODULE_REGISTER(ModelWeight, core::ModelWeightConfig); + +TM_MODULE_METHODS(ModelWeight, MODEL_WEIGHT_CHILDREN, MODEL_WEIGHT_PARAMS) + +} // namespace turbomind diff --git a/src/turbomind/models/model_weight.h b/src/turbomind/models/model_weight.h new file mode 100644 index 0000000000..b4b0f17864 --- /dev/null +++ b/src/turbomind/models/model_weight.h @@ -0,0 +1,83 @@ +// Copyright (c) OpenMMLab. All rights reserved. +#pragma once + +#include "src/turbomind/core/core.h" +#include "src/turbomind/core/module.h" +#include "src/turbomind/models/linear_weight.h" +#include "src/turbomind/models/norm_weight.h" +#include "src/turbomind/utils/memory_utils.h" + +#include + +namespace turbomind::core { + +struct ModelWeightConfig: ModuleConfig { + ModelWeightConfig(): ModuleConfig{"ModelWeight"} {} + +#define MODEL_WEIGHT_FIELDS(X) \ + X(int, tp_size) \ + X(int, tp_rank) \ + X(DataType, data_type) \ + X(int, hidden_units) + + MODEL_WEIGHT_FIELDS(TM_MEMBER) + TM_FOR_EACH(ModelWeightConfig, MODEL_WEIGHT_FIELDS) + +#undef MODEL_WEIGHT_FIELDS +}; + +} // namespace turbomind::core + +namespace turbomind { + +class DecoderLayerWeight; + +/// Root weight module for a model. Owns the full weight tree. +class ModelWeight: public core::Module { +public: + const char* type() const override + { + return "ModelWeight"; + } + + ModelWeight() = default; + + explicit ModelWeight(const core::ModelWeightConfig& cfg); + + void prepare() override; + bool verify(std::vector& missing) override; + + // --- X-macro field lists --- +#define MODEL_WEIGHT_CHILDREN(X) \ + X(LinearWeight, output) \ + X(NormWeight, norm) \ + X(core::ModuleList, layers) + +#define MODEL_WEIGHT_PARAMS(X) X(tok_embeddings) + + TM_MODULE_DECLARE(ModelWeight, MODEL_WEIGHT_CHILDREN, MODEL_WEIGHT_PARAMS) + + // --- Accessors --- + DecoderLayerWeight* layer(int i) const; + std::vector layers_list() const; + + // --- Derived in prepare() from children -- public for direct access --- + DataType data_type{}; + int hidden_units{}; + int vocab_size{}; + int vocab_size_padded{}; + int embedding_size{}; + int num_layer{}; + int head_dim{}; + int kv_head_num{}; + std::vector layer_types; + + // --- From ModelWeightConfig at construction --- + int tp_size{}; + int tp_rank{}; + +private: + mutable std::vector layers_cache_; +}; + +} // namespace turbomind diff --git a/src/turbomind/models/moe_weight.cc b/src/turbomind/models/moe_weight.cc new file mode 100644 index 0000000000..23353d8fd5 --- /dev/null +++ b/src/turbomind/models/moe_weight.cc @@ -0,0 +1,156 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include "src/turbomind/models/moe_weight.h" + +#include "src/turbomind/core/check.h" +#include "src/turbomind/core/registry.h" +#include "src/turbomind/kernels/gemm/convert.h" +#include "src/turbomind/utils/cuda_utils.h" + +namespace turbomind { + +MoeWeight::MoeWeight(const core::MoeConfig& cfg) +{ + experts_per_token = cfg.experts_per_token; + norm_topk_prob = cfg.norm_topk_prob; + routed_scale = static_cast(cfg.routed_scale); + topk_group = cfg.topk_group; + topk_method = cfg.topk_method; + n_group = cfg.n_group; + scoring_func = cfg.scoring_func; + router_n_groups = cfg.router_n_groups; + data_type_ = cfg.data_type; + act_type_ = static_cast(cfg.act_type); + fuse_silu_act_ = cfg.fuse_silu; + expert_num = cfg.expert_num; +} + +// Adapted from LinkExperts for LinearWeight +static void LinkLinearExperts(std::function experts, int n, LinearWeight& d) +{ + const auto& e0 = *experts(0); + + e0.copy_metadata_to(d); + + d.k_desc.num = d.q_desc.num = n; + + if (e0.bias) { + d.bias = Tensor{{n, e0.output_dim}, e0.bias.dtype(), kDEVICE}; + } + + std::vector> weights; + std::vector> scales; + + for (int i = 0; i < n; ++i) { + auto& e = *experts(i); + weights.emplace_back(e.weight.raw_data(), e.k_desc.ld); + if (e.scales) { + scales.emplace_back(e.scales.raw_data(), e.q_desc.ld); + } + if (e.bias) { + Copy(e.bias, d.bias.slice(i, 1).squeeze(0)); + } + } + + auto stream = core::Context::stream().handle(); + + if (d.weight_format.dtype == kFloat8_e4m3 && d.input_dtype() == kFloat8_e4m3) { + auto make_blocked_ptr = [&](const auto& ptrs) { + return std::shared_ptr{gemm::MakeBlockedPtrs(ptrs, stream), [](auto p) { cudaFree(p); }}; + }; + d.weight = Tensor{make_blocked_ptr(weights), {n}, e0.weight.dtype(), kDEVICE}; + d.scales = Tensor{make_blocked_ptr(scales), {n}, e0.scales.dtype(), kDEVICE}; + d.k_desc.offsets = d.q_desc.offsets = (int*)1; + } + else { + auto make_strided_ptr = [&](const auto& ptrs) { + return std::shared_ptr{gemm::MakeStridedPtrs(ptrs, stream), [](auto p) { cudaFree(p); }}; + }; + d.weight = Tensor{make_strided_ptr(weights), {n}, d.weight_format.dtype, kDEVICE}; + if (e0.scales) { + d.scales = Tensor{make_strided_ptr(scales), {n}, e0.scales.dtype(), kDEVICE}; + } + d.k_desc.ld = d.q_desc.ld = 0; + } +} + +FfnWeight* MoeWeight::expert(int i) const +{ + if (!experts) { + return nullptr; + } + return static_cast(experts->child(std::to_string(i))); +} + +void MoeWeight::prepare() +{ + // First prepare all children (experts, gate, etc.) + Module::prepare(); + + // Create batched block view for fused MoE path + auto e0 = TM_CHECK_NOTNULL(expert(0)); // exemplar expert + + core::FfnConfig block_cfg; + block_cfg.hidden_dim = e0->hidden_dim; + block_cfg.inter_size = e0->inter_size; + // tp_size=1: expert weights are already TP-sharded — the block + // just batches them and must not divide inter_size a second time. + block_cfg.tp_size = 1; + block_cfg.tp_rank = 0; + block_cfg.data_type = data_type_; + block_cfg.act_type = static_cast(act_type_); + block_cfg.fuse_silu = fuse_silu_act_; + block_ = std::make_unique(block_cfg); + + // Link each linear in the block to the corresponding expert linears + auto get_expert_w1w3 = [this](int i) -> LinearWeight* { + auto* exp = expert(i); + return exp ? exp->w1w3.get() : nullptr; + }; + auto get_expert_w1 = [this](int i) -> LinearWeight* { + auto* exp = expert(i); + return exp ? exp->w1.get() : nullptr; + }; + auto get_expert_w3 = [this](int i) -> LinearWeight* { + auto* exp = expert(i); + return exp ? exp->w3.get() : nullptr; + }; + auto get_expert_w2 = [this](int i) -> LinearWeight* { + auto* exp = expert(i); + return exp ? exp->w2.get() : nullptr; + }; + + if (get_expert_w1w3(0)) { + // Fused w1w3 path: experts have a single fused gate+up projection + block_->add_child("w1w3", std::make_unique()); + LinkLinearExperts(get_expert_w1w3, expert_num, *block_->w1w3); + } + else { + // Separate w1/w3 path: link individually + block_->add_child("w1", std::make_unique()); + block_->add_child("w3", std::make_unique()); + if (get_expert_w1(0)) { + LinkLinearExperts(get_expert_w1, expert_num, *block_->w1); + } + if (get_expert_w3(0)) { + LinkLinearExperts(get_expert_w3, expert_num, *block_->w3); + } + } + + block_->add_child("w2", std::make_unique()); + if (get_expert_w2(0)) { + LinkLinearExperts(get_expert_w2, expert_num, *block_->w2); + } + + // Propagate the actual fused-silu state from the first expert to + // the block. Each expert's prepare() has already run above, so + // is_fused_silu() now reflects whether the GEMM epilogue applies + // SiLU. + block_->is_fused_silu = e0->is_fused_silu; +} + +TM_MODULE_REGISTER(MoeWeight, core::MoeConfig); + +TM_MODULE_METHODS(MoeWeight, MOE_WEIGHT_CHILDREN, MOE_WEIGHT_PARAMS) + +} // namespace turbomind diff --git a/src/turbomind/models/moe_weight.h b/src/turbomind/models/moe_weight.h new file mode 100644 index 0000000000..4b767e6710 --- /dev/null +++ b/src/turbomind/models/moe_weight.h @@ -0,0 +1,95 @@ +// Copyright (c) OpenMMLab. All rights reserved. +#pragma once + +#include "src/turbomind/core/core.h" +#include "src/turbomind/core/module.h" +#include "src/turbomind/models/ffn_weight.h" + +namespace turbomind { + +} // namespace turbomind + +namespace turbomind::core { + +struct MoeConfig: ModuleConfig { + MoeConfig(): ModuleConfig{"MoeWeight"} {} + +#define MOE_FIELDS(X) \ + X(int, expert_num) \ + X(int, experts_per_token) \ + X(int, act_type) \ + X(bool, fuse_silu) \ + X(bool, norm_topk_prob) \ + X(std::string, topk_method) \ + X(std::string, scoring_func) \ + X(int, topk_group) \ + X(int, n_group) \ + X(int, router_n_groups) \ + X(double, routed_scale) \ + X(DataType, data_type) + + MOE_FIELDS(TM_MEMBER) + TM_FOR_EACH(MoeConfig, MOE_FIELDS) + +#undef MOE_FIELDS +}; + +} // namespace turbomind::core + +namespace turbomind { + +class MoeWeight: public core::Module { +public: + const char* type() const override + { + return "MoeWeight"; + } + + MoeWeight() = default; + + MoeWeight(const core::MoeConfig& cfg); + + void prepare() override; + int num_experts() const + { + return expert_num; + } + + // --- X-macro child members --- +#define MOE_WEIGHT_CHILDREN(X) \ + X(LinearWeight, gate) \ + X(LinearWeight, shared_gate) \ + X(core::ModuleList, experts) + +#define MOE_WEIGHT_PARAMS(X) X(score_correction_bias) + + TM_MODULE_DECLARE(MoeWeight, MOE_WEIGHT_CHILDREN, MOE_WEIGHT_PARAMS) + + // --- Typed accessors --- + FfnWeight* expert(int i) const; + FfnWeight* block() const + { + return block_.get(); + } + + // --- Config fields (public for runtime access) --- + int expert_num{}; + int experts_per_token{}; + bool norm_topk_prob{}; + float routed_scale{}; + int topk_group{}; + std::string topk_method; + int n_group{}; + std::string scoring_func; + int router_n_groups{}; + +private: + ActivationType act_type_{}; + bool fuse_silu_act_{}; + + DataType data_type_{}; + + std::unique_ptr block_; +}; + +} // namespace turbomind diff --git a/src/turbomind/models/norm_weight.cc b/src/turbomind/models/norm_weight.cc new file mode 100644 index 0000000000..a1f234f50a --- /dev/null +++ b/src/turbomind/models/norm_weight.cc @@ -0,0 +1,21 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include "src/turbomind/models/norm_weight.h" + +#include "src/turbomind/core/registry.h" +#include "src/turbomind/utils/memory_utils.h" + +namespace turbomind { + +NormWeight::NormWeight(const core::NormConfig& cfg): shape_{cfg.dim}, dtype_{cfg.data_type}, norm_eps_{cfg.norm_eps} {} + +void NormWeight::prepare() +{ + EnsureFloatDtype(weight, dtype_); +} + +TM_MODULE_REGISTER(NormWeight, core::NormConfig); + +TM_MODULE_METHODS(NormWeight, NORM_WEIGHT_CHILDREN, NORM_WEIGHT_PARAMS) + +} // namespace turbomind diff --git a/src/turbomind/models/norm_weight.h b/src/turbomind/models/norm_weight.h new file mode 100644 index 0000000000..ccf78ecaf4 --- /dev/null +++ b/src/turbomind/models/norm_weight.h @@ -0,0 +1,54 @@ +// Copyright (c) OpenMMLab. All rights reserved. +#pragma once + +#include "src/turbomind/core/core.h" +#include "src/turbomind/core/module.h" + +namespace turbomind::core { + +struct NormConfig: ModuleConfig { + NormConfig(): ModuleConfig{"NormWeight"} {} + +#define NORM_FIELDS(X) \ + X(int, dim) \ + X(DataType, data_type) \ + X(float, norm_eps, 0.f) + + NORM_FIELDS(TM_MEMBER) + TM_FOR_EACH(NormConfig, NORM_FIELDS) + +#undef NORM_FIELDS +}; + +} // namespace turbomind::core + +namespace turbomind { + +class NormWeight: public core::Module { +public: + const char* type() const override + { + return "NormWeight"; + } + + NormWeight() = default; + + explicit NormWeight(const core::NormConfig& cfg); + + /// Post-load: cast weight to configured dtype if needed. + void prepare() override; + +#define NORM_WEIGHT_CHILDREN(X) + +#define NORM_WEIGHT_PARAMS(X) X(weight) + + TM_MODULE_DECLARE(NormWeight, NORM_WEIGHT_CHILDREN, NORM_WEIGHT_PARAMS) + + float norm_eps_{}; + +private: + std::vector shape_; + DataType dtype_{}; +}; + +} // namespace turbomind diff --git a/src/turbomind/models/output_processor.cc b/src/turbomind/models/output_processor.cc index 92f943de40..a0fdce788d 100644 --- a/src/turbomind/models/output_processor.cc +++ b/src/turbomind/models/output_processor.cc @@ -22,15 +22,8 @@ struct OutputProcessor::Impl { std::function lm_head_; - Impl(const ModelParam& model, - int max_logits_len, - int tp_rank, - int phases, - std::function lm_head): - vocab_size_{(int)model.vocab_size}, - max_logits_len_{max_logits_len}, - tp_rank_{tp_rank}, - lm_head_{std::move(lm_head)} + Impl(int vocab_size, int max_logits_len, int tp_rank, int phases, std::function lm_head): + vocab_size_{vocab_size}, max_logits_len_{max_logits_len}, tp_rank_{tp_rank}, lm_head_{std::move(lm_head)} { for (int i = 0; i < phases; ++i) { data_.emplace_back(); @@ -286,8 +279,8 @@ struct OutputProcessor::Impl { OutputProcessor::~OutputProcessor() = default; OutputProcessor::OutputProcessor( - const ModelParam& model, int max_logits_len, int tp_rank, int phases, std::function lm_head): - impl_{std::make_unique(model, max_logits_len, tp_rank, phases, std::move(lm_head))} + int vocab_size, int max_logits_len, int tp_rank, int phases, std::function lm_head): + impl_{std::make_unique(vocab_size, max_logits_len, tp_rank, phases, std::move(lm_head))} { } diff --git a/src/turbomind/models/output_processor.h b/src/turbomind/models/output_processor.h index 661cb72c74..2dcd569d4c 100644 --- a/src/turbomind/models/output_processor.h +++ b/src/turbomind/models/output_processor.h @@ -1,7 +1,6 @@ #pragma once #include "src/turbomind/engine/batch.h" -#include "src/turbomind/models/llama/llama_params.h" namespace turbomind { @@ -9,11 +8,8 @@ class OutputProcessor { public: ~OutputProcessor(); - OutputProcessor(const ModelParam& model, // - int max_logits_len, - int tp_rank, - int phases, - std::function lm_head); + OutputProcessor( + int vocab_size, int max_logits_len, int tp_rank, int phases, std::function lm_head); void Run(BatchOp op, int phase, TensorMap& env); diff --git a/src/turbomind/python/CMakeLists.txt b/src/turbomind/python/CMakeLists.txt index 2b4ceb557f..2df6e8fc52 100644 --- a/src/turbomind/python/CMakeLists.txt +++ b/src/turbomind/python/CMakeLists.txt @@ -14,6 +14,12 @@ endif() pybind11_add_module(${PROJECT_NAME} bind.cpp) target_link_libraries(${PROJECT_NAME} PRIVATE turbomind xgrammar) +# Force-link all objects from static libraries that contain self-registration +# globals (module type registrars). Without --whole-archive, the linker +# strips registrar objects because nobody references them directly. +set_property(TARGET ${PROJECT_NAME} APPEND PROPERTY LINK_OPTIONS + "-Wl,--whole-archive" "$" "$" "-Wl,--no-whole-archive") +target_link_libraries(${PROJECT_NAME} PRIVATE models) pybind11_add_module(_xgrammar xgrammar_bind.cpp) target_link_libraries(_xgrammar PRIVATE core xgrammar) diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp index 535c284535..1e36f35bad 100644 --- a/src/turbomind/python/bind.cpp +++ b/src/turbomind/python/bind.cpp @@ -14,9 +14,20 @@ #include "xgrammar/compiler.h" +#include "src/turbomind/core/data_format.h" #include "src/turbomind/core/data_type.h" +#include "src/turbomind/core/module.h" #include "src/turbomind/core/tensor.h" +#include "src/turbomind/engine/engine_config.h" #include "src/turbomind/engine/model_request.h" +#include "src/turbomind/models/attention_weight.h" +#include "src/turbomind/models/decoder_layer_weight.h" +#include "src/turbomind/models/delta_net_weight.h" +#include "src/turbomind/models/ffn_weight.h" +#include "src/turbomind/models/linear_weight.h" +#include "src/turbomind/models/model_weight.h" +#include "src/turbomind/models/moe_weight.h" +#include "src/turbomind/models/norm_weight.h" #include "src/turbomind/python/dlpack.h" #include "src/turbomind/turbomind.h" #include "src/turbomind/utils/cuda_utils.h" @@ -292,6 +303,25 @@ struct ScopedGIL { } // namespace +// --- Generic config binding helper --- + +template +void bind_config(py::module_& m, const char* name) +{ + py::class_ cls(m, name); + cls.def(py::init<>()); + Config::for_each([&](const char* fname, auto member_ptr) { cls.def_readwrite(fname, member_ptr); }); + cls.def("clone", [](const Config& c) { return Config(c); }); +} + +template +void bind_struct(py::module_& m, const char* name) +{ + py::class_ cls(m, name); + cls.def(py::init<>()); + T::for_each([&](const char* fname, auto member_ptr) { cls.def_readwrite(fname, member_ptr); }); +} + PYBIND11_MODULE(_turbomind, m) { py::class_>(m, "RequestMetrics") @@ -378,7 +408,10 @@ PYBIND11_MODULE(_turbomind, m) .value("TYPE_FP16", kFloat16) .value("TYPE_FP32", kFloat32) .value("TYPE_FP64", kFloat64) - .value("TYPE_BF16", kBfloat16); + .value("TYPE_BF16", kBfloat16) + .value("TYPE_FP8_E4M3", kFloat8_e4m3) + .value("TYPE_FP4_E2M1", kFloat4_e2m1) + .value("TYPE_UINT4", kUint4); // memory type py::enum_(m, "MemoryType") @@ -387,12 +420,53 @@ PYBIND11_MODULE(_turbomind, m) .value("MEMORY_GPU", ft::DeviceType::kDEVICE); } + // DataFormat descriptors + py::class_(m, "QuantParamDesc") + .def_readonly("dtype", &turbomind::QuantParamDesc::dtype) + .def_readonly("transposed", &turbomind::QuantParamDesc::transposed) + .def("present", &turbomind::QuantParamDesc::present); + + py::class_(m, "DataFormat") + .def_readonly("dtype", &turbomind::DataFormat::dtype) + .def_readonly("block_sizes", &turbomind::DataFormat::block_sizes) + .def_readonly("scales", &turbomind::DataFormat::scales) + .def_readonly("zeros", &turbomind::DataFormat::zeros) + .def("is_quantized", &turbomind::DataFormat::is_quantized) + .def("rank", &turbomind::DataFormat::rank); + + m.def("ResolveLinearWeightFormat", + &turbomind::ResolveLinearWeightFormat, + py::arg("data_type"), + py::arg("weight_dtype"), + py::arg("block_in"), + py::arg("block_out")); + + // --- Config struct bindings --- + py::class_(m, "ModuleConfig") + .def_property_readonly("module_type", [](const turbomind::core::ModuleConfig& c) -> std::string { + return std::string(c.module_type); + }); + + bind_config(m, "LinearConfig"); + bind_struct(m, "RopeConfig"); + bind_struct(m, "EngineConfig"); + bind_config(m, "AttentionConfig"); + bind_config(m, "FfnConfig"); + bind_config(m, "MoeConfig"); + bind_config(m, "DeltaNetConfig"); + bind_config(m, "ModuleListConfig"); + bind_config(m, "NormConfig"); + bind_config(m, "DecoderLayerConfig"); + bind_config(m, "ModelWeightConfig"); + // tensor py::class_>(m, "Tensor") .def_property_readonly("where", [](const Tensor& t) { return t.device().type; }) .def_property_readonly("type", [](const Tensor& t) { return t.dtype(); }) .def_property_readonly("shape", [](const Tensor& t) { return t.shape(); }) .def_property_readonly("data", [](const Tensor& t) { return t.raw_data(); }) + .def_property_readonly("byte_size", [](const Tensor& t) { return t.byte_size(); }) + .def("__bool__", [](const Tensor& t) { return t.byte_size() > 0; }) .def( "copy_from", [](Tensor& self, py::object obj) { @@ -502,12 +576,112 @@ PYBIND11_MODULE(_turbomind, m) py::call_guard(), "grammar"_a); + // Python context manager wrapper for ContextGuard. + // Stores copies of Stream + Allocator; constructs the real guard + // in-place on __enter__ and destroys it on __exit__. + struct PyContextGuard { + ft::core::Stream stream; + ft::core::Allocator alloc; + std::unique_ptr guard; + + PyContextGuard(ft::core::Stream s, ft::core::Allocator a): stream(std::move(s)), alloc(std::move(a)) {} + + void enter() + { + guard = std::make_unique(stream, alloc); + } + void exit() + { + guard.reset(); + } + }; + + py::class_(m, "ContextGuard") + .def("__enter__", + [](PyContextGuard& g) -> PyContextGuard& { + g.enter(); + return g; + }) + .def("__exit__", [](PyContextGuard& g, py::object, py::object, py::object) { g.exit(); }); + + // Param — lightweight handle to a Module parameter slot + py::class_(m, "Param") + .def( + "alloc", + [](ft::core::Param& p, std::vector shape, ft::DataType dtype) { + return std::make_shared(p.alloc(shape, dtype)); + }, + "shape"_a, + "dtype"_a) + .def("get", [](ft::core::Param& p) { return std::make_shared(p.get()); }) + .def("__bool__", [](ft::core::Param& p) { return static_cast(p); }); + + // Module class — navigation and allocation interface + py::class_(m, "Module") + .def( + "get", + [](ft::core::Module& m, const std::string& segment) -> ft::core::Module* { return m.get(segment); }, + py::return_value_policy::reference, + "segment"_a) + .def( + "param", + [](ft::core::Module& m, const std::string& name) -> ft::core::Param { return m.param(name); }, + "name"_a) + .def("prepare", [](ft::core::Module& m) { m.prepare(); }) + .def( + "child", + [](ft::core::Module& m, const std::string& name) -> ft::core::Module* { return m.child(name); }, + py::return_value_policy::reference, + "name"_a) + // Config-based create_child: accepts any ModuleConfig subclass + .def( + "create_child", + [](ft::core::Module& m, + const std::string& name, + turbomind::core::ModuleConfig& config) -> ft::core::Module* { return m.create_child(name, config); }, + py::return_value_policy::reference, + "name"_a, + "config"_a) + .def("type", [](ft::core::Module& m) -> const char* { return m.type(); }) + .def("full_path", [](ft::core::Module& m) -> std::string { return m.full_path(); }) + .def( + "__getitem__", + [](ft::core::Module& m, const std::string& key) -> ft::core::Module* { return m.get(key); }, + py::return_value_policy::reference) + .def( + "__getitem__", + [](ft::core::Module& m, int idx) -> ft::core::Module* { return m.get(std::to_string(idx)); }, + py::return_value_policy::reference) + // Deferred parent binding — transfer ownership of a previously created module + .def( + "add_child_raw", + [](ft::core::Module& parent, const std::string& name, ft::core::Module* child) -> ft::core::Module* { + auto owned = std::unique_ptr(child); + return parent.add_child(name, std::move(owned)); + }, + py::return_value_policy::reference, + "name"_a, + "child"_a); + + // Standalone module creation (no parent needed) + m.def( + "create_module", + [](turbomind::core::ModuleConfig& config) -> ft::core::Module* { + auto mod = ft::core::Module::create(config); + return mod.release(); + }, + py::return_value_policy::reference, + "config"_a); + + // LinearWeight — specific interface for weight loading + py::class_(m, "LinearWeight"); + // transformer model using ft::TurboMind; py::class_>(m, "TurboMind") .def_static( "create", - [](std::string model_dir, std::string config, std::string weight_type) -> std::shared_ptr { + [](std::string model_dir, turbomind::EngineConfig config) -> std::shared_ptr { auto gil_factory = [] { // // erase the type return std::static_pointer_cast(std::make_shared()); @@ -517,22 +691,35 @@ PYBIND11_MODULE(_turbomind, m) delete ptr; }; - std::shared_ptr model(new TurboMind(model_dir, config, gil_factory), no_gil_deleter); + std::shared_ptr model(new TurboMind(model_dir, std::move(config), gil_factory), + no_gil_deleter); return model; }, "model_dir"_a, - "config"_a = "", - "weight_type"_a = "half") + "engine_config"_a) .def( "create_request", [](TurboMind* model) { return model->CreateRequest(); }, py::call_guard()) - .def("create_weights", &TurboMind::CreateWeights, py::call_guard(), "index"_a) + .def("create_context", &TurboMind::CreateContext, py::call_guard(), "index"_a) .def( - "get_weights", - [](TurboMind* model, int index) { return model->GetWeights(index); }, + "create_root", + [](TurboMind* model, int index) -> ft::core::Module* { return model->CreateRoot(index); }, + py::return_value_policy::reference, py::call_guard(), "index"_a) + .def( + "root", + [](TurboMind* model, int index) -> ft::core::Module* { return model->root(index); }, + py::return_value_policy::reference, + "index"_a) + .def( + "context", + [](ft::TurboMind* model, int index) -> std::unique_ptr { + auto [stream, alloc] = model->weight_context(index); + return std::make_unique(std::move(stream), std::move(alloc)); + }, + "index"_a) .def( "process_weight", [](TurboMind* model, int index) { model->ProcessWeights(index); }, @@ -548,17 +735,8 @@ PYBIND11_MODULE(_turbomind, m) [](TurboMind* model, int index) { return model->GetScheduleMetrics(index); }, py::call_guard(), "index"_a) - .def( - "sleep", - [](TurboMind* model, int index, int level) { model->Sleep(index, level); }, - py::call_guard(), - "index"_a, - "level"_a) - .def( - "wakeup", - [](TurboMind* model, int index, const std::vector& tags) { model->WakeUp(index, tags); }, - py::call_guard(), - "index"_a, - "tags"_a) - .def("is_dummy_node", [](TurboMind* model) { return model->is_dummy_node(); }); + .def("is_dummy_node", [](TurboMind* model) { return model->is_dummy_node(); }) + .def("attn_tp_rank", &TurboMind::GetAttnTpRank, "index"_a) + .def("mlp_tp_rank", &TurboMind::GetMlpTpRank, "index"_a) + .def("model_tp_rank", &TurboMind::GetModelTpRank, "index"_a); } diff --git a/src/turbomind/turbomind.cc b/src/turbomind/turbomind.cc index 1529269fde..664e6ca0bf 100644 --- a/src/turbomind/turbomind.cc +++ b/src/turbomind/turbomind.cc @@ -1,6 +1,5 @@ // Copyright (c) OpenMMLab. All rights reserved. -#include #include #include @@ -18,18 +17,16 @@ #include "src/turbomind/engine/model_request.h" #include "src/turbomind/models/language_model.h" -#include "src/turbomind/models/llama/LlamaWeight.h" #include "src/turbomind/models/llama/context.h" #include "src/turbomind/models/llama/llama_params.h" -#include "src/turbomind/models/llama/llama_utils.h" +#include "src/turbomind/models/model_root.h" +#include "src/turbomind/models/model_weight.h" #include "src/turbomind/kernels/gemm/tuner/params.h" #include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/metrics.h" -#include - // #include "dbg.h" namespace turbomind { @@ -39,140 +36,10 @@ using std::string; using std::shared_ptr; using std::unique_ptr; -static std::optional get_moe_method() -{ - static const auto value = []() -> std::optional { - const auto p = std::getenv("TM_MOE_METHOD"); - if (p) { - std::string str(p); - for (auto& x : str) { - x = std::tolower(x); - } - if (str == "naive") { - return MoeParam::kNaive; - } - else if (str == "fused") { - return MoeParam::kFused; - } - else { - std::cerr << "[WARNING] unrecognised MoE method: " << str << "\n"; - } - } - return {}; - }(); - return value; -} - -/// TODO: move config parsing to suitable place -static void parse_default_rope_param(const YAML::Node& node, RopeParam& param) -{ - param.base = node["base"].as(); - param.dim = node["dim"].as(); - if (param.base == 0.f || param.dim == 0) { - TM_LOG_ERROR("invalid rope param: base = {}, dim = {}", param.base, param.dim); - FT_CHECK(0); - } -} - -static void parse_linear_rope_param(const YAML::Node& node, RopeParam& param) -{ - parse_default_rope_param(node, param); - param.factor = node["factor"].as(); -} - -static void parse_dynamic_rope_param(const YAML::Node& node, RopeParam& param) -{ - parse_linear_rope_param(node, param); - param.max_position_embeddings = node["max_position_embeddings"].as(); -} - -static void parse_yarn_rope_param(const YAML::Node& node, RopeParam& param) -{ - parse_dynamic_rope_param(node, param); - param.yarn.attention_factor = node["attention_factor"].as(); - param.yarn.beta_fast = node["beta_fast"].as(); - param.yarn.beta_slow = node["beta_slow"].as(); -} - -static void parse_llama3_rope_param(const YAML::Node& node, RopeParam& param) -{ - parse_linear_rope_param(node, param); - param.llama3.low_freq_factor = node["low_freq_factor"].as(); - param.llama3.high_freq_factor = node["high_freq_factor"].as(); - param.llama3.original_max_position_embeddings = node["original_max_position_embeddings"].as(); -} - -static void parse_mrope_rope_param(const YAML::Node& node, RopeParam& param) -{ - parse_default_rope_param(node, param); - auto mrope_section = node["mrope_section"].as>(); - FT_CHECK(mrope_section.size() == 3); - param.mrope.section = {mrope_section[0], mrope_section[1], mrope_section[2]}; -} - -static void parse_rope_param(const YAML::Node& node, RopeParam& rope) -{ - rope.type = GetRoPEType(node["type"].as()); - - switch (rope.type) { - case RopeType::kDefault: - parse_default_rope_param(node, rope); - break; - case RopeType::kLinear: - parse_linear_rope_param(node, rope); - break; - case RopeType::kDynamic: - parse_dynamic_rope_param(node, rope); - break; - case RopeType::kYarn: - parse_yarn_rope_param(node, rope); - break; - case RopeType::kLlama3: - parse_llama3_rope_param(node, rope); - break; - case RopeType::kMrope: - parse_mrope_rope_param(node, rope); - break; - default: - FT_CHECK(0); - break; - } -} - -static DataType data_type_from_string(std::string str) -{ - if (str == "fp16" || str == "float16") { - return kFloat16; - } - else if (str == "bf16" || str == "bfloat16") { - return kBfloat16; - } - else if (str == "fp32") { - return kFloat32; - } - else if (str == "int8") { - return kUint8; - } - else if (str == "int4") { - return kUint4; - } - else if (str == "fp8") { - return kFloat8_e4m3; - } - else if (str == "e2m1") { - return kFloat4_e2m1; - } - TM_CHECK(0) << "unsupported weight type: " << str; - return {}; -} - struct TurboMind::Impl { - DataType data_type_; - ModelParam model_param_; - AttentionParam attn_param_; - MoeParam moe_param_; - EngineParam engine_param_; - size_t comm_size_; + DataType data_type_; + EngineParam engine_param_; + size_t comm_size_; vector engine_params_; @@ -187,11 +54,10 @@ struct TurboMind::Impl { vector global_rank_; // Weights & engine instances for the ranks - vector> weights_; - vector> contexts_; - vector engines_; + vector> weights_; + vector> contexts_; + vector engines_; - string model_name_; string model_dir_; vector queue_id_; @@ -202,37 +68,23 @@ struct TurboMind::Impl { ~Impl(); - Impl(string model_dir, string config, FFICtxFactory ffi_ctx_factory); + Impl(string model_dir, EngineConfig config, FFICtxFactory ffi_ctx_factory); unique_ptr CreateRequest() { return std::make_unique(gateway_.get(), // data_type_, engine_param_.session_len, - model_param_.vocab_size, - model_param_.hidden_units); + weights_[0]->text_model_ptr()->vocab_size, + weights_[0]->text_model_ptr()->hidden_units); } - void CreateWeights(int index) + core::Module* CreateRoot(int index) { CudaDeviceGuard dev_guard(engine_param_.devices[index]); - - CreateContext(index); - - weights_[index] = std::make_shared(data_type_, // - model_param_, - engine_params_.at(index), - moe_param_); - } - - TensorMap GetWeights(int index) - { - const auto& tensor_ptr_map = TM_CHECK_NOTNULL(weights_[index])->get_parameters(); - TensorMap params; - for (const auto& [name, tensor_ptr] : tensor_ptr_map) { - params[name] = *tensor_ptr; - } - return params; + TM_CHECK(contexts_[index] != nullptr) << "CreateContext(" << index << ") must run before CreateRoot"; + weights_[index] = std::make_shared(); + return weights_[index].get(); } void ProcessWeights(int index) @@ -240,10 +92,8 @@ struct TurboMind::Impl { CudaDeviceGuard dev_guard(engine_param_.devices[index]); FT_CHECK(weights_[index] != nullptr); - cudaDeviceProp props{}; - check_cuda_error(cudaGetDeviceProperties(&props, engine_param_.devices[index])); - - weights_[index]->prepare(props); + auto ctx_guard = weights_[index]->context(); + weights_[index]->prepare(); sync_check_cuda_error(); } @@ -255,54 +105,12 @@ struct TurboMind::Impl { void Sleep(int index, int level) { - CudaDeviceGuard dev_guard(engine_param_.devices[index]); - - if (level == 2) { - // free weights - weights_[index]->release(); - } - else { - // offload weights to CPU - TM_CHECK(moe_param_.experts_per_token == 0) << "level 1 sleep not supported for MoE model"; - weights_[index]->to_device(kCPU); - } - - // free model (kv cache and buffer) - if (index == 0) { - gateway_->shutdown(); - gateway_.reset(); - } - - engines_[index] = {}; - contexts_[index]->allocator->trim(0); - - trim_default_mempool(engine_param_.devices[index]); + // Sleep/wakeup is broken — disabled } void WakeUp(int index, const std::vector& tags) { - CudaDeviceGuard dev_guard(engine_param_.devices[index]); - - std::set keys(tags.begin(), tags.end()); - - auto& ctx = *TM_CHECK_NOTNULL(contexts_[index]); - - if (keys.find("weights") != keys.end()) { - TM_CHECK(weights_[index] != nullptr); - if (weights_[index]->is_initialized()) { - weights_[index]->to_device(kDEVICE); - } - else { - weights_[index]->initialize(); - } - } - - if (keys.find("kv_cache") != keys.end()) { - if (index == 0) { - gateway_ = std::make_shared(n_queues_, ffi_ctx_factory_); - } - CreateEngine(index); - } + // Sleep/wakeup is broken — disabled } void HandleMissingParams() @@ -337,136 +145,20 @@ TurboMind::Impl::~Impl() } } -TurboMind::Impl::Impl(string model_dir, string config, FFICtxFactory ffi_ctx_factory): - data_type_{}, model_param_{}, attn_param_{}, moe_param_{}, engine_param_{}, ffi_ctx_factory_{ffi_ctx_factory} +TurboMind::Impl::Impl(string model_dir, EngineConfig config, FFICtxFactory ffi_ctx_factory): + data_type_{}, engine_param_{}, ffi_ctx_factory_{ffi_ctx_factory} { - TM_CHECK(!config.empty()); - - YAML::Node node; - try { - node = YAML::Load(config); - } - catch (const YAML::Exception& e) { - TM_CHECK(0) << "Error loading YAML config: " << e.what() << "\nconfig:\n" << config; - } - - /// TODO: move config parsing to suitable place - const auto model = node["model_config"]; - const auto attention = node["attention_config"]; - const auto engine = node["engine_config"]; - - data_type_ = model_param_.data_type = data_type_from_string(model["data_type"].as()); + data_type_ = config.data_type; TM_CHECK(data_type_ == kBfloat16 || data_type_ == kHalf); - model_name_ = model["model_name"].as(); - model_param_.head_num = model["head_num"].as(); - model_param_.head_dim = model["size_per_head"].as(); - model_param_.kv_head_num = model["kv_head_num"].as(0); - model_param_.hidden_units = model["hidden_units"].as(); - model_param_.layer_num = model["num_layer"].as(); - model_param_.vocab_size = model["vocab_size"].as(); - model_param_.embedding_size = model["embedding_size"].as(); - model_param_.norm_eps = model["norm_eps"].as(); - model_param_.tune_layer_num = model["tune_layer_num"].as(1); - model_param_.mla.q_lora_rank = model["q_lora_rank"].as(); - model_param_.mla.kv_lora_rank = model["kv_lora_rank"].as(); - model_param_.mla.qk_rope_dim = model["qk_rope_dim"].as(); - model_param_.mla.v_head_dim = model["v_head_dim"].as(); - attn_param_.cache_block_seq_len = attention["cache_block_seq_len"].as(0); - model_param_.quant_policy = engine["quant_policy"].as(0); - - auto inter_size = model["inter_size"]; - for (auto it = inter_size.begin(); it != inter_size.end(); ++it) { - model_param_.inter_size.push_back(it->as()); - } - - if (auto layer_types = model["layer_types"]) { - for (auto it = layer_types.begin(); it != layer_types.end(); ++it) { - auto type_str = it->as(""); - if (type_str == "linear_attention") { - model_param_.layer_types.push_back(1); - } - else if (type_str == "full_attention" || type_str.empty()) { - model_param_.layer_types.push_back(0); - } - else { - TM_LOG_WARN("Unknown layer_type '{}', treating as full_attention.", type_str); - model_param_.layer_types.push_back(0); - } - } - } + // Copy config into the EngineConfig base of engine_param_ + static_cast(engine_param_) = config; - // Qwen3.5 Gated DeltaNet linear attention parameters - model_param_.linear_key_head_dim = model["linear_key_head_dim"].as(0); - model_param_.linear_value_head_dim = model["linear_value_head_dim"].as(0); - model_param_.linear_conv_kernel_dim = model["linear_conv_kernel_dim"].as(0); - model_param_.linear_num_key_heads = model["linear_num_key_heads"].as(0); - model_param_.linear_num_value_heads = model["linear_num_value_heads"].as(0); - model_param_.attn_output_gate = model["attn_output_gate"].as(false); - model_param_.linear_state_dtype = data_type_; - - if (auto uqel = model["unquantized_expert_layers"]) { - for (auto it = uqel.begin(); it != uqel.end(); ++it) { - model_param_.unquantized_expert_layers.insert(it->as()); - } - } - model_param_.attn_sink = model["attn_sink"].as(); - model_param_.mlp_bias = model["mlp_bias"].as(); - if (model["activation_type"].as("") == "gpt-oss") { - model_param_.act_type = ActivationType::kSiluGptOss; - } - - auto window_size = model["window_size"]; - for (auto it = window_size.begin(); it != window_size.end(); ++it) { - model_param_.window_size.push_back(it->as()); - } - - model_param_.attn_bias = model["attn_bias"].as(0); - model_param_.qk_norm = model["qk_norm"].as(); - model_param_.group_size = model["group_size"].as(0); + phases_ = config.async_ ? 2 : 1; - attn_param_.softmax_scale = attention["softmax_scale"].as(0); - // logn attn for qwen model - attn_param_.use_logn_attn = attention["use_logn_attn"].as(0); - attn_param_.max_position_embeddings = attention["max_position_embeddings"].as(0); - // rotary embedding parameters - parse_rope_param(attention["rope_param"], attn_param_.rope); - - engine_param_.max_batch_size = engine["max_batch_size"].as(0); - auto max_forward_token_num = engine["max_prefill_token_num"].as(0); + auto max_forward_token_num = config.max_prefill_token_num; max_forward_token_num += engine_param_.max_batch_size; - engine_param_.max_context_token_num = engine["max_context_token_num"].as(0); - engine_param_.session_len = model["session_len"].as(0); - - engine_param_.cache_max_block_count = engine["cache_max_entry_count"].as(0); - engine_param_.cache_chunk_size = engine["cache_chunk_size"].as(0); - engine_param_.enable_prefix_caching = engine["enable_prefix_caching"].as(false); - engine_param_.enable_metrics = engine["enable_metrics"].as(false); - - if (engine_param_.enable_prefix_caching && HasLinearAttention(model_param_)) { - TM_CHECK(0) << "Prefix caching is unsupported when linear attention is present"; - } - - engine_param_.num_tokens_per_iter = engine["num_tokens_per_iter"].as(0); - engine_param_.max_prefill_iters = engine["max_prefill_iters"].as(1); - - phases_ = engine["async_"].as() ? 2 : 1; - - engine_param_.outer_dp_size = engine["outer_dp_size"].as(); - - engine_param_.attn_dp_size = engine["attn_dp_size"].as(); - engine_param_.attn_tp_size = engine["attn_tp_size"].as(); - engine_param_.attn_cp_size = engine["attn_cp_size"].as(); - - engine_param_.mlp_tp_size = engine["mlp_tp_size"].as(); - - engine_param_.devices = engine["devices"].as>(); - - // multi-node information - engine_param_.nnodes = engine["nnodes"].as(); - engine_param_.node_rank = engine["node_rank"].as(); - { auto sp = engine_param_.attn_tp_size * engine_param_.attn_cp_size; engine_param_.max_forward_token_num = ((size_t)max_forward_token_num + sp - 1) / sp * sp; @@ -475,23 +167,7 @@ TurboMind::Impl::Impl(string model_dir, string config, FFICtxFactory ffi_ctx_fac comm_size_ = engine_param_.attn_dp_size * engine_param_.attn_tp_size * engine_param_.attn_cp_size; FT_CHECK(engine_param_.mlp_tp_size == comm_size_); - communicator_type_ = engine["communicator"].as(); - - moe_param_.experts_per_token = model["experts_per_token"].as(0); - moe_param_.inter_size = model["expert_inter_size"].as(0); - moe_param_.shared_gate = model["moe_shared_gate"].as(); - moe_param_.norm_topk_prob = model["norm_topk_prob"].as(); - moe_param_.routed_scale = model["routed_scale"].as(1.f); - moe_param_.topk_group = model["topk_group"].as(1); - moe_param_.topk_method = model["topk_method"].as("greedy"); - moe_param_.n_group = model["moe_group_num"].as(1); - moe_param_.scoring_func = model["scoring_func"].as("softmax"); - moe_param_.router_n_groups = model["router_n_groups"].as(-1); - moe_param_.router_bias = model["expert_router_bias"].as(); - YAML::Node expert_num = model["expert_num"]; - for (auto it = expert_num.begin(); it != expert_num.end(); ++it) { - moe_param_.expert_num.push_back(it->as()); - } + communicator_type_ = std::move(config.communicator); HandleMissingParams(); @@ -499,18 +175,6 @@ TurboMind::Impl::Impl(string model_dir, string config, FFICtxFactory ffi_ctx_fac engines_.resize(engine_param_.devices.size()); contexts_.resize(engine_param_.devices.size()); - model_param_.weight_type = data_type_from_string(model["weight_type"].as()); - model_param_.expert_weight_type = data_type_from_string(model["expert_weight_type"].as()); - model_param_.ffn_weight_type = - data_type_from_string(model["ffn_weight_type"].as(model["weight_type"].as())); - - if (auto method = get_moe_method()) { - moe_param_.method = *method; - } - else { - moe_param_.method = MoeParam::kFused; - } - // NOTE: This runs on Python main thread group_id_ = comm::CreateHostGroupId((engine_param_.nnodes == 1) ? "" : "hybrid"); group_id_->Initialize(); @@ -575,8 +239,9 @@ void TurboMind::Impl::CreateContext(int index) p.attn_cp_rank = c.d_comm->rank(c.d_cp_group); } - p.attn_tp_rank = c.d_comm->rank(c.d_tp_group) / p.attn_cp_size; - p.mlp_tp_rank = c.d_comm->rank(0); + p.model_tp_rank = c.d_comm->rank(c.d_tp_group); + p.attn_tp_rank = p.model_tp_rank / p.attn_cp_size; + p.mlp_tp_rank = c.d_comm->rank(0); } if (c.h_tp_group->rank() == 0) { @@ -609,19 +274,12 @@ void TurboMind::Impl::CreateEngine(int index) ctx.comm.h_comm->Sync(); // create model - LanguageModel model{data_type_, // - model_param_, - param, - attn_param_, - moe_param_, - ctx, - *weights_[index], - phases_}; + LanguageModel model{param, ctx, *weights_[index]->text_model_ptr(), phases_}; // create engine - engines_[index] = Engine{data_type_, // - param, + engines_[index] = Engine{param, std::move(model), + *weights_[index]->text_model_ptr(), ctx, *gateway_, engine_param_.devices[index], @@ -703,7 +361,7 @@ void TurboMind::Impl::WarmUp(int index) const auto max_bs = *std::max_element(bss.begin(), bss.end()); Buffer_ input_ids(max_bs, kCPU); std::mt19937 g{}; - std::uniform_int_distribution d{0, (int)model_param_.vocab_size - 1}; + std::uniform_int_distribution d{0, (int)weights_[index]->text_model_ptr()->vocab_size - 1}; for (auto& x : input_ids) { x = d(g); } @@ -777,19 +435,31 @@ void TurboMind::Impl::WarmUp(int index) TurboMind::~TurboMind() = default; -TurboMind::TurboMind(string model_dir, string config, FFICtxFactory ffi_ctx_factory): - impl_{std::make_unique(model_dir, config, ffi_ctx_factory)} +TurboMind::TurboMind(string model_dir, EngineConfig config, FFICtxFactory ffi_ctx_factory): + impl_{std::make_unique(model_dir, std::move(config), ffi_ctx_factory)} { } -void TurboMind::CreateWeights(int index) +void TurboMind::CreateContext(int index) { - return impl_->CreateWeights(index); + return impl_->CreateContext(index); } -TensorMap TurboMind::GetWeights(int index) +core::Module* TurboMind::CreateRoot(int index) { - return impl_->GetWeights(index); + return impl_->CreateRoot(index); +} + +core::Module* TurboMind::root(int index) +{ + return impl_->weights_[index].get(); +} + +std::pair TurboMind::weight_context(int index) +{ + auto& root = impl_->weights_.at(index); + TM_CHECK(root != nullptr); + return {root->stream(), root->allocator()}; } void TurboMind::ProcessWeights(int index) @@ -827,4 +497,19 @@ bool TurboMind::is_dummy_node() const noexcept return impl_->n_queues_ == 0; } +int TurboMind::GetAttnTpRank(int index) +{ + return impl_->engine_params_.at(index).attn_tp_rank; +} + +int TurboMind::GetMlpTpRank(int index) +{ + return impl_->engine_params_.at(index).mlp_tp_rank; +} + +int TurboMind::GetModelTpRank(int index) +{ + return impl_->engine_params_.at(index).model_tp_rank; +} + } // namespace turbomind diff --git a/src/turbomind/turbomind.h b/src/turbomind/turbomind.h index ede7c7c2e3..4d19d12641 100644 --- a/src/turbomind/turbomind.h +++ b/src/turbomind/turbomind.h @@ -5,8 +5,11 @@ #include #include #include +#include #include "src/turbomind/core/core.h" +#include "src/turbomind/core/module.h" +#include "src/turbomind/engine/engine_config.h" #include "src/turbomind/engine/model_request.h" #include "src/turbomind/utils/metrics.h" @@ -18,11 +21,16 @@ class TurboMind { ~TurboMind(); - TurboMind(std::string model_dir, std::string config, FFICtxFactory ffi_ctx_factory); + TurboMind(std::string model_dir, EngineConfig config, FFICtxFactory ffi_ctx_factory); - void CreateWeights(int index); + void CreateContext(int index); + core::Module* CreateRoot(int index); - TensorMap GetWeights(int index); + /// Returns the root `Module` for GPU `index`'s weight tree. + core::Module* root(int index); + + /// Returns the Stream and Allocator for GPU `index`'s weight tree. + std::pair weight_context(int index); void ProcessWeights(int index); @@ -38,6 +46,15 @@ class TurboMind { std::unique_ptr CreateRequest(); + /// Attention TP rank for GPU *index*. + int GetAttnTpRank(int index); + + /// MLP TP rank for GPU *index*. + int GetMlpTpRank(int index); + + /// Model-level TP rank (rank within d_tp_group) for GPU *index*. + int GetModelTpRank(int index); + private: struct Impl; std::unique_ptr impl_; diff --git a/src/turbomind/utils/memory_utils.cu b/src/turbomind/utils/memory_utils.cu index a31bfd631d..0eee78e544 100644 --- a/src/turbomind/utils/memory_utils.cu +++ b/src/turbomind/utils/memory_utils.cu @@ -14,6 +14,9 @@ * limitations under the License. */ +#include "src/turbomind/core/context.h" +#include "src/turbomind/core/data_format.h" +#include "src/turbomind/core/data_type.h" #include "src/turbomind/macro.h" #include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/memory_utils.h" @@ -59,4 +62,69 @@ template void invokeInPlaceTranspose102(uint16_t* data, bool copy, cudaStream_t stream); +// ----------------------------------------------------------------------- +// Element-wise dtype cast kernel (fp32 <-> fp16 <-> bf16) +// ----------------------------------------------------------------------- + +template +__global__ void dtype_cast_kernel(To* dst, const Ti* src, size_t n) +{ + for (size_t i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += blockDim.x * gridDim.x) { + dst[i] = static_cast(src[i]); + } +} + +void invokeDtypeCast( + void* dst, const void* src, size_t count, DataType dst_dtype, DataType src_dtype, cudaStream_t stream) +{ + const int block = 512; + const int grid = std::min((count + block - 1) / block, (size_t)8192); + + using half_t = turbomind::half_t; + using bf16_t = turbomind::bfloat16_t; + + // fp32 -> fp16 + if (src_dtype == turbomind::kFloat32 && dst_dtype == turbomind::kFloat16) { + dtype_cast_kernel<<>>((half_t*)dst, (const float*)src, count); + } + // fp32 -> bf16 + else if (src_dtype == turbomind::kFloat32 && dst_dtype == turbomind::kBfloat16) { + dtype_cast_kernel<<>>((bf16_t*)dst, (const float*)src, count); + } + // fp16 -> fp32 + else if (src_dtype == turbomind::kFloat16 && dst_dtype == turbomind::kFloat32) { + dtype_cast_kernel<<>>((float*)dst, (const half_t*)src, count); + } + // bf16 -> fp32 + else if (src_dtype == turbomind::kBfloat16 && dst_dtype == turbomind::kFloat32) { + dtype_cast_kernel<<>>((float*)dst, (const bf16_t*)src, count); + } + // fp16 -> bf16 + else if (src_dtype == turbomind::kFloat16 && dst_dtype == turbomind::kBfloat16) { + dtype_cast_kernel<<>>((bf16_t*)dst, (const half_t*)src, count); + } + // bf16 -> fp16 + else if (src_dtype == turbomind::kBfloat16 && dst_dtype == turbomind::kFloat16) { + dtype_cast_kernel<<>>((half_t*)dst, (const bf16_t*)src, count); + } +} + +// ----------------------------------------------------------------------- +// EnsureFloatDtype — cast tensor to target dtype if both are trivial float +// ----------------------------------------------------------------------- + +void EnsureFloatDtype(core::Tensor& tensor, DataType target_dtype) +{ + if (!tensor || tensor.dtype() == target_dtype) { + return; + } + if (!IsTrivialFloatType(tensor.dtype()) || !IsTrivialFloatType(target_dtype)) { + return; + } + auto stream = core::Context::stream().handle(); + core::Tensor casted{tensor.shape(), target_dtype, tensor.device()}; + invokeDtypeCast(casted.raw_data(), tensor.raw_data(), tensor.size(), target_dtype, tensor.dtype(), stream); + tensor = std::move(casted); +} + } // namespace turbomind diff --git a/src/turbomind/utils/memory_utils.h b/src/turbomind/utils/memory_utils.h index a61408281f..e49bfc6cf4 100644 --- a/src/turbomind/utils/memory_utils.h +++ b/src/turbomind/utils/memory_utils.h @@ -16,6 +16,8 @@ #pragma once +#include "src/turbomind/core/data_type.h" +#include "src/turbomind/core/tensor.h" #include namespace turbomind { @@ -24,4 +26,13 @@ template void invokeInPlaceTranspose102( T* data, T* workspace, const int dim0, const int dim1, const int dim2, bool copy = true, cudaStream_t stream = 0); +/// Element-wise dtype cast kernel. Supports fp32 <-> fp16 <-> bf16. +void invokeDtypeCast( + void* dst, const void* src, size_t count, DataType dst_dtype, DataType src_dtype, cudaStream_t stream = 0); + +/// If *tensor* is a trivial float type that differs from *target_dtype*, cast +/// it in-place (allocates a temporary, casts, move-assigns). Uses +/// Context::stream() internally — no stream parameter needed. +void EnsureFloatDtype(core::Tensor& tensor, DataType target_dtype); + } // namespace turbomind diff --git a/tests/test_lmdeploy/test_converter.py b/tests/test_lmdeploy/test_converter.py new file mode 100644 index 0000000000..07b04af4c6 --- /dev/null +++ b/tests/test_lmdeploy/test_converter.py @@ -0,0 +1,49 @@ +import logging + +import pytest + +from lmdeploy.turbomind.converter import _deep_merge + + +@pytest.fixture(autouse=True) +def _caplog_lmdeploy(caplog): + caplog.set_level(logging.WARNING, logger='lmdeploy') + logger = logging.getLogger('lmdeploy') + logger.propagate = True + yield + logger.propagate = False + + +class TestDeepMerge: + + def test_flat_override(self): + base = {'a': 1, 'b': 2} + _deep_merge(base, {'b': 99}) + assert base == {'a': 1, 'b': 99} + + def test_nested_override(self): + base = {'rope_scaling': {'rope_type': 'default', 'factor': 1.0}} + _deep_merge(base, {'rope_scaling': {'factor': 4.0}}) + assert base == {'rope_scaling': {'rope_type': 'default', 'factor': 4.0}} + + def test_new_key_warns(self, caplog): + base = {'a': 1} + _deep_merge(base, {'nonexistent_key': 'val'}) + assert base['nonexistent_key'] == 'val' + assert 'nonexistent_key' in caplog.text + + def test_nested_new_key_warns(self, caplog): + base = {'rope_scaling': {'factor': 1.0}} + _deep_merge(base, {'rope_scaling': {'brand_new': 'yes'}}) + assert base['rope_scaling']['brand_new'] == 'yes' + assert 'brand_new' in caplog.text + + def test_empty_override_is_noop(self): + base = {'a': 1} + _deep_merge(base, {}) + assert base == {'a': 1} + + def test_scalar_overrides_dict(self): + base = {'a': {'nested': 1}} + _deep_merge(base, {'a': 'flat'}) + assert base == {'a': 'flat'} diff --git a/tests/test_lmdeploy/test_turbomind/test_compressed_tensors.py b/tests/test_lmdeploy/test_turbomind/test_compressed_tensors.py index 7b8b75d4b1..8c565aab89 100644 --- a/tests/test_lmdeploy/test_turbomind/test_compressed_tensors.py +++ b/tests/test_lmdeploy/test_turbomind/test_compressed_tensors.py @@ -1,22 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. -import pytest import torch - -from lmdeploy.turbomind.deploy import converter from lmdeploy.turbomind.deploy.parameter import QuantWeightOnly, pack_u4_row from lmdeploy.turbomind.deploy.source_model.qwen import Qwen3_5ReaderMixin -class _FakeModelConfig: - - def __init__(self, dtype=torch.float16): - self.dtype = dtype - - def to_dict(self): - return {'architectures': ['Qwen3_5ForConditionalGeneration']} - - class _DummyQwen35Reader(Qwen3_5ReaderMixin): def __init__(self, params): @@ -41,35 +29,6 @@ def _reference_compressed_tensors_dequant(weight_packed: torch.Tensor, weight_sc weight_scale.to(torch.float16).unsqueeze(-1)).reshape(weight.shape[0], -1) -def test_compressed_tensors_support_matrix(monkeypatch): - fake_cfg = _FakeModelConfig() - monkeypatch.setattr(converter, 'get_model_arch', lambda _: ('Qwen3_5ForConditionalGeneration', fake_cfg)) - monkeypatch.setattr(converter, '_get_and_verify_max_len', lambda *args, **kwargs: 4096) - monkeypatch.setattr(converter, 'is_bf16_supported', lambda: False) - - _, default_cfg = converter.get_output_model_registered_name_and_config('dummy', - model_format='compressed-tensors', - dtype='auto', - group_size=0) - assert default_cfg.model_config.group_size == 128 - - _, gs32_cfg = converter.get_output_model_registered_name_and_config('dummy', - model_format='compressed-tensors', - dtype='auto', - group_size=32) - assert gs32_cfg.model_config.group_size == 32 - assert gs32_cfg.model_config.model_format == 'awq' - - with pytest.raises(ValueError, match='Unsupported group_size=64'): - converter.get_output_model_registered_name_and_config('dummy', - model_format='compressed-tensors', - dtype='auto', - group_size=64) - - with pytest.raises(ValueError, match='model_format=\"awq\"'): - converter.get_output_model_registered_name_and_config('dummy', model_format='awq', dtype='auto', group_size=32) - - def test_quant_weight_only_synthesizes_compressed_tensor_zero_points_from_scales(): scales = ( torch.rand(4, 8, dtype=torch.float32), diff --git a/tests/test_lmdeploy/test_turbomind/test_converter.py b/tests/test_lmdeploy/test_turbomind/test_converter.py index e91e0fefaa..689276934b 100644 --- a/tests/test_lmdeploy/test_turbomind/test_converter.py +++ b/tests/test_lmdeploy/test_turbomind/test_converter.py @@ -1,31 +1,8 @@ # yapf: disable -from lmdeploy import TurbomindEngineConfig -from lmdeploy.turbomind import update_parallel_config -from lmdeploy.turbomind.deploy.converter import ( - get_input_model_registered_name, - get_output_model_registered_name_and_config, -) -from lmdeploy.turbomind.deploy.source_model.base import INPUT_MODELS # yapf: enable -def test_torch_dtype_fallback(): - """torch_dtype is deprecated in transformers v5+; dtype should be - preferred. - - This test ensures get_output_model_registered_name_and_config still works - for models whose config exposes either `dtype` or `torch_dtype`. - """ - _, config = get_output_model_registered_name_and_config( - 'internlm/internlm2-chat-7b', - model_format='hf', - dtype='auto', - group_size=0, - ) - assert config.weight_type in ('float16', 'bfloat16') - - def test_ffn_reader_kind_none(): """FFN readers must handle kind=None (returns filter list, not tensors). @@ -67,97 +44,3 @@ def test_ffn_reader_kind_none(): assert len(result2) > 0 assert all(isinstance(k, str) for k in result2) assert all(re.search(r'feed_forward', k) for k in result2) - - -def test_registered_models(): - for model, model_format, group_size, weight_type, register_name in [ - ('internlm/internlm2-7b', 'hf', 0, 'bfloat16', 'tm'), ('baichuan-inc/Baichuan-7B', 'hf', 0, 'float16', 'tm'), - ('baichuan-inc/Baichuan2-7B-Chat', 'hf', 0, 'bfloat16', 'tm'), - ('baichuan-inc/Baichuan-13B-Chat', 'hf', 0, 'bfloat16', 'tm'), - ('baichuan-inc/Baichuan2-13B-Chat', 'hf', 0, 'bfloat16', 'tm'), - ('internlm/internlm-chat-7b', 'hf', 0, 'float16', 'tm'), - ('internlm/internlm2-chat-7b', 'hf', 0, 'bfloat16', 'tm'), - ('internlm/internlm-xcomposer2-4khd-7b', 'hf', 0, 'bfloat16', 'tm'), - ('internlm/internlm-xcomposer2-vl-7b', 'hf', 0, 'bfloat16', 'tm'), - ('internlm/internlm-xcomposer2-7b', 'hf', 0, 'bfloat16', 'tm'), - ('lmsys/vicuna-7b-v1.5', 'hf', 0, 'float16', 'tm'), ('01-ai/Yi-1.5-9B', 'hf', 0, 'bfloat16', 'tm'), - ('deepseek-ai/deepseek-coder-6.7b-instruct', 'hf', 0, 'bfloat16', 'tm'), - ('deepseek-ai/deepseek-llm-7b-chat', 'hf', 0, 'bfloat16', 'tm'), - ('Qwen/Qwen-7B-Chat', 'hf', 0, 'bfloat16', 'tm'), ('Qwen/Qwen1.5-7B-Chat', 'hf', 0, 'bfloat16', 'tm'), - ('Qwen/Qwen2-7B-Instruct', 'hf', 0, 'bfloat16', 'tm'), ('Qwen/Qwen-VL-Chat', 'hf', 0, 'bfloat16', 'tm'), - ('liuhaotian/llava-v1.6-34b', 'hf', 0, 'bfloat16', 'tm'), - ('liuhaotian/llava-v1.6-mistral-7b', 'hf', 0, 'bfloat16', 'tm'), - ('liuhaotian/llava-v1.6-vicuna-13b', 'hf', 0, 'bfloat16', 'tm'), - ('OpenGVLab/InternVL-Chat-V1-5', 'hf', 0, 'bfloat16', 'tm'), - ('deepseek-ai/deepseek-vl-7b-chat', 'hf', 0, 'float16', 'tm'), - ('Qwen/Qwen1.5-4B-Chat-AWQ', 'awq', 128, 'int4', 'tm'), - ('solidrust/Meta-Llama-3-8B-Instruct-hf-AWQ', 'awq', 128, 'int4', 'tm'), - ('internlm/internlm2-chat-20b-4bits', 'awq', 128, 'int4', 'tm'), - ('internlm/internlm-xcomposer2-vl-7b-4bit', 'awq', 128, 'int4', 'tm') - ]: - input_name = get_input_model_registered_name(model, model_format=model_format) - assert input_name in list(INPUT_MODELS.module_dict.keys()) - - output_name, config = get_output_model_registered_name_and_config(model, - model_format=model_format, - dtype='auto', - group_size=0) - assert output_name == register_name - assert config.model_config.group_size == group_size - assert config.session_len > 0 - assert config.model_config.model_arch is not None - - -def test_update_from_engine_config(): - import copy - _, _config = get_output_model_registered_name_and_config('internlm/internlm2-chat-7b', - model_format='hf', - dtype='auto', - group_size=0) - config = copy.deepcopy(_config) - config.update_from_engine_config(None) - assert (config == _config) - - config = copy.deepcopy(_config) - engine_config = TurbomindEngineConfig() - update_parallel_config(engine_config) - config.update_from_engine_config(engine_config) - assert config.model_config.attn_tp_size == 1 - assert config.session_len == 32768 - - config = copy.deepcopy(_config) - engine_config = TurbomindEngineConfig(model_format='hf', - tp=2, - device_num=2, - session_len=4000, - max_batch_size=100, - cache_max_entry_count=0.5, - quant_policy=8, - rope_scaling_factor=3.0, - use_logn_attn=True, - max_prefill_iters=64, - num_tokens_per_iter=256) - update_parallel_config(engine_config) - config.update_from_engine_config(engine_config) - - assert (config.model_config.attn_tp_size == engine_config.attn_tp_size) - assert (config.session_len == engine_config.session_len) - assert (config.attention_config.rope_param.type == 'dynamic') - assert (config.attention_config.rope_param.factor == engine_config.rope_scaling_factor) - assert (config.attention_config.use_logn_attn == engine_config.use_logn_attn) - - -def test_dtype(): - testsets = [('auto', 'bfloat16'), ('float16', 'float16'), ('bfloat16', 'bfloat16')] - for specified_dtype, expected_dtype in testsets: - _, _config = get_output_model_registered_name_and_config('internlm/internlm2-chat-7b', - model_format='hf', - dtype=specified_dtype, - group_size=0) - assert _config.weight_type == expected_dtype - for specified_dtype in ['auto', 'float16', 'bfloat16']: - _, _config = get_output_model_registered_name_and_config('internlm/internlm2_5-20b-chat-4bit-awq', - model_format='awq', - dtype=specified_dtype, - group_size=128) - assert _config.weight_type == 'int4' diff --git a/tests/test_lmdeploy/test_turbomind/test_internvl3_5.py b/tests/test_lmdeploy/test_turbomind/test_internvl3_5.py new file mode 100644 index 0000000000..200832cb80 --- /dev/null +++ b/tests/test_lmdeploy/test_turbomind/test_internvl3_5.py @@ -0,0 +1,184 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest.mock import Mock + +import _turbomind as _tm +import pytest +from transformers import PretrainedConfig + +from lmdeploy.turbomind.models.qwen3 import Qwen3TextModel + + +def _make_qwen3_stub(): + model = Qwen3TextModel.__new__(Qwen3TextModel) + model.cfg = PretrainedConfig(hidden_size=4, vocab_size=8, tie_word_embeddings=False) + model._contexts = [] + model._root_handles = [] + model._model_tp_ranks = [] + model._layer_prefix = 'model.layers' + model._tie_embeddings = False + model._get = Mock(side_effect=lambda key: f'tensor:{key}') + model._linear = Mock(side_effect=lambda key: f'linear:{key}') + model.norm = Mock(side_effect=lambda weight: f'norm:{weight}') + model.layers = Mock(side_effect=lambda pfx: f'layers:{pfx}') + return model + + +class _FakeRoot: + + last = None + + def __init__(self, *args, **kwargs): + self.add_token_embeds = Mock() + self.add_lm_head = Mock() + self.norm = None + self.layers = None + self.build = Mock() + _FakeRoot.last = self + + +def test_qwen3_model_uses_optional_checkpoint_prefix(monkeypatch): + import lmdeploy.turbomind.models.qwen3 as qwen3_mod + + monkeypatch.setattr(qwen3_mod, 'TextModelBuilder', _FakeRoot) + model = _make_qwen3_stub() + + model.model(pfx='language_model.') + + model._get.assert_any_call('language_model.model.embed_tokens.weight') + model._get.assert_any_call('language_model.model.norm.weight') + model._linear.assert_called_once_with('language_model.lm_head') + model.layers.assert_called_once_with('language_model.model.layers') + + root = _FakeRoot.last + root.add_token_embeds.assert_called_once_with( + 'tensor:language_model.model.embed_tokens.weight') + root.add_lm_head.assert_called_once_with('linear:language_model.lm_head') + root.build.assert_called_once_with() + + +def test_qwen3_model_default_prefix_preserves_plain_keys(monkeypatch): + import lmdeploy.turbomind.models.qwen3 as qwen3_mod + + monkeypatch.setattr(qwen3_mod, 'TextModelBuilder', _FakeRoot) + model = _make_qwen3_stub() + + model.model() + + model._get.assert_any_call('model.embed_tokens.weight') + model._get.assert_any_call('model.norm.weight') + model._linear.assert_called_once_with('lm_head') + model.layers.assert_called_once_with('model.layers') + + +def _internvl_cfg(inner_arch='Qwen3ForCausalLM'): + return PretrainedConfig( + architectures=['InternVLChatModel'], + llm_config=PretrainedConfig( + architectures=[inner_arch], + num_hidden_layers=1, + vocab_size=8, + rms_norm_eps=1e-6, + tie_word_embeddings=False, + model_type='qwen3', + num_attention_heads=2, + hidden_size=4, + head_dim=2, + num_key_value_heads=2, + max_position_embeddings=16, + intermediate_size=8, + attention_bias=False, + ), + ) + + +def test_internvl35_model_creates_qwen3_text_model(): + from lmdeploy.turbomind.models.internvl3_5 import InternVL3_5Model + + model = InternVL3_5Model( + _internvl_cfg(), + resolver=Mock(data_type=_tm.DataType.TYPE_FP16)) + + assert isinstance(model.text_model, Qwen3TextModel) + assert model.vision_model is None + + +def test_internvl35_model_delegates_runtime_params_and_export(monkeypatch): + from lmdeploy.turbomind.models.internvl3_5 import InternVL3_5Model + + fake_text_model = Mock() + fake_text_cls = Mock(return_value=fake_text_model) + monkeypatch.setattr( + 'lmdeploy.turbomind.models.internvl3_5.Qwen3TextModel', + fake_text_cls) + + resolver = Mock() + model = InternVL3_5Model(_internvl_cfg(), resolver=resolver) + + assert fake_text_cls.call_args.args[0].architectures == ['Qwen3ForCausalLM'] + assert fake_text_cls.call_args.kwargs == {'resolver': resolver} + + attn_tp = Mock() + mlp_tp = Mock() + model_tp = Mock() + model.bind_runtime( + ctx='ctx', + root_handles=['root'], + attn_tp=attn_tp, + mlp_tp=mlp_tp, + model_tp=model_tp, + ) + fake_text_model.bind_runtime.assert_called_once_with( + ctx='ctx', + root_handles=['root'], + attn_tp=attn_tp, + mlp_tp=mlp_tp, + model_tp=model_tp, + ) + + params = {'language_model.lm_head.weight': object()} + model.set_params(params) + fake_text_model.set_params.assert_called_once_with(params) + + model.model() + fake_text_model.model.assert_called_once_with(pfx='language_model.') + + fake_text_model.cfg.vocab_size = 32000 + assert model._vocab_size == 32000 + + +def test_internvl35_model_requires_llm_config(): + from lmdeploy.turbomind.models.internvl3_5 import InternVL3_5Model + + cfg = {'architectures': ['InternVLChatModel']} + + with pytest.raises(ValueError, match='llm_config'): + InternVL3_5Model(cfg, resolver=Mock()) + + +def test_internvl35_model_requires_inner_architecture(): + from lmdeploy.turbomind.models.internvl3_5 import InternVL3_5Model + + cfg = {'architectures': ['InternVLChatModel'], 'llm_config': {}} + + with pytest.raises(ValueError, match='llm_config.architectures'): + InternVL3_5Model(cfg, resolver=Mock()) + + +def test_internvl35_model_rejects_unsupported_inner_architecture(): + from lmdeploy.turbomind.models.internvl3_5 import InternVL3_5Model + + with pytest.raises(ValueError, match='GptOssForCausalLM'): + InternVL3_5Model(_internvl_cfg('GptOssForCausalLM'), resolver=Mock()) + + +def test_supported_archs_maps_internvl_chat_model(): + from lmdeploy.turbomind.supported_models import SUPPORTED_ARCHS + + assert SUPPORTED_ARCHS['InternVLChatModel'] == 'internvl3_5' + + +def test_internvl35_model_is_registered(): + from lmdeploy.turbomind.models import InternVL3_5Model # noqa: F401 + from lmdeploy.turbomind.models.base import INPUT_MODELS + + assert INPUT_MODELS.get('internvl3_5') is InternVL3_5Model diff --git a/tests/test_lmdeploy/test_turbomind/test_loader.py b/tests/test_lmdeploy/test_turbomind/test_loader.py new file mode 100644 index 0000000000..b9d8144a45 --- /dev/null +++ b/tests/test_lmdeploy/test_turbomind/test_loader.py @@ -0,0 +1,88 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from queue import Queue +from unittest.mock import Mock + +from lmdeploy.turbomind.loader import BaseLoader, StateDictLoader, create_loader +from lmdeploy.turbomind.model_loader import ModelLoader + + +class _FakeModelComm: + + def attn_tp_rank(self, gpu): + return gpu + + def mlp_tp_rank(self, gpu): + return gpu + + def model_tp_rank(self, gpu): + return gpu + + def context(self, gpu): + return f'ctx:{gpu}' + + def root(self, gpu): + return f'root:{gpu}' + + +def test_base_loader_defaults_to_no_layer_pattern(): + class _TestLoader(BaseLoader): + def items(self): + raise NotImplementedError + + def all_items(self): + raise NotImplementedError + + loader = _TestLoader('model-path') + + assert loader.model_path == 'model-path' + assert loader.pattern is None + assert loader.mappings == [] + + +def test_state_dict_loader_can_be_created_without_pattern_or_mappings(): + queue = Queue() + + loader = create_loader(queue) + + assert isinstance(loader, StateDictLoader) + assert loader.pattern is None + + +def test_model_loader_export_uses_all_params_loader_without_model_metadata(monkeypatch): + import lmdeploy.turbomind.model_loader as model_loader_mod + + loader = Mock() + loader.all_items.return_value = {'weight': object()} + create_loader = Mock(return_value=loader) + monkeypatch.setattr(model_loader_mod, 'create_loader', create_loader) + + model = Mock() + model._loader_mappings = [] + model_loader = ModelLoader(model, _FakeModelComm(), 1, 'model-path', + data_type=Mock(), engine_config=Mock(attn_tp_size=1, attn_cp_size=1, mlp_tp_size=1)) + + model_loader.export() + + create_loader.assert_called_once_with('model-path', None, []) + model.set_params.assert_called_once_with(loader.all_items.return_value) + model.model.assert_called_once_with() + + +def test_model_loader_export_iter_uses_all_params_loader_without_model_metadata(monkeypatch): + import lmdeploy.turbomind.model_loader as model_loader_mod + + loader = Mock() + loader.all_items.return_value = {'weight': object()} + create_loader = Mock(return_value=loader) + monkeypatch.setattr(model_loader_mod, 'create_loader', create_loader) + + model = Mock() + model._loader_mappings = [] + model_loader = ModelLoader(model, _FakeModelComm(), 1, 'model-path', + data_type=Mock(), engine_config=Mock(attn_tp_size=1, attn_cp_size=1, mlp_tp_size=1)) + + assert list(model_loader.export_iter()) == [-1] + + create_loader.assert_called_once_with('model-path', None, []) + model.set_params.assert_called_once_with(loader.all_items.return_value) + model.model.assert_called_once_with() diff --git a/tests/test_lmdeploy/test_turbomind/test_local_autoconfig.py b/tests/test_lmdeploy/test_turbomind/test_local_autoconfig.py new file mode 100644 index 0000000000..72c5020f24 --- /dev/null +++ b/tests/test_lmdeploy/test_turbomind/test_local_autoconfig.py @@ -0,0 +1,353 @@ +from types import SimpleNamespace + +import _turbomind as _tm +from transformers import PretrainedConfig + + +class DummyConfig(PretrainedConfig): + model_type = 'dummy' + + +def test_load_model_config_returns_text_config_object(monkeypatch): + from lmdeploy.turbomind.models import utils + + text_cfg = DummyConfig( + hidden_size=16, + num_attention_heads=2, + num_hidden_layers=1, + vocab_size=32, + rms_norm_eps=1e-6, + ) + outer_cfg = DummyConfig(text_config=text_cfg) + + monkeypatch.setattr(utils, 'get_model_arch', lambda model_path: ('DummyForCausalLM', outer_cfg)) + + assert utils.load_model_config('/fake/model') is text_cfg + + +def test_load_model_config_returns_outer_object_without_text_config(monkeypatch): + from lmdeploy.turbomind.models import utils + + cfg = DummyConfig( + hidden_size=16, + num_attention_heads=2, + num_hidden_layers=1, + vocab_size=32, + rms_norm_eps=1e-6, + ) + + monkeypatch.setattr(utils, 'get_model_arch', lambda model_path: ('DummyForCausalLM', cfg)) + + assert utils.load_model_config('/fake/model') is cfg + + +def test_apply_hf_overrides_updates_config_object(): + from lmdeploy.turbomind.converter import _apply_hf_overrides + + cfg = DummyConfig(hidden_size=16, rope_scaling={'type': 'linear', 'factor': 2.0}) + + _apply_hf_overrides(cfg, { + 'hidden_size': 32, + 'rope_scaling': {'factor': 4.0}, + 'new_field': 'kept', + }) + + assert cfg.hidden_size == 32 + assert cfg.rope_scaling == {'type': 'linear', 'factor': 4.0} + assert cfg.new_field == 'kept' + + +def test_apply_hf_overrides_updates_nested_config_object(): + from lmdeploy.turbomind.converter import _apply_hf_overrides + + cfg = DummyConfig(llm_config=DummyConfig(hidden_size=16, rope_scaling={'type': 'linear', 'factor': 2.0})) + + _apply_hf_overrides(cfg, { + 'llm_config': { + 'hidden_size': 32, + 'rope_scaling': {'factor': 4.0}, + }, + }) + + assert cfg.llm_config.hidden_size == 32 + assert cfg.llm_config.rope_scaling == {'type': 'linear', 'factor': 4.0} + + +def test_parse_rope_param_reads_config_object_fields(): + from lmdeploy.turbomind.models.utils import parse_rope_param + + cfg = DummyConfig( + rope_theta=500000.0, + max_position_embeddings=4096, + rope_scaling={ + 'rope_type': 'llama3', + 'factor': 8.0, + 'low_freq_factor': 1.0, + 'high_freq_factor': 4.0, + 'original_max_position_embeddings': 8192, + }, + ) + + rope, max_pos = parse_rope_param(cfg, head_dim=128) + + assert max_pos == 4096 + assert rope.type == 'llama3' + assert rope.base == 500000.0 + assert rope.dim == 128 + assert rope.factor == 8.0 + assert rope.low_freq_factor == 1.0 + assert rope.high_freq_factor == 4.0 + assert rope.original_max_position_embeddings == 8192 + + +def test_make_attention_config_reads_only_common_attention_fields(): + from lmdeploy.turbomind.models.utils import make_attention_config + + cfg = DummyConfig( + hidden_size=16, + num_attention_heads=4, + num_key_value_heads=2, + head_dim=8, + rope_theta=10000.0, + max_position_embeddings=128, + ) + engine_cfg = SimpleNamespace(attn_tp_size=2) + + attn_cfg = make_attention_config( + cfg, + engine_cfg, + data_type=_tm.DataType.TYPE_FP16, + ) + + assert attn_cfg.hidden_dim == 16 + assert attn_cfg.head_num == 4 + assert attn_cfg.kv_head_num == 2 + assert attn_cfg.head_dim == 8 + assert attn_cfg.tp_size == 2 + assert attn_cfg.data_type == _tm.DataType.TYPE_FP16 + assert attn_cfg.rope.dim == 8 + + +def test_make_attention_config_applies_rope_scaling_factor_override(): + from lmdeploy.turbomind.models.utils import make_attention_config + + cfg = DummyConfig( + hidden_size=16, + num_attention_heads=4, + num_key_value_heads=4, + head_dim=4, + rope_theta=10000.0, + max_position_embeddings=128, + ) + engine_cfg = SimpleNamespace(attn_tp_size=2, rope_scaling_factor=2.0) + + attn_cfg = make_attention_config( + cfg, + engine_cfg, + data_type=_tm.DataType.TYPE_FP16, + ) + + assert attn_cfg.rope.factor == 2.0 + assert attn_cfg.rope.max_position_embeddings == 128 + + +def test_model_weight_and_ffn_helpers_read_module_fields(): + from lmdeploy.turbomind.builders import _act_type_id + from lmdeploy.turbomind.models.utils import make_ffn_config, make_model_weight_config + + cfg = DummyConfig(hidden_size=16, intermediate_size=64) + engine_cfg = SimpleNamespace(attn_tp_size=2, attn_cp_size=1, mlp_tp_size=4) + + root_cfg = make_model_weight_config( + cfg, + engine_cfg, + data_type=_tm.DataType.TYPE_FP16, + ) + ffn_cfg = make_ffn_config( + cfg, + engine_cfg, + data_type=_tm.DataType.TYPE_FP16, + act_type=_act_type_id('silu'), + ) + + assert root_cfg.hidden_units == 16 + assert root_cfg.tp_size == 2 + assert root_cfg.data_type == _tm.DataType.TYPE_FP16 + assert ffn_cfg.hidden_dim == 16 + assert ffn_cfg.inter_size == 64 + assert ffn_cfg.tp_size == 4 + assert ffn_cfg.data_type == _tm.DataType.TYPE_FP16 + assert ffn_cfg.act_type == _act_type_id('silu') + + +def test_make_moe_config_returns_populated_config(): + from lmdeploy.turbomind.models.utils import make_moe_config + + cfg = DummyConfig(hidden_size=16) + engine_cfg = SimpleNamespace(mlp_tp_size=4) + + moe_cfg = make_moe_config( + cfg, engine_cfg, + data_type=_tm.DataType.TYPE_FP16, + experts_per_token=4, + ) + + assert moe_cfg.method == 1 + assert moe_cfg.experts_per_token == 4 + assert moe_cfg.norm_topk_prob is True + assert moe_cfg.shared_gate is False + assert moe_cfg.routed_scale == 1.0 + assert moe_cfg.router_bias is False + assert moe_cfg.topk_group == 1 + assert moe_cfg.topk_method == 'greedy' + assert moe_cfg.n_group == 1 + assert moe_cfg.scoring_func == 'softmax' + assert moe_cfg.router_n_groups == 0 + assert moe_cfg.hidden_dim == 16 + assert moe_cfg.mlp_bias is False + assert moe_cfg.data_type == _tm.DataType.TYPE_FP16 + assert moe_cfg.tp_size == 4 + assert moe_cfg.act_type == 0 # silu + assert moe_cfg.fuse_silu is True + + +def test_make_moe_config_overrides_defaults(): + from lmdeploy.turbomind.models.utils import make_moe_config + + cfg = DummyConfig(hidden_size=32) + engine_cfg = SimpleNamespace(mlp_tp_size=2) + + moe_cfg = make_moe_config( + cfg, engine_cfg, + data_type=_tm.DataType.TYPE_BF16, + experts_per_token=8, + act_type=1, + norm_topk_prob=False, + shared_gate=True, + router_bias=True, + mlp_bias=True, + topk_method='noaux_tc', + scoring_func='sigmoid', + routed_scale=2.0, + topk_group=2, + n_group=2, + router_n_groups=4, + ) + + assert moe_cfg.experts_per_token == 8 + assert moe_cfg.act_type == 1 + assert moe_cfg.norm_topk_prob is False + assert moe_cfg.shared_gate is True + assert moe_cfg.router_bias is True + assert moe_cfg.mlp_bias is True + assert moe_cfg.topk_method == 'noaux_tc' + assert moe_cfg.scoring_func == 'sigmoid' + assert moe_cfg.routed_scale == 2.0 + assert moe_cfg.topk_group == 2 + assert moe_cfg.n_group == 2 + assert moe_cfg.router_n_groups == 4 + assert moe_cfg.hidden_dim == 32 + assert moe_cfg.data_type == _tm.DataType.TYPE_BF16 + assert moe_cfg.tp_size == 2 + +def _engine_cfg(): + return SimpleNamespace( + rope_scaling_factor=0, + attn_tp_size=2, + attn_cp_size=1, + mlp_tp_size=4, + ) + + +def _resolver(): + return SimpleNamespace(data_type=_tm.DataType.TYPE_FP16) + + +def test_llama_constructor_preserves_common_config_fields(): + from lmdeploy.turbomind.models.llama import LlamaModel + + cfg = DummyConfig( + num_hidden_layers=2, + vocab_size=128, + rms_norm_eps=1e-6, + tie_word_embeddings=False, + model_type='llama', + num_attention_heads=4, + num_key_value_heads=2, + hidden_size=32, + head_dim=8, + max_position_embeddings=1024, + intermediate_size=64, + attention_bias=False, + ) + + model = LlamaModel(cfg, _engine_cfg(), resolver=_resolver()) + + assert model.cfg is cfg + assert model.cfg.num_hidden_layers == 2 + assert model.cfg.vocab_size == 128 + assert model.cfg.rms_norm_eps == 1e-6 + assert model._attn_cfg.hidden_dim == 32 + assert model._attn_cfg.head_num == 4 + assert model._attn_cfg.kv_head_num == 2 + assert not hasattr(model, '_head_dim') + assert not hasattr(model, '_head_num') + assert not hasattr(model, '_kv_head_num') + assert not hasattr(model, '_rope') + assert model._attn_cfg.has_bias is False + assert model._ffn_cfg.inter_size == 64 + assert model._ffn_cfg.tp_size == 4 + + +def test_qwen2_constructor_keeps_qkv_bias_local(): + from lmdeploy.turbomind.models.qwen2 import Qwen2Model + + cfg = DummyConfig( + num_hidden_layers=2, + vocab_size=128, + rms_norm_eps=1e-6, + tie_word_embeddings=False, + model_type='qwen2', + num_attention_heads=4, + num_key_value_heads=2, + hidden_size=32, + head_dim=8, + max_position_embeddings=1024, + intermediate_size=64, + qkv_bias=True, + ) + + model = Qwen2Model(cfg, _engine_cfg(), resolver=_resolver()) + + assert model._attn_cfg.has_bias is True + assert model._ffn_cfg.inter_size == 64 + + +def test_gpt_oss_constructor_keeps_sliding_window_local(): + from lmdeploy.turbomind.models.gpt_oss import GptOssModel + + cfg = DummyConfig( + num_hidden_layers=2, + vocab_size=128, + rms_norm_eps=1e-6, + tie_word_embeddings=False, + model_type='gpt-oss', + num_attention_heads=4, + num_key_value_heads=2, + hidden_size=32, + head_dim=8, + max_position_embeddings=1024, + intermediate_size=64, + attention_bias=True, + num_local_experts=4, + experts_per_token=2, + layer_types=['sliding_attention', 'full_attention'], + sliding_window=256, + ) + + model = GptOssModel(cfg, _engine_cfg(), resolver=_resolver()) + + assert model._attn_cfg.attn_sink is True + assert model._attn_cfg.has_bias == 1 + assert model._window_sizes == [256, 0] + assert model._expert_nums == [4, 4] diff --git a/tests/test_lmdeploy/test_turbomind/test_transform_tensors.py b/tests/test_lmdeploy/test_turbomind/test_transform_tensors.py new file mode 100644 index 0000000000..541bb1c4da --- /dev/null +++ b/tests/test_lmdeploy/test_turbomind/test_transform_tensors.py @@ -0,0 +1,362 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Tests for the @transform_output_dim decorator.""" + +from __future__ import annotations + +import importlib +import importlib.util +import os +import sys +import types + +import torch + +# --------------------------------------------------------------------------- +# Bootstrap: make _turbomind available as a lightweight stub so that +# ``lmdeploy.turbomind.linear`` and ``_base`` can be imported without +# the real C extension. +# --------------------------------------------------------------------------- + +_repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')) + + +def _setup_fake_tm(): + """Register a minimal ``_turbomind`` stub in ``sys.modules``.""" + if '_turbomind' in sys.modules: + return + + tm = types.ModuleType('_turbomind') + + class DataType: + TYPE_FP32 = 0 + TYPE_FP16 = 1 + TYPE_BF16 = 2 + TYPE_INVALID = 3 + TYPE_INT32 = 4 + TYPE_INT64 = 5 + TYPE_INT8 = 6 + TYPE_UINT8 = 7 + TYPE_UINT4 = 8 + TYPE_FP8_E4M3 = 9 + TYPE_FP4_E2M1 = 10 + + tm.DataType = DataType + + # Stub functions / classes referenced throughout turbomind/ + tm.create_module = lambda cfg: None + tm.LinearConfig = type('LinearConfig', (), {})() + tm.ResolveLinearWeightFormat = lambda *a, **kw: None + + sys.modules['_turbomind'] = tm + + +_setup_fake_tm() + +# --------------------------------------------------------------------------- +# Import modules under test by loading their files directly so we avoid +# triggering the ``lmdeploy.turbomind`` package __init__ (which drags in +# the real TurboMind runtime). +# --------------------------------------------------------------------------- + +# Ensure ``lmdeploy`` top-level is importable. +import lmdeploy # noqa: F401 + +# Register the sub-package stubs so that relative imports resolve. +_turbomind_pkg = sys.modules.get('lmdeploy.turbomind') +if _turbomind_pkg is None: + _turbomind_pkg = types.ModuleType('lmdeploy.turbomind') + _turbomind_pkg.__path__ = [os.path.join(_repo_root, 'lmdeploy', 'turbomind')] + _turbomind_pkg.__package__ = 'lmdeploy.turbomind' + sys.modules['lmdeploy.turbomind'] = _turbomind_pkg + +# (No longer need 'lmdeploy.turbomind.deploy' stub -- deploy/ was promoted.) + + +def _load_module_from_file(mod_name: str, file_path: str): + """Load a Python module from *file_path* and register it as *mod_name*.""" + spec = importlib.util.spec_from_file_location(mod_name, file_path) + mod = importlib.util.module_from_spec(spec) + sys.modules[mod_name] = mod + spec.loader.exec_module(mod) + return mod + + +# Load linear.py first — weight_format.py imports from .linear at module level. +_linear_path = os.path.join(_repo_root, 'lmdeploy', 'turbomind', 'linear.py') +_linear_mod = _load_module_from_file('lmdeploy.turbomind.linear', _linear_path) +Linear = _linear_mod.Linear + +# Load weight_format (needed by _base for TrivialFormat) +_wf_path = os.path.join(_repo_root, 'lmdeploy', 'turbomind', 'weight_format.py') +_load_module_from_file('lmdeploy.turbomind.weight_format', _wf_path) + +# Load builder/_base.py +_base_path = os.path.join(_repo_root, 'lmdeploy', 'turbomind', 'builders', '_base.py') +_base_mod = _load_module_from_file('lmdeploy.turbomind.builders._base', _base_path) +transform_output_dim = _linear_mod.transform_output_dim +transform_input_dim = _linear_mod.transform_input_dim + +# Register builder sub-package +_builder_pkg = sys.modules.get('lmdeploy.turbomind.builders') +if _builder_pkg is None: + _builder_pkg = types.ModuleType('lmdeploy.turbomind.builders') + _builder_pkg.__path__ = [os.path.join(_repo_root, 'lmdeploy', 'turbomind', 'builders')] + _builder_pkg.__package__ = 'lmdeploy.turbomind.builders' + sys.modules['lmdeploy.turbomind.builders'] = _builder_pkg + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_linear(out_dim: int, in_dim: int | None = None, + has_bias: bool = False) -> Linear: + """Create a trivial Linear for testing. + + If *in_dim* is given the weight is 2-D (in_dim, out_dim); otherwise it is 1-D (out_dim,) -- simulating a bias-only + tensor. + """ + tensors: dict[str, torch.Tensor] = {} + if in_dim is not None: + tensors['weight'] = torch.randn(in_dim, out_dim) + else: + tensors['weight'] = torch.randn(out_dim) + if has_bias: + tensors['bias'] = torch.randn(out_dim) + return Linear(tensors=tensors, + weight_format='placeholder') + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +class TestTransformTensors: + + # -- 1-in / 1-out ------------------------------------------------------- + + def test_1in_1out_2d_weight_only(self): + """1-in/1-out with a 2-D weight tensor.""" + + @transform_output_dim + def double(x: torch.Tensor) -> torch.Tensor: + return x * 2 + + lin = _make_linear(out_dim=8, in_dim=4) + result = double(lin) + assert isinstance(result, Linear) + assert set(result.tensors) == {'weight'} + assert result.tensors['weight'].shape == (4, 8) + assert torch.allclose(result.tensors['weight'], + lin.tensors['weight'] * 2) + + def test_1in_1out_1d_bias_only(self): + """1-in/1-out with a 1-D tensor (bias-only shape).""" + + @transform_output_dim + def add_one(x: torch.Tensor) -> torch.Tensor: + return x + 1.0 + + lin = _make_linear(out_dim=6) # 1-D weight + result = add_one(lin) + assert isinstance(result, Linear) + assert result.tensors['weight'].shape == (6,) + assert torch.allclose(result.tensors['weight'], + lin.tensors['weight'] + 1.0) + + def test_1in_1out_mixed_dims(self): + """1-in/1-out with 2-D weight + 1-D bias.""" + + @transform_output_dim + def negate(x: torch.Tensor) -> torch.Tensor: + return -x + + lin = _make_linear(out_dim=5, in_dim=3, has_bias=True) + result = negate(lin) + assert isinstance(result, Linear) + assert set(result.tensors) == {'weight', 'bias'} + # weight stays 2-D + assert result.tensors['weight'].shape == (3, 5) + assert torch.allclose(result.tensors['weight'], + -lin.tensors['weight']) + # bias stays 1-D + assert result.tensors['bias'].shape == (5,) + assert torch.allclose(result.tensors['bias'], + -lin.tensors['bias']) + + # -- 1-in / 2-out (split) ----------------------------------------------- + + def test_1in_2out_split(self): + """1-in/2-out: split one Linear into two along last dim.""" + + @transform_output_dim + def split_in_half(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + mid = x.shape[-1] // 2 + return x[..., :mid], x[..., mid:] + + lin = _make_linear(out_dim=8, in_dim=4, has_bias=True) + a, b = split_in_half(lin) + assert isinstance(a, Linear) + assert isinstance(b, Linear) + assert a.tensors['weight'].shape == (4, 4) + assert b.tensors['weight'].shape == (4, 4) + assert a.tensors['bias'].shape == (4,) + assert b.tensors['bias'].shape == (4,) + + def test_1in_2out_1d_only(self): + """1-in/2-out with 1-D tensors.""" + + @transform_output_dim + def split_1d(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + mid = x.shape[-1] // 2 + return x[..., :mid], x[..., mid:] + + lin = _make_linear(out_dim=6) # 1-D + a, b = split_1d(lin) + assert a.tensors['weight'].shape == (3,) + assert b.tensors['weight'].shape == (3,) + + # -- multi-in / 1-out (concat) ------------------------------------------ + + def test_multi_in_1out_concat(self): + """Multi-in/1-out: concatenate three Linears along last dim.""" + + @transform_output_dim + def concat3(a: torch.Tensor, b: torch.Tensor, + c: torch.Tensor) -> torch.Tensor: + return torch.cat([a, b, c], dim=-1) + + la = _make_linear(out_dim=4, in_dim=3, has_bias=True) + lb = _make_linear(out_dim=4, in_dim=3, has_bias=True) + lc = _make_linear(out_dim=4, in_dim=3, has_bias=True) + result = concat3(la, lb, lc) + assert isinstance(result, Linear) + assert result.tensors['weight'].shape == (3, 12) + assert result.tensors['bias'].shape == (12,) + + # -- optional tensor arg ------------------------------------------------- + + def test_optional_tensor_none(self): + """Optional tensor arg passed as None -> inner fn receives None.""" + + @transform_output_dim + def maybe_add(x: torch.Tensor, + y: torch.Tensor | None) -> torch.Tensor: + if y is None: + return x + return x + y + + lin = _make_linear(out_dim=4, in_dim=3) + result = maybe_add(lin, None) + assert isinstance(result, Linear) + assert torch.allclose(result.tensors['weight'], + lin.tensors['weight']) + + def test_optional_tensor_provided(self): + """Optional tensor arg provided -> inner fn receives the tensor.""" + + @transform_output_dim + def maybe_add(x: torch.Tensor, + y: torch.Tensor | None) -> torch.Tensor: + return x + y + + la = _make_linear(out_dim=4, in_dim=3) + lb = _make_linear(out_dim=4, in_dim=3) + result = maybe_add(la, lb) + assert isinstance(result, Linear) + assert torch.allclose(result.tensors['weight'], + la.tensors['weight'] + lb.tensors['weight']) + + # -- format propagation -------------------------------------------------- + + def test_format_propagation(self): + """Output inherits weight_format from first input.""" + + @transform_output_dim + def identity(x: torch.Tensor) -> torch.Tensor: + return x + + lin = _make_linear(out_dim=4, in_dim=3) + object.__setattr__(lin, 'weight_format', 'fake_fmt') + result = identity(lin) + assert result.weight_format == 'fake_fmt' + + # -- kwargs passthrough --------------------------------------------------- + + def test_kwargs_passthrough(self): + """Non-tensor kwargs are forwarded unchanged.""" + + @transform_output_dim + def scale(x: torch.Tensor, factor: float) -> torch.Tensor: + return x * factor + + lin = _make_linear(out_dim=4, in_dim=3) + result = scale(lin, factor=3.0) + assert isinstance(result, Linear) + assert torch.allclose(result.tensors['weight'], + lin.tensors['weight'] * 3.0) + + +class TestTransformInputDim: + + def test_2d_transformed(self): + """2-D tensors are passed through the inner function.""" + + @transform_input_dim + def pad_first_dim(tensor: torch.Tensor, + *, target: int) -> torch.Tensor: + return torch.nn.functional.pad( + tensor, [0, 0, 0, target - tensor.size(0)]) + + lin = _make_linear(out_dim=4, in_dim=2) + result = pad_first_dim(lin, target=6) + assert isinstance(result, Linear) + assert result.tensors['weight'].shape == (6, 4) + + def test_1d_passthrough(self): + """1-D tensors (bias) pass through unchanged.""" + + @transform_input_dim + def pad_first_dim(tensor: torch.Tensor, + *, target: int) -> torch.Tensor: + return torch.nn.functional.pad( + tensor, [0, 0, 0, target - tensor.size(0)]) + + lin = _make_linear(out_dim=4) # 1-D weight + result = pad_first_dim(lin, target=6) + assert isinstance(result, Linear) + assert result.tensors['weight'].shape == (4,) # unchanged + + def test_mixed_dims_2d_transformed_1d_passthrough(self): + """2-D weight is transformed; 1-D bias passes through.""" + + @transform_input_dim + def double_input_dim(tensor: torch.Tensor) -> torch.Tensor: + return tensor.repeat(2, 1) + + lin = _make_linear(out_dim=4, in_dim=3, has_bias=True) + result = double_input_dim(lin) + assert isinstance(result, Linear) + assert set(result.tensors) == {'weight', 'bias'} + assert result.tensors['weight'].shape == (6, 4) # doubled + assert result.tensors['bias'].shape == (4,) # unchanged + + def test_1in_2out_distributes_1d(self): + """Multi-output: 1-D tensors duplicated into all output buckets.""" + + @transform_input_dim + def split_input(tensor: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor]: + mid = tensor.size(0) // 2 + return tensor[:mid], tensor[mid:] + + lin = _make_linear(out_dim=4, in_dim=6, has_bias=True) + a, b = split_input(lin) + assert isinstance(a, Linear) + assert isinstance(b, Linear) + assert a.tensors['weight'].shape == (3, 4) + assert b.tensors['weight'].shape == (3, 4) + assert a.tensors['bias'].shape == (4,) # duplicated + assert b.tensors['bias'].shape == (4,) # duplicated diff --git a/tests/test_lmdeploy/test_turbomind/test_weight_format_resolver.py b/tests/test_lmdeploy/test_turbomind/test_weight_format_resolver.py new file mode 100644 index 0000000000..2810205ad4 --- /dev/null +++ b/tests/test_lmdeploy/test_turbomind/test_weight_format_resolver.py @@ -0,0 +1,253 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Tests for WeightFormatResolver dispatch logic. + +Uses a lightweight fake WeightFormat subclass that stubs out +``make_data_format`` so the resolver can be exercised without the real +``_turbomind`` extension. +""" +from __future__ import annotations + +import importlib +import importlib.util +import os +import sys +import types + +import pytest +import torch + +# --------------------------------------------------------------------------- +# _turbomind stub (same pattern as test_transform_tensors.py) +# --------------------------------------------------------------------------- + +_repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')) + + +def _setup_fake_tm(): + """Ensure ``_turbomind`` in sys.modules has every attribute the resolver + and weight_format class bodies touch. + + Idempotent: augments whatever is + already there so running after test_transform_tensors.py (which sets up + a minimal stub) still leaves a usable module. + """ + tm = sys.modules.get('_turbomind') + if tm is None: + tm = types.ModuleType('_turbomind') + sys.modules['_turbomind'] = tm + + dt = getattr(tm, 'DataType', None) + if dt is None: + class DataType: + pass + dt = DataType + tm.DataType = dt + + # Class-body references in weight_format.py and builder/_base.py + # (_STR_TO_DTYPE, _TORCH_TO_CPP) need these specific names present at + # module load time. + for name, val in (('TYPE_FP32', 0), ('TYPE_FP16', 1), ('TYPE_BF16', 2), + ('TYPE_INVALID', 3), ('TYPE_INT32', 4), + ('TYPE_INT64', 5), ('TYPE_INT8', 6), + ('TYPE_UINT8', 7), ('TYPE_UINT4', 10), + ('TYPE_FP8_E4M3', 11), ('TYPE_FP4_E2M1', 12)): + if not hasattr(dt, name): + setattr(dt, name, val) + + if not hasattr(tm, 'ResolveLinearWeightFormat'): + tm.ResolveLinearWeightFormat = lambda d, w, bi, bo: ('DataFormat', d, w, bi, bo) + + +_setup_fake_tm() + +# Register package stubs. +import lmdeploy # noqa: F401 + +for _pkg in ('lmdeploy.turbomind',): + if _pkg not in sys.modules: + mod = types.ModuleType(_pkg) + mod.__path__ = [os.path.join(_repo_root, *_pkg.split('.'))] + mod.__package__ = _pkg + sys.modules[_pkg] = mod + + +def _load(mod_name, file_rel_path): + path = os.path.join(_repo_root, *file_rel_path.split('/')) + spec = importlib.util.spec_from_file_location(mod_name, path) + mod = importlib.util.module_from_spec(spec) + sys.modules[mod_name] = mod + spec.loader.exec_module(mod) + return mod + + +_linear_mod = _load('lmdeploy.turbomind.linear', + 'lmdeploy/turbomind/linear.py') +_wf_mod = _load('lmdeploy.turbomind.weight_format', + 'lmdeploy/turbomind/weight_format.py') + +Linear = _linear_mod.Linear +WeightFormat = _wf_mod.WeightFormat +WeightFormatResolver = _wf_mod.WeightFormatResolver + + +# --------------------------------------------------------------------------- +# Fake format used by the tests +# --------------------------------------------------------------------------- + + +class _FakeQuant(WeightFormat): + """Accepts when a ``.qfoo`` tensor is present. + + ``normalize`` is identity. + """ + name = 'fakeq' + suffix_map = {'.qfoo': 'weight', '.scales': 'scales', '.bias': 'bias'} + weight_dtype = 0 # TYPE_FP32 from our stub + has_zero_point = False + + def __init__(self, *, block_in=None, block_out=None): + super().__init__(block_in=block_in, block_out=block_out) + + def accepts(self, available): + return '.qfoo' in available + + def normalize(self, x, kind): + return x + + +class _FakeQuantWithZeros(_FakeQuant): + name = 'fakeqz' + suffix_map = {'.qfoo': 'weight', '.scales': 'scales', + '.qzeros': 'zeros', '.bias': 'bias'} + has_zero_point = True + + def synthesize_zeros(self, scales): + return torch.zeros_like(scales) + + +class _FakeTrivial(WeightFormat): + name = 'faketr' + suffix_map = {'.weight': 'weight', '.bias': 'bias'} + weight_dtype = None + has_zero_point = False + + def accepts(self, available): + return available.keys() <= {'.weight', '.bias'} and '.weight' in available + + def normalize(self, x, kind): + return x + + def dequant(self, tensors, data_type): + return tensors + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +class TestResolveQuantized: + + def _make_resolver(self): + return WeightFormatResolver( + data_type=0, + formats=[_FakeQuant(), _FakeTrivial()]) + + def test_quant_prefix_picks_quant_format(self): + params = { + 'layer.qfoo': torch.randn(4, 4), + 'layer.scales': torch.randn(1, 4), + } + lin = self._make_resolver().resolve(params, 'layer') + assert isinstance(lin.weight_format, _FakeQuant) + assert set(lin.tensors) == {'weight', 'scales'} + + def test_trivial_prefix_falls_through(self): + params = {'layer.weight': torch.randn(4, 4)} + lin = self._make_resolver().resolve(params, 'layer') + assert isinstance(lin.weight_format, _FakeTrivial) + + +class TestResolveFailureModes: + + def _make_resolver(self): + return WeightFormatResolver( + data_type=0, + formats=[_FakeQuant(), _FakeTrivial()]) + + def test_missing_prefix_default_raises_key_error(self): + with pytest.raises(KeyError, match='no checkpoint tensors found'): + self._make_resolver().resolve({}, 'missing.prefix') + + def test_missing_prefix_optional_returns_none(self): + assert self._make_resolver().resolve( + {}, 'missing.prefix', optional=True) is None + + def test_tensors_present_no_match_raises_value_error(self): + class _PickyTrivial(_FakeTrivial): + def accepts(self, available): + return False + + resolver = WeightFormatResolver( + data_type=0, + formats=[_FakeQuant(), _PickyTrivial()]) + params = {'layer.weight': torch.randn(4, 4)} + with pytest.raises(ValueError, match='no weight format accepts'): + resolver.resolve(params, 'layer') + + +class TestIndexedProbe: + + def test_index_slices_available_tensors(self): + resolver = WeightFormatResolver( + data_type=0, formats=[_FakeTrivial()]) + params = {'experts.weight': torch.arange(24).reshape(3, 4, 2).float()} + lin = resolver.resolve(params, 'experts', index=1) + assert lin.tensors['weight'].shape == (4, 2) + torch.testing.assert_close( + lin.tensors['weight'], + torch.arange(8, 16).reshape(4, 2).float()) + + +class TestZerosSynthesis: + + def test_synthesize_zeros_called_when_missing(self): + params = { + 'layer.qfoo': torch.randn(4, 4), + 'layer.scales': torch.ones(1, 4), + } + resolver = WeightFormatResolver( + data_type=0, formats=[_FakeQuantWithZeros()]) + lin = resolver.resolve(params, 'layer') + assert 'zeros' in lin.tensors + torch.testing.assert_close( + lin.tensors['zeros'], torch.zeros(1, 4)) + + def test_synthesize_zeros_skipped_when_present(self): + scales = torch.ones(1, 4) + supplied = torch.full_like(scales, 5.0) + params = { + 'layer.qfoo': torch.randn(4, 4), + 'layer.scales': scales, + 'layer.qzeros': supplied, + } + resolver = WeightFormatResolver( + data_type=0, formats=[_FakeQuantWithZeros()]) + lin = resolver.resolve(params, 'layer') + torch.testing.assert_close(lin.tensors['zeros'], supplied) + + +class TestEquality: + + def test_same_class_same_blocks_equal(self): + a = _FakeQuant(block_in=128) + b = _FakeQuant(block_in=128) + assert a == b + assert hash(a) == hash(b) + assert {a, b} == {a} + + def test_different_blocks_unequal(self): + assert _FakeQuant(block_in=128) != _FakeQuant(block_in=64) + + def test_different_classes_unequal(self): + assert _FakeQuant() != _FakeTrivial()