diff --git a/.gitignore b/.gitignore
index 3e9609afe1..5a8a9866a4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -45,6 +45,7 @@ htmlcov/
 .coverage.*
 .cache
 *build*/
+!lmdeploy/turbomind/builders
 !builder/
 lmdeploy/lib/
 lmdeploy/bin/
@@ -83,3 +84,6 @@ work_dir*/
 
 !CMakeLists.txt
 proxy_config.yml
+
+# Claude Code local config
+CLAUDE.local.md
diff --git a/lmdeploy/turbomind/builders/__init__.py b/lmdeploy/turbomind/builders/__init__.py
new file mode 100644
index 0000000000..922386d9c4
--- /dev/null
+++ b/lmdeploy/turbomind/builders/__init__.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Builder sub-package — spec-driven module loading for TurboMind."""
+from __future__ import annotations
+
+from ._base import Builder, BuiltModule, SplitSide, _act_type_id, _cpp_dtype, _torch_dtype_to_cpp
+from .attention import AttentionBuilder
+from .decoder_layer import DecoderLayerBuilder, DecoderLayerConfig
+from .deltanet import DeltaNetBuilder
+from .ffn import FfnBuilder, fuse_w1w3
+from .mla import MLABuilder
+from .module_list import ModuleListBuilder, ModuleListConfig
+from .moe import MoeBuilder
+from .norm import NormBuilder, make_norm_config
+from .text_model import TextModelBuilder
+
+__all__ = [
+    # Base
+    'Builder', 'BuiltModule', 'TextModelBuilder', 'SplitSide',
+    '_cpp_dtype', '_act_type_id', '_torch_dtype_to_cpp',
+    # Builders
+    'AttentionBuilder', 'FfnBuilder', 'MoeBuilder',
+    'DeltaNetBuilder', 'MLABuilder',
+    'DecoderLayerBuilder', 'ModuleListBuilder',
+    'NormBuilder',
+    # Primitive config wrappers
+    'make_norm_config',
+    # C++ config re-exports
+    'DecoderLayerConfig', 'ModuleListConfig',
+    # Helper functions
+    'fuse_w1w3',
+]
diff --git a/lmdeploy/turbomind/builders/_base.py b/lmdeploy/turbomind/builders/_base.py
new file mode 100644
index 0000000000..f2c8d08d5a
--- /dev/null
+++ b/lmdeploy/turbomind/builders/_base.py
@@ -0,0 +1,425 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import enum
+
+import _turbomind as _tm
+import torch
+
+from ..linear import Linear
+
+# ---------------------------------------------------------------------------
+# SplitSide enum (internal -- not exposed to specs)
+# ---------------------------------------------------------------------------
+
+
+class SplitSide(enum.Enum):
+    """Semantic TP split direction for commit operations.
+
+    OUTPUT -- column-parallel: split along the output dimension (axis -1)
+    INPUT  -- row-parallel:    split along the input dimension  (axis  0)
+    """
+
+    OUTPUT = 'output'
+    INPUT = 'input'
+
+
+# ---------------------------------------------------------------------------
+# Canonical dtype mappings (moved from commit.py)
+# ---------------------------------------------------------------------------
+
+_STR_TO_DTYPE: dict[str, _tm.DataType] = {
+    'float32':  _tm.DataType.TYPE_FP32,
+    'float16':  _tm.DataType.TYPE_FP16,
+    'bfloat16': _tm.DataType.TYPE_BF16,
+}
+
+_TORCH_TO_CPP: dict[torch.dtype, _tm.DataType] = {
+    torch.float32:  _tm.DataType.TYPE_FP32,
+    torch.float16:  _tm.DataType.TYPE_FP16,
+    torch.bfloat16: _tm.DataType.TYPE_BF16,
+    torch.int32:    _tm.DataType.TYPE_INT32,
+    torch.int64:    _tm.DataType.TYPE_INT64,
+    torch.int8:     _tm.DataType.TYPE_INT8,
+    torch.uint8:    _tm.DataType.TYPE_UINT8,
+}
+
+_CPP_TO_TORCH: dict[_tm.DataType, torch.dtype] = {v: k for k, v in _TORCH_TO_CPP.items()}
+
+_SPLIT_SIDE_TO_DIM: dict[SplitSide, int] = {SplitSide.OUTPUT: -1, SplitSide.INPUT: 0}
+
+
+# ---------------------------------------------------------------------------
+# Dtype / format helpers (moved from commit.py)
+# ---------------------------------------------------------------------------
+
+
+def _cpp_dtype(dtype_str: str):
+    """Convert a model-config data_type string to C++ DataType enum."""
+    return _STR_TO_DTYPE[dtype_str]
+
+
+def _act_type_id(act_str: str) -> int:
+    """Convert activation_type string to C++ ActivationType enum value."""
+    return {'silu': 0, 'gpt-oss': 1}.get(act_str, 0)
+
+
+def _torch_dtype_to_cpp(dtype: torch.dtype):
+    """Convert a torch dtype to the C++ ``DataType`` enum, or ``None``."""
+    return _TORCH_TO_CPP.get(dtype)
+
+
+def _cast_shard_for_tm(shard: torch.Tensor, tm_tensor) -> torch.Tensor:
+    """Cast *shard* dtype to match *tm_tensor*'s C++ dtype when needed."""
+    if tm_tensor.type == _tm.DataType.TYPE_FP32 and shard.dtype in (torch.float16, torch.bfloat16):
+        return shard.float()
+    if tm_tensor.type == _tm.DataType.TYPE_FP16 and shard.dtype != torch.float16:
+        return shard.half()
+    if tm_tensor.type == _tm.DataType.TYPE_BF16 and shard.dtype != torch.bfloat16:
+        return shard.to(torch.bfloat16)
+    return shard
+
+
+
+def _copy_shard_to_param(handle, param_name: str, shard: torch.Tensor, *,
+                         alloc_shape: list[int] | None = None,
+                         alloc_dtype=None) -> None:
+    """Move shard to GPU, allocate the C++ param slot, cast, and copy.
+
+    Invariant: ``dst.byte_size == shard.nbytes`` after the cast.  Upstream
+    is responsible for any padding/reshape needed to satisfy this.  A
+    mismatch raises immediately.
+
+    ``alloc_shape`` / ``alloc_dtype`` default to the shard's own shape /
+    dtype.  Override only to express shape/dtype *relabels* where byte
+    size is preserved (e.g. quantized weight: physical int32
+    [in, out/8] stored in a logical UINT4 [in, out] C++ slot).
+    """
+    if not shard.is_cuda:
+        shard = shard.cuda(0).contiguous()
+    elif not shard.is_contiguous():
+        shard = shard.contiguous()
+
+    if alloc_shape is None:
+        alloc_shape = list(shard.shape)
+    if alloc_dtype is None:
+        alloc_dtype = _torch_dtype_to_cpp(shard.dtype)
+
+    dst = handle.param(param_name).alloc(alloc_shape, alloc_dtype)
+    shard = _cast_shard_for_tm(shard, dst)
+    assert dst.byte_size == shard.nbytes, (
+        f'{param_name}: alloc byte_size={dst.byte_size} != '
+        f'shard.nbytes={shard.nbytes}')
+    dst.copy_from(shard)
+
+
+def _shard(tensor: torch.Tensor, split_dim: int | None, tp: int,
+           rank: int) -> torch.Tensor:
+    """Return the ``rank``-th split along ``split_dim``, or the tensor
+    unchanged.
+
+    Used wherever a TP shard is selected from a broadcast-by-default
+    tensor.  A ``split_dim`` of ``None`` or ``tp <= 1`` returns the tensor
+    untouched.
+    """
+    if split_dim is None or tp <= 1:
+        return tensor
+    return tensor.split(tensor.shape[split_dim] // tp, dim=split_dim)[rank]
+
+
+# ---------------------------------------------------------------------------
+# Builder base class
+# ---------------------------------------------------------------------------
+
+
+class BuiltModule:
+    """Opaque handle bundle returned by ``Builder.build()``.
+
+    Wraps a list of per-GPU C++ module handles.  Iteration and len delegate
+    to the underlying list so callers can ``zip(BuiltModule, contexts)`` etc.
+    """
+
+    __slots__ = ('handles',)
+
+    def __init__(self, handles):
+        self.handles = handles
+
+    def __iter__(self):
+        return iter(self.handles)
+
+    def __len__(self):
+        return len(self.handles)
+
+
+class Context:
+    """Bundle of per-GPU contexts and the model compute dtype."""
+    def __init__(self, devices, data_type):
+        self.devices = devices
+        self.data_type = data_type
+
+
+class ParallelGroup:
+    """Bundle a parallelism size with per-GPU TP ranks."""
+    def __init__(self, size, ranks):
+        self.size = size
+        self.ranks = ranks
+
+
+class Builder:
+    """Wraps N GPU handles for a single logical module.
+
+    Distributes module creation, child binding, and weight commits
+    across all GPUs with bound TP configuration.
+
+    Subclasses specialize for particular module types (e.g. attention,
+    FFN, MoE).
+
+    Lifecycle: stage commits -> build() -> BuiltModule (frozen).
+    After ``build()`` the Builder is inert — further commits or child
+    attachments raise.
+    """
+
+    def __init__(self, config, ctx):
+        """Initialise the builder with staging dicts.
+
+        Parameters
+        ----------
+        config : C++ config struct
+            Config with ``clone()`` method.
+        ctx : Context
+            Per-GPU context handles + model compute dtype.
+        """
+        # `_built` must be set first: __setattr__ reads it inside the
+        # BuiltModule branch.  Bool is not a BuiltModule, so the normal
+        # fall-through assigns it via object.__setattr__ at the end of
+        # __setattr__.
+        self._built = False
+        self._ctx = ctx
+        self.tp = ParallelGroup(1, None)   # default: no TP
+        self.config = config
+        if hasattr(self.config, 'data_type'):
+            self.config.data_type = ctx.data_type
+        self._pending_tensors = {}
+        self._pending_children = {}
+        self._handles = None
+
+    # ------------------------------------------------------------------
+    # Child binding via attribute assignment
+    # ------------------------------------------------------------------
+
+    def __setattr__(self, name: str, value):
+        if isinstance(value, Builder):
+            raise TypeError(
+                f'{type(self).__name__}.{name}: assign .build() output '
+                f'(BuiltModule), not the Builder itself')
+        if isinstance(value, BuiltModule):
+            if self._built:
+                raise RuntimeError(
+                    f'{type(self).__name__} is built; '
+                    f'cannot assign {name!r}')
+            self._add_child(name, value.handles)
+            return
+        object.__setattr__(self, name, value)
+
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+
+    @property
+    def tp_size(self):
+        return self.tp.size
+
+    def _rank_for(self, gpu_idx: int) -> int:
+        if self.tp.ranks and self.tp.size > 1:
+            return self.tp.ranks[gpu_idx]
+        return 0
+
+    # ------------------------------------------------------------------
+    # Add methods — stage into pending dicts (pre-build only)
+    # ------------------------------------------------------------------
+
+    def _add_linear(self, name: str, linear: Linear,
+                       split_side: SplitSide | None = None):
+        """Create standalone LinearWeight modules and copy tensor data.
+
+        Creates per-GPU LinearWeight modules via ``_tm.create_module``
+        at commit time.  Attachment to the parent module is deferred to
+        ``build()`` via ``_commit_child``.
+        """
+        assert not self._built, (
+            f"{type(self).__name__} is built; commit '{name}' rejected")
+
+        w = linear.tensors.get('weight')
+        if w is None:
+            return
+
+        # --- GPU-invariant preparation -------------------------------------
+        fmt = linear.weight_format
+
+        tp = self.tp.size if split_side else 1
+        split_dim = _SPLIT_SIDE_TO_DIM.get(split_side) if split_side else None
+
+        in_dim, out_dim = w.shape[0], w.shape[-1]
+        if split_side == SplitSide.OUTPUT:
+            out_dim //= tp
+        elif split_side == SplitSide.INPUT:
+            in_dim //= tp
+
+        compute_dtype = self.config.data_type
+        lin_cfg = _tm.LinearConfig()
+        lin_cfg.input_dim  = in_dim
+        lin_cfg.output_dim = out_dim
+        lin_cfg.data_type  = compute_dtype or _tm.DataType.TYPE_INVALID
+        lin_cfg.format     = linear.weight_format.make_data_format(compute_dtype)
+        lin_cfg.has_bias   = 'bias' in linear.tensors
+
+        packed = {k: fmt.pack(t, k) for k, t in linear.tensors.items()}
+        tensors = {k: p.tensor for k, p in packed.items()}
+
+        kind_split_dims = {
+            kind: None if (kind == 'bias' and split_side == SplitSide.INPUT)
+                  else split_dim
+            for kind in tensors
+        }
+
+        if tp > 1 and split_dim is not None:
+            for kind, tensor in tensors.items():
+                kind_split_dim = kind_split_dims[kind]
+                if kind_split_dim is not None:
+                    d = tensor.shape[kind_split_dim]
+                    assert d % tp == 0, (
+                        f'TP split: {name}.{kind} dim {kind_split_dim} '
+                        f'has size {d}, not divisible by tp={tp}.')
+
+        # --- Per-GPU: standalone creation + tensor copy --------------------
+        handles = []
+        for i, ctx in enumerate(self._ctx.devices):
+            with ctx:
+                rank = self._rank_for(i) if tp > 1 else 0
+
+                mod = _tm.create_module(lin_cfg)
+
+                for kind, tensor in tensors.items():
+                    shard = _shard(tensor, kind_split_dims[kind], tp, rank)
+
+                    alloc_shape, alloc_dtype = packed[kind].alloc_shape, \
+                                               packed[kind].alloc_dtype
+                    if alloc_shape is not None and split_dim is not None \
+                            and tp > 1:
+                        alloc_shape = list(alloc_shape)
+                        alloc_shape[split_dim] //= tp
+                    if alloc_dtype is None and kind == 'weight':
+                        alloc_dtype = self.config.data_type
+
+                    _copy_shard_to_param(mod, kind, shard,
+                                         alloc_shape=alloc_shape,
+                                         alloc_dtype=alloc_dtype)
+
+                handles.append(mod)
+
+        self._add_child(name, handles)
+
+    def _add_tensor(self, name: str, tensor: torch.Tensor | None,
+                       split_side: SplitSide | None = None):
+        """Stage a raw-tensor commit under ``name``.
+
+        Applied during
+        ``build()`` in ``_commit_tensor``.
+        """
+        assert not self._built, (
+            f"{type(self).__name__} is built; commit '{name}' rejected")
+        if tensor is not None:
+            self._pending_tensors[name] = (tensor, split_side)
+
+    # ------------------------------------------------------------------
+    # Add helpers
+    # ------------------------------------------------------------------
+
+    def _add_child(self, name: str, handles: list):
+        """Stage pre-created per-GPU ``Module*`` handles under ``name``.
+
+        Applied during ``build()`` in ``_commit_child``.
+        """
+        assert not self._built, (
+            f"{type(self).__name__} is built; commit '{name}' rejected")
+        assert name not in self._pending_children, (
+            f"{type(self).__name__}: duplicate child commit '{name}'")
+        self._pending_children[name] = handles
+
+    # ------------------------------------------------------------------
+    # build() — create handles, drain staged state, return BuiltModule
+    # ------------------------------------------------------------------
+
+    def build(self) -> BuiltModule:
+        """Create C++ module handles and drain all staged state.
+
+        Idempotent on second call — returns the same ``BuiltModule``.
+        """
+        if self._built:
+            return BuiltModule(self._handles)
+
+        self._create_handles()
+
+        # True is not BuiltModule; falls through to plain assignment.
+        self._built = True
+
+        # Drain staged children (linear weights + sub-builder output)
+        for name, handles in self._pending_children.items():
+            self._commit_child(name, handles)
+
+        # Drain staged tensors
+        for name, (tensor, split_side) in self._pending_tensors.items():
+            self._commit_tensor(name, tensor, split_side)
+
+        return BuiltModule(self._handles)
+
+    def _create_handles(self):
+        """Create one C++ module per context via ``_tm.create_module(cfg)``."""
+        handles = []
+        for i, ctx in enumerate(self._ctx.devices):
+            with ctx:
+                cfg = self._cfg_for_rank(i)
+                handle = _tm.create_module(cfg)
+                handles.append(handle)
+        self._handles = handles
+
+    def _cfg_for_rank(self, gpu_idx: int):
+        """Clone config and set tp_rank if tp > 1."""
+        if self.tp.size > 1 and hasattr(self.config, 'tp_rank'):
+            cfg = self.config.clone()
+            cfg.tp_rank = self.tp.ranks[gpu_idx]
+            return cfg
+        return self.config
+
+    def _commit_child(self, name: str, handles: list):
+        """Attach pre-created per-GPU child handles to parent handles."""
+        for i, (parent_h, child_h) in enumerate(
+                zip(self._handles, handles)):
+            with self._ctx.devices[i]:
+                parent_h.add_child_raw(name, child_h)
+
+    # ------------------------------------------------------------------
+    # Commit methods — drain pending dicts to C++ modules
+    # ------------------------------------------------------------------
+
+    def _commit_tensor(self, name: str, tensor: torch.Tensor,
+                      split_side: SplitSide | None = None):
+        """Commit a raw tensor to a named parameter on all GPUs.
+
+        Parameters
+        ----------
+        name : str
+            Parameter name within the module.
+        tensor : torch.Tensor
+            The tensor data.
+        split_side : SplitSide | None
+            TP split semantics.  ``None`` means broadcast.
+        """
+        tp = self.tp.size if split_side else 1
+        split_dim = _SPLIT_SIDE_TO_DIM.get(split_side) if split_side else None
+
+        for i, handle in enumerate(self._handles):
+            with self._ctx.devices[i]:
+                rank = self._rank_for(i) if tp > 1 else 0
+                shard = _shard(tensor, split_dim, tp, rank)
+                _copy_shard_to_param(handle, name, shard,
+                                     alloc_dtype=None)
diff --git a/lmdeploy/turbomind/builders/attention.py b/lmdeploy/turbomind/builders/attention.py
new file mode 100644
index 0000000000..91f38821fe
--- /dev/null
+++ b/lmdeploy/turbomind/builders/attention.py
@@ -0,0 +1,120 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Attention weight loading builder and QKV fusion pipeline.
+
+Provides ``AttentionBuilder`` for committing attention weights (QKV fusion,
+O-proj, QK-norm, direct params) and pipeline functions (``dequant_mixed``,
+``repeat_kv_for_tp``, ``split_output_gate``, ``fuse_qkv``) for fusing Q/K/V
+Linear bundles into a single interleaved w_qkv with KV head padding and
+output-gate splitting.
+"""
+from __future__ import annotations
+
+import torch
+
+from ..linear import Linear, dequant_mixed, transform_output_dim
+from ._base import Builder, ParallelGroup, SplitSide
+
+# ---------------------------------------------------------------------------
+# New pipeline functions (replacing merge_qkv_linear)
+# ---------------------------------------------------------------------------
+
+
+def _infer_heads(linear: Linear, head_dim: int) -> int:
+    """Derive head count from the weight tensor's output dimension."""
+    w = linear.tensors.get('weight')
+    if w is None:
+        return 0
+    return w.size(-1) // head_dim
+
+
+@transform_output_dim
+def _repeat_kv_heads(tensor: torch.Tensor, *, tp: int,
+                     heads: int) -> torch.Tensor:
+    """Repeat KV heads to reach a TP-divisible count."""
+    if heads % tp == 0:
+        return tensor
+    target_heads = ((heads + tp - 1) // tp) * tp
+    assert target_heads % heads == 0, (
+        f'target_heads={target_heads} must be divisible by heads={heads}')
+    n_repeat = target_heads // heads
+    per_head = tensor.size(-1) // heads
+    t = tensor.view(tensor.size(0), heads, per_head)
+    return t.repeat(1, n_repeat, 1).reshape(tensor.size(0), target_heads * per_head)
+
+
+def repeat_kv_for_tp(k: Linear, v: Linear, *,
+                     tp: int, head_dim: int) -> tuple[Linear, Linear]:
+    """Repeat KV heads to reach a TP-divisible count."""
+    k = _repeat_kv_heads(k, tp=tp, heads=_infer_heads(k, head_dim))
+    v = _repeat_kv_heads(v, tp=tp, heads=_infer_heads(v, head_dim))
+    return k, v
+
+
+@transform_output_dim
+def split_output_gate(tensor: torch.Tensor, *, head_num: int
+                      ) -> tuple[torch.Tensor, torch.Tensor]:
+    """Split output gate from Q projection (Qwen3.5).
+
+    Q's output dim is 2 * head_num * head_dim. Reshape to [batch, head_num, 2, head_dim], split into q_real and gate.
+    """
+    per_head = tensor.size(-1) // (head_num * 2)
+    q, gate = tensor.view(-1, head_num, 2, per_head).unbind(2)
+    return q.reshape(-1, head_num * per_head), gate.reshape(-1, head_num * per_head)
+
+
+@transform_output_dim
+def fuse_qkv(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+             *, tp: int, gate: torch.Tensor | None = None) -> torch.Tensor:
+    """Fuse Q, K, V (and optionally gate) into a single w_qkv Linear.
+
+    Concatenates output channels with TP interleaving.
+    Layout per tp-shard: [Q | K | V] or [Q | K | V | Gate].
+    """
+    tensors = [t for t in (q, k, v, gate) if t is not None]
+    parts = [t.view(t.size(0), tp, -1) for t in tensors]
+    merged = torch.cat(parts, dim=-1)
+    return merged.view(-1, merged.size(-1) * tp)
+
+
+# ---------------------------------------------------------------------------
+# AttentionBuilder
+# ---------------------------------------------------------------------------
+
+
+class AttentionBuilder(Builder):
+    """Attention weight loading builder."""
+
+    _PARAM_TP_RULES: dict[str, SplitSide] = {
+        'sinks': SplitSide.OUTPUT,
+    }
+
+    def __init__(self, config, ctx, tp: ParallelGroup):
+        super().__init__(config, ctx)
+        self.tp = tp
+        self.config.tp_size = tp.size
+
+    def add_qkv_proj(self, q, k, v, *, gate=None):
+        """Fuse Q/K/V into a single w_qkv with TP interleave, commit.
+
+        Pipeline: dequant_mixed -> repeat_kv_for_tp -> fuse_qkv -> commit.
+        """
+        q, k, v, gate = dequant_mixed(q, k, v, gate, data_type=self.config.data_type)
+        k, v = repeat_kv_for_tp(k, v, tp=self.tp.size,
+                                head_dim=self.config.head_dim)
+        # After KV head repeat, push the padded-global kv_head_num onto
+        # config so that C++ module creation sees the correct head count.
+        self.config.kv_head_num = _infer_heads(k, self.config.head_dim)
+        merged = fuse_qkv(q, k, v, tp=self.tp.size, gate=gate)
+        self._add_linear('w_qkv', merged, SplitSide.OUTPUT)
+
+    def add_o_proj(self, o):
+        """Shard along input dim, commit."""
+        self._add_linear('wo', o, SplitSide.INPUT)
+
+    def add_param(self, name, tensor):
+        """Commit a direct parameter.
+
+        Builder determines split side.
+        """
+        split_side = self._PARAM_TP_RULES.get(name)
+        self._add_tensor(name, tensor, split_side)
diff --git a/lmdeploy/turbomind/builders/decoder_layer.py b/lmdeploy/turbomind/builders/decoder_layer.py
new file mode 100644
index 0000000000..9d3ab8fc69
--- /dev/null
+++ b/lmdeploy/turbomind/builders/decoder_layer.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import _turbomind as _tm
+
+from ._base import Builder
+
+DecoderLayerConfig = _tm.DecoderLayerConfig
+
+
+class DecoderLayerBuilder(Builder):
+    """Pure container builder for decoder layers."""
+    pass
diff --git a/lmdeploy/turbomind/builders/deltanet.py b/lmdeploy/turbomind/builders/deltanet.py
new file mode 100644
index 0000000000..2e1e5ad63a
--- /dev/null
+++ b/lmdeploy/turbomind/builders/deltanet.py
@@ -0,0 +1,146 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""DeltaNet weight loading builder and GDN input-projection fusion helpers.
+
+Provides ``DeltaNetBuilder`` for committing DeltaNet weights (GDN input
+projections, scalar params, conv1d) and helper functions ``split_qkv`` and
+``fuse_gdn`` for merging in_proj_qkv/z/b/a into a single ``in_proj_all``
+with TP interleaving.
+"""
+from __future__ import annotations
+
+import torch
+
+from ..linear import Linear, concat_out_dim, dequant_mixed
+from ._base import Builder, ParallelGroup, SplitSide
+
+
+def tp_interleave_tensor(t: torch.Tensor, tp: int, d: int) -> torch.Tensor:
+    """Reshape dim *d* as [tp, per_tp] for TP-rank interleaving."""
+    shape = list(t.shape)
+    return t.reshape(shape[:d] + [tp, shape[d] // tp] + shape[d + 1:])
+
+
+def split_qkv(linear: Linear,
+              qkv_split: tuple[int, int, int]) -> tuple[Linear, Linear, Linear]:
+    """Split combined QKV linear into Q, K, V linears along output dim."""
+    wfmt = linear.weight_format
+    block_out = (wfmt.block_out or 0) if wfmt is not None else 0
+    new_linears = []
+    offset = 0
+    for dim in qkv_split:
+        tensors = {}
+        for kind, t in linear.tensors.items():
+            out_dim = t.dim() - 1
+            if kind in ('scales', 'zeros') and block_out > 0:
+                block_offset = offset // block_out
+                block_len = dim // block_out
+                tensors[kind] = t.narrow(out_dim, block_offset, block_len).contiguous()
+            else:
+                tensors[kind] = t.narrow(out_dim, offset, dim).contiguous()
+        new_linears.append(Linear(tensors=tensors,
+                                  weight_format=linear.weight_format))
+        offset += dim
+    return tuple(new_linears)
+
+
+def fuse_gdn(q: Linear, k: Linear, v: Linear,
+             z: Linear, b: Linear, a: Linear, *,
+             tp: int) -> Linear:
+    """Fuse GDN input projections with TP interleaving.
+
+    Layout per tp-shard: [Q | K | V | Z | B | A].
+    For tp=1 reduces to simple concat along output dim.
+    """
+    components = [q, k, v, z, b, a]
+
+    if tp <= 1:
+        return concat_out_dim(components)
+
+    first = components[0]
+    fused_tensors: dict[str, torch.Tensor] = {}
+    for kind in first.tensors:
+        parts = []
+        all_1d = True
+        d = -1
+        for lin in components:
+            t = lin.tensors.get(kind)
+            if t is None:
+                continue
+            if t.dim() > 1:
+                this_d = t.dim() - 1
+                if d >= 0 and this_d != d:
+                    raise ValueError(
+                        f'Inconsistent tensor dims for kind={kind}: '
+                        f'{this_d} vs {d}')
+                d = this_d
+                all_1d = False
+                parts.append(tp_interleave_tensor(t, tp, d))
+            else:
+                # 1-D tensors (bias): simple concat
+                parts.append(t)
+        if not parts:
+            continue
+        if all_1d:
+            fused_tensors[kind] = torch.cat(parts, dim=0)
+        else:
+            fused = torch.cat(parts, dim=d + 1)
+            shape = list(fused.shape)
+            final = shape[:d] + [shape[d] * shape[d + 1]] + shape[d + 2:]
+            fused_tensors[kind] = fused.reshape(final)
+
+    return Linear(tensors=fused_tensors, weight_format=first.weight_format)
+
+
+def fuse_qkv_conv1d(t: torch.Tensor, qkv_split: tuple[int, int, int],
+                     tp: int) -> torch.Tensor:
+    """Split conv1d into Q/K/V parts, TP-interleave each, concatenate back."""
+    q_dim, k_dim, _ = qkv_split
+    d_conv = t.shape[0]
+    q_part = tp_interleave_tensor(t[:, :q_dim], tp, 1)
+    k_part = tp_interleave_tensor(t[:, q_dim:q_dim + k_dim], tp, 1)
+    v_part = tp_interleave_tensor(t[:, q_dim + k_dim:], tp, 1)
+    return torch.cat([q_part, k_part, v_part], dim=2).reshape(d_conv, -1).contiguous()
+
+
+# ---------------------------------------------------------------------------
+# DeltaNetBuilder -- Gated Delta Net input projections, scalar params, conv1d
+# ---------------------------------------------------------------------------
+
+
+class DeltaNetBuilder(Builder):
+    """DeltaNet (Gated Delta Net) weight loading builder."""
+
+    def __init__(self, config, ctx, tp: ParallelGroup):
+        super().__init__(config, ctx)
+        self.tp = tp
+        self.config.tp_size = tp.size
+
+    def add_input_projections(self, *, in_proj_qkv, in_proj_z=None,
+                              in_proj_b=None, in_proj_a=None, out_proj=None,
+                              qkv_split):
+        """Fuse GDN input projections via pipeline, commit all linears.
+
+        Pipeline: split_qkv -> dequant_mixed -> fuse_gdn -> commit.
+        """
+        q, k, v = split_qkv(in_proj_qkv, qkv_split)
+        q, k, v, z, b, a = dequant_mixed(q, k, v, in_proj_z, in_proj_b, in_proj_a,
+                                           data_type=self.config.data_type)
+        fused = fuse_gdn(q, k, v, z, b, a, tp=self.tp.size)
+        self._add_linear('in_proj_all', fused, SplitSide.OUTPUT)
+        if out_proj is not None:
+            self._add_linear('out_proj', out_proj, SplitSide.INPUT)
+
+    def add_scalar_params(self, a_log=None, dt_bias=None):
+        """Commit A_log and dt_bias as OUTPUT-split tensors."""
+        if a_log is not None:
+            self._add_tensor('A_log', a_log, split_side=SplitSide.OUTPUT)
+        if dt_bias is not None:
+            self._add_tensor('dt_bias', dt_bias, split_side=SplitSide.OUTPUT)
+
+    def add_conv1d(self, conv1d, qkv_split):
+        """Transpose HF layout to TM layout, TP-interleave Q/K/V, commit."""
+        if conv1d.ndim == 3 and conv1d.shape[1] == 1:
+            conv1d = conv1d.squeeze(1)
+        conv1d = conv1d.t().contiguous()
+        conv1d = fuse_qkv_conv1d(conv1d, qkv_split, self.tp.size)
+        self._add_tensor('conv1d', conv1d, split_side=SplitSide.OUTPUT)
diff --git a/lmdeploy/turbomind/builders/ffn.py b/lmdeploy/turbomind/builders/ffn.py
new file mode 100644
index 0000000000..90f6f46104
--- /dev/null
+++ b/lmdeploy/turbomind/builders/ffn.py
@@ -0,0 +1,201 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""FFN weight loading builder and w1+w3 fusion helpers.
+
+Provides ``FfnBuilder`` for committing FFN weights (w1/w2/w3 with optional
+w1+w3 fusion) and helper functions for determining whether SiLU fusion
+(interleave vs chunk) should be used and whether w1+w3 fusion is safe for
+the given TP configuration.
+"""
+from __future__ import annotations
+
+import math
+
+import torch
+
+from ..linear import (Linear, round_up_input_groups, round_up_output_groups,
+                       transform_output_dim)
+from ._base import Builder, ParallelGroup, SplitSide
+
+__all__ = [
+    'FfnBuilder',
+    'fuse_w1w3',
+]
+
+# ---------------------------------------------------------------------------
+# @transform_output_dim / @transform_input_dim helpers
+# ---------------------------------------------------------------------------
+
+
+@transform_output_dim
+def _interleave_w1w3(w1: torch.Tensor, w3: torch.Tensor) -> torch.Tensor:
+    """Interleave w1 and w3 along output dim for fused SiLU epilogue."""
+    return torch.stack([w1, w3], dim=-1).reshape(w1.shape[:-1] + (-1,)).contiguous()
+
+
+@transform_output_dim
+def _chunk_w1w3(w1: torch.Tensor, w3: torch.Tensor, *,
+                tp: int) -> torch.Tensor:
+    """Concatenate w1 and w3 along output dim with TP interleaving."""
+    if tp <= 1:
+        return torch.cat([w1, w3], dim=-1).contiguous()
+    d = w1.dim() - 1
+    r1 = w1.reshape(w1.shape[:d] + (tp, w1.shape[d] // tp))
+    r3 = w3.reshape(w3.shape[:d] + (tp, w3.shape[d] // tp))
+    combined = torch.cat([r1, r3], dim=d + 1)
+    return combined.reshape(w1.shape[:d] + (-1,)).contiguous()
+
+
+# ---------------------------------------------------------------------------
+# FFN fusion helpers
+# ---------------------------------------------------------------------------
+
+
+def _should_fuse_silu(w1_linear: Linear, act_type: str, is_moe: bool = False) -> bool:
+    """Determine if fused SiLU (interleave) should be used for w1+w3 fusion.
+
+    Gold standard condition (from GEMM kernel constraints — trust it):
+
+    act_type == SiLU && (int4 || mxfp4 || fp8 || moe) && !(fp8 && SM90)
+    """
+    if act_type not in ('', 'silu', 'SiLU'):
+        return False
+
+    # Dense bf16/fp16 without MoE -> chunk, not interleave
+    weight = w1_linear.tensors.get('weight')
+    is_quantized = weight is not None and weight.element_size() < 2
+    if not is_quantized and not is_moe:
+        return False
+
+    # FP8 on SM90 -> chunk
+    fmt = w1_linear.weight_format
+    if fmt is not None and fmt.name == 'fp8':
+        if torch.cuda.is_available():
+            cap = torch.cuda.get_device_capability()
+            if cap == (9, 0):
+                return False
+
+    return True
+
+
+def _can_fuse_w1w3(w1: Linear, tp: int) -> bool:
+    """Check whether w1+w3 fusion is safe for the given TP.
+
+    Fusion (interleave or chunk) concatenates w1 and w3 along the output dim.
+    For block-quantized formats (e.g. FP8 with block_out=128), the fused
+    scale count ``2 * cdiv(N/tp, block_out)`` must equal
+    ``cdiv(2*N/tp, block_out)``.  This holds iff ``(N/tp) % block_out == 0``.
+    When it doesn't, the fused module's C++ allocation won't match the
+    concatenated scales and we must commit w1/w3 separately.
+    """
+    if tp <= 1:
+        return True
+    fmt = w1.weight_format
+    if fmt is None or fmt.block_out is None:
+        return True
+    w = w1.tensors.get('weight')
+    if w is None:
+        return True
+    return (w.size(-1) // tp) % fmt.block_out == 0
+
+
+def fuse_w1w3(
+    w1: Linear,
+    w3: Linear,
+    tp: int,
+    act_type: str,
+    is_moe: bool = False,
+) -> tuple[Linear | None, bool]:
+    """Optionally fuse w1/w3 on full (unsharded) tensors for FFN.
+
+    Returns (fused_w1w3_or_none, fused_silu).
+    When fusion is possible, fused_w1w3 is set.
+    When block-scale boundaries prevent fusion, returns (None, fused_silu).
+
+    TP sharding is NOT done here — the caller's commit path handles it
+    via split_side=SplitSide.OUTPUT.  ``tp`` is only used for the
+    block-scale alignment check in ``_can_fuse_w1w3``.
+    """
+    fused_silu = _should_fuse_silu(w1, act_type, is_moe)
+    can_fuse = _can_fuse_w1w3(w1, tp)
+
+    if can_fuse:
+        if fused_silu:
+            w1w3 = _interleave_w1w3(w1, w3)
+        else:
+            w1w3 = _chunk_w1w3(w1, w3, tp=tp)
+        return (w1w3, fused_silu)
+    else:
+        return (None, fused_silu)
+
+
+# ---------------------------------------------------------------------------
+# TP padding
+# ---------------------------------------------------------------------------
+
+# Minimum CTA_K across all registered grouped-GEMM kernels (SM75–SM90).
+# Included in effective_block via lcm so the padded intermediate is always
+# GEMM-aligned.
+_GEMM_K_ALIGN = 32
+
+
+def _pad_ffn_for_tp(w1: Linear, w2: Linear, w3: Linear,
+                     tp: int) -> tuple[Linear, Linear, Linear]:
+    """Pad w1/w3 output dim and w2 input dim for TP sharding."""
+    raw_inter = w1.tensors['weight'].size(-1)
+
+    if tp <= 1:
+        return w1, w2, w3
+
+    fmt = w1.weight_format
+    effective_block = math.lcm(fmt.block_in or 1, fmt.block_out or 1,
+                               _GEMM_K_ALIGN)
+
+    groups = raw_inter // effective_block
+    w1 = round_up_output_groups(w1, groups, tp)
+    w3 = round_up_output_groups(w3, groups, tp)
+    w2 = round_up_input_groups(w2, groups, tp)
+    return w1, w2, w3
+
+
+# ---------------------------------------------------------------------------
+# FfnBuilder -- w1+w3 fusion, w2 commit
+# ---------------------------------------------------------------------------
+
+
+class FfnBuilder(Builder):
+    """FFN weight loading builder with w1+w3 fusion."""
+
+    def __init__(self, config, ctx, tp: ParallelGroup):
+        super().__init__(config, ctx)
+        self.tp = tp
+        self.config.tp_size = tp.size
+
+    def add_ffn(self, w1, w2, w3):
+        """Pad weights for TP alignment, fuse w1+w3 if possible, then shard and
+        commit.
+
+        The fusion result determines ``fuse_silu`` on the C++ module config.
+        Updating ``self.config.fuse_silu`` **before** any ``_add_linear``
+        call ensures the C++ module is lazily created with the correct flag.
+        """
+        # Pad weights for TP alignment before any fusion or sharding.
+        # After padding, push the padded-global inter_size onto config so
+        # that C++ module creation sees the correct dimension.
+        w1, w2, w3 = _pad_ffn_for_tp(w1, w2, w3, self.tp.size)
+        self.config.inter_size = w1.tensors['weight'].size(-1)
+
+        act_type = getattr(self.config, 'act_type', 0)
+        if isinstance(act_type, int):
+            act_type = {0: 'silu', 1: 'gpt-oss'}.get(act_type, 'silu')
+        fused, fused_silu = fuse_w1w3(
+            w1, w3, self.tp.size, act_type,
+            is_moe=self.config.is_expert)
+
+        self.config.fuse_silu = fused_silu
+
+        if fused is not None:
+            self._add_linear('w1w3', fused, SplitSide.OUTPUT)
+        else:
+            self._add_linear('w1', w1, SplitSide.OUTPUT)
+            self._add_linear('w3', w3, SplitSide.OUTPUT)
+        self._add_linear('w2', w2, SplitSide.INPUT)
diff --git a/lmdeploy/turbomind/builders/mla.py b/lmdeploy/turbomind/builders/mla.py
new file mode 100644
index 0000000000..738596c7b0
--- /dev/null
+++ b/lmdeploy/turbomind/builders/mla.py
@@ -0,0 +1,89 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from __future__ import annotations
+
+import torch
+
+from ..linear import Linear
+from ._base import Builder, ParallelGroup, SplitSide
+
+# ---------------------------------------------------------------------------
+# MLA fold+pad pipeline (standalone functions)
+# ---------------------------------------------------------------------------
+
+
+def fold_kv_b(q_b: Linear, kv_b: Linear, wo: Linear, *,
+              cfg) -> tuple[Linear, Linear]:
+    """Fold kv_b into q_b and wo. Returns (q_b_folded, wo_folded).
+
+    Splits kv_b into key-compressed (kc) and value-compressed (vc) parts. Folds kc into q_b via matmul (q_nope @ kc^T
+    per head). Folds vc into wo via matmul (vc @ wo per head). All arithmetic in TM layout [in, out].
+    """
+    H = cfg.head_num
+    P = cfg.qk_nope_dim
+    S = cfg.qk_rope_dim
+    R_q = cfg.q_lora_rank   # q_b input dim
+    R = cfg.kv_lora_rank    # kv_b input dim, also fold expansion target
+    V = wo.tensors['weight'].shape[0] // H  # v_head_dim (cfg value overridden)
+
+    q_b_h = q_b.tensors['weight'].reshape(R_q, H, P + S)
+    kc, vc = kv_b.tensors['weight'].reshape(R, H, P + V).split([P, V], dim=-1)
+    q_nope, q_rope = q_b_h.split([P, S], dim=-1)
+
+    # q_nope @ kc^T per head: [R_q, H, P] × [R, H, P] → [R_q, H, R]
+    q_folded = torch.cat([
+        torch.einsum('ihp,jhp->ihj', q_nope, kc),  # [R_q, H, R]
+        q_rope,                                      # [R_q, H, S]
+    ], dim=-1).reshape(R_q, H * (R + S))
+
+    # vc @ wo per head
+    o_folded = torch.einsum('rhv,hvn->hrn', vc,
+                            wo.tensors['weight'].reshape(H, V, -1)
+                            ).reshape(H * R, -1)
+
+    return (Linear(tensors={'weight': q_folded.contiguous()},
+                   weight_format=q_b.weight_format),
+            Linear(tensors={'weight': o_folded.contiguous()},
+                   weight_format=wo.weight_format))
+
+
+def pad_wo_input(wo: Linear, *, cfg) -> Linear:
+    """Pad wo input dim from head_num * cur_dim to head_num * size_per_head."""
+    head_num = cfg.head_num
+    size_per_head = cfg.head_dim
+    w = wo.tensors['weight']
+    cur_dim = w.shape[0] // head_num
+    w = w.reshape(head_num, cur_dim, -1)
+    w = torch.nn.functional.pad(w, (0, 0, size_per_head - cur_dim, 0))
+    w = w.reshape(head_num * size_per_head, -1)
+    return Linear(tensors={'weight': w.contiguous()},
+                  weight_format=wo.weight_format)
+
+
+# ---------------------------------------------------------------------------
+# MLABuilder -- MLA projections, fold+pad, norms
+# ---------------------------------------------------------------------------
+
+
+class MLABuilder(Builder):
+    """MLA (Multi-head Latent Attention) weight loading builder."""
+
+    def __init__(self, config, ctx, tp: ParallelGroup):
+        super().__init__(config, ctx)
+        self.tp = tp
+        self.config.tp_size = tp.size
+        if config.kv_lora_rank > 0 and config.kv_head_num < tp.size:
+            config.kv_head_num = tp.size
+
+    def add_projections(self, *, q_a_proj, q_b_proj, kv_a_proj, kv_b_proj,
+                        wo):
+        """Apply MLA fold+pad, then commit each projection."""
+        q_b_proj, wo = fold_kv_b(q_b_proj, kv_b_proj, wo, cfg=self.config)
+        wo = pad_wo_input(wo, cfg=self.config)
+
+        for name, lin, side in [
+            ('q_a_proj', q_a_proj, None),
+            ('q_b_proj', q_b_proj, SplitSide.OUTPUT),
+            ('kv_a_proj', kv_a_proj, None),
+            ('wo', wo, SplitSide.INPUT),
+        ]:
+            self._add_linear(name, lin, split_side=side)
diff --git a/lmdeploy/turbomind/builders/module_list.py b/lmdeploy/turbomind/builders/module_list.py
new file mode 100644
index 0000000000..5fabf64ed6
--- /dev/null
+++ b/lmdeploy/turbomind/builders/module_list.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import _turbomind as _tm
+
+from ._base import Builder, BuiltModule
+
+ModuleListConfig = _tm.ModuleListConfig
+
+
+class ModuleListBuilder(Builder):
+    """Builder for ModuleList containers."""
+
+    def __setitem__(self, index: int, value):
+        if isinstance(value, Builder):
+            raise TypeError(
+                f'{type(self).__name__}[{index}]: call .build() first')
+        if isinstance(value, BuiltModule):
+            if self._built:
+                raise RuntimeError(
+                    f'{type(self).__name__} is built; '
+                    f'cannot set index {index}')
+            self._add_child(str(index), value.handles)
+            return
+        raise TypeError(
+            f'{type(self).__name__}[{index}] requires a BuiltModule')
diff --git a/lmdeploy/turbomind/builders/moe.py b/lmdeploy/turbomind/builders/moe.py
new file mode 100644
index 0000000000..a4a1bcb36d
--- /dev/null
+++ b/lmdeploy/turbomind/builders/moe.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from __future__ import annotations
+
+from ._base import Builder, SplitSide
+
+# ---------------------------------------------------------------------------
+# MoeBuilder -- gate, non-expert params
+# ---------------------------------------------------------------------------
+
+
+class MoeBuilder(Builder):
+    """MoE weight loading builder."""
+
+    def add_gate(self, name, linear):
+        """Commit a gate linear (broadcast, no split)."""
+        self._add_linear(name, linear, split_side=None)
+
+    def add_param(self, name, tensor, split_side=None):
+        """Commit a non-expert MoE parameter."""
+        if split_side is not None and not isinstance(split_side, SplitSide):
+            split_side = None  # specs may pass None for broadcast
+        self._add_tensor(name, tensor, split_side)
diff --git a/lmdeploy/turbomind/builders/norm.py b/lmdeploy/turbomind/builders/norm.py
new file mode 100644
index 0000000000..ab8e4df9ad
--- /dev/null
+++ b/lmdeploy/turbomind/builders/norm.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import _turbomind as _tm
+import torch
+
+from ._base import Builder
+
+
+def make_norm_config(*, dim, norm_eps):
+    cfg = _tm.NormConfig()
+    cfg.dim = dim
+    cfg.norm_eps = norm_eps
+    return cfg
+
+
+class NormBuilder(Builder):
+    """Builder for a single norm weight module."""
+
+    def set_weight(self, tensor: torch.Tensor):
+        """Commit the norm weight tensor to all GPU handles."""
+        self._add_tensor('weight', tensor)
diff --git a/lmdeploy/turbomind/builders/text_model.py b/lmdeploy/turbomind/builders/text_model.py
new file mode 100644
index 0000000000..e3c4ab5598
--- /dev/null
+++ b/lmdeploy/turbomind/builders/text_model.py
@@ -0,0 +1,54 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from ..linear import round_up_output_groups
+from ._base import Builder, BuiltModule, ParallelGroup, SplitSide
+
+
+class TextModelBuilder(Builder):
+    """Builder for the root ModelWeight.
+
+    Constructs a ModelWeight via ``_tm.create_module(ModelWeightConfig)``
+    on each context (inherited Builder machinery), then attaches it to
+    externally-owned ``ModelRoot`` sentinel handles as their
+    ``text_model`` child during ``build()``.
+
+    Owns ``tok_embeddings`` (Tensor param) and ``output`` (LinearWeight
+    child) commits on the ModelWeight via ``add_token_embeds`` /
+    ``add_lm_head``.
+    """
+
+    def __init__(self, config, ctx, *, root_handles,
+                 tp: ParallelGroup, vocab_size):
+        super().__init__(config, ctx)
+        self.tp = tp
+        self.config.tp_size = tp.size
+        self._root_handles = root_handles
+        self._vocab_size = vocab_size
+
+    def build(self) -> BuiltModule:
+        """Create ModelWeight via _tm.create_module (via super), then attach
+        each per-GPU ModelWeight handle to its sentinel root via
+        add_child_raw."""
+        built = super().build()
+        for i, (root, text_model) in enumerate(
+                zip(self._root_handles, built.handles)):
+            with self._ctx.devices[i]:
+                root.add_child_raw('text_model', text_model)
+        return built
+
+    def add_token_embeds(self, tensor):
+        """Commit the raw embedding lookup as the ``tok_embeddings`` root
+        param.
+
+        Shards along hidden (output) dim by ``self.tp.size``. No vocab padding —
+        embedding lookup never indexes past ``vocab - 1``.
+        """
+        self._add_tensor('tok_embeddings', tensor,
+                            split_side=SplitSide.OUTPUT)
+
+    def add_lm_head(self, linear):
+        """Pad output dim to ``round_up(vocab_size, tp)`` and commit to the
+        ``output`` LinearWeight root child."""
+        linear = round_up_output_groups(linear, self._vocab_size,
+                                        self.tp.size)
+        self._add_linear('output', linear, split_side=SplitSide.OUTPUT)
diff --git a/lmdeploy/turbomind/converter.py b/lmdeploy/turbomind/converter.py
new file mode 100644
index 0000000000..56dace37bd
--- /dev/null
+++ b/lmdeploy/turbomind/converter.py
@@ -0,0 +1,240 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import torch
+
+from lmdeploy.archs import get_model_arch, search_nested_config
+from lmdeploy.messages import TurbomindEngineConfig
+from lmdeploy.pytorch.config import override_hf_config
+from lmdeploy.utils import get_logger
+
+from ..utils import _get_and_verify_max_len, is_bf16_supported
+from .builders import _cpp_dtype
+from .models.base import INPUT_MODELS
+from .models.utils import source_model_config
+from .supported_models import SUPPORTED_ARCHS
+from .weight_format import (
+    AWQFormat,
+    CompressedTensorFormat,
+    FP8Format,
+    GPTQFormat,
+    MXFP4Format,
+    TrivialFormat,
+    WeightFormat,
+    WeightFormatResolver,
+)
+
+logger = get_logger('lmdeploy')
+
+
+def _build_resolver(model_format: str | None,
+                    group_size: int | None,
+                    data_type: '_tm.DataType') -> WeightFormatResolver:
+    """Build the active resolver: quantized format (if any) + trivial fallback.
+
+    Called after the int4 fp16 force but before the ``compressed-tensors →
+    awq`` rename, so compressed-tensors models get ``CompressedTensorFormat``.
+    """
+    formats: list[WeightFormat] = []
+    if model_format in (None, 'hf'):
+        pass
+    elif model_format == 'awq':
+        formats.append(AWQFormat(block_in=group_size))
+    elif model_format == 'gptq':
+        formats.append(GPTQFormat(block_in=group_size))
+    elif model_format == 'compressed-tensors':
+        formats.append(CompressedTensorFormat(block_in=group_size))
+    elif model_format == 'fp8':
+        formats.append(FP8Format())
+    elif model_format == 'mxfp4':
+        formats.append(MXFP4Format())
+    else:
+        raise ValueError(f'unknown model_format: {model_format!r}')
+    formats.append(TrivialFormat())
+    return WeightFormatResolver(data_type=data_type, formats=formats)
+
+
+def _deep_merge(base: dict, override: dict, path: str = '') -> dict:
+    """Recursively merge override into base, mutating base in-place."""
+    for k, v in override.items():
+        key_path = f'{path}.{k}' if path else k
+        if k in base and isinstance(base[k], dict) and isinstance(v, dict):
+            _deep_merge(base[k], v, key_path)
+        else:
+            if k not in base:
+                logger.warning(f'hf_overrides key "{key_path}" not found in config, applying anyway')
+            base[k] = v
+    return base
+
+
+def _apply_hf_overrides(cfg, override: dict):
+    """Apply hf_overrides to a Transformers config object or nested dict."""
+    override_hf_config(cfg, override)
+    return cfg
+
+
+_DEFAULT_GROUP_SIZES = {
+    'awq': 128,
+    'gptq': 128,
+    'compressed-tensors': 128,
+    'fp8': 128,
+    'mxfp4': 32,
+}
+
+_SUPPORTED_GROUP_SIZES = {
+    'awq': frozenset({128}),
+    'gptq': frozenset({128}),
+    'compressed-tensors': frozenset({32, 128}),
+    'fp8': frozenset({128}),
+    'mxfp4': frozenset({32}),
+}
+
+
+def _validate_quant_group_size(model_format: str | None, group_size: int | None) -> int | None:
+    """Normalize and validate quantized group sizes.
+
+    The low-level int4 kernels can be shared across formats, but we only expose the format/group-size combinations that
+    are verified end to end.
+    """
+    if group_size in (None, 0):
+        group_size = _DEFAULT_GROUP_SIZES.get(model_format, group_size)
+
+    supported_group_sizes = _SUPPORTED_GROUP_SIZES.get(model_format)
+    if supported_group_sizes is not None and group_size not in supported_group_sizes:
+        supported = ', '.join(map(str, sorted(supported_group_sizes)))
+        raise ValueError(f'Unsupported group_size={group_size} for model_format="{model_format}". '
+                         f'Supported group_size values: {supported}.')
+
+    return group_size
+
+
+def get_registered_name(model_path: str, model_format: str, arch: str = None):
+    """Get the registered name of a model. The name will be used to access the
+    INPUT_MODELS registry.
+
+    Args:
+        model_path (str): the path of the input model
+        model_format (str): the format of the model, which can be one of
+            ['hf', 'awq', 'gptq', 'compressed-tensors', 'fp8', 'mxfp4']
+        arch (str): optional architecture string, to avoid reloading config
+    """
+    if arch is None:
+        arch = get_model_arch(model_path)[0]
+    register_name = SUPPORTED_ARCHS[arch]
+    return register_name
+
+
+def _resolve_dtype(requested: str, hf_model_cfg) -> str:
+    """Resolve 'auto' dtype against the HF config and the current device.
+
+    Prefers `dtype` over the deprecated `torch_dtype` key. Falls back to
+    float16 on hardware that does not support bfloat16.
+    """
+    has_bf16 = is_bf16_supported()
+    dtype = requested
+    if dtype == 'auto':
+        dtype = 'bfloat16' if has_bf16 else 'float16'
+        torch_dtype = getattr(hf_model_cfg, 'dtype', None)
+        if torch_dtype is None:
+            torch_dtype = getattr(hf_model_cfg, 'torch_dtype', None)
+        TORCH_DTYPE_MAP = {torch.bfloat16: 'bfloat16', torch.float16: 'float16'}
+        dtype = TORCH_DTYPE_MAP.get(torch_dtype, dtype)
+
+    if dtype == 'bfloat16' and not has_bf16:
+        logger.warning('data type fallback to float16 since '
+                       'torch.cuda.is_bf16_supported is False')
+        dtype = 'float16'
+    return dtype
+
+
+def get_tm_config(model_path,
+                  engine_config: TurbomindEngineConfig,
+                  group_size: int = None):
+    """Resolve dtype/model_format/group_size/session_len, mutate engine_config
+    in place, build the text model.
+
+    Returns:
+        tuple: (text_model, model_path, data_type)
+    """
+    # 1. Load HF config once; reused for quant_config, dtype, and session_len.
+    arch, hf_model_cfg = get_model_arch(model_path)
+
+    # 2. Reconcile quant_config (unchanged logic from the prior flow).
+    quant_config = search_nested_config(
+        hf_model_cfg.to_dict(), 'quantization_config')
+    if quant_config:
+        quant_method = quant_config.get('quant_method')
+        _group_size = int(quant_config.get('group_size', 0))
+        version = quant_config.get('version')
+        assert engine_config.model_format is None or engine_config.model_format == quant_method, (
+            f'mismatched quant method: user input "{engine_config.model_format}" '
+            f'vs model quant_config "{quant_method}"')
+        assert not group_size or group_size == _group_size, (
+            f'mismatched quant group size: user input "{group_size}" '
+            f'vs model quant_config "{_group_size}"')
+
+        if quant_method == 'awq':
+            assert version == 'gemm', f'unsupported quant config: {quant_config}'
+        elif quant_method == 'gptq':
+            assert not quant_config.get('desc_act', False) and quant_config.get(
+                'sym', True), f'unsupported quant config: {quant_config}'
+        elif quant_method == 'fp8':
+            pass
+        elif quant_method == 'mxfp4':
+            _group_size = 32
+        elif quant_method == 'compressed-tensors':
+            _format = quant_config['config_groups']['group_0']['format']
+            assert _format == 'pack-quantized', (
+                'compressed-tensors only supports pack-quantized format, '
+                f'but got {_format}')
+            _weights = quant_config['config_groups']['group_0']['weights']
+            _group_size = _weights['group_size']
+            _num_bits = _weights['num_bits']
+            _type = _weights['type']
+            assert _num_bits == 4 and _type == 'int', (
+                'pack-quantized requires 4-bit int, '
+                f'but got {_num_bits}-bit {_type}')
+        else:
+            assert 0, f'unsupported quant_config: {quant_config}'
+
+        engine_config.model_format = quant_method
+        group_size = _group_size
+
+    group_size = _validate_quant_group_size(engine_config.model_format, group_size)
+    if engine_config.model_format is None:
+        engine_config.model_format = 'hf'
+
+    # 3. Resolve dtype and format overrides.
+    dtype = _resolve_dtype(engine_config.dtype, hf_model_cfg)
+    if engine_config.model_format in ('awq', 'gptq', 'compressed-tensors'):
+        dtype = 'float16'
+    engine_config.dtype = dtype
+
+    # Build resolver after dtype is finalized but before the CT→AWQ rename,
+    # so compressed-tensors models instantiate CompressedTensorFormat.
+    resolver = _build_resolver(engine_config.model_format,
+                               group_size, _cpp_dtype(dtype))
+
+    # C++-side label rename (does not affect resolver).
+    if engine_config.model_format == 'compressed-tensors':
+        engine_config.model_format = 'awq'
+
+    # 4. Resolve session_len default.
+    session_len_default = _get_and_verify_max_len(hf_model_cfg, None)
+
+    # 5. Mutate engine_config with remaining resolved values.
+    if engine_config.session_len is None:
+        engine_config.session_len = session_len_default
+    engine_config.attn_tp_size = engine_config.attn_tp_size or 1
+    engine_config.attn_cp_size = engine_config.attn_cp_size or 1
+    engine_config.mlp_tp_size = engine_config.mlp_tp_size or 1
+
+    # 6. Build text model.
+    cfg = source_model_config(hf_model_cfg)
+    if engine_config.hf_overrides:
+        logger.warning(f'Overriding HF config with {engine_config.hf_overrides}')
+        _apply_hf_overrides(cfg, engine_config.hf_overrides)
+    registered_name = get_registered_name(model_path, engine_config.model_format, arch=arch)
+    model_cls = INPUT_MODELS.get(registered_name)
+    text_model = model_cls(cfg, resolver=resolver)
+
+    return text_model, model_path, _cpp_dtype(dtype)
diff --git a/lmdeploy/turbomind/deploy/__init__.py b/lmdeploy/turbomind/deploy/__init__.py
deleted file mode 100644
index ef101fec61..0000000000
--- a/lmdeploy/turbomind/deploy/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
deleted file mode 100644
index ba6d632242..0000000000
--- a/lmdeploy/turbomind/deploy/config.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import inspect
-import json
-from dataclasses import asdict, field, fields
-
-# use pydantic.dataclasses.dataclass to check data type
-from pydantic.dataclasses import dataclass
-
-from lmdeploy.messages import TurbomindEngineConfig
-from lmdeploy.utils import get_logger
-
-logger = get_logger('lmdeploy')
-
-
-def config_from_dict(cls, env):
-    """Initiate an instance of a config class from a dict."""
-    params = inspect.signature(cls).parameters
-    used = {k: v for k, v in env.items() if k in params and v is not None}
-
-    def _remove_none(d: dict):
-        for k, v in d.items():
-            if isinstance(v, dict):
-                d[k] = _remove_none(v)
-        return {k: v for k, v in d.items() if v is not None}
-
-    used = _remove_none(used)
-    return cls(**used)
-
-
-def config_to_dict(config):
-    """Export config to a dict."""
-    if not config:
-        return dict()
-    assert isinstance(config, (ModelConfig, AttentionConfig, LoraConfig)), \
-        f'A dataclass is expected, but got {type(config)}'
-
-    return asdict(config)
-
-
-@dataclass
-class ModelConfig:
-    model_name: str = ''
-    chat_template: str = ''
-    model_arch: str = None
-    head_num: int = None
-    kv_head_num: int = None
-    hidden_units: int = None
-    vocab_size: int = None
-    # Turbomind used to assume token_embedding and lm_head has the same size
-    # at vocab dim, i.e. `vocab_size`
-    # But in molmo, embedding.shape is [vocab_size + 128, hidden_units]
-    # while lm_head shape is [hidden_units, vocab_size].
-    # Therefore, we add a new attr "embedding_size" to represent the vocab dim
-    # of token_embedding
-    embedding_size: int = 0
-    num_layer: int = None
-    inter_size: list[int] = None
-    norm_eps: float = None
-    attn_bias: int = 0
-    mlp_bias: bool = False
-    window_size: list[int] = field(default_factory=list)
-    attn_sink: bool = False
-    qk_norm: bool = False
-    size_per_head: int = 128
-    group_size: int = 32
-    data_type: str = None
-    weight_type: str = None
-    expert_weight_type: str = None
-    ffn_weight_type: str = None
-    session_len: int = None
-    attn_tp_size: int = 1
-    attn_cp_size: int = 1
-    mlp_tp_size: int = 1
-    model_format: str = 'hf'
-    expert_num: list[int] = field(default_factory=list)
-    expert_router_bias: bool = False
-    expert_inter_size: int = 0
-    experts_per_token: int = 0
-    activation_type: str = ''
-    moe_shared_gate: bool = False
-    norm_topk_prob: bool = False
-    routed_scale: float = 1.0
-    topk_group: int = 1
-    topk_method: str = 'greedy'
-    moe_group_num: int = 1
-    scoring_func: str = 'softmax'
-    router_n_groups: int = -1
-    # MLA
-    q_lora_rank: int = 0
-    kv_lora_rank: int = 0
-    qk_rope_dim: int = 0
-    v_head_dim: int = 0
-    # Qwen 3.5
-    layer_types: list[str] = field(default_factory=list)
-    linear_key_head_dim: int = 0
-    linear_value_head_dim: int = 0
-    linear_conv_kernel_dim: int = 0
-    linear_num_key_heads: int = 0
-    linear_num_value_heads: int = 0
-    attn_output_gate: bool = False
-    # Per-layer expert weight type override: layer indices whose
-    # MoE experts are unquantized (fp16) despite expert_weight_type=int4.
-    # Populated from modules_to_not_convert patterns like 'model.layers.0.'.
-    unquantized_expert_layers: list[int] = field(default_factory=list)
-    # tuning
-    tune_layer_num: int = 1
-
-    def verify(self):
-        invalid = {}
-        for k, v in self.__dict__.items():
-            if v is None:
-                invalid[k] = v
-        assert not invalid, f'incomplete model config: {invalid}'
-
-
-@dataclass
-class RopeParam:
-    type: str
-    base: float
-    dim: int
-    factor: float = 1.0
-    max_position_embeddings: int = None
-    attention_factor: float = 1.0
-    beta_fast: float = 32
-    beta_slow: float = 1
-    low_freq_factor: float = None
-    high_freq_factor: float = None
-    original_max_position_embeddings: int = None
-    mrope_section: list[int] = None
-
-
-@dataclass
-class AttentionConfig:
-    softmax_scale: float = 0
-    cache_block_seq_len: int = 64
-    use_logn_attn: int = 0
-    max_position_embeddings: int = 0
-    rope_param: RopeParam = None
-
-
-@dataclass
-class LoraConfig:
-    lora_policy: str = ''
-    lora_r: int = 0
-    lora_scale: float = 0.0
-    lora_max_wo_r: int = 0
-    lora_rank_pattern: str = ''
-    lora_scale_pattern: str = ''
-
-
-@dataclass
-class TurbomindModelConfig:
-    """Config for turbomind model."""
-    model_config: ModelConfig = None
-    attention_config: AttentionConfig = None
-    lora_config: LoraConfig = None
-
-    def update_from_engine_config(self, config: TurbomindEngineConfig):
-        """Update the attributes of this instance with the attributes from
-        TurbomindEngineConfig.
-
-        Args:
-            config (TurbomindEngineConfig): The turbomind engine config
-        """
-        if config is None:
-            return
-        for key, value in asdict(config).items():
-            if not value:
-                continue
-
-            if hasattr(self.model_config, key):
-                setattr(self.model_config, key, value)
-            if hasattr(self.attention_config, key):
-                setattr(self.attention_config, key, value)
-
-        # update from hf_overrides
-        if hasattr(config, 'hf_overrides') and config.hf_overrides:
-            hf_overrides = config.hf_overrides
-
-            if hf_overrides.get('rope_scaling'):
-                override_params = hf_overrides.get('rope_scaling')
-
-                rope_param = self.attention_config.rope_param or RopeParam(type='', base=0, dim=0)
-                rope_param.type = override_params.get('rope_type', '')
-                if rope_param.type == 'yarn' and 'original_max_position_embeddings' in override_params:
-                    rope_param.factor = self.attention_config.max_position_embeddings / override_params[
-                        'original_max_position_embeddings']
-                    rope_param.max_position_embeddings = override_params['original_max_position_embeddings']
-                else:
-                    rope_param.factor = override_params.get('factor', 1.0)
-                    rope_param.max_position_embeddings = override_params.get('original_max_position_embeddings', None)
-
-                self.attention_config.rope_param = rope_param
-            logger.warning(f'Overriding HF config with {hf_overrides}')
-
-        # use dynamic ntk
-        if config.rope_scaling_factor:
-            # some ut will create empty RopeParam, will check base/dim in src code
-            rope_param = self.attention_config.rope_param or RopeParam(type='', base=0, dim=0)
-            rope_param.type = 'dynamic'
-            rope_param.factor = config.rope_scaling_factor
-            rope_param.max_position_embeddings = self.attention_config.max_position_embeddings
-
-            self.attention_config.rope_param = rope_param
-            logger.warning(
-                '`--rope-scaling-factor` will be removed in a future release. Please instead use `--hf-overrides`.')
-
-    @classmethod
-    def from_dict(cls, config: dict | None = None):
-        """Construct TurbomindModelConfig instance from config in a dict."""
-        if config is None:
-            config = {}
-        _cfg = {field.name: config.get(field.name, {}) for field in fields(TurbomindModelConfig)}
-
-        return TurbomindModelConfig(model_config=config_from_dict(ModelConfig, _cfg['model_config']),
-                                    attention_config=config_from_dict(AttentionConfig, _cfg['attention_config']),
-                                    lora_config=config_from_dict(LoraConfig, _cfg['lora_config']))
-
-    def to_dict(self):
-        """Export to a dict."""
-        return dict(model_config=config_to_dict(self.model_config),
-                    attention_config=config_to_dict(self.attention_config),
-                    lora_config=config_to_dict(self.lora_config))
-
-    @property
-    def session_len(self):
-        return self.model_config.session_len
-
-    @property
-    def weight_type(self):
-        return self.model_config.weight_type
-
-    @property
-    def group_size(self):
-        return self.model_config.group_size
-
-    @property
-    def vocab_size(self):
-        return self.model_config.vocab_size
-
-    def __str__(self):
-        return json.dumps(self.to_dict(), indent=2)
diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py
deleted file mode 100644
index 05b1ba526f..0000000000
--- a/lmdeploy/turbomind/deploy/converter.py
+++ /dev/null
@@ -1,285 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from lmdeploy.archs import get_model_arch, search_nested_config
-from lmdeploy.messages import TurbomindEngineConfig
-from lmdeploy.utils import get_logger
-
-from ...utils import _get_and_verify_max_len, is_bf16_supported
-from ..supported_models import SUPPORTED_ARCHS
-from .config import TurbomindModelConfig
-from .module import Transformer
-from .policy import get_input_policy
-from .source_model.base import INPUT_MODELS
-from .target_model.base import OUTPUT_MODELS, BaseOutputModel
-
-SUPPORTED_FORMATS = ['hf', 'awq', 'gptq', 'compressed-tensors', 'fp8', 'mxfp4', None]
-logger = get_logger('lmdeploy')
-
-_DEFAULT_GROUP_SIZES = {
-    'awq': 128,
-    'gptq': 128,
-    'compressed-tensors': 128,
-    'fp8': 128,
-    'mxfp4': 32,
-}
-
-_SUPPORTED_GROUP_SIZES = {
-    'awq': frozenset({128}),
-    'gptq': frozenset({128}),
-    'compressed-tensors': frozenset({32, 128}),
-    'fp8': frozenset({128}),
-    'mxfp4': frozenset({32}),
-}
-
-
-def _validate_quant_group_size(model_format: str | None, group_size: int | None) -> int | None:
-    """Normalize and validate quantized group sizes.
-
-    The low-level int4 kernels can be shared across formats, but we only expose the format/group-size combinations that
-    are verified end to end.
-    """
-    if group_size in (None, 0):
-        group_size = _DEFAULT_GROUP_SIZES.get(model_format, group_size)
-
-    supported_group_sizes = _SUPPORTED_GROUP_SIZES.get(model_format)
-    if supported_group_sizes is not None and group_size not in supported_group_sizes:
-        supported = ', '.join(map(str, sorted(supported_group_sizes)))
-        raise ValueError(f'Unsupported group_size={group_size} for model_format="{model_format}". '
-                         f'Supported group_size values: {supported}.')
-
-    return group_size
-
-
-def get_input_model_registered_name(model_path: str, model_format: str):
-    """Get the registered name of a model. The name will be used to access the
-    INPUT_MODELS registry.
-
-    Args:
-        model_path (str): the path of the input model
-        model_format (str): the format of the model, which can be one of
-            ['hf', 'awq', 'gptq', 'compressed-tensors', 'fp8', 'mxfp4']
-    """
-    arch = get_model_arch(model_path)[0]
-    register_name = SUPPORTED_ARCHS[arch]
-    return register_name
-
-
-def get_output_model_registered_name_and_config(model_path: str, model_format: str, dtype: str, group_size: int):
-    """Get the registered name of the turbomind model and its configuration
-    according to the input model path, format and user-input config. The name
-    will be used to access the OUTPUT_MODELS registry.
-
-    Args:
-        model_path (str): the path of the input model
-        model_format (str): the format of the model, which can be one of
-            ['hf', 'awq', 'gptq', 'compressed-tensors', 'fp8', 'mxfp4']
-        dtype (str): the data type of the model's weights and activations
-        group_size (int): the quantization group size used by grouped formats
-    """
-    register_name = 'tm'
-
-    has_bf16 = is_bf16_supported()
-
-    model_arch, model_config = get_model_arch(model_path)
-
-    # infer dtype from device and model config
-    if dtype == 'auto':
-        # pick dtype by device as default
-        dtype = 'bfloat16' if has_bf16 else 'float16'
-        # dtype from model (prefer `dtype` over deprecated `torch_dtype`)
-        torch_dtype = getattr(model_config, 'dtype', None)
-        if torch_dtype is None:
-            torch_dtype = getattr(model_config, 'torch_dtype', None)
-        if not torch_dtype:
-            if model_arch in ['QWenLMHeadModel', 'GptOssForCausalLM']:
-                torch_dtype = torch.bfloat16
-        TORCH_DTYPE_MAP = {torch.bfloat16: 'bfloat16', torch.float16: 'float16'}
-        dtype = TORCH_DTYPE_MAP.get(torch_dtype, dtype)
-
-    if dtype == 'bfloat16' and not has_bf16:
-        logger.warning('data type fallback to float16 since '
-                       'torch.cuda.is_bf16_supported is False')
-        dtype = 'float16'
-
-    weight_type = dtype
-
-    config = TurbomindModelConfig.from_dict()
-
-    session_len = _get_and_verify_max_len(model_config, None)
-
-    group_size = _validate_quant_group_size(model_format, group_size)
-
-    if model_format in ['awq', 'gptq', 'compressed-tensors']:
-        weight_type = 'int4'
-        dtype = 'float16'  # force float16 for int4 quantized weights
-        if model_format == 'compressed-tensors':
-            # TurboMind reuses the AWQ int4 export path for pack-quantized
-            # compressed-tensors weights after the format-specific checks above.
-            model_format = 'awq'
-    elif model_format == 'fp8':
-        weight_type = 'fp8'
-    elif model_format == 'mxfp4':
-        weight_type = 'e2m1'
-
-    expert_weight_type = weight_type
-
-    # ONLY experts are in mxfp4
-    if model_arch == 'GptOssForCausalLM':
-        weight_type = dtype
-
-    # Three weight types control allocation for mixed quantization:
-    #   weight_type        - attention weights
-    #   ffn_weight_type    - dense FFN / shared expert weights
-    #   expert_weight_type - MoE routed expert weights
-    #
-    # The assignment order matters:
-    #   1. expert_weight_type = original weight_type (before any overrides)
-    #   2. GptOss override:   weight_type -> dtype  (attn + shared experts are fp16)
-    #   3. ffn_weight_type  = weight_type           (captures post-GptOss value)
-    #   4. Mixed AWQ override: weight_type -> dtype  (only attn becomes fp16)
-    #
-    #                  weight_type   ffn_weight_type   expert_weight_type
-    #  Pure fp16       float16       float16           float16
-    #  Full AWQ        int4          int4              int4
-    #  Mixed AWQ       float16       int4              int4
-    #  GptOss mxfp4    bfloat16      bfloat16          e2m1
-    ffn_weight_type = weight_type
-
-    # When attention weights are not quantized (e.g. AWQ with self_attn in
-    # modules_to_not_convert), weight_type becomes fp16 for attention.
-    # ffn_weight_type and expert_weight_type retain int4.
-    if model_format in ['awq', 'gptq'] and weight_type != dtype:
-        quant_config = getattr(model_config, 'quantization_config', None)
-        if quant_config is None:
-            quant_config = {}
-        if isinstance(quant_config, dict):
-            modules_to_not_convert = quant_config.get('modules_to_not_convert') or []
-        else:
-            modules_to_not_convert = getattr(quant_config, 'modules_to_not_convert', None) or []
-        if any('self_attn' in m for m in modules_to_not_convert):
-            weight_type = dtype
-        if any('shared_expert' in m for m in modules_to_not_convert):
-            ffn_weight_type = dtype
-        # Detect per-layer exclusions like 'model.layers.0.' which mean
-        # ALL weights in that layer (including MoE experts) are fp16.
-        import re as _re
-        unquantized_expert_layers = []
-        for m in modules_to_not_convert:
-            _m = _re.match(r'model\.layers\.(\d+)\.?$', m)
-            if _m:
-                unquantized_expert_layers.append(int(_m.group(1)))
-        config.model_config.unquantized_expert_layers = unquantized_expert_layers
-
-    config.model_config.model_arch = model_arch
-    config.model_config.data_type = dtype
-    config.model_config.weight_type = weight_type
-    config.model_config.expert_weight_type = expert_weight_type
-    config.model_config.ffn_weight_type = ffn_weight_type
-    config.model_config.model_format = model_format
-    config.model_config.group_size = group_size
-    config.model_config.session_len = session_len
-
-    return register_name, config
-
-
-def get_tm_model(model_path,
-                 model_name,
-                 chat_template_name,
-                 engine_config: TurbomindEngineConfig,
-                 group_size: int = None,
-                 out_dir: str = None) -> BaseOutputModel:
-    """Create turbomind model.
-
-    Args:
-        model_path (str): the path of the input model, which is supposed
-            to be a local path, or huggingface hub repo_id, or modelscope
-            hub repo_id
-        model_name (str): user customized model name
-        chat_template_name (str): the name of the chat template of
-            the input model
-        engine_config(TurbomindEngineConfig): user input engine config
-        group_size(int): refers to the group_size if the input model
-            is a grouped quantized model
-        out_dir(str): the output directory where to save to turbomind model.
-            If it is None, the turbomind model won't be saved
-    """
-    _, cfg = get_model_arch(model_path)
-    quant_config = search_nested_config(cfg.to_dict(), 'quantization_config')
-    mixed_awq = False
-    if quant_config:
-        quant_method = quant_config.get('quant_method')
-        _group_size = int(quant_config.get('group_size', 0))
-        version = quant_config.get('version')
-        assert engine_config.model_format is None or engine_config.model_format == quant_method, (
-            f'mismatched quant method: user input "{engine_config.model_format}" '
-            f'vs model quant_config "{quant_method}"')
-        assert not group_size or group_size == _group_size, (f'mismatched quant group size: user input "{group_size}" '
-                                                             f'vs model quant_config "{_group_size}"')
-
-        if quant_method == 'awq':
-            assert version == 'gemm', f'unsupported quant config: {quant_config}'
-            modules_to_not_convert = quant_config.get('modules_to_not_convert') or []
-            if any('self_attn' in name for name in modules_to_not_convert):
-                mixed_awq = True
-        elif quant_method == 'gptq':
-            assert not quant_config.get('desc_act', False) and quant_config.get(
-                'sym', True), f'unsupported quant config: {quant_config}'
-        elif quant_method == 'fp8':
-            pass
-        elif quant_method == 'mxfp4':
-            _group_size = 32
-        elif quant_method == 'compressed-tensors':
-            _format = quant_config['config_groups']['group_0']['format']
-            assert _format == 'pack-quantized', ('compressed-tennsors only supports pack-quantized format, '
-                                                 f'but got {_format}')
-            _weights = quant_config['config_groups']['group_0']['weights']
-            _group_size = _weights['group_size']
-            _num_bits = _weights['num_bits']
-            _type = _weights['type']
-            assert _num_bits == 4 and _type == 'int', ('pack-quantized requires 4-bit int, '
-                                                       f'but got {_num_bits}-bit {_type}')
-        else:
-            assert 0, f'unsupported quant_config: {quant_config}'
-
-        engine_config.model_format = quant_method
-        group_size = _group_size
-
-    group_size = _validate_quant_group_size(engine_config.model_format, group_size)
-
-    input_model_name = get_input_model_registered_name(model_path, engine_config.model_format)
-
-    fp8_quant = (engine_config.model_format == 'fp8' and not quant_config)
-    input_policy = get_input_policy(engine_config.model_format)
-    input_model = INPUT_MODELS.get(input_model_name)(model_path=model_path,
-                                                     tokenizer_path=model_path,
-                                                     input_policy=input_policy,
-                                                     fp8_quant=fp8_quant)
-
-    output_model_name, tm_cfg = get_output_model_registered_name_and_config(model_path=model_path,
-                                                                            model_format=engine_config.model_format,
-                                                                            dtype=engine_config.dtype,
-                                                                            group_size=group_size)
-
-    if mixed_awq:
-        # Mixed-precision AWQ: attention weights are fp16 (not quantized),
-        # but expert weights remain as int4 AWQ for efficient inference.
-        tm_cfg.model_config.weight_type = tm_cfg.model_config.data_type
-        # expert_weight_type stays as 'int4' (set by get_output_model_registered_name_and_config)
-
-    tm_cfg.model_config.chat_template = chat_template_name
-    tm_cfg.model_config.model_name = model_name
-
-    if engine_config.attn_tp_size is not None:
-        tm_cfg.model_config.attn_tp_size = engine_config.attn_tp_size
-    if engine_config.attn_cp_size is not None:
-        tm_cfg.model_config.attn_cp_size = engine_config.attn_cp_size
-    if engine_config.mlp_tp_size is not None:
-        tm_cfg.model_config.mlp_tp_size = engine_config.mlp_tp_size
-
-    output_model = OUTPUT_MODELS.get(output_model_name)(input_model=input_model,
-                                                        cfg=tm_cfg,
-                                                        model_cls=Transformer,
-                                                        out_dir=out_dir)
-
-    return output_model
diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py
deleted file mode 100644
index b62d9aead0..0000000000
--- a/lmdeploy/turbomind/deploy/module.py
+++ /dev/null
@@ -1,639 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from abc import ABC, abstractmethod
-from functools import partial
-
-import torch
-
-from .parameter import get_params
-from .source_model.base import BaseReader
-from .target_model.base import BaseOutputModel
-
-
-def permute_v2(x: torch.Tensor, size_per_head: int = 128):
-    """
-        Contract: x.size(-1) is output dims
-    """
-
-    assert x.size(-1) > 1
-
-    output_dims = x.size(-1)
-    head_num = output_dims // size_per_head
-
-    return x.view(-1, head_num, 2, size_per_head // 2).transpose(2, 3).reshape(x.shape)
-
-
-def permute_v2_partial(x: torch.Tensor, size_per_head: int, rotary_dim: int):
-    """Permute only the first rotary_dim elements of each head.
-
-    Used when partial_rotary_factor < 1.0: only the rotary portion needs interleaving for TurboMind's RoPE kernel
-    layout.
-    """
-    assert x.size(-1) > 1
-    assert rotary_dim % 2 == 0, f'rotary_dim must be even, got {rotary_dim}'
-    assert rotary_dim <= size_per_head, f'rotary_dim ({rotary_dim}) must be <= size_per_head ({size_per_head})'
-    output_dims = x.size(-1)
-    assert output_dims % size_per_head == 0, (f'output_dims ({output_dims}) must be divisible by '
-                                              f'size_per_head ({size_per_head})')
-    head_num = output_dims // size_per_head
-    orig_shape = x.shape
-    if x.dim() == 1:
-        x = x.unsqueeze(0)
-    x = x.view(x.size(0), head_num, size_per_head)
-    rotary = x[:, :, :rotary_dim]
-    passthrough = x[:, :, rotary_dim:]
-    # Interleave rotary part: [2, rotary_dim//2] -> [rotary_dim//2, 2]
-    rotary = rotary.view(x.size(0), head_num, 2, rotary_dim // 2).transpose(2, 3).contiguous()
-    rotary = rotary.view(x.size(0), head_num, rotary_dim)
-    x = torch.cat([rotary, passthrough], dim=-1)
-    return x.reshape(orig_shape)
-
-
-def merge_qkv_v2(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, tp: int):
-    """
-        Contract: x.size(-1) is output dims
-    """
-
-    def reshape(x):
-        return x.view(x.size(0), tp, -1) if q.dim() == 2 else x.view(tp, -1)
-
-    qkv = torch.cat(tuple(map(reshape, (q, k, v))), dim=-1)
-
-    qkv = qkv.view(-1, qkv.size(-1) * tp)
-    if q.dim() == 1:
-        qkv.squeeze_()
-
-    return qkv
-
-
-def merge_qkvg_v2(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, gate: torch.Tensor, tp: int):
-    """Merge Q, K, V, and Gate with gate appended after V.
-
-    Layout per tp-shard: [Q | K | V | Gate].
-    """
-
-    def reshape(x):
-        return x.view(x.size(0), tp, -1) if q.dim() == 2 else x.view(tp, -1)
-
-    qkvg = torch.cat(tuple(map(reshape, (q, k, v, gate))), dim=-1)
-
-    qkvg = qkvg.view(-1, qkvg.size(-1) * tp)
-    if q.dim() == 1:
-        qkvg.squeeze_()
-
-    return qkvg
-
-
-def transpose(x):
-    return x.t() if x is not None else x
-
-
-def pad_out_dims(x: torch.Tensor, dims: int):
-    pad = dims - x.size(-1)
-    assert pad >= 0
-    return torch.nn.functional.pad(x, (0, pad), 'constant', 0)
-
-
-def pad_in_dims(x: torch.Tensor, dims: int):
-    if x.dim() == 1:  # 1-dim object does not have input dim (e.g. bias)
-        return x
-    pad = dims - x.size(0)
-    assert x.dim() == 2
-    assert pad >= 0
-    return torch.nn.functional.pad(x, (0, 0, 0, pad), 'constant', 0)
-
-
-# split out dims -> copy A, split-out-dims B (qkv, w1, w3)
-# split  in dims -> split-in-dims A,  copy B (  o, w2)
-def get_lora_flags(kind: str):
-    return ('lora_a' in kind, 'lora_b' in kind)
-
-
-class Module(ABC):
-
-    def __init__(self, model: BaseOutputModel):
-        self.model = model
-
-    def __call__(self, *args, **kwargs):
-        return self.apply(*args, **kwargs)
-
-    @abstractmethod
-    def apply(self, idx: int, r: BaseReader):
-        pass
-
-
-class LayerNorm(Module):
-
-    def apply(self, i: int, r: BaseReader):
-        attn_norm = r.attn_norm(i)
-        ffn_norm = r.ffn_norm(i)
-        self.model.save_split(attn_norm, f'layers.{i}.attention_norm.weight')
-        self.model.save_split(ffn_norm, f'layers.{i}.ffn_norm.weight')
-
-
-class Ffn(Module):
-    """
-    requires:
-        r.ffn(i, kind)
-    """
-
-    _ffn = 'layers.{0}.feed_forward.{1}.{2}'
-
-    def __init__(self, model: BaseOutputModel):
-        self.model = model
-        self.tp = model.mlp_tp_size
-        # inter_sizes in config are padded and may be different from what's
-        # in the weights
-        self.inter_size = model.model_config.inter_size
-        self.group_size = max(1, model.model_config.group_size)
-
-    def _export(self, inter_size: int, fmt: str, idx: int, w123, kind: str, pack_fn, apply_gs=None, **kwargs):
-        if apply_gs is None:
-            apply_gs = []
-        is_lora_a, is_lora_b = get_lora_flags(kind)
-        w1, w2, w3 = map(transpose, w123)
-
-        gs1 = self.group_size if 'w1' in apply_gs else 1
-        w1 = pad_out_dims(w1, inter_size // gs1)
-
-        gs3 = self.group_size if 'w3' in apply_gs else 1
-        w3 = pad_out_dims(w3, inter_size // gs3)
-
-        gs2 = self.group_size if 'w2' in apply_gs else 1
-        w2 = pad_in_dims(w2, inter_size // gs2)
-
-        w1, w2, w3 = map(pack_fn, (w1, w2, w3))
-        self.model.save_split(w1, fmt.format(idx, 'w1', kind), split_dim=-1, split_num=self.tp, copy=is_lora_a)
-        self.model.save_split(w3, fmt.format(idx, 'w3', kind), split_dim=-1, split_num=self.tp, copy=is_lora_a)
-        self.model.save_split(w2, fmt.format(idx, 'w2', kind), split_dim=0, split_num=self.tp, copy=is_lora_b)
-
-    def apply(self, i: int, r: BaseReader):
-        if i >= len(self.inter_size) or not self.inter_size[i]:
-            return
-        keys = r.ffn(i, None)
-
-        for e in get_params(keys):
-            e(partial(self._export, self.inter_size[i], self._ffn), partial(r.ffn, i), i)
-
-
-class MoeFfn(Ffn):
-    """
-    requires:
-        r.moe_ffn_expert(e, i, kind)
-        r.moe_ffn_gate(i)
-        r.moe_ffn_shared_gate(i)
-    """
-
-    _moe_ffn_expert = 'layers.{0}.moe_ffn.experts.E.{1}.{2}'
-    _moe_ffn_gate = 'layers.{0}.moe_ffn.gate.{1}'
-    _moe_ffn_shared_gate = 'layers.{0}.moe_ffn.shared_gate.weight'
-
-    def __init__(self, model: BaseOutputModel):
-        super().__init__(model)
-        self.expert_num = model.model_config.expert_num
-        self.inter_size = model.model_config.expert_inter_size
-        self.shared_gate = model.model_config.moe_shared_gate
-
-    def apply(self, i: int, r: BaseReader):
-        if i >= len(self.expert_num) or self.expert_num[i] == 0:
-            return
-
-        # Export expert weights with outer loop over experts (not params)
-        # to ensure each expert's full weight set is grouped together
-        for e in range(self.expert_num[i]):
-            for p in get_params(r.moe_ffn_expert(), 1):
-                fmt = self._moe_ffn_expert.replace('E', str(e))
-                p(partial(self._export, self.inter_size, fmt), partial(r.moe_ffn_expert, e, i), i)
-
-        # router
-        gate = transpose(r.moe_ffn_gate(i, 'weight'))
-        self.model.save_split(gate, self._moe_ffn_gate.format(i, 'weight'))
-        bias = r.moe_ffn_gate(i, 'bias')
-        if bias is not None:
-            self.model.save_split(bias, self._moe_ffn_gate.format(i, 'bias'))
-
-        # Export score_correction_bias for noaux_tc routing (GLM 4.7 Flash)
-        correction_bias = getattr(r, 'moe_ffn_gate_correction_bias', None)
-        if callable(correction_bias):
-            correction = correction_bias(i)
-            if correction is not None:
-                self.model.save_split(correction, self._moe_ffn_gate.format(i, 'score_correction_bias'))
-
-        if self.shared_gate:
-            shared_gate = transpose(r.moe_ffn_shared_gate(i))
-            self.model.save_split(shared_gate, self._moe_ffn_shared_gate.format(i))
-
-
-class Attn(Module):
-    """
-    requires:
-        r.attn(i, kind)
-    """
-
-    _attn = 'layers.{0}.attention.{1}.{2}'
-
-    def __init__(self, model: BaseOutputModel):
-        self.model = model
-        self.tp = model.attn_tp_size
-        self.head_dim = model.model_config.size_per_head
-        self.attn_bias = model.model_config.attn_bias
-        self.qk_norm = model.model_config.qk_norm
-        self.attn_sink = model.model_config.attn_sink
-        self.group_size = max(1, model.model_config.group_size)
-        self.attn_output_gate = model.model_config.attn_output_gate
-        rope_param = model.attention_config.rope_param
-        self.rope_dim = rope_param.dim if rope_param else self.head_dim
-        self.head_num = model.model_config.head_num
-
-    def _split_q_gate(self, q):
-        """Split interleaved Q+gate tensor into separate Q and gate.
-
-        HF layout: [Q_head0, Gate_head0, Q_head1, Gate_head1, ...]
-        Returns: (q_real, gate) each with shape [..., num_heads * head_dim]
-        """
-        output_dims = q.size(-1)
-        head_num = output_dims // (self.head_dim * 2)
-        orig_shape = list(q.shape)
-        if q.dim() == 1:
-            q = q.unsqueeze(0)
-        q = q.view(q.size(0), head_num, 2, self.head_dim)
-        q_real = q[:, :, 0, :].contiguous()
-        gate = q[:, :, 1, :].contiguous()
-        new_last_dim = head_num * self.head_dim
-        q_real = q_real.reshape(-1, new_last_dim)
-        gate = gate.reshape(-1, new_last_dim)
-        if len(orig_shape) == 1:
-            q_real = q_real.squeeze(0)
-            gate = gate.squeeze(0)
-        return q_real, gate
-
-    def _reorder_and_merge(self, qkvo, gs: int):
-        q, k, v, o = qkvo
-        gate = None
-        # When attn_output_gate, Q is interleaved [Q0, G0, Q1, G1, ...]
-        # Split into separate Q and gate before permuting
-        if self.attn_output_gate and q is not None:
-            q, gate = self._split_q_gate(q)
-        # reorder output dim for tm's rotary embedding layout
-        if self.model.permute_qk:
-            if gs == 1:
-                if self.rope_dim < self.head_dim:
-                    q = permute_v2_partial(q, self.head_dim, self.rope_dim)
-                    k = permute_v2_partial(k, self.head_dim, self.rope_dim)
-                else:
-                    q = permute_v2(q, self.head_dim)
-                    k = permute_v2(k, self.head_dim)
-            else:
-                assert gs % self.head_dim == 0
-        # Merge QKV with gate appended at end if present
-        if gate is not None:
-            qkv = merge_qkvg_v2(q, k, v, gate, self.tp)
-        else:
-            qkv = merge_qkv_v2(q, k, v, self.tp)
-        # zero bias for `wo` when `w_qkv` has bias but `wo` doesn't
-        if o is None and q.dim() == 1:
-            o = torch.zeros_like(q)
-        return qkv, o
-
-    def _repeat_kv(self, qkvo, gs: int, kind: str):
-        """Replicate kv."""
-        q, k, v, o = qkvo
-        head_dim = self.model.model_config.size_per_head // gs
-        kv_head_num = self.model.model_config.kv_head_num // self.model.repeat_kv
-        hidden_dim = self.model.model_config.hidden_units
-
-        def _repeat(x):
-            n = self.model.repeat_kv
-
-            x = x.reshape(-1, kv_head_num, head_dim)
-            x = x.repeat(1, 1, n)
-            x = x.reshape(-1, kv_head_num * n * head_dim)
-
-            return x
-
-        k, v = map(_repeat, (k, v))
-
-        if kind == 'bias':
-            if o is None:
-                o = torch.zeros(hidden_dim, dtype=q.dtype, device=q.device)
-            q, k, v, o = map(torch.squeeze, (q, k, v, o))
-
-        return (q, k, v, o)
-
-    def _export(self, idx: int, qkvo, kind: str, pack_fn, apply_gs=None, **kwargs):
-        if apply_gs is None:
-            apply_gs = []
-        if all(x is None for x in qkvo):
-            return
-        is_lora_a, is_lora_b = get_lora_flags(kind)
-        assert not (is_lora_a or is_lora_b)
-
-        qkvo = tuple(map(transpose, qkvo))
-
-        gs = self.group_size if ('w1' in apply_gs) else 1
-
-        if self.model.repeat_kv:
-            qkvo = self._repeat_kv(qkvo, gs, kind)
-
-        qkv, o = self._reorder_and_merge(qkvo, gs)
-
-        self.model.save_split(pack_fn(qkv),
-                              self._attn.format(idx, 'w_qkv', kind),
-                              split_dim=-1,
-                              split_num=self.tp,
-                              copy=is_lora_a)
-        self.model.save_split(pack_fn(o),
-                              self._attn.format(idx, 'wo', kind),
-                              split_dim=0,
-                              split_num=self.tp,
-                              copy=is_lora_b)
-
-    def apply(self, i: int, r: BaseReader):
-        for e in get_params(r.attn(i, None), bias=self.attn_bias):
-            e(self._export, partial(r.attn, i), i)
-        if self.qk_norm:
-            q, k = r.qk_norm(i)
-            if q is not None and k is not None:
-                if self.model.permute_qk:
-                    if self.rope_dim < self.head_dim:
-                        q = permute_v2_partial(q, self.head_dim, self.rope_dim)
-                        k = permute_v2_partial(k, self.head_dim, self.rope_dim)
-                    else:
-                        q = permute_v2(q, self.head_dim)
-                        k = permute_v2(k, self.head_dim)
-                self.model.save_split(q, self._attn.format(i, 'q_norm', '')[:-1])
-                self.model.save_split(k, self._attn.format(i, 'k_norm', '')[:-1])
-        if self.attn_sink:
-            sinks = r.attn_sinks(i)
-            self.model.save_split(sinks, self._attn.format(i, 'sinks', '')[:-1], split_dim=-1, split_num=self.tp)
-
-
-class MLA(Module):
-    """
-    requires:
-        r.mla(i, kind)
-        r.mla_norm(i)
-    """
-
-    _mla = 'layers.{0}.attention.{1}.{2}'
-
-    def __init__(self, model: BaseOutputModel):
-        self.model = model
-
-    def _export(self, idx: int, xs, kind: str, pack_fn, **kwargs):
-        if all(x is None for x in xs):
-            return
-        q_a, q_b, q, kv_a, kv_b, o = xs
-
-        cfg = self.model.model_config
-        head_num = cfg.head_num
-        kv_lora_rank = cfg.kv_lora_rank
-        qk_rope_dim = cfg.qk_rope_dim
-        size_per_head = cfg.size_per_head
-        v_head_dim = cfg.v_head_dim
-
-        # ========== MLA Weight Folding for Dimension Mismatch ==========
-        # When kv_lora_rank != qk_nope_dim (e.g., GLM 4.7 Flash: 512 != 512+64=576),
-        # fold the kc/vc compression/decompression BMMs into q_b_proj/o_proj weights
-        # at conversion time to avoid runtime overhead.
-        if kind == 'weight' and kv_lora_rank and q is None and q_b is not None and kv_b is not None and o is not None:
-            if not (torch.is_floating_point(q_b) and torch.is_floating_point(kv_b) and torch.is_floating_point(o)):
-                raise ValueError('MLA weight folding requires floating-point attention weights.')
-
-            orig_q_head_dim = q_b.size(0) // head_num
-            orig_qk_nope_dim = orig_q_head_dim - qk_rope_dim
-            orig_kv_dim_total = kv_b.size(0) // head_num
-            orig_v_head_dim = o.size(1) // head_num
-            actual_orig_qk_nope_dim = orig_kv_dim_total - orig_v_head_dim
-
-            if abs(orig_qk_nope_dim - actual_orig_qk_nope_dim) > 1:
-                raise ValueError(f'Dimension mismatch: inferred qk_nope from q_b ({orig_qk_nope_dim}) != '
-                                 f'inferred from kv_b ({actual_orig_qk_nope_dim})')
-
-            orig_qk_nope_dim = actual_orig_qk_nope_dim
-            target_nope_dim = size_per_head - qk_rope_dim
-            target_v_head_dim = v_head_dim
-
-            if orig_qk_nope_dim != target_nope_dim or orig_v_head_dim != target_v_head_dim:
-                if target_nope_dim != kv_lora_rank or target_v_head_dim != kv_lora_rank:
-                    raise ValueError(f'MLA folding expects v_head_dim and nope_dim to equal kv_lora_rank, '
-                                     f'got nope={target_nope_dim}, v_head={target_v_head_dim}, rank={kv_lora_rank}')
-
-                if kv_b.size(1) != kv_lora_rank:
-                    raise ValueError(f'kv_b_proj second dim must equal kv_lora_rank for MLA folding, '
-                                     f'got {kv_b.size(1)} != {kv_lora_rank}')
-
-                # Split kv_b into kc and vc
-                kv_b_per_head = kv_b.reshape(head_num, orig_qk_nope_dim + orig_v_head_dim, kv_lora_rank)
-                kc_w = kv_b_per_head[:, :orig_qk_nope_dim, :]
-                vc_w = kv_b_per_head[:, orig_qk_nope_dim:, :]
-
-                # Fold kc into q_b_proj
-                q_b_per_head = q_b.reshape(head_num, orig_q_head_dim, q_b.size(1))
-                q_nope_w = q_b_per_head[:, :orig_qk_nope_dim, :]
-                q_rope_w = q_b_per_head[:, orig_qk_nope_dim:, :]
-                q_nope_expanded = torch.bmm(kc_w.transpose(1, 2), q_nope_w)
-                q_b_folded = torch.cat([q_nope_expanded, q_rope_w], dim=1)
-                q_b = q_b_folded.reshape(head_num * size_per_head, q_b.size(1))
-
-                # Fold vc into o_proj
-                o_per_head = o.reshape(o.size(0), head_num, orig_v_head_dim)
-                o_folded = torch.bmm(o_per_head.permute(1, 0, 2), vc_w)
-                o = o_folded.permute(1, 0, 2).reshape(o.size(0), head_num * kv_lora_rank)
-
-                # Set kv_b to identity (kc/vc are now absorbed)
-                eye = torch.eye(kv_lora_rank, dtype=kv_b.dtype, device=kv_b.device)
-                kv_b = torch.cat([eye, eye], dim=0).repeat(head_num, 1)
-        # ========== End MLA Weight Folding ==========
-
-        # Transpose after folding
-        q_a, q_b, q, kv_a, kv_b, o = map(transpose, (q_a, q_b, q, kv_a, kv_b, o))
-
-        if q is not None:
-            q_b = q
-
-        # Pad o_proj to size_per_head if present
-        if o is not None:
-            o = o.reshape(head_num, v_head_dim, -1)
-            o = torch.nn.functional.pad(o, (0, 0, size_per_head - v_head_dim, 0, 0, 0))
-            o = o.view(head_num * size_per_head, cfg.hidden_units)
-
-        tp = self.model.attn_tp_size
-
-        # Export MLA weights (handle None for folded-away tensors)
-        if q_a is not None:
-            self.model.save_split(pack_fn(q_a), self._mla.format(idx, 'q_a_proj', kind))
-        q_b_name = 'q_proj' if q_a is None else 'q_b_proj'
-        if q_b is not None:
-            self.model.save_split(pack_fn(q_b), self._mla.format(idx, q_b_name, kind), split_dim=-1, split_num=tp)
-        if kv_a is not None:
-            self.model.save_split(pack_fn(kv_a), self._mla.format(idx, 'kv_a_proj', kind))
-        # if kv_b is not None:
-        #     self.model.save_split(pack_fn(kv_b), self._mla.format(idx, 'kv_b_proj', kind), split_dim=-1, split_num=tp)
-        if o is not None:
-            self.model.save_split(pack_fn(o), self._mla.format(idx, 'wo', kind), split_dim=0, split_num=tp)
-
-    _layernorm = 'layers.{0}.attention.{1}_a_layernorm'
-
-    def apply(self, i: int, r: BaseReader):
-
-        for f in get_params(r.attn(i, None), bias=False):
-            f(self._export, partial(r.mla, i), i)
-
-        q, k = r.mla_norm(i)
-        if q is not None:
-            self.model.save_split(q, self._layernorm.format(i, 'q'))
-        self.model.save_split(k, self._layernorm.format(i, 'kv'))
-
-
-class LinearAttn(Module):
-    _linear_attn = 'layers.{0}.linear_attn.{1}.{2}'
-
-    def __init__(self, model: BaseOutputModel):
-        self.model = model
-        self.tp = model.attn_tp_size
-        cfg = model.model_config
-        self.key_dim = cfg.linear_num_key_heads * cfg.linear_key_head_dim
-        self.value_dim = cfg.linear_num_value_heads * cfg.linear_value_head_dim
-
-    def _tp_interleave_qkv(self, tensor, dim):
-        """Split a concatenated [Q, K, V] tensor into components, reshape each
-        for TP interleaving, and re-concatenate.
-
-        in_proj_qkv layout along ``dim``: Q(key_dim) | K(key_dim) | V(value_dim).
-        A naive split doesn't respect component boundaries when key_dim and
-        value_dim differ.  This method splits Q/K/V, reshapes each to
-        ``(tp, -1)`` along ``dim``, concatenates per-TP-shard, then flattens
-        so that a subsequent ``save_split(split_dim=dim)`` gives each rank the
-        correct portion.
-        """
-        if dim < 0:
-            dim = tensor.dim() + dim
-        q, k, v = torch.split(tensor, [self.key_dim, self.key_dim, self.value_dim], dim=dim)
-
-        def reshape(x):
-            # Move TP axis to a new dimension right after ``dim``
-            shape = list(x.shape)
-            d = shape[dim]
-            new_shape = shape[:dim] + [self.tp, d // self.tp] + shape[dim + 1:]
-            return x.view(new_shape)
-
-        parts = torch.cat([reshape(q), reshape(k), reshape(v)], dim=dim + 1)
-        # Collapse tp and per-shard dims back
-        shape = list(parts.shape)
-        final_shape = shape[:dim] + [shape[dim] * shape[dim + 1]] + shape[dim + 2:]
-        return parts.reshape(final_shape)
-
-    def apply(self, i: int, r: BaseReader):
-        layer_types = getattr(self.model.model_config, 'layer_types', [])
-        if i >= len(layer_types) or layer_types[i] != 'linear_attention':
-            return
-
-        for kind in ['weight', 'bias']:
-            weights = r.linear_attn(i, kind)
-            if not weights:
-                continue
-
-            names = ['conv1d', 'in_proj_qkv', 'in_proj_z', 'in_proj_b', 'in_proj_a', 'out_proj', 'A_log', 'dt_bias']
-            for name, tensor in zip(names, weights):
-                if tensor is None:
-                    continue
-                if name == 'conv1d':
-                    # conv1d shape: (conv_dim, 1, d_conv) where
-                    # conv_dim = key_dim*2 + value_dim.  Interleave Q/K/V
-                    # portions along dim 0 before splitting for TP.
-                    tensor = self._tp_interleave_qkv(tensor, dim=0)
-                    self.model.save_split(tensor,
-                                          self._linear_attn.format(i, name, kind),
-                                          split_dim=0,
-                                          split_num=self.tp)
-                elif name in ['A_log', 'dt_bias']:
-                    # Split per-head params across TP ranks (use -1 to
-                    # avoid the 1-D copy shortcut in save_split).
-                    self.model.save_split(tensor,
-                                          self._linear_attn.format(i, name, kind),
-                                          split_dim=-1,
-                                          split_num=self.tp)
-                elif name == 'out_proj':
-                    self.model.save_split(transpose(tensor),
-                                          self._linear_attn.format(i, name, kind),
-                                          split_dim=0,
-                                          split_num=self.tp)
-                elif name == 'in_proj_qkv':
-                    # in_proj_qkv: (conv_dim, hidden) where conv_dim =
-                    # key_dim*2 + value_dim.  After transpose the QKV
-                    # components are along dim -1.  Interleave for TP so
-                    # each shard gets the correct Q/K/V slice.
-                    t = transpose(tensor)
-                    t = self._tp_interleave_qkv(t, dim=-1)
-                    self.model.save_split(t, self._linear_attn.format(i, name, kind), split_dim=-1, split_num=self.tp)
-                else:
-                    self.model.save_split(transpose(tensor),
-                                          self._linear_attn.format(i, name, kind),
-                                          split_dim=-1,
-                                          split_num=self.tp)
-
-        norm = r.linear_norm(i, 'weight')
-        if norm is not None:
-            self.model.export_weight(norm, f'layers.{i}.linear_attn.norm.weight')
-
-
-class Misc(Module):
-    """
-    requires:
-        r.tok_embeddings()
-        r.norm_weight()
-        r.output_weight()
-    """
-
-    def apply(self, i: int, r: BaseReader):
-        """Export embedding, norm, output weight."""
-        emb = r.tok_embeddings()
-        norm_weight = r.norm_weight()
-        output_weight = r.output_weight()
-
-        def pad_weight(tensor: torch.Tensor, tp: int):
-            pad_size = None
-            vocab_size = self.model.model_config.vocab_size
-            if vocab_size % tp != 0:
-                pad_size = (vocab_size + tp - 1) // tp * tp - vocab_size
-            if pad_size is None:
-                return tensor
-            return torch.nn.functional.pad(tensor, (0, 0, 0, pad_size), 'constant', 0)
-
-        tp = self.model.attn_tp_size * self.model.attn_cp_size
-        if emb is not None:
-            emb = pad_weight(emb, tp=tp)
-            self.model.save_split(emb, 'tok_embeddings.weight', split_dim=1, split_num=tp)
-        if norm_weight is not None:
-            self.model.export_weight(norm_weight, 'norm.weight')
-        if output_weight is not None:
-            output_weight = pad_weight(output_weight, tp=tp)
-            # transpose
-            self.model.save_split(output_weight.t(), 'output.weight', split_dim=1, split_num=tp)
-
-
-class Transformer:
-
-    def __init__(self, model: BaseOutputModel):
-        self.model = model
-        modules = [LayerNorm]
-        if model.model_config.kv_lora_rank:
-            modules.append(MLA)
-        else:
-            modules.append(Attn)
-        if getattr(model.model_config, 'layer_types', []):
-            modules.append(LinearAttn)
-        if model.model_config.inter_size:
-            modules.append(Ffn)
-        if model.model_config.expert_num:
-            modules.append(MoeFfn)
-        self.modules = [c(model) for c in modules]
-        self.misc = Misc(model)
-
-    def __call__(self, i: int, r: BaseReader):
-        if i >= 0:
-            for m in self.modules:
-                m(i, r)
-            return 1
-        else:
-            self.misc(i, r)
diff --git a/lmdeploy/turbomind/deploy/parameter.py b/lmdeploy/turbomind/deploy/parameter.py
deleted file mode 100644
index 59c6f0158f..0000000000
--- a/lmdeploy/turbomind/deploy/parameter.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from abc import abstractmethod
-
-import torch
-
-
-def identity(x):
-    return x
-
-
-def to_half(x: torch.Tensor):
-    return x.to(torch.half)
-
-
-def to_float(x: torch.Tensor):
-    return x.to(torch.float)
-
-
-def to_fp8(x: torch.Tensor):
-    assert x.dtype == torch.uint8
-    return x.view(dtype=torch.float8_e4m3fn)
-
-
-def pack_u4_row(x: torch.Tensor) -> torch.Tensor:
-    assert x.dtype == torch.uint8, f'x.dtype: {x.dtype}'
-    xs = x.view(*x.shape[:-1], -1, 8).split(1, dim=-1)
-    a = torch.zeros(xs[0].shape, dtype=torch.int32, device=x.device)
-    for t in reversed(xs):
-        a = (a << 4) | t
-    return a.squeeze(dim=-1)
-
-
-def generate_zero_point(scales):
-    """Synthesize symmetric int4 zero-points from exported scale shapes."""
-    return tuple(torch.full(s.shape, 8, dtype=torch.uint8) for s in scales)
-
-
-class Parameter:
-    KEY = ()
-
-    @classmethod
-    def take(cls, keys: list[str]):
-        if not any(k.endswith(cls.KEYS[0]) for k in keys):
-            return False
-        xs = []
-        for k in keys:
-            if any(k.endswith(p) for p in cls.KEYS):
-                xs.append(k)
-        for x in xs:
-            keys.remove(x)
-        return xs
-
-    @abstractmethod
-    def __call__(cls, f, g, i):
-        pass
-
-
-class QuantWeightOnly(Parameter):
-    AWQ_KEYS = '.qweight', '.scales', '.qzeros'
-    COMPRESSED_KEYS = '.weight_packed', '.weight_scale', '.weight_zero_point'
-    KEYS = AWQ_KEYS + COMPRESSED_KEYS
-
-    @classmethod
-    def take(cls, keys: list[str]):
-        if any(k.endswith(cls.AWQ_KEYS[0]) for k in keys):
-            suffixes = cls.AWQ_KEYS
-        elif any(k.endswith(cls.COMPRESSED_KEYS[0]) for k in keys):
-            suffixes = cls.COMPRESSED_KEYS
-        else:
-            return False
-
-        xs = []
-        for k in keys:
-            if any(k.endswith(p) for p in suffixes):
-                xs.append(k)
-        for x in xs:
-            keys.remove(x)
-        return xs
-
-    def __init__(self, xs):
-        self.compressed_tensors = any(key.endswith(self.COMPRESSED_KEYS[0]) for key in xs)
-        self.has_zero_point = any(key.endswith(self.COMPRESSED_KEYS[2]) for key in xs)
-
-    def _get(self, g, kind: str):
-        if not self.compressed_tensors:
-            return g(kind)
-
-        mapping = {
-            'qweight': 'weight_packed',
-            'scales': 'weight_scale',
-            'qzeros': 'weight_zero_point',
-        }
-        return g(mapping[kind])
-
-    def __call__(self, f, g, i):
-        f(i, self._get(g, 'qweight'), 'qweight', pack_u4_row)
-        scales = self._get(g, 'scales')
-        f(i, scales, 'scales', to_half, apply_gs=['w2'])
-        if self.compressed_tensors and not self.has_zero_point:
-            zeros = generate_zero_point(scales)
-        else:
-            zeros = self._get(g, 'qzeros')
-        f(i, zeros, 'zeros', to_half, apply_gs=['w2'])
-
-
-class WeightScaleInv(Parameter):
-    KEYS = '.weight_scale_inv', '.weight'
-
-    # TODO: flag any operations crossing the quant blocks as illegal
-    def __call__(self, f, g, i):
-        f(i, g('weight_scale_inv'), 'scales', to_float, apply_gs=['w1', 'w3', 'w2'])
-        f(i, g('weight'), 'weight', identity)
-
-
-class Mxfp4Weight(Parameter):
-    KEYS = '.blocks', '.scales'
-
-    def __call__(self, f, g, i):
-        f(i, g('blocks'), 'weight', pack_u4_row)
-        f(i, g('scales'), 'scales', identity, apply_gs=['w2'])
-
-
-class Weight(Parameter):
-    KEYS = '.weight',
-
-    def __call__(self, f, g, i):
-        f(i, g('weight'), 'weight', identity)
-
-
-class Bias(Parameter):
-    KEYS = '.bias',
-
-    def __call__(self, f, g, i):
-        f(i, g('bias'), 'bias', identity)
-
-
-class PLora(Parameter):
-    KEYS = '.Plora_A.weight', '.Plora_B.weight'
-
-    def __call__(self, f, g, i):
-        f(i, g('Plora_A.weight'), 'lora_a.weight', identity)
-        f(i, g('Plora_B.weight'), 'lora_b.weight', identity)
-
-
-def get_params(keys: list[str], bias=0):
-    ps = []
-    if PLora.take(keys):
-        ps.append(PLora())
-    xs = QuantWeightOnly.take(keys)
-    if xs:
-        ps.append(QuantWeightOnly(xs))
-    if WeightScaleInv.take(keys):
-        ps.append(WeightScaleInv())
-    if Mxfp4Weight.take(keys):
-        ps.append(Mxfp4Weight())
-    if Weight.take(keys):
-        ps.append(Weight())
-    if bias and Bias.take(keys):
-        ps.append(Bias())
-    return ps
diff --git a/lmdeploy/turbomind/deploy/policy.py b/lmdeploy/turbomind/deploy/policy.py
deleted file mode 100644
index 0e4c061c0d..0000000000
--- a/lmdeploy/turbomind/deploy/policy.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-import torch.cuda
-
-
-def to_cuda(x: torch.Tensor, *args):
-    return x.cuda()
-
-
-def get_u4_slices(x: torch.Tensor, dtype: torch.dtype) -> list[torch.Tensor]:
-    MAP = {torch.int32: 8, torch.uint8: 2}
-    xs = []
-    for _ in range(MAP[x.dtype]):
-        xs.append((x & 15).to(dtype))
-        x = x >> 4
-    return xs
-
-
-def unpack_awq_gemm(x: torch.Tensor) -> torch.Tensor:
-    xs = get_u4_slices(x, torch.uint8)
-    order = [0, 4, 1, 5, 2, 6, 3, 7]
-    ys = [xs[i] for i in order]
-    return torch.stack(ys, dim=-1).view(*x.shape[:-1], -1)
-
-
-def process_awq_gemm(x: torch.Tensor, kind: str):
-    x = x.cuda()
-    if x.dtype == torch.int32:
-        x = unpack_awq_gemm(x)
-    if kind in ['qweight', 'qzeros', 'scales']:
-        x = x.t()
-    return x
-
-
-def process_gptq(x: torch.Tensor, kind: str):
-    x = x.cuda()
-    if x.dtype == torch.int32:
-        xs = get_u4_slices(x, torch.uint8)
-        if kind == 'qweight':  # (k/8,n)
-            x = torch.stack(xs, dim=1).view(-1, x.size(-1))
-        else:  # 'qzeros' (k/g,n/8)
-            x = torch.stack(xs, dim=-1).view(x.size(0), -1) + 1
-    if kind in ['qweight', 'qzeros', 'scales']:
-        x = x.t()
-    return x
-
-
-def process_mxfp4(x: torch.Tensor, kind: str):
-    # print(x.shape, x.dtype, kind)
-    x = x.cuda()
-    if kind == 'blocks':
-        xs = get_u4_slices(torch.flatten(x, start_dim=-2), torch.uint8)
-        x = torch.flatten(torch.stack(xs, dim=-1), start_dim=-2)
-    if kind == 'scales':
-        pass
-    return x
-
-
-def process_fp8(x: torch.Tensor, kind: str):
-    x = x.cuda()
-    if x.dtype == torch.float8_e4m3fn:
-        # some ops (e.g. torch.cat) for fp8 is not implemented in pytorch
-        return x.view(dtype=torch.uint8)
-    elif kind != 'weight_scale_inv' and x.dtype == torch.float:
-        return x.to(dtype=torch.bfloat16)
-    else:
-        return x.to(dtype=torch.bfloat16)
-
-
-def process_compressed_tensor(x: torch.Tensor, kind: str):
-    x = x.cuda()
-    if x.dtype == torch.int32:
-        xs = get_u4_slices(x, torch.uint8)
-        if kind == 'weight_packed':  # (out_channels, in_channels // 8)
-            x = torch.stack(xs, dim=-1).view(*x.shape[:-1], -1)
-        elif kind == 'weight_zero_point':  # (out_channels // 8, in_channels // group_size)
-            x = torch.stack(xs, dim=1).view(-1, x.size(-1))
-    return x
-
-
-def get_input_policy(model_format):
-    if model_format == 'awq':
-        return process_awq_gemm
-    elif model_format == 'gptq':
-        return process_gptq
-    elif model_format == 'mxfp4':
-        return process_mxfp4
-    elif model_format == 'fp8':
-        return process_fp8
-    elif model_format == 'compressed-tensors':
-        return process_compressed_tensor
-    else:
-        return to_cuda
diff --git a/lmdeploy/turbomind/deploy/source_model/__init__.py b/lmdeploy/turbomind/deploy/source_model/__init__.py
deleted file mode 100644
index 11a17bea9d..0000000000
--- a/lmdeploy/turbomind/deploy/source_model/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .baichuan import Baichuan2Model, BaichuanModel  # noqa: F401
-from .deepseek2 import DeepSeek2Model  # noqa: F401
-from .deepseek_vl import DeepSeekVLModel  # noqa: F401
-from .glm4 import Glm4Model  # noqa: F401
-from .glm4_moe_lite import Glm4MoeLiteModel  # noqa: F401
-from .gpt_oss import GptOssModel  # noqa: F401
-from .internlm2 import InternLM2Model  # noqa: F401
-from .internvl import InternVLModel  # noqa: F401
-from .llama import LlamaModel  # noqa: F401
-from .llava import LlavaModel  # noqa: F401
-from .minicpmv import MiniCPMVModel  # noqa: F401
-from .mixtral import MixtralModel  # noqa: F401
-from .molmo import MolmoModel  # noqa: F401
-from .qwen import QwenModel  # noqa: F401
-from .xcomposer2 import Xcomposer2Model  # noqa: F401
diff --git a/lmdeploy/turbomind/deploy/source_model/baichuan.py b/lmdeploy/turbomind/deploy/source_model/baichuan.py
deleted file mode 100644
index 51ca34b55a..0000000000
--- a/lmdeploy/turbomind/deploy/source_model/baichuan.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-import torch
-
-from .base import INPUT_MODELS
-from .llama import LlamaModel, LlamaReader
-
-
-class BaichuanReader(LlamaReader):
-    """BaichuanReader."""
-
-    def _attn(self, i: int, kind: str):
-        """Get q, k, v, o kind for layer i."""
-        q, k, v, o = (None, ) * 4
-        pack_key = f'model.layers.{i}.self_attn.W_pack.{kind}'
-        qkv = self.transform(self.params.get(pack_key), kind)
-        if qkv is not None:
-            q, k, v = torch.split(qkv, qkv.shape[0] // 3, dim=0)
-        o = self.params.get(f'model.layers.{i}.self_attn.o_proj.{kind}')
-        o = self.transform(o, kind)
-        return q, k, v, o
-
-
-@INPUT_MODELS.register_module(name='baichuan')
-class BaichuanModel(LlamaModel):
-    """Llama model in baichuan format."""
-
-    Reader = BaichuanReader
-
-
-class Baichuan2Reader(BaichuanReader):
-    """Baichuan2Reader."""
-
-    def output_weight(self):
-        """Get output."""
-        # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/main/modeling_baichuan.py#L507
-        tensor = self.params.get('lm_head.weight', None)
-        if tensor is not None:
-            tensor = tensor.cuda()
-            tensor = torch.nn.functional.normalize(tensor)
-        return tensor
-
-
-@INPUT_MODELS.register_module(name='baichuan2')
-class Baichuan2Model(LlamaModel):
-    """Llama model in baichuan format."""
-
-    Reader = Baichuan2Reader
diff --git a/lmdeploy/turbomind/deploy/source_model/base.py b/lmdeploy/turbomind/deploy/source_model/base.py
deleted file mode 100644
index 9bc6ca3bbc..0000000000
--- a/lmdeploy/turbomind/deploy/source_model/base.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from abc import ABC, abstractmethod
-from collections.abc import Iterator
-
-import torch
-from mmengine import Registry
-
-INPUT_MODELS = Registry('source model', locations=['lmdeploy.turbomind.deploy.source_model.base'])
-
-
-class BaseReader(ABC):
-    """Mapping between TM modules and source modules."""
-
-    def __init__(self):
-        pass
-
-    def transform(self, x: torch.Tensor | None, kind: str) -> torch.Tensor | None:
-        return None if x is None else self._transform(x, kind)
-
-    @abstractmethod
-    def _transform(self, x: torch.Tensor, kind: str):
-        """Transform x."""
-        pass
-
-
-class BaseInputModel(ABC):
-    """Base class for input model."""
-
-    def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
-        """Constructor for BaseInputModel.
-
-        Args:
-            model_path (str): the path of the model.
-            tokenizer_path (str): the path of the tokenizer model.
-        """
-        self.model_path = model_path
-        self.tokenizer_path = tokenizer_path
-
-    @abstractmethod
-    def model_info(self) -> dict:
-        """Read model info."""
-        pass
-
-    @abstractmethod
-    def readers(self) -> Iterator[BaseReader]:
-        pass
diff --git a/lmdeploy/turbomind/deploy/source_model/deepseek2.py b/lmdeploy/turbomind/deploy/source_model/deepseek2.py
deleted file mode 100644
index 79b6d3c354..0000000000
--- a/lmdeploy/turbomind/deploy/source_model/deepseek2.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import math
-import os
-
-from ..config import RopeParam
-from .base import INPUT_MODELS
-from .llama import LlamaModel, LlamaReader
-
-
-class DeepSeek2Reader(LlamaReader):
-
-    def moe_ffn_gate(self, i, kind):
-        return self.params.get(f'model.layers.{i}.mlp.gate.{kind}')
-
-    def moe_ffn_expert(self, e=None, i=None, kind=None):
-        if not kind:
-            return self.filter(r'experts', i)
-        result = []
-        for key in ['gate', 'down', 'up']:
-            name = f'model.layers.{i}.mlp.experts.{e}.{key}_proj.{kind}'
-            tensor = self.params.get(name)
-            tensor = self.transform(tensor, kind)
-            result.append(tensor)
-        return (*result, )
-
-    def _ffn(self, i: int, kind: str):
-        """Get ffn kind for layer i."""
-        if not kind:
-            # Filter by layer number to get only keys for this specific layer
-            if i == 0:
-                pattern = rf'model\.layers\.{i}\.mlp\.'
-            else:
-                pattern = rf'model\.layers\.{i}\.mlp\.shared_experts\.'
-            return self.filter(pattern, None)
-        result = []
-        for key in ['gate', 'down', 'up']:
-            name = f'model.layers.{i}.mlp.shared_experts.{key}_proj.{kind}'
-            if i == 0:
-                name = name.replace('shared_experts.', '')
-            tensor = self.params.get(name)
-            tensor = self.transform(tensor, kind)
-            result.append(tensor)
-        return (*result, )
-
-    def ffn(self, i: int, kind: str):
-        return self._ffn(i, kind)
-
-    def mla(self, i: int, kind: str):
-        if not kind:
-            return self.filter(r'self_attn.*proj', i)
-        result = []
-        for key in ['q_a_proj', 'q_b_proj', 'q_proj', 'kv_a_proj_with_mqa', 'kv_b_proj', 'o_proj']:
-            tensor = self.params.get(f'{self.attn_layer_prefix}.{i}.self_attn.{key}.{kind}')
-            tensor = self.transform(tensor, kind)
-            result.append(tensor)
-        return (*result, )
-
-    def mla_norm(self, i: int):
-        result = []
-        for k in ['q', 'kv']:
-            name = f'{self.attn_layer_prefix}.{i}.self_attn.{k}_a_layernorm.weight'  # noqa: E501
-            result.append(self.params.get(name))
-        return (*result, )
-
-
-def get_yarn_params(rope_scaling: dict):
-
-    scaling_factor = float(rope_scaling['factor'])
-    mscale = rope_scaling['mscale']
-    mscale_all_dim = rope_scaling['mscale_all_dim']
-
-    def yarn_get_mscale(scale=1, mscale=1):
-        if scale <= 1:
-            return 1.0
-        return 0.1 * mscale * math.log(scale) + 1.0
-
-    _mscale = float(yarn_get_mscale(scaling_factor, mscale) / yarn_get_mscale(scaling_factor, mscale_all_dim))
-
-    softmax_scale = 0
-    if mscale_all_dim:
-        scale = yarn_get_mscale(scaling_factor, mscale_all_dim)
-        softmax_scale = scale * scale
-
-    return _mscale, softmax_scale
-
-
-@INPUT_MODELS.register_module(name='deepseek2')
-class DeepSeek2Model(LlamaModel):
-
-    Reader = DeepSeek2Reader
-
-    def model_info(self):
-        cfg = self.model_config
-        info = super().model_info()
-        qk_nope_dim = cfg['qk_nope_head_dim']
-        qk_rope_dim = cfg['qk_rope_head_dim']
-        kv_lora_rank = cfg['kv_lora_rank']
-        q_head_dim = qk_nope_dim + qk_rope_dim
-        num_layer = cfg['num_hidden_layers']
-        expert_num = cfg['n_routed_experts']
-        expert_num = [expert_num] * num_layer
-        expert_num[0] = 0
-        n_shared_experts = cfg['n_shared_experts']
-        expert_inter_size = cfg['moe_intermediate_size']
-        experts_per_token = cfg['num_experts_per_tok']
-        inter_size = [n_shared_experts * expert_inter_size] * num_layer
-        inter_size[0] = cfg['intermediate_size']
-        norm_topk_prob = cfg['norm_topk_prob']
-        size_per_head = qk_rope_dim + qk_nope_dim
-        v_head_dim = cfg['v_head_dim']
-        softmax_scale = 0.0
-        disable_mla_fold = os.getenv('LMDEPLOY_MLA_FOLD', '1').lower() in ('0', 'false', 'no')
-        if kv_lora_rank and kv_lora_rank != qk_nope_dim and not disable_mla_fold:
-            # MLA folding: remap to kv_lora_rank-based head dims and fold
-            # kc/vc BMMs into q_b_proj/o_proj at conversion time.
-            size_per_head = kv_lora_rank + qk_rope_dim
-            v_head_dim = kv_lora_rank
-            softmax_scale = q_head_dim**(-0.5)
-        elif kv_lora_rank and kv_lora_rank != qk_nope_dim:
-            softmax_scale = q_head_dim**(-0.5)
-
-        info.update(kv_lora_rank=kv_lora_rank,
-                    q_lora_rank=cfg['q_lora_rank'] or 0,
-                    qk_rope_dim=qk_rope_dim,
-                    v_head_dim=v_head_dim,
-                    size_per_head=size_per_head,
-                    kv_head_num=1,
-                    expert_num=expert_num,
-                    expert_inter_size=expert_inter_size,
-                    experts_per_token=experts_per_token,
-                    inter_size=inter_size,
-                    norm_topk_prob=norm_topk_prob,
-                    routed_scale=cfg['routed_scaling_factor'],
-                    topk_method=cfg['topk_method'],
-                    topk_group=cfg['topk_group'],
-                    moe_group_num=cfg['n_group'],
-                    scoring_func=cfg.get('scoring_func', 'softmax'),
-                    tune_layer_num=2)
-        if 'router_n_groups' in cfg and cfg['router_n_groups'] > 0:
-            info['router_n_groups'] = cfg['router_n_groups']
-        rope_param: RopeParam = info['rope_param']
-        rope_param.dim = qk_rope_dim
-        if 'rope_parameters' in cfg:
-            # transformers v5.0.0 aggregates all rope-related parameters into 'rope_parameters'
-            rope_scaling = cfg['rope_parameters']
-        else:
-            rope_scaling = cfg.get('rope_scaling')
-        if rope_scaling and rope_scaling.get('type') == 'yarn':
-            attention_factor, yarn_scale = get_yarn_params(rope_scaling)
-            yarn_scale *= q_head_dim**(-0.5)
-            rope_param.max_position_embeddings = rope_scaling['original_max_position_embeddings']
-            rope_param.attention_factor = attention_factor
-            info.update(rope_param=rope_param, softmax_scale=yarn_scale)
-        elif softmax_scale:
-            info.update(softmax_scale=softmax_scale)
-        return info
diff --git a/lmdeploy/turbomind/deploy/source_model/deepseek_vl.py b/lmdeploy/turbomind/deploy/source_model/deepseek_vl.py
deleted file mode 100644
index 8fa8a4c85a..0000000000
--- a/lmdeploy/turbomind/deploy/source_model/deepseek_vl.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import json
-import os.path as osp
-
-from ..config import RopeParam
-from .base import INPUT_MODELS
-from .llama import LlamaModel, LlamaReader
-
-
-class DeepSeekVLReader(LlamaReader):
-    """DeepSeekVL model reader."""
-
-    attn_layer_prefix = 'language_model.model.layers'
-    attn_layer_patten = r'language_model\.model\.layers\.([0-9]+).'
-    tok_embeddings_key = 'language_model.model.embed_tokens.weight'
-    norm_weight_key = 'language_model.model.norm.weight'
-    output_weight_key = 'language_model.lm_head.weight'
-
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool, model_cfg: dict, **kwargs):
-        model_cfg = model_cfg['language_config']
-        super().__init__(new_params, unused_params, last_bin, model_cfg, **kwargs)
-
-    def attn_norm(self, i: int):
-        """Get attn norm for layer i."""
-        return self.params[f'language_model.model.layers.{i}.input_layernorm.weight']
-
-    def ffn_norm(self, i: int):
-        """Get ffn norm for layer i."""
-        return self.params[f'language_model.model.layers.{i}.post_attention_layernorm.weight']
-
-
-@INPUT_MODELS.register_module(name='deepseekvl')
-class DeepSeekVLModel(LlamaModel):
-    """DeepSeekVL model in hf format."""
-
-    Reader = DeepSeekVLReader
-
-    def model_info(self):
-        """Read model info."""
-        params_path = osp.join(self.model_path, 'config.json')
-        with open(params_path) as f:
-            model_arg = json.load(f)
-            if 'language_config' in model_arg and model_arg['language_config'].get('model_type', None) == 'llama':
-                model_arg = model_arg['language_config']  # depseek-vl
-            num_layer = model_arg['num_hidden_layers']
-            hidden_units = model_arg.get('hidden_size', 4096)
-            inter_size = model_arg.get('intermediate_size', 11008)
-            vocab_size = model_arg.get('vocab_size', 102400)
-            norm_eps = model_arg.get('rms_norm_eps', 1e-06)
-            attn_head_num = model_arg.get('num_attention_heads', 32)
-            if 'num_key_value_heads' in model_arg:
-                kv_head_num = model_arg['num_key_value_heads']
-            else:
-                kv_head_num = model_arg.get('num_attention_heads', 32)
-            rope_theta = float(model_arg.get('rope_theta', 10000.0))
-            max_position_embeddings = int(model_arg.get('max_position_embeddings', 0))
-            rope_scaling = model_arg.get('rope_scaling', None)
-            scaling_factor = 0.0
-            scaling_type = 'default'
-            if isinstance(rope_scaling, dict):
-                scaling_type = model_arg['rope_scaling'].get('type', 'default')
-                scaling_factor = model_arg['rope_scaling'].get('factor', '')
-            head_dim = model_arg.get('head_dim', hidden_units // attn_head_num)
-            rope_param = RopeParam(type=scaling_type,
-                                   base=rope_theta,
-                                   dim=head_dim,
-                                   max_position_embeddings=max_position_embeddings,
-                                   factor=scaling_factor)
-
-        return dict(num_layer=num_layer,
-                    norm_eps=norm_eps,
-                    head_num=attn_head_num,
-                    kv_head_num=kv_head_num,
-                    hidden_units=hidden_units,
-                    inter_size=inter_size,
-                    vocab_size=vocab_size,
-                    max_position_embeddings=max_position_embeddings,
-                    rope_param=rope_param)
diff --git a/lmdeploy/turbomind/deploy/source_model/glm4.py b/lmdeploy/turbomind/deploy/source_model/glm4.py
deleted file mode 100644
index df6c2f574a..0000000000
--- a/lmdeploy/turbomind/deploy/source_model/glm4.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import json
-import os.path as osp
-
-import torch
-
-from ..config import RopeParam
-from .base import INPUT_MODELS
-from .llama import LlamaModel, LlamaReader
-
-
-class Glm4Reader(LlamaReader):
-    """Glm4Reader."""
-
-    attn_layer_patten = r'transformer\.encoder\.layers\.([0-9]+).'
-    tok_embeddings_key = 'transformer.embedding.word_embeddings.weight'
-    norm_weight_key = 'transformer.encoder.final_layernorm.weight'
-    output_weight_key = 'transformer.output_layer.weight'
-
-    attn_pattern = r'self_attention'
-
-    def _attn(self, i: int, kind: str):
-        """Get q, k, v, o kind for layer i."""
-        qkv = self.params[f'transformer.encoder.layers.{i}'
-                          f'.self_attention.query_key_value.{kind}']
-        qkv = self.transform(qkv, kind)
-        attn_head_num = self.model_cfg['num_attention_heads']
-        kv_head_num = attn_head_num
-        if self.model_cfg.get('multi_query_attention', False):
-            kv_head_num = self.model_cfg['multi_query_group_num']
-        HEAD_DIM = 128
-        q, k, v = torch.split(qkv, [attn_head_num * HEAD_DIM, kv_head_num * HEAD_DIM, kv_head_num * HEAD_DIM], dim=0)
-        o = self.params.get(f'transformer.encoder.layers.{i}.self_attention.dense.{kind}')
-        o = self.transform(o, kind)
-        if o is None:  # handle the case when qkv has bias but o doesn't
-            o = torch.zeros_like(q)
-        return q, k, v, o
-
-    def attn_norm(self, i: int):
-        """Get attn norm for layer i."""
-        return self.params[f'transformer.encoder.layers.{i}.input_layernorm.weight']
-
-    def _ffn(self, i: int, kind: str):
-        """Get ffn kind for layer i."""
-        up_and_gate = self.params[f'transformer.encoder.layers.{i}.mlp.dense_h_to_4h.{kind}']
-        up_and_gate = self.transform(up_and_gate, kind)
-        up, gate = up_and_gate.chunk(2, dim=0)
-        down = self.params[f'transformer.encoder.layers.{i}.mlp.dense_4h_to_h.{kind}']
-        down = self.transform(down, kind)
-        return (up, down, gate)
-
-    def ffn_norm(self, i: int):
-        """Get ffn norm for layer i."""
-        return self.params[f'transformer.encoder.layers.{i}.post_attention_layernorm.weight']
-
-
-@INPUT_MODELS.register_module(name='glm4')
-class Glm4Model(LlamaModel):
-    """Glm2/3/4 model in hf format."""
-
-    Reader = Glm4Reader
-
-    def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
-        super().__init__(model_path, tokenizer_path, **kwargs)
-        config_path = osp.join(self.model_path, 'config.json')
-        with open(config_path) as f:
-            self.config = json.load(f)
-
-    def model_info(self):
-        """Read model info."""
-        config = self.config
-        hidden_units = config.get('hidden_size', None)
-        num_layer = config.get('num_hidden_layers', None)
-        num_layer = config.get('num_layers', num_layer)
-        norm_eps = config['layernorm_epsilon']
-        rope_theta = float(config.get('rotary_emb_base', 10000.0))
-        rope_ratio = float(config.get('rope_ratio', 1.0))
-        rope_theta *= rope_ratio
-        attn_head_num = config['num_attention_heads']
-        kv_head_num = attn_head_num
-        inter_size = config['ffn_hidden_size']
-        vocab_size = config['padded_vocab_size']
-        attn_bias = config['add_qkv_bias']
-        if config['multi_query_attention']:
-            kv_head_num = config['multi_query_group_num']
-        seq_length = config['seq_length']
-        rope_param = RopeParam(type='default', base=rope_theta, dim=64)
-        return dict(num_layer=num_layer,
-                    norm_eps=norm_eps,
-                    head_num=attn_head_num,
-                    kv_head_num=kv_head_num,
-                    hidden_units=hidden_units,
-                    attn_bias=int(attn_bias),
-                    inter_size=inter_size,
-                    vocab_size=vocab_size,
-                    rope_param=rope_param,
-                    max_position_embeddings=seq_length,
-                    permute_qk=False)  # head layout is same as TM
diff --git a/lmdeploy/turbomind/deploy/source_model/glm4_moe_lite.py b/lmdeploy/turbomind/deploy/source_model/glm4_moe_lite.py
deleted file mode 100644
index 9e4eeedebd..0000000000
--- a/lmdeploy/turbomind/deploy/source_model/glm4_moe_lite.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-"""GLM-4 MoE Lite (e.g. GLM-4.7-Flash) source model for TurboMind.
-
-Architecture: MLA (Multi-head Latent Attention) + MoE with dense first layer.
-Weight layout follows HuggingFace checkpoint with model.layers.* (same family as DeepSeek2).
-"""
-
-from .base import INPUT_MODELS
-from .deepseek2 import DeepSeek2Model, DeepSeek2Reader
-
-
-class Glm4MoeLiteReader(DeepSeek2Reader):
-    """Reader for Glm4MoeLiteForCausalLM (GLM-4.7-Flash).
-
-    Uses same key layout as DeepSeek2: model.layers.{i}.self_attn.*, model.layers.{i}.mlp.*
-    Supports noaux_tc via e_score_correction_bias.
-    """
-
-    attn_layer_prefix = 'model.layers'
-    attn_layer_patten = r'model\.layers\.([0-9]+).'
-    tok_embeddings_key = 'model.embed_tokens.weight'
-    norm_weight_key = 'model.norm.weight'
-    output_weight_key = 'lm_head.weight'
-
-    def moe_ffn_gate_correction_bias(self, i: int):
-        """Per-expert score correction bias for noaux_tc routing."""
-        return self.params.get(f'{self.attn_layer_prefix}.{i}.mlp.gate.e_score_correction_bias')
-
-
-@INPUT_MODELS.register_module(name='glm4-moe-lite')
-class Glm4MoeLiteModel(DeepSeek2Model):
-    """GLM-4 MoE Lite (e.g. GLM-4.7-Flash) in HF format.
-
-    MLA + MoE with first_k_dense_replace; config mapping aligned to DeepSeek2.
-    """
-
-    Reader = Glm4MoeLiteReader
-
-    def model_info(self):
-        cfg = self.model_config
-        # Set default MoE routing config for GLM-4 MoE Lite if not in HF config
-        if 'topk_method' not in cfg:
-            cfg['topk_method'] = 'noaux_tc'
-        if 'topk_group' not in cfg:
-            cfg['topk_group'] = 1
-        if 'n_group' not in cfg:
-            cfg['n_group'] = 1
-        if 'scoring_func' not in cfg:
-            cfg['scoring_func'] = 'sigmoid'
-
-        info = super().model_info()
-        # GLM4 MoE Lite uses noaux_tc routing with sigmoid scoring
-        info['topk_method'] = 'noaux_tc'
-        info['scoring_func'] = 'sigmoid'
-        if 'router_n_groups' in cfg and cfg['router_n_groups'] > 0:
-            info['router_n_groups'] = cfg['router_n_groups']
-
-        return info
diff --git a/lmdeploy/turbomind/deploy/source_model/gpt_oss.py b/lmdeploy/turbomind/deploy/source_model/gpt_oss.py
deleted file mode 100644
index c6bfdb06b1..0000000000
--- a/lmdeploy/turbomind/deploy/source_model/gpt_oss.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-import re
-
-from .base import INPUT_MODELS
-from .llama import LlamaModel, LlamaReader
-
-
-def map_experts(str):
-    s = re.sub(r'(experts.*proj)$', r'\1.weight', str)
-    s = re.sub(r'(experts.*proj)_bias$', r'\1.bias', s)
-    s = re.sub(r'(experts.*proj)_blocks$', r'\1.blocks', s)
-    s = re.sub(r'(experts.*proj)_scales$', r'\1.scales', s)
-    return s
-
-
-class GptOssReader(LlamaReader):
-
-    mappings = [map_experts]
-
-    def moe_ffn_expert(self, e=None, i=None, kind=None):
-        if not kind:
-            return self.filter(r'experts', i)
-        result = []
-        for key in ['gate_up', 'down']:
-            name = f'{self.attn_layer_prefix}.{i}.mlp.experts.{key}_proj.{kind}'
-            tensor = self.params.get(name)[e]
-            if kind == 'weight':  # experts in BF16 models are in M-major
-                tensor = tensor.cuda().t()
-            if key == 'gate_up':
-                gate, up = tensor[::2], tensor[1::2]
-                result.append(self.transform(gate, kind))
-                result.append(self.transform(up, kind))
-            else:
-                result.append(self.transform(tensor, kind))
-        return (result[0], result[2], result[1])
-
-    def moe_ffn_gate(self, i, kind):
-        return self.transform(self.params.get(f'{self.attn_layer_prefix}.{i}.mlp.router.{kind}'), kind)
-
-    def attn_sinks(self, i):
-        return self.params.get(f'{self.attn_layer_prefix}.{i}.self_attn.sinks')
-
-
-@INPUT_MODELS.register_module(name='gpt-oss')
-class GptOssModel(LlamaModel):
-
-    Reader = GptOssReader
-
-    def model_info(self):
-        cfg = self.model_config
-        types = cfg['layer_types']
-        sliding_window = cfg['sliding_window']
-        info = super().model_info()
-        info.update(attn_bias=int(cfg['attention_bias']),
-                    mlp_bias=True,
-                    expert_router_bias=True,
-                    expert_num=cfg['num_local_experts'],
-                    expert_inter_size=cfg['intermediate_size'],
-                    experts_per_token=cfg['experts_per_token'],
-                    norm_topk_prob=True,
-                    inter_size=0,
-                    window_size=[sliding_window if x == 'sliding_attention' else 0 for x in types],
-                    attn_sink=True,
-                    activation_type='gpt-oss')
-        return info
diff --git a/lmdeploy/turbomind/deploy/source_model/internlm2.py b/lmdeploy/turbomind/deploy/source_model/internlm2.py
deleted file mode 100644
index 21ccf9d9f8..0000000000
--- a/lmdeploy/turbomind/deploy/source_model/internlm2.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-import re
-
-import torch
-
-from .base import INPUT_MODELS
-from .llama import LlamaModel, LlamaReader
-
-
-class InternLM2Reader(LlamaReader):
-    """InternLM2 model reader."""
-
-    attn_layer_prefix = 'model.layers'
-    attn_layer_patten = r'model\.layers\.([0-9]+).'
-    tok_embeddings_key = 'model.tok_embeddings.weight'
-    norm_weight_key = 'model.norm.weight'
-    output_weight_key = 'output.weight'
-
-    attn_pattern = r'attention'
-    ffn_pattern = r'feed_forward'
-
-    proj_pattern = 'w'
-
-    def filter(self, pattern: str, i: int | None):
-        params = []
-        for k in self.params.keys():
-            if re.search(pattern, k):
-                params.append(k)
-
-        if self.fp8_quant and pattern == self.attn_pattern:
-            from lmdeploy.lite.quantization.weight.quant_utils import quant_blocked_fp8
-            q, k, v = (None, ) * 3
-            kv_head_num = self.model_cfg['num_key_value_heads']
-            gs = int(self.model_cfg['num_attention_heads'] / kv_head_num)
-            qkv = self.params.get(f'{self.attn_layer_prefix}.{i}.attention.wqkv.weight')
-
-            if qkv is not None:
-                qkv = qkv.view(kv_head_num, gs + 2, 128, -1)
-                hidden_dim = qkv.shape[-1]
-                q, k, v = torch.split(qkv, [gs, 1, 1], dim=1)
-
-                tensors = [q.reshape(-1, hidden_dim), k.reshape(-1, hidden_dim), v.reshape(-1, hidden_dim)]
-                split_sizes = [gs, 1, 1]
-                keys = ['q', 'k', 'v']
-                qkv_weight = []
-                for tensor, split_size, key in zip(tensors, split_sizes, keys):
-                    qweight, scale = quant_blocked_fp8(tensor, torch.float8_e4m3fn, block_size=128)
-                    qweight = qweight.reshape(kv_head_num, split_size, 128, -1)
-                    qkv_weight.append(qweight)
-
-                    self.params[f'{self.attn_layer_prefix}.{i}.{self.attn_pattern}.w{key}.weight_scale_inv'] = scale
-                    params.append(f'{self.attn_layer_prefix}.{i}.{self.attn_pattern}.w{key}.weight_scale_inv')
-
-                qkv_weight = torch.cat(qkv_weight, dim=1)
-                qkv_weight = qkv_weight.reshape(-1, hidden_dim)
-                self.params[f'{self.attn_layer_prefix}.{i}.{self.attn_pattern}.wqkv.weight'] = qkv_weight
-
-            return params
-        else:
-            return params
-
-    def _attn(self, i: int, kind: str):
-        """Get q, k, v, o kind for layer i."""
-        if self.fp8_quant and kind == 'weight_scale_inv':
-            result = []
-            for key in ['q', 'k', 'v', 'o']:
-                tensor = self.params.get(f'{self.attn_layer_prefix}.{i}.{self.attn_pattern}.w{key}.{kind}')
-                tensor = self.transform(tensor, kind)
-                result.append(tensor)
-            return (*result, )
-        q, k, v = (None, ) * 3
-        kv_head_num = self.model_cfg['num_key_value_heads']
-        gs = int(self.model_cfg['num_attention_heads'] / kv_head_num)
-        qkv = self.params.get(f'{self.attn_layer_prefix}.{i}.attention.wqkv.{kind}')
-        qkv = self.transform(qkv, kind)
-        if qkv is not None:
-            qkv = qkv.view(kv_head_num, gs + 2, 128, -1)
-            hidden_dim = qkv.shape[-1]
-            q, k, v = torch.split(qkv, [gs, 1, 1], dim=1)
-            q = q.reshape(-1, hidden_dim)
-            k = k.reshape(-1, hidden_dim)
-            v = v.reshape(-1, hidden_dim)
-        o = self.params.get(f'{self.attn_layer_prefix}.{i}.attention.wo.{kind}')
-        o = self.transform(o, kind)
-        return (q, k, v, o)
-
-    def attn_norm(self, i: int):
-        """Get attn norm for layer i."""
-        return self.params[f'{self.attn_layer_prefix}.{i}.attention_norm.weight']
-
-    def _ffn(self, i: int, kind: str):
-        """Get ffn kind for layer i."""
-        if not kind:
-            return self.filter(self.ffn_pattern, i)
-        result = []
-        for key in ['w1', 'w2', 'w3']:
-            tensor = self.params[f'{self.attn_layer_prefix}.{i}.feed_forward.{key}.{kind}']
-            tensor = self.transform(tensor, kind)
-            result.append(tensor)
-        return (*result, )
-
-    def ffn_norm(self, i: int):
-        """Get ffn norm for layer i."""
-        return self.params[f'{self.attn_layer_prefix}.{i}.ffn_norm.weight']
-
-
-@INPUT_MODELS.register_module(name='internlm2')
-class InternLM2Model(LlamaModel):
-    """InternLM2 model in hf format."""
-
-    Reader = InternLM2Reader
diff --git a/lmdeploy/turbomind/deploy/source_model/internvl.py b/lmdeploy/turbomind/deploy/source_model/internvl.py
deleted file mode 100644
index 575507c2a9..0000000000
--- a/lmdeploy/turbomind/deploy/source_model/internvl.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .base import INPUT_MODELS
-from .gpt_oss import GptOssReader
-from .internlm2 import InternLM2Reader
-from .llama import LlamaModel, LlamaReader
-from .qwen import Qwen3MoeReader, Qwen3Reader
-
-
-class InternVLReader(LlamaReader):
-    """InternVLReader for llama model."""
-
-    attn_layer_prefix = 'language_model.model.layers'
-    attn_layer_patten = r'language_model\.model\.layers\.([0-9]+).'
-    tok_embeddings_key = 'language_model.model.embed_tokens.weight'
-    norm_weight_key = 'language_model.model.norm.weight'
-    output_weight_key = 'language_model.lm_head.weight'
-
-
-# Note the subtle difference in keys
-class InternVL2Reader(InternLM2Reader):
-    """InternVLReader for InternLM2 model."""
-
-    attn_layer_prefix = 'language_model.model.layers'
-    attn_layer_patten = r'language_model\.model\.layers\.([0-9]+).'
-    tok_embeddings_key = 'language_model.model.tok_embeddings.weight'
-    norm_weight_key = 'language_model.model.norm.weight'
-    output_weight_key = 'language_model.output.weight'
-
-
-class InternVL3d5Reader(Qwen3Reader):
-    attn_layer_prefix = 'language_model.model.layers'
-    attn_layer_patten = r'language_model\.model\.layers\.([0-9]+).'
-    tok_embeddings_key = 'language_model.model.embed_tokens.weight'
-    norm_weight_key = 'language_model.model.norm.weight'
-    output_weight_key = 'language_model.lm_head.weight'
-
-
-class InternVL3d5Qwen3MoEReader(Qwen3MoeReader):
-    attn_layer_prefix = 'language_model.model.layers'
-    attn_layer_patten = r'language_model\.model\.layers\.([0-9]+).'
-    tok_embeddings_key = 'language_model.model.embed_tokens.weight'
-    norm_weight_key = 'language_model.model.norm.weight'
-    output_weight_key = 'language_model.lm_head.weight'
-
-
-class InternVL3d5GptOSSReader(GptOssReader):
-    attn_layer_prefix = 'language_model.model.layers'
-    attn_layer_patten = r'language_model\.model\.layers\.([0-9]+).'
-    tok_embeddings_key = 'language_model.model.embed_tokens.weight'
-    norm_weight_key = 'language_model.model.norm.weight'
-    output_weight_key = 'language_model.lm_head.weight'
-
-
-class InternS1Reader(Qwen3MoeReader):
-    """InternS1Reader for internlm/InternS1 model."""
-
-    attn_layer_prefix = 'model.language_model.layers'
-    attn_layer_patten = r'model\.language_model\.layers\.([0-9]+).'
-    tok_embeddings_key = 'model.language_model.embed_tokens.weight'
-    norm_weight_key = 'model.language_model.norm.weight'
-    output_weight_key = 'lm_head.weight'
-
-
-class InternS1MiniReader(Qwen3Reader):
-
-    attn_layer_prefix = 'model.language_model.layers'
-    attn_layer_patten = r'model\.language_model\.layers\.([0-9]+).'
-    tok_embeddings_key = 'model.language_model.embed_tokens.weight'
-    norm_weight_key = 'model.language_model.norm.weight'
-    output_weight_key = 'lm_head.weight'
-
-
-@INPUT_MODELS.register_module(name='internvl')
-class InternVLModel(LlamaModel):
-    """InternVL model in hf format."""
-
-    def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
-        super().__init__(model_path, tokenizer_path, **kwargs)
-        from transformers import AutoConfig
-        config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-
-        arch = config.architectures[0]
-        if arch == 'InternVLChatModel' or arch == 'InternVLForConditionalGeneration':
-            relations = dict(InternLM2ForCausalLM=('internlm2', InternVL2Reader),
-                             LlamaForCausalLM=('llama', InternVLReader),
-                             Qwen2ForCausalLM=('qwen2', InternVLReader),
-                             Qwen3MoeForCausalLM=('qwen3-moe', InternVL3d5Qwen3MoEReader),
-                             Qwen3ForCausalLM=('qwen3', InternVL3d5Reader),
-                             GptOssForCausalLM=('gpt-oss', InternVL3d5GptOSSReader))
-        elif arch == 'InternS1ForConditionalGeneration':
-            relations = dict(Qwen3MoeForCausalLM=('qwen3-moe', InternS1Reader),
-                             Qwen3ForCausalLM=('qwen3', InternS1MiniReader))
-        else:
-            raise ValueError('unsupported model arch {arch}')
-        self.llm_config = getattr(config, 'llm_config', None) or getattr(config, 'text_config', None)
-        arch = self.llm_config.architectures[0]
-        llm_model, self.Reader = relations[arch]
-        self.llm_model = INPUT_MODELS.get(llm_model)(model_path=model_path, tokenizer_path=tokenizer_path, **kwargs)
-
-    def model_info(self):
-        """Read model info."""
-        self.llm_model.model_config = self.llm_config.to_dict()
-        return self.llm_model.model_info()
diff --git a/lmdeploy/turbomind/deploy/source_model/llama.py b/lmdeploy/turbomind/deploy/source_model/llama.py
deleted file mode 100644
index 339b084f9a..0000000000
--- a/lmdeploy/turbomind/deploy/source_model/llama.py
+++ /dev/null
@@ -1,237 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import math
-import re
-
-import torch
-
-from lmdeploy.archs import get_model_arch
-
-from ..config import RopeParam
-from ..loader import create_loader
-from .base import INPUT_MODELS, BaseInputModel, BaseReader
-
-
-class LlamaReader(BaseReader):
-    """LlamaReader."""
-
-    attn_layer_prefix = 'model.layers'
-    attn_layer_patten = r'model\.layers\.([0-9]+).'
-    tok_embeddings_key = 'model.embed_tokens.weight'
-    norm_weight_key = 'model.norm.weight'
-    output_weight_key = 'lm_head.weight'
-
-    attn_pattern = r'self_attn'
-    ffn_pattern = r'mlp'
-
-    proj_pattern = 'proj'
-    scale_inv_suffix = '_scale_inv'
-
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool, model_cfg: dict, policy, fp8_quant=False):
-        super().__init__()
-        self.params = unused_params
-        self.params.update(new_params)
-        self.last_bin = last_bin
-        self.model_cfg = model_cfg
-        tie_word_embeddings = self.model_cfg.get('tie_word_embeddings', False)
-        if tie_word_embeddings:
-            self.output_weight_key = self.tok_embeddings_key
-        self.processor = policy
-        self.fp8_quant = fp8_quant
-        if self.fp8_quant:
-            quant_params = self.quant_weight_fp8()
-            self.params.update(quant_params)
-
-    def quant_weight_fp8(self):
-        from lmdeploy.lite.quantization.weight.quant_utils import quant_blocked_fp8
-        pattern_str = fr'({self.attn_pattern}|{self.ffn_pattern}).*{self.proj_pattern}.*\.weight'
-        target_pattern = re.compile(pattern_str)
-
-        if self.__class__.__name__ == 'InternLM2Reader':
-            skip_pattern = re.compile(r'wqkv.*\.weight')
-        else:
-            skip_pattern = None
-
-        quant_params = {}
-        for name, weight in self.params.items():
-            if target_pattern.search(name) and name.endswith('.weight'):
-                if skip_pattern and skip_pattern.search(name):
-                    continue
-                q_weight, scale = quant_blocked_fp8(weight, torch.float8_e4m3fn, block_size=128)
-                quant_params[name] = q_weight
-                quant_params[f'{name}{self.scale_inv_suffix}'] = scale.to(weight.dtype)
-
-        return quant_params
-
-    def filter(self, pattern: str, i: int | None):
-        params = []
-        for k in self.params.keys():
-            if re.search(pattern, k):
-                params.append(k)
-        return params
-
-    def tok_embeddings(self):
-        """Get embeddings."""
-        return self.transform(self.params.get(self.tok_embeddings_key, None), 'weight')
-
-    def norm_weight(self):
-        """Get norm."""
-        return self.transform(self.params.get(self.norm_weight_key, None), 'weight')
-
-    def output_weight(self):
-        """Get output."""
-        return self.transform(self.params.get(self.output_weight_key, None), 'weight')
-
-    def _transform(self, x: torch.Tensor, kind: str):
-        return self.processor(x, kind)
-
-    def _attn(self, i: int, kind: str):
-        """Get q, k, v, o kind for layer i."""
-        result = []
-        for key in ['q', 'k', 'v', 'o']:
-            tensor = self.params.get(f'{self.attn_layer_prefix}.{i}.self_attn.{key}_proj.{kind}')
-            tensor = self.transform(tensor, kind)
-            result.append(tensor)
-        return (*result, )
-
-    def attn(self, i: int, kind: str):
-        if not kind:
-            return self.filter(self.attn_pattern, i)
-        return self._attn(i, kind)
-
-    def attn_norm(self, i: int):
-        """Get attn norm for layer i."""
-        return self.transform(self.params[f'{self.attn_layer_prefix}.{i}.input_layernorm.weight'], 'weight')
-
-    def _ffn(self, i: int, kind: str):
-        """Get ffn kind for layer i."""
-        if not kind:
-            return self.filter(self.ffn_pattern, i)
-        result = []
-        for key in ['gate', 'down', 'up']:
-            tensor = self.params[f'{self.attn_layer_prefix}.{i}.mlp.{key}_proj.{kind}']
-            tensor = self.transform(tensor, kind)
-            result.append(tensor)
-        return (*result, )
-
-    def ffn(self, i: int, kind: str):
-        if not kind:
-            return self.filter(self.ffn_pattern, i)
-        return self._ffn(i, kind)
-
-    def ffn_norm(self, i: int):
-        """Get ffn norm for layer i."""
-        return self.transform(self.params[f'{self.attn_layer_prefix}.{i}.post_attention_layernorm.weight'], 'weight')
-
-
-@INPUT_MODELS.register_module(name='llama')
-class LlamaModel(BaseInputModel):
-    """Llama model in hf format."""
-
-    Reader = LlamaReader
-
-    def __init__(self, model_path: str, tokenizer_path: str, **kwargs: dict):
-        super().__init__(model_path, tokenizer_path)
-        self.policy = kwargs.get('input_policy')
-        _, model_config = get_model_arch(model_path)
-        if hasattr(model_config, 'text_config'):
-            model_config = model_config.text_config
-        elif hasattr(model_config, 'llm_config'):
-            model_config = model_config.llm_config
-        if hasattr(model_config, 'to_dict'):
-            self.model_config = model_config.to_dict()
-        else:
-            self.model_config = model_config
-        self.fp8_quant = kwargs.get('fp8_quant', False)
-
-    def readers(self):
-        mappings = getattr(self.Reader, 'mappings', [])
-        loader = create_loader(self.model_path, self.Reader.attn_layer_patten, mappings)
-        for i, param in loader.items():
-            reader = self.Reader(param, {}, False, self.model_config, policy=self.policy, fp8_quant=self.fp8_quant)
-            yield i, reader
-        torch.cuda.empty_cache()
-
-    def model_info(self):
-        """Read model info."""
-        model_arg = self.model_config
-        num_layer = model_arg['num_hidden_layers']
-        norm_eps = model_arg['rms_norm_eps']
-        attn_head_num = model_arg['num_attention_heads']
-        vocab_size = model_arg['vocab_size']
-        inter_size = model_arg.get('intermediate_size', 0)
-        if 'num_key_value_heads' in model_arg:
-            kv_head_num = model_arg['num_key_value_heads']
-        else:
-            kv_head_num = model_arg['num_attention_heads']
-        hidden_units = model_arg['hidden_size']
-        # head_dim could be none in config
-        head_dim = model_arg.get('head_dim', None)
-        head_dim = head_dim or hidden_units // attn_head_num
-        # compute rope param
-        if 'rope_parameters' in model_arg:
-            # transformers v5.0.0 aggregates rope settings into rope_parameters
-            rope_scaling = model_arg['rope_parameters']
-            rope_theta = float(rope_scaling.get('rope_theta', 10000.0))
-        else:
-            rope_theta = float(model_arg.get('rope_theta', 10000.0))
-            rope_scaling = model_arg.get('rope_scaling', None)
-        max_position_embeddings = int(model_arg.get('max_position_embeddings', 0))
-        rope_param = RopeParam(type='default', base=rope_theta, dim=head_dim)
-        if isinstance(rope_scaling, dict):
-            rope_type = rope_scaling.get('rope_type', '') or rope_scaling.get('type', '')
-            if rope_scaling.get('mrope_section') is not None:
-                # TODO: treat mrope as an option to the common rope functions
-                rope_type = 'mrope'
-            scaling_factor = rope_scaling.get('factor', 0.0)
-            if rope_type == 'default':
-                pass
-            elif rope_type == 'dynamic':
-                rope_param.type = 'dynamic'
-                rope_param.factor = scaling_factor
-                rope_param.max_position_embeddings = max_position_embeddings
-            elif rope_type == 'linear':
-                rope_param.type = 'linear'
-                rope_param.factor = scaling_factor
-            elif rope_type == 'llama3':
-                low_freq_factor = rope_scaling.get('low_freq_factor', 1.0)
-                high_freq_factor = rope_scaling.get('high_freq_factor', 1.0)
-                original_max_position_embeddings = rope_scaling.get('original_max_position_embeddings', 0)
-                rope_param.type = 'llama3'
-                rope_param.factor = scaling_factor
-                rope_param.low_freq_factor = low_freq_factor
-                rope_param.high_freq_factor = high_freq_factor
-                rope_param.original_max_position_embeddings = original_max_position_embeddings
-            elif rope_type == 'yarn':
-                attention_factor = rope_scaling.get('attention_factor', None)
-                if attention_factor is None:
-                    attention_factor = 0.1 * math.log(scaling_factor) + 1.0
-                beta_fast = rope_scaling.get('beta_fast', 32.0)
-                beta_slow = rope_scaling.get('beta_slow', 1.0)
-                rope_param.type = 'yarn'
-                if 'original_max_position_embeddings' in rope_scaling:
-                    original_max_position_embeddings = rope_scaling['original_max_position_embeddings']
-                    scaling_factor = max_position_embeddings / original_max_position_embeddings
-                else:
-                    original_max_position_embeddings = max_position_embeddings
-                rope_param.factor = scaling_factor
-                rope_param.max_position_embeddings = original_max_position_embeddings
-                rope_param.attention_factor = attention_factor
-                rope_param.beta_fast = beta_fast
-                rope_param.beta_slow = beta_slow
-            elif rope_type == 'mrope':
-                mrope_section = rope_scaling.get('mrope_section')
-                rope_param.type = 'mrope'
-                rope_param.mrope_section = mrope_section
-            else:
-                raise RuntimeError(f'Unsupported rope type: {rope_type}')
-
-        return dict(size_per_head=head_dim,
-                    num_layer=num_layer,
-                    norm_eps=norm_eps,
-                    head_num=attn_head_num,
-                    kv_head_num=kv_head_num,
-                    hidden_units=hidden_units,
-                    inter_size=inter_size,
-                    vocab_size=vocab_size,
-                    max_position_embeddings=max_position_embeddings,
-                    rope_param=rope_param)
diff --git a/lmdeploy/turbomind/deploy/source_model/llava.py b/lmdeploy/turbomind/deploy/source_model/llava.py
deleted file mode 100644
index a305f0ac9e..0000000000
--- a/lmdeploy/turbomind/deploy/source_model/llava.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import json
-import os.path as osp
-
-from ..config import RopeParam
-from .base import INPUT_MODELS
-from .llama import LlamaModel, LlamaReader
-
-
-class LlavaReader(LlamaReader):
-    """LlavaReader for llama model."""
-
-    attn_layer_prefix = 'language_model.model.layers'
-    attn_layer_patten = r'language_model\.model\.layers\.([0-9]+).'
-    tok_embeddings_key = 'language_model.model.embed_tokens.weight'
-    norm_weight_key = 'language_model.model.norm.weight'
-    output_weight_key = 'language_model.lm_head.weight'
-
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool, model_cfg: dict, policy):
-        model_cfg = model_cfg.get('text_config')
-        super().__init__(new_params, unused_params, last_bin, model_cfg, policy)
-
-
-@INPUT_MODELS.register_module(name='llava')
-class LlavaModel(LlamaModel):
-    """LlavaModel model in hf format."""
-
-    def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
-        super().__init__(model_path, tokenizer_path, **kwargs)
-        from transformers import AutoConfig
-        config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-        config = getattr(config, 'text_config', config)
-        arch = config.architectures[0]
-        _readers = dict(Qwen2ForCausalLM=LlavaReader, LlamaForCausalLM=LlavaReader)
-        self.Reader = _readers[arch]
-        self.arch = arch
-
-    def model_info(self):
-        """Read model info for LlavaForConditionalGeneration.
-
-        https://huggingface.co/llava-hf/llava-interleave-qwen-7b-hf
-        """
-        params_path = osp.join(self.model_path, 'config.json')
-        with open(params_path) as f:
-            model_arg = json.load(f)['text_config']
-            num_layer = model_arg.get('num_hidden_layers', 32)
-            norm_eps = model_arg.get('rms_norm_eps', 1e-6)
-            attn_head_num = model_arg.get('num_attention_heads', 32)
-            if 'num_key_value_heads' in model_arg:
-                kv_head_num = model_arg.get('num_key_value_heads', 32)
-            else:
-                kv_head_num = model_arg.get('num_attention_heads', 32)
-            rope_theta = float(model_arg.get('rope_theta', 10000.0))
-            max_position_embeddings = int(model_arg.get('max_position_embeddings', 0))
-            rope_scaling = model_arg.get('rope_scaling', None)
-            scaling_factor = 0.0
-            scaling_type = 'default'
-
-            # special for the model: llava-hf/llava-interleave-qwen-7b-hf
-            hidden_units = model_arg.get('hidden_size', 4096)
-            vocab_size = model_arg.get('vocab_size', 152000)
-            intermediate_size = model_arg.get('intermediate_size', 11008)
-            attn_bias = 1 if model_arg['architectures'][0] \
-                == 'Qwen2ForCausalLM' else 0
-            attn_bias = int(model_arg.get('attn_bias', attn_bias))
-            use_logn_attn = int(model_arg.get('use_logn_attn', 0))
-
-            if isinstance(rope_scaling, dict):
-                scaling_type = model_arg['rope_scaling'].get('type', '')
-                scaling_factor = model_arg['rope_scaling'].get('factor', '')
-
-            rope_param = RopeParam(type=scaling_type,
-                                   base=rope_theta,
-                                   dim=hidden_units // attn_head_num,
-                                   max_position_embeddings=max_position_embeddings,
-                                   factor=scaling_factor)
-
-        return dict(num_layer=num_layer,
-                    norm_eps=norm_eps,
-                    head_num=attn_head_num,
-                    hidden_units=hidden_units,
-                    kv_head_num=kv_head_num,
-                    rope_param=rope_param,
-                    max_position_embeddings=max_position_embeddings,
-                    inter_size=intermediate_size,
-                    use_logn_attn=use_logn_attn,
-                    attn_bias=attn_bias,
-                    vocab_size=vocab_size)
diff --git a/lmdeploy/turbomind/deploy/source_model/minicpmv.py b/lmdeploy/turbomind/deploy/source_model/minicpmv.py
deleted file mode 100644
index 6046dd3ac1..0000000000
--- a/lmdeploy/turbomind/deploy/source_model/minicpmv.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-import json
-import os.path as osp
-
-from .base import INPUT_MODELS
-from .llama import LlamaModel, LlamaReader
-
-
-class MiniCPMVReader(LlamaReader):
-    """MiniCPMVReader for llama model."""
-
-    attn_layer_prefix = 'llm.model.layers'
-    attn_layer_patten = r'llm\.model\.layers\.([0-9]+).'
-    tok_embeddings_key = 'llm.model.embed_tokens.weight'
-    norm_weight_key = 'llm.model.norm.weight'
-    output_weight_key = 'llm.lm_head.weight'
-
-
-@INPUT_MODELS.register_module(name='minicpmv')
-class MiniCPMVModel(LlamaModel):
-    """MiniCPMV model in hf format."""
-    Reader = MiniCPMVReader
-
-    def model_info(self):
-        info = super().model_info()
-        with open(osp.join(self.model_path, 'config.json')) as f:
-            config = json.load(f)
-            if str(config.get('version')) == '2.6':
-                info['attn_bias'] = True
-        return info
diff --git a/lmdeploy/turbomind/deploy/source_model/mixtral.py b/lmdeploy/turbomind/deploy/source_model/mixtral.py
deleted file mode 100644
index 820a106956..0000000000
--- a/lmdeploy/turbomind/deploy/source_model/mixtral.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-from .base import INPUT_MODELS
-from .llama import LlamaModel, LlamaReader
-
-
-class MixtralReader(LlamaReader):
-
-    def moe_ffn_expert(self, e=None, i=None, kind=None):
-        if not kind:
-            return self.filter(r'experts', i)
-        result = []
-        for x in ['w1', 'w2', 'w3']:
-            name = f'model.layers.{i}.block_sparse_moe.experts.{e}.{x}.{kind}'
-            tensor = self.params.get(name)
-            tensor = self.transform(tensor, kind)
-            result.append(tensor)
-        return (*result, )
-
-    def moe_ffn_gate(self, i, kind):
-        return self.params.get(f'model.layers.{i}.block_sparse_moe.gate.{kind}')
-
-
-@INPUT_MODELS.register_module(name='mixtral')
-class MixtralModel(LlamaModel):
-
-    Reader = MixtralReader
-
-    def model_info(self):
-        cfg = self.model_config
-        info = super().model_info()
-        info['expert_num'] = cfg['num_local_experts']
-        info['expert_inter_size'] = cfg['intermediate_size']
-        info['experts_per_token'] = cfg['num_experts_per_tok']
-        info['norm_topk_prob'] = True
-        info['inter_size'] = 0
-        return info
diff --git a/lmdeploy/turbomind/deploy/source_model/molmo.py b/lmdeploy/turbomind/deploy/source_model/molmo.py
deleted file mode 100644
index 09e320e9de..0000000000
--- a/lmdeploy/turbomind/deploy/source_model/molmo.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import json
-import os.path as osp
-
-import torch
-
-from ..config import RopeParam
-from .base import INPUT_MODELS
-from .llama import LlamaModel, LlamaReader
-
-
-class MolmoReader(LlamaReader):
-    attn_layer_prefix = 'model.transformer.blocks'
-    attn_layer_patten = r'model\.transformer\.blocks\.([0-9]+).'
-    norm_weight_key = 'model.transformer.ln_f.weight'
-    output_weight_key = 'model.transformer.ff_out.weight'
-
-    # In molmo, names of attention parameters are "att_proj.bias",
-    # "att_proj.weight", "attn_norm.weight", "attn_out.weight", and names
-    # of ffn parameters are "ff_norm", "ff_out", "ff_proj", so we
-    # make the patterns are r'att' and r'ffn_', respectively.
-    attn_pattern = r'att'
-    ffn_pattern = r'ff_'
-
-    def tok_embeddings(self):
-        embed1 = self.params.get('model.transformer.wte.embedding', None)
-        embed2 = self.params.get('model.transformer.wte.new_embedding', None)
-        if embed1 is not None and embed2 is not None:
-            return torch.cat((embed1, embed2), dim=0)
-        else:
-            assert embed1 is None and embed2 is None
-            return None
-
-    def attn_norm(self, i: int):
-        """Get attn norm for layer i."""
-        return self.params[f'{self.attn_layer_prefix}.{i}.attn_norm.weight']
-
-    def _attn(self, i: int, kind: str):
-        """Get q, k, v, o kind(weight, bias, qweight) for layer i.
-
-        Args:
-            i (int): layer id
-            kind (str): can be one of ["weight", "bias", "qweight"]
-        """
-        q, k, v = (None, ) * 3
-        hidden_size = self.model_cfg['hidden_size']
-        head_num = self.model_cfg['num_attention_heads']
-        kv_head_num = self.model_cfg['num_key_value_heads']
-        head_dim = hidden_size // head_num
-        assert head_dim == 128
-        fused_dims = (hidden_size, kv_head_num * head_dim, kv_head_num * head_dim)
-        qkv = self.params.get(f'{self.attn_layer_prefix}.{i}.att_proj.{kind}')
-        qkv = self.transform(qkv, kind)
-        if qkv is not None:
-            q, k, v = qkv.split(fused_dims, dim=0)
-        o = self.params.get(f'{self.attn_layer_prefix}.{i}.attn_out.{kind}')
-        o = self.transform(o, kind)
-        if o is None:  # handle the case when qkv has bias but o doesn't
-            o = torch.zeros_like(q)
-        return (q, k, v, o)
-
-    def _ffn(self, i: int, kind: str):
-        """Get ffn kind(weight, qweight) for layer i."""
-        up_and_gate = self.params[f'{self.attn_layer_prefix}.{i}.ff_proj.{kind}']
-        up_and_gate = self.transform(up_and_gate, kind)
-        gate, up = up_and_gate.chunk(2, dim=0)
-        down = self.params[f'{self.attn_layer_prefix}.{i}.ff_out.{kind}']
-        down = self.transform(down, kind)
-        return (up, down, gate)
-
-    def ffn_norm(self, i: int):
-        """Get ffn norm for layer i."""
-        return self.params[f'{self.attn_layer_prefix}.{i}.ff_norm.weight']
-
-
-@INPUT_MODELS.register_module(name='molmo')
-class MolmoModel(LlamaModel):
-
-    Reader = MolmoReader
-
-    def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
-        super().__init__(model_path, tokenizer_path, **kwargs)
-        config_path = osp.join(self.model_path, 'config.json')
-        with open(config_path) as f:
-            self.config = json.load(f)
-
-    def model_info(self):
-        config = self.config
-        num_layer = config['num_hidden_layers']
-        norm_eps = config['layer_norm_eps']
-        attn_head_num = config['num_attention_heads']
-        kv_head_num = config['num_key_value_heads']
-        hidden_units = config['hidden_size']
-        rope_theta = config['rope_theta']
-        max_position_embeddings = config['max_position_embeddings']
-        vocab_size = config['vocab_size']
-        # https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/modeling_molmo.py#L2041
-        additional_vocab_size = 128
-        inter_size = config['intermediate_size'] // 2
-        attn_bias = config['qkv_bias']
-        rope_param = RopeParam(type='default', base=rope_theta, dim=hidden_units // attn_head_num)
-        return dict(
-            num_layer=num_layer,
-            norm_eps=norm_eps,
-            head_num=attn_head_num,
-            kv_head_num=kv_head_num,
-            hidden_units=hidden_units,
-            attn_bias=int(attn_bias),
-            inter_size=inter_size,
-            vocab_size=vocab_size,
-            # https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/modeling_molmo.py#L564
-            embedding_size=vocab_size + additional_vocab_size,
-            rope_param=rope_param,
-            max_position_embeddings=max_position_embeddings,
-        )
diff --git a/lmdeploy/turbomind/deploy/source_model/qwen.py b/lmdeploy/turbomind/deploy/source_model/qwen.py
deleted file mode 100644
index 2223151e54..0000000000
--- a/lmdeploy/turbomind/deploy/source_model/qwen.py
+++ /dev/null
@@ -1,499 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import json
-import os.path as osp
-import re
-
-import torch
-
-from ..config import RopeParam
-from ..loader import create_loader
-from .base import INPUT_MODELS
-from .llama import LlamaModel, LlamaReader
-
-
-class QwenReader(LlamaReader):
-    """QwenReader."""
-
-    attn_layer_patten = r'transformer\.h\.([0-9]+).'
-    tok_embeddings_key = 'transformer.wte.weight'
-    norm_weight_key = 'transformer.ln_f.weight'
-    output_weight_key = 'lm_head.weight'
-
-    attn_pattern = r'attn'
-    ffn_pattern = r'mlp'
-
-    def _attn(self, i: int, kind: str):
-        """Get q, k, v, o kind for layer i."""
-        q, k, v, o = (None, ) * 4
-        qkv = self.params[f'transformer.h.{i}.attn.c_attn.{kind}']
-        qkv = self.transform(qkv, kind)
-        if qkv is not None:
-            q, k, v = torch.split(qkv, qkv.size(0) // 3, dim=0)
-        o = self.params.get(f'transformer.h.{i}.attn.c_proj.{kind}')
-        o = self.transform(o, kind)
-        if o is None:
-            o = torch.zeros_like(q)
-        return q, k, v, o
-
-    def attn_norm(self, i: int):
-        """Get attn norm for layer i."""
-        return self.params[f'transformer.h.{i}.ln_1.weight']
-
-    def _ffn(self, i: int, kind: str):
-        """Get ffn kind for layer i."""
-        result = []
-        for key in ['w2', 'c_proj', 'w1']:
-            tensor = self.params[f'transformer.h.{i}.mlp.{key}.{kind}']
-            tensor = self.transform(tensor, kind)
-            result.append(tensor)
-        return (*result, )
-
-    def ffn_norm(self, i: int):
-        """Get ffn norm for layer i."""
-        return self.params[f'transformer.h.{i}.ln_2.weight']
-
-
-@INPUT_MODELS.register_module(name='qwen')
-class QwenModel(LlamaModel):
-    """Qwen model in hf format."""
-
-    Reader = QwenReader
-
-    def model_info(self):
-        """Read model info."""
-        params_path = osp.join(self.model_path, 'config.json')
-        with open(params_path) as f:
-            config = json.load(f)
-            hidden_units = config['hidden_size']
-            num_layer = config['num_hidden_layers']
-            norm_eps = config['layer_norm_epsilon']
-            kv_channels = config['kv_channels']
-            rope_theta = float(config.get('rotary_emb_base', 10000.0))
-            if 'num_key_value_heads' in config:
-                kv_head_num = config['num_key_value_heads']
-            else:
-                kv_head_num = config['num_attention_heads']
-            attn_head_num = config['num_attention_heads']
-            seq_length = config['seq_length']
-            use_dynamic_ntk = int(config['use_dynamic_ntk'])
-            use_logn_attn = int(config['use_logn_attn'])
-            vocab_size = config['vocab_size']
-            inter_size = config['intermediate_size']
-            scaling_type = 'dynamic' if use_dynamic_ntk else 'default'
-            # need setting rope_scaling_factor in TurbomindEngineConfig if scaling_type is dynamic
-            rope_param = RopeParam(type=scaling_type,
-                                   base=rope_theta,
-                                   dim=kv_channels,
-                                   max_position_embeddings=seq_length,
-                                   factor=0)
-
-        return dict(size_per_head=kv_channels,
-                    num_layer=num_layer,
-                    norm_eps=norm_eps,
-                    hidden_units=hidden_units,
-                    head_num=attn_head_num,
-                    kv_head_num=kv_head_num,
-                    vocab_size=vocab_size,
-                    inter_size=inter_size,
-                    attn_bias=1,
-                    rope_param=rope_param,
-                    max_position_embeddings=seq_length,
-                    use_dynamic_ntk=int(use_dynamic_ntk),
-                    use_logn_attn=use_logn_attn)
-
-
-@INPUT_MODELS.register_module(name='qwen2')
-class Qwen2Model(LlamaModel):
-    """Qwen model in hf format.
-
-    The weight of qwen2 model is similar to Llama, except its attention bias doesn't include o_proj bias.
-    """
-
-    Reader = LlamaReader
-
-    def model_info(self):
-        cfg = super().model_info()
-        cfg['attn_bias'] = 1
-        return cfg
-
-
-class Qwen2MoeReader(LlamaReader):
-
-    def moe_ffn_expert(self, e=None, i=None, kind=None):
-        if not kind:
-            return self.filter(r'experts', i)
-        result = []
-        for key in ['gate', 'down', 'up']:
-            name = f'{self.attn_layer_prefix}.{i}.mlp.experts.{e}.{key}_proj.{kind}'
-            tensor = self.params.get(name)
-            tensor = self.transform(tensor, kind)
-            result.append(tensor)
-        return (*result, )
-
-    def moe_ffn_gate(self, i, kind):
-        return self.transform(self.params.get(f'{self.attn_layer_prefix}.{i}.mlp.gate.{kind}'), kind)
-
-    def _ffn(self, i: int, kind: str):
-        """Get ffn kind for layer i."""
-        if not kind:
-            return self.filter(r'shared_expert\.', i)
-        result = []
-        for key in ['gate', 'down', 'up']:
-            tensor = self.params[f'{self.attn_layer_prefix}.{i}.mlp.shared_expert.{key}_proj.{kind}']
-            tensor = self.transform(tensor, kind)
-            result.append(tensor)
-        return (*result, )
-
-    def ffn(self, i: int, kind: str):
-        if not kind:
-            return self.filter(r'shared_expert\.', i)
-        return self._ffn(i, kind)
-
-    def moe_ffn_shared_gate(self, i):
-        return self.params.get(f'{self.attn_layer_prefix}.{i}.mlp.shared_expert_gate.weight')
-
-
-@INPUT_MODELS.register_module(name='qwen2-moe')
-class Qwen2MoeModel(LlamaModel):
-
-    Reader = Qwen2MoeReader
-
-    def model_info(self):
-        cfg = self.model_config
-        info = super().model_info()
-        info['expert_num'] = cfg['num_experts']
-        info['expert_inter_size'] = cfg['moe_intermediate_size']
-        info['experts_per_token'] = cfg['num_experts_per_tok']
-        info['inter_size'] = cfg['shared_expert_intermediate_size']
-        info['moe_shared_gate'] = True
-        info['norm_topk_prob'] = cfg['norm_topk_prob']
-        info['attn_bias'] = cfg.get('qkv_bias', 1)
-        return info
-
-
-class Qwen3Reader(LlamaReader):
-
-    def qk_norm(self, i: int):
-        result = []
-        for x in ['q', 'k']:
-            name = f'{self.attn_layer_prefix}.{i}.self_attn.{x}_norm.weight'
-            result.append(self.transform(self.params.get(name), 'weight'))
-        return (*result, )
-
-
-@INPUT_MODELS.register_module(name='qwen3')
-class Qwen3Model(LlamaModel):
-    Reader = Qwen3Reader
-
-    def model_info(self):
-        cfg = self.model_config
-        info = super().model_info()
-        info.update(qk_norm=True, attn_bias=cfg.get('attention_bias', 0))
-        return info
-
-
-class Qwen3MoeReader(Qwen2MoeReader):
-
-    def qk_norm(self, i: int):
-        result = []
-        for x in ['q', 'k']:
-            name = f'{self.attn_layer_prefix}.{i}.self_attn.{x}_norm.weight'
-            result.append(self.transform(self.params.get(name), 'weight'))
-        return (*result, )
-
-
-@INPUT_MODELS.register_module(name='qwen3-moe')
-class Qwen3MoeModel(LlamaModel):
-    Reader = Qwen3MoeReader
-
-    def model_info(self):
-        cfg = self.model_config
-        info = super().model_info()
-        info.update(
-            qk_norm=True,
-            expert_num=cfg.get('num_experts', 128),
-            experts_per_token=cfg.get('num_experts_per_tok', 8),
-            expert_inter_size=cfg.get('moe_intermediate_size', 768),
-            attn_bias=cfg.get('attention_bias', 0),
-            inter_size=0,  # no shared expert
-            norm_topk_prob=cfg.get('norm_topk_prob', False))
-        return info
-
-
-class Qwen3_5ReaderMixin:
-    """Mixin providing linear attention weight reading for Qwen3.5 models.
-
-    Qwen3.5 uses a zero-centered RMSNorm: ``output = norm(x) * (1 + weight)``
-    where weight is initialized to zeros.  TurboMind's RMSNorm kernel computes
-    ``norm(x) * weight`` (standard LLaMA style), so we add 1 to every
-    RMSNorm weight during export.  The GDN-internal norm
-    (``Qwen3_5MoeRMSNormGated``) uses standard weight and is NOT affected.
-    """
-
-    attn_layer_pattern = r'(?:model\.language_model\.|model\.)layers\.([0-9]+)\.'
-
-    _LINEAR_ATTN_KEYS = ['conv1d', 'in_proj_qkv', 'in_proj_z', 'in_proj_b', 'in_proj_a', 'out_proj', 'A_log', 'dt_bias']
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        if any(k.startswith('model.language_model.') for k in self.params.keys()):
-            self.attn_layer_prefix = 'model.language_model.layers'
-            self.tok_embeddings_key = 'model.language_model.embed_tokens.weight'
-            self.norm_weight_key = 'model.language_model.norm.weight'
-        tie_word_embeddings = self.model_cfg.get('tie_word_embeddings', False)
-        if tie_word_embeddings:
-            self.output_weight_key = self.tok_embeddings_key
-
-    # ---- zero-centered RMSNorm: add 1 to weights during export ----
-    def attn_norm(self, i: int):
-        w = super().attn_norm(i)
-        if w is not None:
-            w = w.float() + 1.0
-        return w
-
-    def ffn_norm(self, i: int):
-        w = super().ffn_norm(i)
-        if w is not None:
-            w = w.float() + 1.0
-        return w
-
-    def norm_weight(self):
-        w = super().norm_weight()
-        if w is not None:
-            w = w.float() + 1.0
-        return w
-
-    def qk_norm(self, i: int):
-        result = super().qk_norm(i)
-        return tuple(w.float() + 1.0 if w is not None else w for w in result)
-
-    # ---- handle mixed QKV(fp16) + O(AWQ) attention layers -------
-
-    def _attn(self, i: int, kind: str):
-        """Override to handle mixed QKV(fp16) + O(AWQ) attention layers.
-
-        Some AWQ-quantized Qwen3.5 models keep QKV in fp16 while quantizing only the O projection.  TurboMind requires
-        uniform weight types per layer, so we dequantize O to fp16 at export time.
-        """
-        prefix = f'{self.attn_layer_prefix}.{i}.self_attn'
-        q_is_fp16 = f'{prefix}.q_proj.weight' in self.params
-        o_is_awq = f'{prefix}.o_proj.qweight' in self.params
-
-        if not (q_is_fp16 and o_is_awq):
-            # Not a mixed-format layer, use standard behaviour.
-            return super()._attn(i, kind)
-
-        # Mixed format detected: QKV are fp16 but O is AWQ.
-        if kind == 'weight':
-            # Get fp16 QKV the normal way, then dequantize O.
-            q, k, v, _ = super()._attn(i, kind)
-            o = self._awq_dequant(f'{prefix}.o_proj')
-            o = self.transform(o, kind)
-            return (q, k, v, o)
-
-        # For any quant kind (qweight/scales/qzeros), return all None
-        # so that the AWQ handler skips this layer entirely — the O
-        # weight is already handled via dequantization above.
-        return (None, None, None, None)
-
-    def _awq_dequant(self, prefix: str):
-        """Dequantize an AWQ-quantized linear layer to fp16.
-
-        AWQ stores weights in transposed form relative to PyTorch's
-        convention ([in, out] vs [out, in]), so we transpose here to
-        match the fp16 ``.weight`` layout that downstream export
-        expects.
-        """
-        from lmdeploy.pytorch.backends.default.awq_modules import dequantize_gemm
-        qweight = self.params[f'{prefix}.qweight']
-        scales = self.params[f'{prefix}.scales']
-        qzeros = self.params[f'{prefix}.qzeros']
-        group_size = qweight.shape[0] // scales.shape[0]
-        w = dequantize_gemm(qweight, qzeros, scales, 4, group_size)
-        return w.t()  # [in, out] → [out, in] (PyTorch convention)
-
-    @staticmethod
-    def _compressed_tensors_dequant(weight_packed, weight_scale):
-        """Dequantize a compressed-tensors (pack-quantized, symmetric int4)
-        weight to fp16.
-
-        Args:
-            weight_packed: int32 tensor of shape (out_features, in_features//8).
-            weight_scale:  bf16/fp16 tensor of shape (out_features, in_features//group_size).
-        Returns:
-            fp16 tensor of shape (out_features, in_features).
-        """
-        out_features = weight_packed.shape[0]
-        num_groups = weight_scale.shape[1]
-        in_features = weight_packed.shape[1] * 8
-        group_size = in_features // num_groups
-
-        # Reinterpret the packed int32 buffer as bytes and unpack two nibbles
-        # per byte directly into the final fp16 tensor. This avoids creating
-        # eight temporary fp16 tensors before applying scales.
-        packed_bytes = weight_packed.contiguous().view(torch.uint8).reshape(out_features, -1)
-        weight = torch.empty((out_features, in_features), device=weight_packed.device, dtype=torch.float16)
-        weight[:, 0::2] = (packed_bytes & 0xF).to(torch.float16)
-        weight[:, 1::2] = (packed_bytes >> 4).to(torch.float16)
-
-        scales = weight_scale.to(torch.float16).unsqueeze(-1)
-        weight = weight.view(out_features, num_groups, group_size)
-        weight.sub_(8.0).mul_(scales)
-        return weight.reshape(out_features, in_features)
-
-    def linear_attn(self, i: int, kind: str):
-        if not kind:
-            return self.filter(r'linear_attn\.', i)
-        # Always return a fixed-length tuple with None placeholders to
-        # preserve positional alignment with the name list in module.py.
-        result = []
-        for key in self._LINEAR_ATTN_KEYS:
-            prefix = f'{self.attn_layer_prefix}.{i}.linear_attn.{key}'
-            tensor = self.params.get(f'{prefix}.{kind}')
-            # A_log and dt_bias are bare nn.Parameter (no .weight suffix)
-            if tensor is None:
-                tensor = self.params.get(prefix)
-            # If requesting weight but only AWQ qweight exists,
-            # dequantize on the fly so LinearAttn gets fp16 tensors.
-            if tensor is None and kind == 'weight':
-                if f'{prefix}.qweight' in self.params:
-                    tensor = self._awq_dequant(prefix)
-                elif f'{prefix}.weight_packed' in self.params:
-                    tensor = self._compressed_tensors_dequant(self.params[f'{prefix}.weight_packed'],
-                                                              self.params[f'{prefix}.weight_scale'])
-            if tensor is not None:
-                tensor = self.transform(tensor, kind)
-            result.append(tensor)  # keep None to preserve alignment
-        if all(t is None for t in result):
-            return tuple()
-        return tuple(result)
-
-    def linear_norm(self, i: int, kind: str = 'weight'):
-        tensor = self.params.get(f'{self.attn_layer_prefix}.{i}.linear_attn.norm.{kind}')
-        if tensor is not None:
-            return self.transform(tensor, kind)
-        return None
-
-
-class Qwen3_5Reader(Qwen3_5ReaderMixin, Qwen3Reader):
-    pass
-
-
-@INPUT_MODELS.register_module(name='qwen3_5')
-class Qwen3_5Model(Qwen3Model):
-    Reader = Qwen3_5Reader
-
-    def model_info(self):
-        if 'text_config' in self.model_config:
-            self.model_config = self.model_config['text_config']
-        cfg = self.model_config
-        info = super().model_info()
-        # MoE parameters (same as Qwen2MoeModel / Qwen3MoeModel)
-        info['expert_num'] = cfg.get('num_experts', 0)
-        info['expert_inter_size'] = cfg.get('moe_intermediate_size', 0)
-        info['experts_per_token'] = cfg.get('num_experts_per_tok', 0)
-        # For MoE models, inter_size is the shared expert intermediate size;
-        # for dense models, keep the value from super() (intermediate_size).
-        shared_expert_size = cfg.get('shared_expert_intermediate_size')
-        if shared_expert_size is not None:
-            info['inter_size'] = shared_expert_size
-        info['moe_shared_gate'] = True
-        # Qwen3.5 uses sigmoid MoE routing (not softmax)
-        info['scoring_func'] = 'softmax'
-        info['norm_topk_prob'] = True
-        # Fix RoPE dim for partial_rotary_factor
-        rope_params = cfg.get('rope_parameters', {})
-        partial_rotary_factor = rope_params.get('partial_rotary_factor', cfg.get('partial_rotary_factor', 1.0))
-        if partial_rotary_factor < 1.0:
-            info['rope_param'].dim = int(info['size_per_head'] * partial_rotary_factor)
-        # Linear attention parameters
-        info['layer_types'] = cfg.get('layer_types', [])
-        info['linear_key_head_dim'] = cfg.get('linear_key_head_dim', 0)
-        info['linear_value_head_dim'] = cfg.get('linear_value_head_dim', 0)
-        info['linear_conv_kernel_dim'] = cfg.get('linear_conv_kernel_dim', 0)
-        info['linear_num_key_heads'] = cfg.get('linear_num_key_heads', 0)
-        info['linear_num_value_heads'] = cfg.get('linear_num_value_heads', 0)
-        # attn_output_gate doubles Q projection for full-attention layers
-        info['attn_output_gate'] = cfg.get('attn_output_gate', False)
-        return info
-
-
-class Qwen3_5MoeReader(Qwen3_5ReaderMixin, Qwen3MoeReader):
-
-    def _unpacked_moe_expert(self, e: int, i: int, kind: str):
-        prefix = f'{self.attn_layer_prefix}.{i}.mlp.experts'
-        gate_up = self.params.get(f'{prefix}.gate_up_proj.{kind}')
-        down = self.params.get(f'{prefix}.down_proj.{kind}')
-        if gate_up is None or down is None:
-            return None
-
-        # Packed Qwen3.5 MoE checkpoints store all experts in the first
-        # dimension. Slice one expert before transform so quantized policies
-        # still see a 2D tensor.
-        gate_up = self.transform(gate_up[e], kind)
-        down = self.transform(down[e], kind)
-        gate, up = gate_up.chunk(2, dim=0)
-        return (gate, down, up)
-
-    def moe_ffn_expert(self, e=None, i=None, kind=None):
-        if not kind:
-            return self.filter(r'experts', i)
-        unpacked = self._unpacked_moe_expert(e, i, kind)
-        if unpacked is not None:
-            return unpacked
-
-        return super().moe_ffn_expert(e, i, kind)
-
-
-@INPUT_MODELS.register_module(name='qwen3_5-moe')
-class Qwen3_5MoeModel(Qwen3MoeModel):
-    Reader = Qwen3_5MoeReader
-
-    @staticmethod
-    def map_packed_qwen35_experts(name: str):
-        """Map packed expert names to weight names, i.e.,
-        "mlp.experts.gate_up_proj" -> "mlp.experts.gate_up_proj.weight" so that
-        class Weight in parameter.py can classify them."""
-        s = re.sub(r'(mlp\.experts\.(?:gate_up|down)_proj)$', r'\1.weight', name)
-        return s
-
-    def readers(self):
-        pattern = getattr(self.Reader, 'attn_layer_pattern', self.Reader.attn_layer_patten)
-        loader = create_loader(self.model_path, pattern, [])
-
-        has_packed_gate_up = any('mlp.experts.gate_up_proj' in k for k in loader.index.keys())
-        has_packed_down = any('mlp.experts.down_proj' in k for k in loader.index.keys())
-        if has_packed_gate_up and has_packed_down:
-            loader.mappings = [self.map_packed_qwen35_experts]
-
-        for i, param in loader.items():
-            reader = self.Reader(param, {}, False, self.model_config, policy=self.policy, fp8_quant=self.fp8_quant)
-            yield i, reader
-        torch.cuda.empty_cache()
-
-    def model_info(self):
-        if 'text_config' in self.model_config:
-            self.model_config = self.model_config['text_config']
-        cfg = self.model_config
-        info = super().model_info()
-        # Shared expert params (missing from Qwen3MoeModel base)
-        info['inter_size'] = cfg.get('shared_expert_intermediate_size', 0)
-        info['moe_shared_gate'] = True
-        # Qwen3.5 uses sigmoid MoE routing (not softmax)
-        info['scoring_func'] = 'softmax'
-        info['norm_topk_prob'] = True
-        # Fix RoPE dim for partial_rotary_factor
-        rope_params = cfg.get('rope_parameters', {})
-        partial_rotary_factor = rope_params.get('partial_rotary_factor', cfg.get('partial_rotary_factor', 1.0))
-        if partial_rotary_factor < 1.0:
-            info['rope_param'].dim = int(info['size_per_head'] * partial_rotary_factor)
-        # Linear attention parameters
-        info['layer_types'] = cfg.get('layer_types', [])
-        info['linear_key_head_dim'] = cfg.get('linear_key_head_dim', 0)
-        info['linear_value_head_dim'] = cfg.get('linear_value_head_dim', 0)
-        info['linear_conv_kernel_dim'] = cfg.get('linear_conv_kernel_dim', 0)
-        info['linear_num_key_heads'] = cfg.get('linear_num_key_heads', 0)
-        info['linear_num_value_heads'] = cfg.get('linear_num_value_heads', 0)
-        # attn_output_gate doubles Q projection for full-attention layers
-        info['attn_output_gate'] = cfg.get('attn_output_gate', False)
-        return info
diff --git a/lmdeploy/turbomind/deploy/source_model/xcomposer2.py b/lmdeploy/turbomind/deploy/source_model/xcomposer2.py
deleted file mode 100644
index 44d0b726b8..0000000000
--- a/lmdeploy/turbomind/deploy/source_model/xcomposer2.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-from .base import INPUT_MODELS
-from .internlm2 import InternLM2Model, InternLM2Reader
-
-
-class Xcomposer2Reader(InternLM2Reader):
-    """Xcomposer2 model reader."""
-
-    # include only Plora and ignore other lora weights
-    attn_pattern = r'attention.\w+(.Plora_[AB])?.\w+$'
-    ffn_pattern = r'feed_forward.\w+(.Plora_[AB])?.\w+$'
-
-    def _attn(self, i, kind):
-        if 'Plora_A' in kind:
-            qkv = self.params[f'model.layers.{i}.attention.wqkv.Plora_A.weight']
-            o = self.params[f'model.layers.{i}.attention.wo.Plora_A.weight']
-            return qkv, o
-        return super()._attn(i, kind)
-
-
-@INPUT_MODELS.register_module(name='xcomposer2')
-class Xcomposer2Model(InternLM2Model):
-    """Xcomposer2 model in hf format."""
-
-    Reader = Xcomposer2Reader
-
-    def _lora_cfg_7b(self):
-        """Lora config for internlm-xcomposer2-7b."""
-        return dict(lora_r=256, lora_scale=1.0, lora_policy='plora', lora_max_wo_r=256)
-
-    def _lora_cfg_4khd_7b(self, model_info: dict):
-        """Lora config for internlm-xcomposer2-4khd-7b."""
-        rank_pattern = ['attention.w_qkv:8', 'attention.wo:256']
-        scale_pattern = ['attention.w_qkv:2.0', 'attention.wo:1.0']
-        rank_pattern = ','.join(rank_pattern)
-        scale_pattern = ','.join(scale_pattern)
-        return dict(lora_r=256,
-                    lora_scale=1.0,
-                    lora_max_wo_r=256,
-                    lora_policy='plora',
-                    lora_rank_pattern=rank_pattern,
-                    lora_scale_pattern=scale_pattern)
-
-    def model_info(self):
-        out = super().model_info()
-        from lmdeploy.vl.model.xcomposer2 import ModelType, get_xcomposer_type
-        model_type, _ = get_xcomposer_type(self.model_path)
-        if model_type == ModelType.XCOMPOSER2_4KHD:
-            out.update(self._lora_cfg_4khd_7b(out))
-        else:
-            out.update(self._lora_cfg_7b())
-        return out
diff --git a/lmdeploy/turbomind/deploy/target_model/__init__.py b/lmdeploy/turbomind/deploy/target_model/__init__.py
deleted file mode 100644
index 505c70de30..0000000000
--- a/lmdeploy/turbomind/deploy/target_model/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .fp import TurbomindModel  # noqa: F401
diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py
deleted file mode 100644
index 95baad7cf2..0000000000
--- a/lmdeploy/turbomind/deploy/target_model/base.py
+++ /dev/null
@@ -1,239 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-import os.path as osp
-from abc import ABC
-from collections.abc import Sequence
-
-import torch
-import tqdm
-import yaml
-from mmengine import Registry
-
-from ..config import AttentionConfig, LoraConfig, ModelConfig, TurbomindModelConfig, config_from_dict, config_to_dict
-from ..source_model.base import BaseInputModel
-
-OUTPUT_MODELS = Registry('target model', locations=['lmdeploy.turbomind.deploy.target_model.base'])
-
-
-def tprint(*args, **kwargs):
-    to_file = kwargs.pop('to_file', False)
-    if not to_file:
-        return
-    from io import StringIO
-    s = StringIO()
-    print(*args, **kwargs, file=s, end='')
-    tqdm.tqdm.write(s.getvalue())
-
-
-def _weight_dtype_map(weight_type: str, default=None):
-    """Map literal data type to torch dtype."""
-
-    _WEIGHT_DTYPE_MAP = dict(int4=torch.float16, float16=torch.float16, float32=torch.float16, bfloat16=torch.bfloat16)
-
-    return _WEIGHT_DTYPE_MAP.get(weight_type, default)
-
-
-def _pad_inter_size(inter_size: int, group_size: int, tp: int):
-    group_size = max(1, group_size)
-    group_num = (inter_size + group_size - 1) // group_size
-    groups_per_rank = (group_num + tp - 1) // tp
-    inter_size_padded = groups_per_rank * group_size * tp
-    return inter_size_padded
-
-
-class BaseOutputModel(ABC):
-    """Base output model."""
-
-    def __init__(self, input_model: BaseInputModel, cfg: TurbomindModelConfig, model_cls, out_dir: str = ''):
-        super().__init__()
-        self.input_model = input_model
-        self.model_config = cfg.model_config
-        self.attention_config = cfg.attention_config
-        self.lora_config = cfg.lora_config
-        self.attn_tp_size = self.model_config.attn_tp_size
-        self.attn_cp_size = self.model_config.attn_cp_size
-        self.mlp_tp_size = self.model_config.mlp_tp_size
-        self.out_dir = out_dir
-        self.to_file = True if out_dir else False
-        self.tm_params = dict()
-
-        # get `model_info` at first, which will be updated to `self.model_config` and `self.attention_config`
-        self.input_model_info = self.input_model.model_info()
-        self.input_model_info = self.single_to_list(self.input_model_info, keys=['inter_size', 'expert_num'])
-        self.permute_qk = self.input_model_info.get('permute_qk', True)
-        self.update_model_config()
-        for i, v in enumerate(self.model_config.inter_size):
-            self.model_config.inter_size[i] = _pad_inter_size(v, self.model_config.group_size, self.mlp_tp_size)
-        if self.model_config.expert_num:
-            self.model_config.expert_inter_size = _pad_inter_size(self.model_config.expert_inter_size,
-                                                                  self.model_config.group_size, self.mlp_tp_size)
-
-        # head_num is divisble by tp but kv_head_num is not
-        # and tp is divisble by kv_head_num
-        assert self.model_config.head_num % self.attn_tp_size == 0
-        self.repeat_kv = 0
-        if (self.attn_tp_size > self.model_config.kv_head_num
-                and self.attn_tp_size % self.model_config.kv_head_num == 0):
-            self.repeat_kv = (self.attn_tp_size // self.model_config.kv_head_num)
-            self.model_config.kv_head_num = self.attn_tp_size
-
-        self.model_config.verify()
-        assert self.model_config.kv_head_num % self.attn_tp_size == 0
-
-        # print(self.model_config)
-
-        self.update_attention_config()
-        self.update_lora_config()
-        # ! Dependency on `self`
-        self.model = model_cls(self)
-
-    def single_to_list(self, config: dict, keys):
-        num_layer = int(config['num_layer'])
-        for k in keys:
-            v = config.get(k, None)
-            if v is not None and not isinstance(v, Sequence):
-                config[k] = [v] * num_layer
-        return config
-
-    def update_model_config(self):
-        """Update `self.model_config` according to the input_model's
-        `model_info`"""
-        final_cfg = config_to_dict(self.model_config)
-        final_cfg.update(self.input_model_info)
-        if 'embedding_size' not in self.input_model_info.keys():
-            final_cfg.update(embedding_size=self.input_model_info['vocab_size'])
-
-        self.model_config = config_from_dict(ModelConfig, final_cfg)
-
-    def update_attention_config(self):
-        """Update attention config according to input model's model info."""
-        final_cfg = config_to_dict(self.attention_config)
-        final_cfg.update(self.input_model_info)
-        self.attention_config = config_from_dict(AttentionConfig, final_cfg)
-
-    def update_lora_config(self):
-        """Update lora config according to input model's model info."""
-        final_cfg = config_to_dict(self.lora_config)
-        final_cfg.update(self.input_model_info)
-        self.lora_config = config_from_dict(LoraConfig, final_cfg)
-
-    def export_config(self) -> None:
-        """Export turbomind config."""
-        if self.to_file:
-            config_path = osp.join(self.out_dir, 'config.yaml')
-            with open(config_path, 'w') as f:
-                yaml.safe_dump(self.tm_config.to_dict(), f)
-
-    def export_weight(self, param: torch.Tensor, name: str) -> None:
-        """Export turbomind weight."""
-
-        def _tofile(tensor, path):
-            """To file."""
-            if tensor.dtype == torch.bfloat16:
-                tensor = tensor.view(torch.half)
-            tensor.contiguous().cpu().numpy().tofile(path)
-
-        if self.to_file:
-            if torch.is_floating_point(param):
-                torch_type = _weight_dtype_map(self.model_config.weight_type, torch.float16)
-                param = param.to(torch_type)
-            tprint(name, param.shape)
-            _tofile(param, osp.join(self.out_dir, name))
-        elif len(self.tm_params) > 0:
-            tm_params = self.tm_params
-            weight_type = self.model_config.weight_type
-            data_type = self.model_config.data_type
-            assert weight_type in ['float16', 'bfloat16', 'int4', 'fp8']
-
-            # currently, the tensor type should in
-            # [torch.float, torch.half, torch.bfloat16, torch.int32]
-            torch_tensor = param if param.is_contiguous() else param.contiguous()
-            torch_tensor = torch_tensor.cuda()
-            assert torch_tensor.dtype in [torch.int32, torch.float, torch.half, torch.bfloat16, torch.uint8]
-            FLOAT_TYPES = [torch.float, torch.half, torch.bfloat16]
-            if weight_type == 'fp8':
-                # avoid casting float scales to half
-                if torch_tensor.dtype == torch.bfloat16 and data_type == 'float16':
-                    torch_tensor = torch_tensor.half()
-            elif torch_tensor.dtype in FLOAT_TYPES:
-                if weight_type in ['float16', 'int4']:
-                    torch_tensor = torch_tensor.half()
-                elif weight_type == 'bfloat16':
-                    torch_tensor = torch_tensor.bfloat16()
-                else:
-                    torch_tensor = torch_tensor.half()
-            if name in tm_params:
-                try:
-                    import _turbomind as _tm
-                except ImportError:
-                    _tm = None
-                for tm_tensor in tm_params[name]:
-                    # Match TurboMind tensor dtype to avoid byte_size mismatch (e.g. f32 256b vs f16 128b)
-                    if _tm is not None:
-                        if tm_tensor.type == _tm.DataType.TYPE_FP32 and torch_tensor.dtype in [
-                                torch.float16, torch.bfloat16
-                        ]:
-                            torch_tensor = torch_tensor.float()
-                        elif tm_tensor.type == _tm.DataType.TYPE_FP16 and torch_tensor.dtype == torch.float32:
-                            torch_tensor = torch_tensor.half()
-                    tm_tensor.copy_from(torch_tensor)
-                tm_params.pop(name)
-        else:
-            tprint('skip export', name, param.shape)
-
-    def save_split(self, tensor: torch.Tensor, name: str, split_dim=None, split_num=1, copy=False) -> None:
-        """Save split.
-
-        - 2D input
-            shape must be (input_dims, output_dims)
-        - 1D input (bias)
-            shape must be (output_dims)
-            split is skipped when split_dim == 0
-        """
-
-        if copy or (tensor.dim() == 1 and split_dim == 0):
-            split_dim = None
-            copy = True
-
-        if split_dim is not None:
-            tprint(f'*** splitting {name}, shape={tensor.shape}, '
-                   f'split_dim={split_dim}, split_num={split_num}',
-                   to_file=self.to_file)
-            if tensor.shape[split_dim] % split_num != 0:
-                raise RuntimeError(f'{name}: shape={list(tensor.shape)}, split_num={split_num}')
-            split_size = tensor.shape[split_dim] // split_num
-            splits = torch.split(tensor, split_size, dim=split_dim)
-            for i, split in enumerate(splits):
-                prefix, ext = osp.splitext(name)
-                self.export_weight(split, f'{prefix}.{i}{ext}')
-        elif copy:
-            tprint(f'### copying {name}, shape={tensor.shape}', to_file=self.to_file)
-            copies = [tensor] * split_num
-            for i, copy in enumerate(copies):
-                prefix, ext = osp.splitext(name)
-                self.export_weight(copy, f'{prefix}.{i}{ext}')
-        else:
-            self.export_weight(tensor, name)
-
-    def export(self) -> None:
-        """Export to turbomind model format."""
-        num_layer = self.model_config.num_layer
-        from tqdm import tqdm
-        pbar = tqdm(total=num_layer, desc='Convert to turbomind format', leave=self.to_file)
-        self.export_config()
-        for i, reader in self.input_model.readers():
-            if self.model(i, reader):
-                pbar.update(1)
-        pbar.close()
-
-    def export_iter(self):
-        self.export_config()
-        for i, reader in self.input_model.readers():
-            self.model(i, reader)
-            yield i
-
-    @property
-    def tm_config(self):
-        return TurbomindModelConfig(model_config=self.model_config,
-                                    attention_config=self.attention_config,
-                                    lora_config=self.lora_config)
diff --git a/lmdeploy/turbomind/deploy/target_model/fp.py b/lmdeploy/turbomind/deploy/target_model/fp.py
deleted file mode 100644
index 11f1f78170..0000000000
--- a/lmdeploy/turbomind/deploy/target_model/fp.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-from .base import OUTPUT_MODELS, BaseOutputModel
-
-
-@OUTPUT_MODELS.register_module(name='tm')
-class TurbomindModel(BaseOutputModel):
-    """Export to turbomind fp16 format."""
-    pass
diff --git a/lmdeploy/turbomind/linear.py b/lmdeploy/turbomind/linear.py
new file mode 100644
index 0000000000..c073a9fbb1
--- /dev/null
+++ b/lmdeploy/turbomind/linear.py
@@ -0,0 +1,265 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Linear weight bundle and composable dimension operations.
+
+Two weight types flow through the TurboMind weight loading pipeline:
+
+- ``Linear`` -- a bundle of tensors for a single linear layer (weight +
+  optional scales, zeros, bias).
+- Raw ``torch.Tensor`` -- everything else (norms, embeddings, scalars).
+
+**concat_out_dim** joins ``Linear`` bundles along the output
+dimension, handling all component tensors correctly regardless of
+quantization-induced dimension scaling.
+"""
+
+from __future__ import annotations
+
+import functools
+import inspect
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
+
+import torch
+from torch import Tensor
+
+if TYPE_CHECKING:
+    from .weight_format import WeightFormat
+
+
+# ---------------------------------------------------------------------------
+# Linear dataclass with methods
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class Linear:
+    """Bundle of tensors for a single linear layer.
+
+    ``tensors`` maps a closed-set TM weight kind (e.g. ``"weight"``,
+    ``"scales"``, ``"zeros"``, ``"bias"``, ``"qweight"``) to the actual
+    tensor.
+
+    **Layout contract**: all ``Linear`` objects are in TM layout with
+    axis 0 as the input dimension and axis -1 as the output dimension.
+    ``commit_linear`` assumes this layout and does not re-transpose.
+    1-D tensors (e.g. bias) only have an output dimension (axis 0).
+    """
+
+    tensors: dict[str, Tensor]
+    weight_format: WeightFormat = field(compare=False, repr=False)
+
+
+def concat_out_dim(xs: list[Linear]) -> Linear:
+    """Concatenate along output dim."""
+    first = xs[0]
+    result: dict[str, Tensor] = {}
+    for kind in first.tensors:
+        t = first.tensors[kind]
+        result[kind] = torch.cat([x.tensors[kind] for x in xs], dim=t.dim() - 1)
+    wfmts = {x.weight_format for x in xs}
+    assert len(wfmts) == 1, (
+        'concat_out_dim requires uniform weight_format; '
+        'call dequant_mixed first if formats differ.')
+    return Linear(tensors=result,
+                  weight_format=next(iter(wfmts)))
+
+
+# ---------------------------------------------------------------------------
+# Format / compatibility utilities
+# ---------------------------------------------------------------------------
+
+
+def _dequant_linear(linear: Linear, *, data_type) -> Linear:
+    """Dequantize a quantized Linear to trivial.
+
+    ``TrivialFormat.dequant`` is identity, so already-trivial inputs round-trip
+    safely.  ``AWQFormat.dequant`` and ``FP8Format.dequant`` do real work.
+    GPTQ / CompressedTensor / MXFP4 inherit the base-class
+    ``NotImplementedError`` — calling ``_dequant_linear`` on one of those is a
+    broken-fusion-group configuration, and the raise names it at the call site.
+    """
+    from .weight_format import TrivialFormat
+
+    fmt = linear.weight_format
+    new_tensors = fmt.dequant(linear.tensors, data_type)
+    trivial = TrivialFormat()
+    return Linear(tensors=new_tensors, weight_format=trivial)
+
+
+def dequant_mixed(*linears: Linear | None, data_type) -> tuple[Linear | None, ...]:
+    """Dequantize linears to a common trivial format when formats differ.
+
+    Trivial inputs round-trip safely through ``_dequant_linear``.
+    None args pass through unchanged.
+    """
+    formats = {l.weight_format.name for l in linears if l is not None}
+    if len(formats) <= 1:
+        return linears
+    return tuple(
+        _dequant_linear(l, data_type=data_type) if l is not None else l
+        for l in linears
+    )
+
+
+# ---------------------------------------------------------------------------
+# Linear-level transform decorators
+# ---------------------------------------------------------------------------
+
+
+def transform_output_dim(fn):
+    """Decorator that lifts a tensor-level transform to Linear-level.
+
+    For output-dim operations: 1-D tensors (bias) are unsqueezed to 2-D
+    before calling *fn*, then squeezed back.  Convention: args that are
+    ``Linear`` instances are treated as tensor inputs; all other args pass
+    through unchanged.  Return type is detected at runtime:
+    ``Tensor`` -> single ``Linear``, ``tuple`` -> tuple of ``Linear`` objects.
+    """
+    sig = inspect.signature(fn)
+
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        bound = sig.bind(*args, **kwargs)
+        bound.apply_defaults()
+
+        first = next(
+            v for v in bound.arguments.values() if isinstance(v, Linear)
+        )
+        out_buckets = None
+
+        for kind in first.tensors:
+            was_1d = False
+            fn_kwargs = {}
+
+            for name, val in bound.arguments.items():
+                if isinstance(val, Linear):
+                    t = val.tensors[kind]
+                    if t.dim() == 1:
+                        was_1d = True
+                        t = t.unsqueeze(0)
+                    fn_kwargs[name] = t
+                else:
+                    fn_kwargs[name] = val
+
+            result = fn(**fn_kwargs)
+            if not isinstance(result, tuple):
+                result = (result,)
+            if out_buckets is None:
+                out_buckets = [{} for _ in result]
+            for i, item in enumerate(result):
+                out_buckets[i][kind] = item.squeeze(0) if was_1d else item
+
+        outputs = tuple(
+            Linear(ts, weight_format=first.weight_format) for ts in out_buckets
+        )
+        return outputs if len(outputs) > 1 else outputs[0]
+
+    return wrapper
+
+
+def transform_input_dim(fn):
+    """Decorator that lifts a tensor-level transform to Linear-level.
+
+    For input-dim operations: 1-D tensors (bias) have no input dimension
+    and are **passed through unchanged**.  The inner function only ever
+    sees 2-D tensors for each kind.  For multi-output functions, 1-D
+    tensors are duplicated into every output bucket.
+    """
+    sig = inspect.signature(fn)
+
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        bound = sig.bind(*args, **kwargs)
+        bound.apply_defaults()
+
+        first = next(
+            v for v in bound.arguments.values() if isinstance(v, Linear)
+        )
+        out_buckets = None
+        deferred_1d: list[str] = []
+
+        for kind in first.tensors:
+            fn_kwargs = {}
+            is_1d = False
+
+            for name, val in bound.arguments.items():
+                if isinstance(val, Linear):
+                    t = val.tensors[kind]
+                    if t.dim() < 2:
+                        is_1d = True
+                        break
+                    fn_kwargs[name] = t
+                else:
+                    fn_kwargs[name] = val
+
+            if is_1d:
+                deferred_1d.append(kind)
+                continue
+
+            result = fn(**fn_kwargs)
+            if not isinstance(result, tuple):
+                result = (result,)
+            if out_buckets is None:
+                out_buckets = [{} for _ in result]
+            for i, item in enumerate(result):
+                out_buckets[i][kind] = item
+
+        if out_buckets is None:
+            out_buckets = [{}]
+        for kind in deferred_1d:
+            for bucket in out_buckets:
+                bucket[kind] = first.tensors[kind]
+
+        outputs = tuple(
+            Linear(ts, weight_format=first.weight_format) for ts in out_buckets
+        )
+        return outputs if len(outputs) > 1 else outputs[0]
+
+    return wrapper
+
+
+# ---------------------------------------------------------------------------
+# Group-based padding
+# ---------------------------------------------------------------------------
+
+
+@transform_output_dim
+def pad_output_groups(t: torch.Tensor, *, src_groups: int,
+                      dst_groups: int) -> torch.Tensor:
+    """Pad output dim by src_groups → dst_groups, viewing it as (groups, -1)."""
+    t = t.reshape(t.shape[:-1] + (src_groups, -1))
+    pad = t.new_zeros(t.shape[:-2] + (dst_groups - src_groups, t.shape[-1]))
+    return torch.cat([t, pad], dim=-2).reshape(t.shape[:-2] + (-1,))
+
+
+@transform_input_dim
+def pad_input_groups(t: torch.Tensor, *, src_groups: int,
+                     dst_groups: int) -> torch.Tensor:
+    """Pad input dim by src_groups → dst_groups, viewing it as (groups, -1)."""
+    t = t.reshape((src_groups, -1) + t.shape[1:])
+    block = t.shape[1]
+    pad = t.new_zeros((dst_groups - src_groups, block) + t.shape[2:])
+    return torch.cat([t, pad], dim=0).reshape((dst_groups * block,) + t.shape[2:])
+
+
+def _round_up(src_groups: int, div: int) -> int:
+    """Round *src_groups* up to the nearest multiple of *div*."""
+    return ((src_groups + div - 1) // div) * div
+
+
+def round_up_output_groups(linear: Linear, groups: int,
+                           div: int) -> Linear:
+    """Pad output-dim groups to ``round_up(groups, div)``."""
+    dst = _round_up(groups, div)
+    if dst == groups:
+        return linear
+    return pad_output_groups(linear, src_groups=groups, dst_groups=dst)
+
+
+def round_up_input_groups(linear: Linear, groups: int,
+                          div: int) -> Linear:
+    """Pad input-dim groups to ``round_up(groups, div)``."""
+    dst = _round_up(groups, div)
+    if dst == groups:
+        return linear
+    return pad_input_groups(linear, src_groups=groups, dst_groups=dst)
diff --git a/lmdeploy/turbomind/deploy/loader.py b/lmdeploy/turbomind/loader.py
similarity index 74%
rename from lmdeploy/turbomind/deploy/loader.py
rename to lmdeploy/turbomind/loader.py
index 2475a8a928..2602dfe61d 100644
--- a/lmdeploy/turbomind/deploy/loader.py
+++ b/lmdeploy/turbomind/loader.py
@@ -23,11 +23,11 @@
 
 class BaseLoader(ABC):
 
-    def __init__(self, model_path: str, pattern, mappings: list):
+    def __init__(self, model_path: str, pattern=None, mappings: list | None = None):
         self.model_path = model_path
         self.pattern = pattern
         self.item_count = defaultdict(int)
-        self.mappings = mappings
+        self.mappings = mappings or []
 
     def get_index(self, index_name: str, file_pattern: str) -> tuple[dict, list]:
         """Get shards and weight map (if possible) for the model."""
@@ -58,10 +58,17 @@ def map_key(self, key: str):
     def items(self) -> Iterator[tuple[int, dict]]:
         pass
 
+    @abstractmethod
+    def all_items(self) -> dict:
+        """Return ALL weights in a single dict."""
+        pass
+
 
 class SafetensorsLoader(BaseLoader):
 
-    def __init__(self, model_path: str, pattern: str, mappings: list, index_name=None, file_pattern=None):
+    def __init__(self, model_path: str, pattern=None,
+                 mappings: list | None = None, index_name=None,
+                 file_pattern=None):
         super().__init__(model_path, pattern, mappings)
         self.shards, index = self.get_index(index_name, file_pattern)
         if not index:
@@ -74,10 +81,11 @@ def __init__(self, model_path: str, pattern: str, mappings: list, index_name=Non
         # self.index maps weight names to their corresponding safetensors file name
         self.index = index
         # count layer-wise parameters
-        for k in index.keys():
-            match = re.findall(self.pattern, k)
-            if match:
-                self.item_count[int(match[0])] += 1
+        if self.pattern:
+            for k in index.keys():
+                match = re.findall(self.pattern, k)
+                if match:
+                    self.item_count[int(match[0])] += 1
 
     def items(self):
         params = defaultdict(dict)
@@ -91,7 +99,7 @@ def items(self):
                     # - Exclude duplicated weights (present in multiple files)
                     if k not in self.index or self.index[k] != filename:
                         continue
-                    match = re.findall(self.pattern, k)
+                    match = re.findall(self.pattern, k) if self.pattern else []
                     if not match:
                         misc.append(k)
                     else:
@@ -104,16 +112,31 @@ def items(self):
                     yield (-1, {k: f.get_tensor(k) for k in misc})
         assert not params
 
+    def all_items(self) -> dict:
+        """Return ALL weights in a single dict (mmap-backed, no eager load)."""
+        all_params = {}
+        for shard in self.shards:
+            with safe_open(shard, 'pt') as f:
+                filename = osp.basename(shard)
+                for k in f.keys():
+                    if k not in self.index or self.index[k] != filename:
+                        continue
+                    all_params[self.map_key(k)] = f.get_tensor(k)
+        return all_params
+
 
 class PytorchLoader(BaseLoader):
 
-    def __init__(self, model_path: str, pattern: str, mappings: list, index_name=None, file_pattern=None):
+    def __init__(self, model_path: str, pattern=None,
+                 mappings: list | None = None, index_name=None,
+                 file_pattern=None):
         super().__init__(model_path, pattern, mappings)
         self.shards, index = self.get_index(index_name, file_pattern)
-        for k in index.keys():
-            match = re.findall(self.pattern, k)
-            if match:
-                self.item_count[int(match[0])] += 1
+        if self.pattern:
+            for k in index.keys():
+                match = re.findall(self.pattern, k)
+                if match:
+                    self.item_count[int(match[0])] += 1
 
     def items(self):
         params = defaultdict(dict)
@@ -121,7 +144,7 @@ def items(self):
             misc = {}
             tmp = torch.load(shard, map_location='cpu', weights_only=True)
             for k, v in tmp.items():
-                match = re.findall(self.pattern, k)
+                match = re.findall(self.pattern, k) if self.pattern else []
                 if not match:
                     misc[k] = v
                 else:
@@ -144,6 +167,16 @@ def items(self):
         for idx in idxs:
             yield (idx, params.pop(idx))
 
+    def all_items(self) -> dict:
+        """Return ALL weights in a single dict."""
+        all_params = {}
+        for shard in self.shards:
+            tmp = torch.load(shard, map_location='cpu', weights_only=True)
+            for k, v in tmp.items():
+                all_params[self.map_key(k)] = v
+            del tmp
+        return all_params
+
 
 class StateDictLoader:
     """This loader is used for `update_params`.
@@ -152,7 +185,7 @@ class StateDictLoader:
     lm_head, norm).
     """
 
-    def __init__(self, queue: Queue, pattern: str, mappings: list):
+    def __init__(self, queue: Queue, pattern=None, mappings: list | None = None):
         self.que = queue
         self.pattern = pattern
 
@@ -160,9 +193,11 @@ def items(self):
         for data in iter(self.que.get, None):
             # If data is state dict of a decoder layer, any key will match the pattern.
             # Otherwise, none of the keys will match the pattern.
-            for k in data.keys():
-                match = re.findall(self.pattern, k)
-                break
+            match = []
+            if self.pattern:
+                for k in data.keys():
+                    match = re.findall(self.pattern, k)
+                    break
 
             if not match:
                 yield (-1, data)
@@ -173,8 +208,12 @@ def items(self):
             torch.cuda.empty_cache()
             self.que.task_done()
 
+    def all_items(self) -> dict:
+        raise NotImplementedError('StateDictLoader does not support all_items()')
+
 
-def create_loader(model_path: str | Queue, pattern: str, mappings: list) -> BaseLoader:
+def create_loader(model_path: str | Queue, pattern=None,
+                  mappings: list | None = None) -> BaseLoader:
     args = (model_path, pattern, mappings)
 
     if isinstance(model_path, Queue):
diff --git a/lmdeploy/turbomind/model_loader.py b/lmdeploy/turbomind/model_loader.py
new file mode 100644
index 0000000000..0d241b6922
--- /dev/null
+++ b/lmdeploy/turbomind/model_loader.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""ModelLoader: coordinates loading a model's weights into the TurboMind runtime."""
+import torch
+
+from .builders._base import Context, ParallelGroup
+from .loader import create_loader
+
+
+class ModelLoader:
+    """Coordinates loading a model's weights into the TurboMind runtime.
+
+    Holds the model, model_comm handle, and model_path. Extracts GPU topology handles from model_comm and binds them
+    onto the model at construction time. Provides export() and export_iter() to load checkpoint weights and commit them
+    to the C++ runtime.
+    """
+
+    def __init__(self, model, model_comm, gpu_count, model_path,
+                 data_type, engine_config):
+        self.model = model
+        self.model_comm = model_comm
+        self.gpu_count = gpu_count
+        self.model_path = model_path
+        self.data_type = data_type
+        self.engine_config = engine_config
+        self._bind_runtime()
+
+    def _bind_runtime(self):
+        mc = self.model_comm
+        ctx = Context(
+            [mc.context(g) for g in range(self.gpu_count)],
+            data_type=self.data_type,
+        )
+        ec = self.engine_config
+
+        attn_tp = ParallelGroup(ec.attn_tp_size,
+                                [mc.attn_tp_rank(g) for g in range(self.gpu_count)])
+        mlp_tp = ParallelGroup(ec.mlp_tp_size,
+                               [mc.mlp_tp_rank(g) for g in range(self.gpu_count)])
+        model_tp = ParallelGroup(ec.attn_tp_size * ec.attn_cp_size,
+                                 [mc.model_tp_rank(g) for g in range(self.gpu_count)])
+
+        self.model.bind_runtime(
+            ctx=ctx,
+            root_handles=[mc.root(g) for g in range(self.gpu_count)],
+            attn_tp=attn_tp,
+            mlp_tp=mlp_tp,
+            model_tp=model_tp,
+        )
+
+    def export(self):
+        loader = create_loader(self.model_path, None,
+                               getattr(self.model, '_loader_mappings', []))
+        self.model.set_params(loader.all_items())
+        self.model.model()
+        torch.cuda.empty_cache()
+
+    def export_iter(self):
+        loader = create_loader(self.model_path, None,
+                               getattr(self.model, '_loader_mappings', []))
+        self.model.set_params(loader.all_items())
+        self.model.model()
+        yield -1
+        torch.cuda.empty_cache()
diff --git a/lmdeploy/turbomind/models/__init__.py b/lmdeploy/turbomind/models/__init__.py
new file mode 100644
index 0000000000..e3310bd183
--- /dev/null
+++ b/lmdeploy/turbomind/models/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .glm4_moe_lite import Glm4MoeLiteModel  # noqa: F401
+from .gpt_oss import GptOssModel  # noqa: F401
+from .internlm2 import InternLM2Model  # noqa: F401
+from .internvl3_5 import InternVL3_5Model  # noqa: F401
+from .llama import LlamaModel  # noqa: F401
+from .qwen2 import Qwen2Model  # noqa: F401
+from .qwen3 import Qwen3TextModel  # noqa: F401
+from .qwen3_5 import Qwen3_5Model  # noqa: F401
diff --git a/lmdeploy/turbomind/models/base.py b/lmdeploy/turbomind/models/base.py
new file mode 100644
index 0000000000..9b317e8397
--- /dev/null
+++ b/lmdeploy/turbomind/models/base.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Source-model registry.
+
+The INPUT_MODELS registry maps an architecture name to its TextModel
+subclass. Models register themselves via ``@INPUT_MODELS.register_module(name=...)``.
+"""
+from __future__ import annotations
+
+from mmengine import Registry
+
+INPUT_MODELS = Registry('source model',
+                        locations=['lmdeploy.turbomind.models.base'])
diff --git a/lmdeploy/turbomind/models/glm4_moe_lite.py b/lmdeploy/turbomind/models/glm4_moe_lite.py
new file mode 100644
index 0000000000..5ed969412e
--- /dev/null
+++ b/lmdeploy/turbomind/models/glm4_moe_lite.py
@@ -0,0 +1,145 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""GLM-4 MoE Lite (GLM-4.7-Flash) TextModel for the new pipeline."""
+from __future__ import annotations
+
+import _turbomind as _tm
+from transformers import Glm4MoeLiteConfig
+
+from ..builders import (
+    DecoderLayerBuilder,
+    DecoderLayerConfig,
+    FfnBuilder,
+    MLABuilder,
+    ModuleListBuilder,
+    ModuleListConfig,
+    MoeBuilder,
+    TextModelBuilder,
+    _act_type_id,
+)
+from ..text_model import TextModel
+from .base import INPUT_MODELS
+from .utils import (
+    layer_progress,
+    make_mla_config,
+    make_model_weight_config,
+    make_moe_config,
+)
+
+
+@INPUT_MODELS.register_module(name='glm4-moe-lite')
+class Glm4MoeLiteModel(TextModel):
+    """Weight model for GLM-4 MoE Lite (e.g. GLM-4.7-Flash)."""
+
+    cfg: Glm4MoeLiteConfig
+
+    def __init__(self, cfg: Glm4MoeLiteConfig, *, resolver):
+        super().__init__(cfg, resolver=resolver)
+
+        self._attn_cfg = make_mla_config(cfg)
+
+        # ---- FFN template ----
+        self._ffn_cfg = _tm.FfnConfig()
+        self._ffn_cfg.hidden_dim = self.cfg.hidden_size
+        self._ffn_cfg.act_type   = _act_type_id('silu')
+
+        # ---- MoE template (GLM-specific: noaux_tc + sigmoid) ----
+        if cfg.n_routed_experts > 0:
+            self._moe_cfg = make_moe_config(
+                cfg,
+                experts_per_token=cfg.num_experts_per_tok,
+                topk_method='noaux_tc',
+                scoring_func='sigmoid',
+                routed_scale=cfg.routed_scaling_factor,
+                topk_group=cfg.topk_group,
+                n_group=cfg.n_group)
+            self._moe_cfg.expert_num = cfg.n_routed_experts
+
+        self._tune_layer_num = 2  # GLM-MoE recommends tuning 2 layers
+
+    # ------------------------------------------------------------------
+    # model() — same as old code
+    # ------------------------------------------------------------------
+
+    def model(self):
+        root_cfg = make_model_weight_config(self.cfg)
+        root = TextModelBuilder(
+            root_cfg, self._ctx,
+            root_handles=self._root_handles,
+            tp=self._model_tp,
+            vocab_size=self.cfg.vocab_size)
+        root.add_token_embeds(self._get('model.embed_tokens.weight'))
+        root.norm = self.norm(self._get('model.norm.weight'))
+        root.add_lm_head(self._linear('lm_head'))  # GLM: never tied
+        root.layers = self.layers('model.layers')
+        root.build()
+
+    # ------------------------------------------------------------------
+    # MLA attention (uses MLABuilder + self._attn_cfg clone)
+    # ------------------------------------------------------------------
+
+    def attn(self, pfx):
+        cfg = self._attn_cfg.clone()
+        m = MLABuilder(cfg, self._ctx, tp=self._attn_tp)
+
+        q_b = (self._linear(f'{pfx}.q_b_proj', optional=True) or
+               self._linear(f'{pfx}.q_proj'))
+        m.add_projections(
+            q_a_proj=self._linear(f'{pfx}.q_a_proj'),
+            q_b_proj=q_b,
+            kv_a_proj=self._linear(f'{pfx}.kv_a_proj_with_mqa'),
+            kv_b_proj=self._linear(f'{pfx}.kv_b_proj'),
+            wo=self._linear(f'{pfx}.o_proj'),
+        )
+        m.q_a_layernorm  = self.norm(self._get(f'{pfx}.q_a_layernorm.weight'))
+        m.kv_a_layernorm = self.norm(self._get(f'{pfx}.kv_a_layernorm.weight'))
+        return m.build()
+
+    # ------------------------------------------------------------------
+    # FFN / MoE factories
+    # ------------------------------------------------------------------
+
+    def ffn(self, pfx, inter_size, is_expert=False):
+        w1, w3, w2 = [self._linear(f'{pfx}.{x}_proj') for x in ('gate', 'up', 'down')]
+
+        cfg = self._ffn_cfg.clone()
+        cfg.inter_size = inter_size
+        cfg.is_expert  = is_expert
+
+        m = FfnBuilder(cfg, self._ctx, tp=self._mlp_tp)
+        m.add_ffn(w1, w2, w3)
+        return m.build()
+
+    def moe(self, pfx):
+        cfg = self._moe_cfg.clone()
+
+        m = MoeBuilder(cfg, self._ctx)
+
+        m.add_gate('gate', self._linear(f'{pfx}.gate'))
+
+        correction = self._get(f'{pfx}.gate.e_score_correction_bias')
+        m.add_param('score_correction_bias', correction)
+
+        experts = ModuleListBuilder(ModuleListConfig(), self._ctx)
+        for e in range(cfg.expert_num):
+            experts[e] = self.ffn(f'{pfx}.experts.{e}', 
+                                  self.cfg.moe_intermediate_size, is_expert=True)
+        m.experts = experts.build()
+
+        shared = self.ffn(f'{pfx}.shared_experts', 
+                          self.cfg.intermediate_size * self.cfg.n_shared_experts)
+
+        return m.build(), shared
+
+    def layers(self, pfx):
+        layers = ModuleListBuilder(ModuleListConfig(), self._ctx)
+        for i in layer_progress(self.cfg.num_hidden_layers):
+            d = DecoderLayerBuilder(DecoderLayerConfig(), self._ctx)
+            d.attention_norm = self.norm(self._get(f'{pfx}.{i}.input_layernorm.weight'))
+            d.attention = self.attn(f'{pfx}.{i}.self_attn')
+            d.ffn_norm = self.norm(self._get(f'{pfx}.{i}.post_attention_layernorm.weight'))
+            if self.cfg.mlp_layer_types[i] == 'sparse':
+                d.moe_ffn, d.feed_forward = self.moe(f'{pfx}.{i}.mlp')
+            else:
+                d.feed_forward = self.ffn(f'{pfx}.{i}.mlp', self.cfg.intermediate_size)
+            layers[i] = d.build()
+        return layers.build()
diff --git a/lmdeploy/turbomind/models/gpt_oss.py b/lmdeploy/turbomind/models/gpt_oss.py
new file mode 100644
index 0000000000..096345d958
--- /dev/null
+++ b/lmdeploy/turbomind/models/gpt_oss.py
@@ -0,0 +1,142 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Gpt-oss TextModel for the new pipeline."""
+from __future__ import annotations
+
+import re
+
+from transformers import GptOssConfig
+
+from ..builders import (
+    AttentionBuilder,
+    DecoderLayerBuilder,
+    DecoderLayerConfig,
+    FfnBuilder,
+    ModuleListBuilder,
+    ModuleListConfig,
+    MoeBuilder,
+    TextModelBuilder,
+    _act_type_id,
+)
+from ..text_model import TextModel
+from .base import INPUT_MODELS
+from .utils import (
+    layer_progress,
+    make_attention_config,
+    make_ffn_config,
+    make_model_weight_config,
+    make_moe_config,
+    read_packed_moe_expert,
+    reorder_rotary_emb,
+)
+
+
+def map_experts(s: str) -> str:
+    s = re.sub(r'(experts.*proj)$', r'\1.weight', s)
+    s = re.sub(r'(experts.*proj)_bias$', r'\1.bias', s)
+    s = re.sub(r'(experts.*proj)_blocks$', r'\1.blocks', s)
+    s = re.sub(r'(experts.*proj)_scales$', r'\1.scales', s)
+    return s
+
+
+@INPUT_MODELS.register_module(name='gpt-oss')
+class GptOssModel(TextModel):
+    """Weight model for gpt-oss (MoE with packed experts)."""
+
+    cfg: GptOssConfig
+
+    _loader_mappings = [map_experts]
+
+    def __init__(self, cfg: GptOssConfig, *, resolver):
+        super().__init__(cfg, resolver=resolver)
+
+        self._attn_cfg = make_attention_config(cfg)
+
+        self._ffn_cfg = make_ffn_config(cfg,
+                                        act_type=_act_type_id('gpt-oss'))
+        self._ffn_cfg.inter_size = cfg.intermediate_size
+        self._ffn_cfg.is_expert = True
+
+        # ---- MoE template ----
+        self._moe_cfg = make_moe_config(
+            cfg,
+            act_type=_act_type_id('gpt-oss'),
+            experts_per_token=cfg.num_experts_per_tok)
+        self._moe_cfg.expert_num = cfg.num_local_experts
+
+    # ------------------------------------------------------------------
+    # model() — walks full hierarchy
+    # ------------------------------------------------------------------
+
+    def model(self):
+        embed_key = 'model.embed_tokens.weight'
+        root_cfg = make_model_weight_config(self.cfg)
+        root = TextModelBuilder(
+            root_cfg, self._ctx,
+            root_handles=self._root_handles,
+            tp=self._model_tp,
+            vocab_size=self.cfg.vocab_size)
+        root.add_token_embeds(self._get(embed_key))
+        root.norm = self.norm(self._get('model.norm.weight'))
+        lm_key = embed_key if self.cfg.tie_word_embeddings else 'lm_head.weight'
+        root.add_lm_head(self._linear(lm_key.removesuffix('.weight')))
+        root.layers = self.layers('model.layers')
+        root.build()
+
+    # ------------------------------------------------------------------
+    # Attention / FFN / MoE factories
+    # ------------------------------------------------------------------
+
+    def attn(self, pfx, layer):
+        q, k, v, o = [self._linear(f'{pfx}.{x}_proj') for x in 'qkvo']
+
+        cfg = self._attn_cfg.clone()
+        if self.cfg.layer_types[layer] == 'sliding_attention':
+            cfg.window_size = self.cfg.sliding_window
+
+        def reorder(x):
+            return reorder_rotary_emb(x, cfg.head_dim, cfg.rope.dim, resolver=self._resolver)
+
+        q, k = [reorder(x) for x in (q, k)]
+
+        m = AttentionBuilder(cfg, self._ctx, tp=self._attn_tp)
+        m.add_qkv_proj(q, k, v)
+        m.add_o_proj(o)
+
+        m.add_param('sinks', self._get(f'{pfx}.sinks'))
+        return m.build()
+
+    def moe(self, pfx):
+        cfg = self._moe_cfg.clone()
+        m = MoeBuilder(cfg, self._ctx)
+        m.add_gate('gate', self._linear(f'{pfx}.router'))
+        experts = ModuleListBuilder(ModuleListConfig(), self._ctx)
+        for e in range(cfg.expert_num):
+            experts[e] = self._packed_moe_ffn(f'{pfx}.experts', e)
+        m.experts = experts.build()
+        return m.build()
+
+    def layers(self, pfx):
+        layers = ModuleListBuilder(ModuleListConfig(), self._ctx)
+        for i in layer_progress(self.cfg.num_hidden_layers):
+            d = DecoderLayerBuilder(DecoderLayerConfig(), self._ctx)
+            d.attention = self.attn(f'{pfx}.{i}.self_attn', i)
+            d.moe_ffn = self.moe(f'{pfx}.{i}.mlp')
+            d.attention_norm = self.norm(self._get(f'{pfx}.{i}.input_layernorm.weight'))
+            d.ffn_norm = self.norm(self._get(f'{pfx}.{i}.post_attention_layernorm.weight'))
+            layers[i] = d.build()
+        return layers.build()
+
+    def _packed_moe_ffn(self, pfx, idx):
+        w1, w2, w3 = read_packed_moe_expert(
+            self.params,
+            f'{pfx}.gate_up_proj',
+            f'{pfx}.down_proj',
+            idx,
+            resolver=self._resolver,
+            interleaved=True,
+            trans=True,
+        )
+        cfg = self._ffn_cfg.clone()
+        m = FfnBuilder(cfg, self._ctx, tp=self._mlp_tp)
+        m.add_ffn(w1, w2, w3)
+        return m.build()
diff --git a/lmdeploy/turbomind/models/internlm2.py b/lmdeploy/turbomind/models/internlm2.py
new file mode 100644
index 0000000000..29c0c05ca9
--- /dev/null
+++ b/lmdeploy/turbomind/models/internlm2.py
@@ -0,0 +1,137 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""InternLM2 / InternLM2.5 TextModel for the new pipeline.
+
+Handles InternLM2 and InternLM2.5 decoder variants.  The key difference from Llama is the GQA-interleaved fused wqkv
+projection that must be deinterleaved into separate Q / K / V bundles before feeding to AttentionBuilder.
+"""
+from __future__ import annotations
+
+from transformers import PretrainedConfig
+
+from ..builders import (
+    AttentionBuilder,
+    DecoderLayerBuilder,
+    DecoderLayerConfig,
+    FfnBuilder,
+    ModuleListBuilder,
+    ModuleListConfig,
+    TextModelBuilder,
+    _act_type_id,
+)
+from ..linear import transform_output_dim
+from ..text_model import TextModel
+from .base import INPUT_MODELS
+from .utils import (
+    layer_progress,
+    make_attention_config,
+    make_ffn_config,
+    make_model_weight_config,
+    reorder_rotary_emb,
+)
+
+
+@transform_output_dim
+def _split_qkv_gqa(w_qkv, *, head_dim, q_heads, kv_heads):
+    """Deinterleave a GQA-fused QKV tensor into separate Q, K, V tensors.
+
+    InternLM2 layout: ``[Q0 Q1 Q2 Q3 K V]`` repeated per KV group.
+    ``per_head_elems`` self-adapts (128 for weights, 1 for block scales).
+    """
+    groups = kv_heads
+    q_per_group = q_heads // kv_heads
+    slots = q_per_group + 2               # Q-slots + K + V
+    total = groups * slots
+    n = w_qkv.size(-1) // total           # elems per head-equivalent
+
+    t = w_qkv.unflatten(-1, (groups, slots, n))
+    q = t[..., :q_per_group, :].flatten(-3, -2)
+    k = t[..., q_per_group, :].flatten(-2, -1)
+    v = t[..., q_per_group + 1, :].flatten(-2, -1)
+    return q.contiguous(), k.contiguous(), v.contiguous()
+
+
+@INPUT_MODELS.register_module(name='internlm2')
+class InternLM2Model(TextModel):
+    """Weight model for InternLM2 / InternLM2.5 decoder-only variants."""
+
+    cfg: PretrainedConfig
+
+    def __init__(self, cfg: PretrainedConfig, *, resolver):
+        super().__init__(cfg, resolver=resolver)
+
+        self._attn_cfg = make_attention_config(cfg)
+
+        self._ffn_cfg = make_ffn_config(cfg,
+                                        act_type=_act_type_id('silu'))
+
+    # ------------------------------------------------------------------
+    # model() — full topology
+    # ------------------------------------------------------------------
+
+    def model(self):
+        embed_key = 'model.tok_embeddings.weight'
+        root_cfg = make_model_weight_config(self.cfg)
+        root = TextModelBuilder(
+            root_cfg, self._ctx,
+            root_handles=self._root_handles,
+            tp=self._model_tp,
+            vocab_size=self.cfg.vocab_size)
+        root.add_token_embeds(self._get(embed_key))
+        root.norm = self.norm(self._get('model.norm.weight'))
+        lm_key = embed_key if self.cfg.tie_word_embeddings else 'output.weight'
+        root.add_lm_head(self._linear(lm_key.removesuffix('.weight')))
+        root.layers = self.layers('model.layers')
+        root.build()
+
+    # ------------------------------------------------------------------
+    # attn() — deinterleave fused wqkv then feed to AttentionBuilder
+    # ------------------------------------------------------------------
+
+    def attn(self, pfx):
+        wqkv = self._linear(f'{pfx}.wqkv')
+        cfg = self._attn_cfg.clone()
+        q, k, v = _split_qkv_gqa(
+            wqkv, head_dim=cfg.head_dim,
+            q_heads=cfg.head_num, kv_heads=cfg.kv_head_num)
+        o = self._linear(f'{pfx}.wo')
+
+        def reorder(x):
+            return reorder_rotary_emb(x, cfg.head_dim, cfg.rope.dim, resolver=self._resolver)
+
+        q, k = [reorder(x) for x in (q, k)]
+
+        m = AttentionBuilder(cfg, self._ctx, tp=self._attn_tp)
+        m.add_qkv_proj(q, k, v)
+        m.add_o_proj(o)
+
+        return m.build()
+
+    # ------------------------------------------------------------------
+    # ffn() — InternLM2 uses w1 / w3 / w2 naming
+    # ------------------------------------------------------------------
+
+    def ffn(self, pfx):
+        w1, w3, w2 = [self._linear(f'{pfx}.{x}') for x in ('w1', 'w3', 'w2')]
+
+        cfg = self._ffn_cfg.clone()
+
+        m = FfnBuilder(cfg, self._ctx, tp=self._mlp_tp)
+        m.add_ffn(w1, w2, w3)
+        return m.build()
+
+    # ------------------------------------------------------------------
+    # layers() — standard loop, InternLM2 norm names
+    # ------------------------------------------------------------------
+
+    def layers(self, pfx):
+        layers = ModuleListBuilder(ModuleListConfig(), self._ctx)
+        for i in layer_progress(self.cfg.num_hidden_layers):
+            d = DecoderLayerBuilder(DecoderLayerConfig(), self._ctx)
+            d.attention_norm = self.norm(
+                self._get(f'{pfx}.{i}.attention_norm.weight'))
+            d.attention = self.attn(f'{pfx}.{i}.attention')
+            d.ffn_norm = self.norm(
+                self._get(f'{pfx}.{i}.ffn_norm.weight'))
+            d.feed_forward = self.ffn(f'{pfx}.{i}.feed_forward')
+            layers[i] = d.build()
+        return layers.build()
diff --git a/lmdeploy/turbomind/models/internvl3_5.py b/lmdeploy/turbomind/models/internvl3_5.py
new file mode 100644
index 0000000000..9139714fc8
--- /dev/null
+++ b/lmdeploy/turbomind/models/internvl3_5.py
@@ -0,0 +1,61 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""InternVL3.5 aggregate source model for TurboMind."""
+from __future__ import annotations
+
+from transformers import PretrainedConfig
+
+from .base import INPUT_MODELS
+from .qwen3 import Qwen3TextModel
+
+
+def _cfg_get(cfg, name: str, default=None):
+    if isinstance(cfg, dict):
+        return cfg.get(name, default)
+    return getattr(cfg, name, default)
+
+
+@INPUT_MODELS.register_module(name='internvl3_5')
+class InternVL3_5Model:
+    """Aggregate source model for Qwen3-backed InternVL3.5 checkpoints."""
+
+    _text_pfx = 'language_model.'
+    _supported_inner_arch = 'Qwen3ForCausalLM'
+
+    def __init__(self, cfg: PretrainedConfig, *, resolver):
+        llm_cfg = _cfg_get(cfg, 'llm_config')
+        if llm_cfg is None:
+            raise ValueError('InternVL3.5 TurboMind requires llm_config.')
+
+        archs = _cfg_get(llm_cfg, 'architectures')
+        if not archs:
+            raise ValueError(
+                'InternVL3.5 TurboMind requires llm_config.architectures.')
+
+        inner_arch = archs[0]
+        if inner_arch != self._supported_inner_arch:
+            raise ValueError(
+                'InternVL3.5 TurboMind currently supports only '
+                f'{self._supported_inner_arch}, but got {inner_arch}.')
+
+        self.text_model = Qwen3TextModel(llm_cfg, resolver=resolver)
+        self.vision_model = None
+
+    def bind_runtime(self, *, ctx, root_handles,
+                     attn_tp, mlp_tp, model_tp):
+        self.text_model.bind_runtime(
+            ctx=ctx,
+            root_handles=root_handles,
+            attn_tp=attn_tp,
+            mlp_tp=mlp_tp,
+            model_tp=model_tp,
+        )
+
+    @property
+    def _vocab_size(self):
+        return self.text_model.cfg.vocab_size
+
+    def set_params(self, params: dict):
+        self.text_model.set_params(params)
+
+    def model(self):
+        self.text_model.model(pfx=self._text_pfx)
diff --git a/lmdeploy/turbomind/models/llama.py b/lmdeploy/turbomind/models/llama.py
new file mode 100644
index 0000000000..5a565843f5
--- /dev/null
+++ b/lmdeploy/turbomind/models/llama.py
@@ -0,0 +1,103 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Llama TextModel for the new pipeline."""
+from __future__ import annotations
+
+from transformers import LlamaConfig
+
+from ..builders import (
+    AttentionBuilder,
+    DecoderLayerBuilder,
+    DecoderLayerConfig,
+    FfnBuilder,
+    ModuleListBuilder,
+    ModuleListConfig,
+    TextModelBuilder,
+    _act_type_id,
+)
+from ..text_model import TextModel
+from .base import INPUT_MODELS
+from .utils import (
+    layer_progress,
+    make_attention_config,
+    make_ffn_config,
+    make_model_weight_config,
+    reorder_rotary_emb,
+)
+
+
+@INPUT_MODELS.register_module(name='llama')
+class LlamaModel(TextModel):
+    """Weight model for Llama decoder-only variants."""
+
+    cfg: LlamaConfig
+
+    def __init__(self, cfg: LlamaConfig, *, resolver):
+        super().__init__(cfg, resolver=resolver)
+
+        self._attn_cfg = make_attention_config(cfg)
+
+        self._ffn_cfg = make_ffn_config(cfg,
+                                        act_type=_act_type_id('silu'))
+
+    # ------------------------------------------------------------------
+    # model() — walks full hierarchy
+    # ------------------------------------------------------------------
+
+    def model(self):
+        embed_key = 'model.embed_tokens.weight'
+        root_cfg = make_model_weight_config(self.cfg)
+        root = TextModelBuilder(
+            root_cfg, self._ctx,
+            root_handles=self._root_handles,
+            tp=self._model_tp,
+            vocab_size=self.cfg.vocab_size)
+        root.add_token_embeds(self._get(embed_key))
+        root.norm = self.norm(self._get('model.norm.weight'))
+        lm_key = embed_key if self.cfg.tie_word_embeddings else 'lm_head.weight'
+        root.add_lm_head(self._linear(lm_key.removesuffix('.weight')))
+        root.layers = self.layers('model.layers')
+        root.build()
+
+    # ------------------------------------------------------------------
+    # Attention / FFN factories
+    # ------------------------------------------------------------------
+
+    def attn(self, pfx):
+        q, k, v, o = [self._linear(f'{pfx}.{x}_proj') for x in 'qkvo']
+
+        cfg = self._attn_cfg.clone()
+
+        def reorder(x):
+            return reorder_rotary_emb(x, cfg.head_dim, cfg.rope.dim, resolver=self._resolver)
+
+        q, k = [reorder(x) for x in (q, k)]
+
+        # No QK-norm for Llama.
+        m = AttentionBuilder(cfg, self._ctx, tp=self._attn_tp)
+
+        m.add_qkv_proj(q, k, v)
+        m.add_o_proj(o)
+
+        return m.build()
+
+    def ffn(self, pfx):
+        w1, w3, w2 = [self._linear(f'{pfx}.{x}_proj') for x in ('gate', 'up', 'down')]
+
+        cfg = self._ffn_cfg.clone()
+
+        m = FfnBuilder(cfg, self._ctx, tp=self._mlp_tp)
+        m.add_ffn(w1, w2, w3)
+        return m.build()
+
+    def layers(self, pfx):
+        layers = ModuleListBuilder(ModuleListConfig(), self._ctx)
+        for i in layer_progress(self.cfg.num_hidden_layers):
+            d = DecoderLayerBuilder(DecoderLayerConfig(), self._ctx)
+            d.attention_norm = self.norm(
+                self._get(f'{pfx}.{i}.input_layernorm.weight'))
+            d.attention = self.attn(f'{pfx}.{i}.self_attn')
+            d.ffn_norm = self.norm(
+                self._get(f'{pfx}.{i}.post_attention_layernorm.weight'))
+            d.feed_forward = self.ffn(f'{pfx}.{i}.mlp')
+            layers[i] = d.build()
+        return layers.build()
diff --git a/lmdeploy/turbomind/models/qwen2.py b/lmdeploy/turbomind/models/qwen2.py
new file mode 100644
index 0000000000..0ad6258eb0
--- /dev/null
+++ b/lmdeploy/turbomind/models/qwen2.py
@@ -0,0 +1,144 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Qwen2 TextModel for the new pipeline.
+
+Handles both dense Qwen2 and Qwen2-MoE variants. MoE detected via num_experts in HF config. Shared expert uses
+shared_gate pattern matching Qwen3.5. No QK-norm, no sliding window.
+"""
+from __future__ import annotations
+
+from transformers import Qwen2Config, Qwen2MoeConfig
+
+from ..builders import (
+    AttentionBuilder,
+    DecoderLayerBuilder,
+    DecoderLayerConfig,
+    FfnBuilder,
+    ModuleListBuilder,
+    ModuleListConfig,
+    MoeBuilder,
+    TextModelBuilder,
+    _act_type_id,
+)
+from ..text_model import TextModel
+from .base import INPUT_MODELS
+from .utils import (
+    layer_progress,
+    make_attention_config,
+    make_ffn_config,
+    make_model_weight_config,
+    make_moe_config,
+    reorder_rotary_emb,
+)
+
+
+@INPUT_MODELS.register_module(name='qwen2-moe')
+@INPUT_MODELS.register_module(name='qwen2')
+class Qwen2Model(TextModel):
+    """Weight model for Qwen2 (dense) and Qwen2-MoE."""
+
+    cfg: Qwen2Config | Qwen2MoeConfig
+
+    def __init__(self, cfg: Qwen2Config | Qwen2MoeConfig, *, resolver):
+        super().__init__(cfg, resolver=resolver)
+
+        self._attn_cfg = make_attention_config(cfg)
+
+        self._ffn_cfg = make_ffn_config(cfg,
+                                        act_type=_act_type_id('silu'))
+
+        self._n_experts = getattr(cfg, 'num_experts', 0)
+        # ---- MoE template (only if MoE variant) ----
+        if self._n_experts > 0:
+            self._moe_cfg = make_moe_config(
+                cfg,
+                experts_per_token=cfg.num_experts_per_tok,
+                norm_topk_prob=cfg.norm_topk_prob)
+            self._moe_cfg.expert_num = self._n_experts
+
+    # ------------------------------------------------------------------
+    # model() — walks full hierarchy
+    # ------------------------------------------------------------------
+
+    def model(self, pfx=''):
+        embed_key = 'model.embed_tokens.weight'
+        root_cfg = make_model_weight_config(self.cfg)
+        root = TextModelBuilder(
+            root_cfg, self._ctx,
+            root_handles=self._root_handles,
+            tp=self._model_tp,
+            vocab_size=self.cfg.vocab_size)
+        root.add_token_embeds(self._get(f'{pfx}{embed_key}'))
+        root.norm = self.norm(self._get(f'{pfx}model.norm.weight'))
+        lm_key = embed_key if self.cfg.tie_word_embeddings else 'lm_head.weight'
+        root.add_lm_head(self._linear(f'{pfx}{lm_key.removesuffix(".weight")}'))
+        root.layers = self.layers(f'{pfx}model.layers')
+        root.build()
+
+    # ------------------------------------------------------------------
+    # Attention / FFN / MoE factories
+    # ------------------------------------------------------------------
+
+    def attn(self, pfx):
+        q, k, v, o = [self._linear(f'{pfx}.{x}_proj') for x in 'qkvo']
+
+        cfg = self._attn_cfg.clone()
+
+        def reorder(x):
+            return reorder_rotary_emb(x, cfg.head_dim, cfg.rope.dim, resolver=self._resolver)
+
+        q, k = [reorder(x) for x in (q, k)]
+
+        # No QK-norm for Qwen2.
+        m = AttentionBuilder(cfg, self._ctx, tp=self._attn_tp)
+
+        m.add_qkv_proj(q, k, v)
+        m.add_o_proj(o)
+
+        return m.build()
+
+    def ffn(self, pfx, inter_size, is_expert=False):
+        w1, w3, w2 = [self._linear(f'{pfx}.{x}_proj') for x in ('gate', 'up', 'down')]
+
+        cfg = self._ffn_cfg.clone()
+        cfg.inter_size = inter_size
+        cfg.is_expert  = is_expert
+
+        m = FfnBuilder(cfg, self._ctx, tp=self._mlp_tp)
+        m.add_ffn(w1, w2, w3)
+        return m.build()
+
+    def moe(self, pfx):
+        cfg = self._moe_cfg.clone()
+
+        m = MoeBuilder(cfg, self._ctx)
+
+        m.add_gate('gate', self._linear(f'{pfx}.gate'))
+
+        experts = ModuleListBuilder(ModuleListConfig(), self._ctx)
+        for e in range(self.cfg.num_experts):
+            experts[e] = self.ffn(f'{pfx}.experts.{e}', self.cfg.moe_intermediate_size,
+                                  is_expert=True)
+        m.experts = experts.build()
+
+        m.add_gate('shared_gate', self._linear(f'{pfx}.shared_expert_gate'))
+        shared = self.ffn(f'{pfx}.shared_expert', self.cfg.shared_expert_intermediate_size)
+
+        return m.build(), shared
+
+    # ------------------------------------------------------------------
+    # layers() — layer dispatch loop
+    # ------------------------------------------------------------------
+
+    def layers(self, pfx):
+        layers = ModuleListBuilder(ModuleListConfig(), self._ctx)
+        for i in layer_progress(self.cfg.num_hidden_layers):
+            d = DecoderLayerBuilder(DecoderLayerConfig(), self._ctx)
+            d.attention = self.attn(f'{pfx}.{i}.self_attn')
+            if self._n_experts > 0:
+                d.moe_ffn, d.feed_forward = self.moe(f'{pfx}.{i}.mlp')
+            else:
+                d.feed_forward = self.ffn(f'{pfx}.{i}.mlp', self.cfg.intermediate_size)
+            d.attention_norm = self.norm(self._get(f'{pfx}.{i}.input_layernorm.weight'))
+            d.ffn_norm = self.norm(self._get(f'{pfx}.{i}.post_attention_layernorm.weight'))
+            layers[i] = d.build()
+        return layers.build()
diff --git a/lmdeploy/turbomind/models/qwen3.py b/lmdeploy/turbomind/models/qwen3.py
new file mode 100644
index 0000000000..2463e60010
--- /dev/null
+++ b/lmdeploy/turbomind/models/qwen3.py
@@ -0,0 +1,144 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Qwen3 TextModel for the new pipeline.
+
+Qwen3 is a standard Llama-like model with QK norm and optional MoE. No shared expert in the MoE variant, no linear
+attention, no zero-centered norm.
+"""
+from __future__ import annotations
+
+from transformers import Qwen3Config, Qwen3MoeConfig
+
+from ..builders import (
+    AttentionBuilder,
+    DecoderLayerBuilder,
+    DecoderLayerConfig,
+    FfnBuilder,
+    ModuleListBuilder,
+    ModuleListConfig,
+    MoeBuilder,
+    TextModelBuilder,
+    _act_type_id,
+)
+from ..text_model import TextModel
+from .base import INPUT_MODELS
+from .utils import (
+    layer_progress,
+    make_attention_config,
+    make_ffn_config,
+    make_model_weight_config,
+    make_moe_config,
+    reorder_rotary_emb,
+)
+
+
+@INPUT_MODELS.register_module(name='qwen3-moe')
+@INPUT_MODELS.register_module(name='qwen3')
+class Qwen3TextModel(TextModel):
+    """Weight model for Qwen3 (dense) and Qwen3-MoE."""
+
+    cfg: Qwen3Config | Qwen3MoeConfig
+
+    def __init__(self, cfg: Qwen3Config | Qwen3MoeConfig, *, resolver):
+        super().__init__(cfg, resolver=resolver)
+
+        self._attn_cfg = make_attention_config(cfg)
+
+        self._ffn_cfg = make_ffn_config(cfg,
+                                        act_type=_act_type_id('silu'))
+
+        self._n_experts = getattr(cfg, 'num_experts', 0)
+
+        if self._n_experts > 0:
+            self._moe_cfg = make_moe_config(
+                cfg,
+                experts_per_token=cfg.num_experts_per_tok,
+                norm_topk_prob=cfg.norm_topk_prob)
+            self._moe_cfg.expert_num = self._n_experts
+
+    # ------------------------------------------------------------------
+    # model() — walks full hierarchy (same as existing code)
+    # ------------------------------------------------------------------
+
+    def model(self, pfx=''):
+        embed_key = 'model.embed_tokens.weight'
+        root_cfg = make_model_weight_config(self.cfg)
+        root = TextModelBuilder(
+            root_cfg, self._ctx,
+            root_handles=self._root_handles,
+            tp=self._model_tp,
+            vocab_size=self.cfg.vocab_size)
+        root.add_token_embeds(self._get(f'{pfx}{embed_key}'))
+        root.norm = self.norm(self._get(f'{pfx}model.norm.weight'))
+        lm_key = embed_key if self.cfg.tie_word_embeddings else 'lm_head.weight'
+        root.add_lm_head(self._linear(f'{pfx}{lm_key.removesuffix(".weight")}'))
+        root.layers = self.layers(f'{pfx}model.layers')
+        root.build()
+
+    # ------------------------------------------------------------------
+    # Attention / FFN / MoE factories
+    # ------------------------------------------------------------------
+
+    def attn(self, pfx):
+        q, k, v, o = [self._linear(f'{pfx}.{x}_proj') for x in 'qkvo']
+
+        cfg = self._attn_cfg.clone()
+
+        def reorder(x):
+            return reorder_rotary_emb(x, cfg.head_dim, cfg.rope.dim, resolver=self._resolver)
+
+        q, k = [reorder(x) for x in (q, k)]
+
+        # No per-layer attention fields for Qwen3 (no sliding window).
+        m = AttentionBuilder(cfg, self._ctx, tp=self._attn_tp)
+
+        m.add_qkv_proj(q, k, v)
+        m.add_o_proj(o)
+
+        q_norm, k_norm = [self._get(f'{pfx}.{x}_norm.weight') for x in 'qk']
+        m.q_norm = self.norm(reorder(q_norm))
+        m.k_norm = self.norm(reorder(k_norm))
+
+        return m.build()
+
+
+    def ffn(self, pfx, is_expert=False):
+        w1, w3, w2 = [self._linear(f'{pfx}.{x}_proj') for x in ('gate', 'up', 'down')]
+
+        cfg = self._ffn_cfg.clone()
+        cfg.is_expert  = is_expert
+
+        m = FfnBuilder(cfg, self._ctx, tp=self._mlp_tp)
+        m.add_ffn(w1, w2, w3)
+        return m.build()
+
+
+    def moe(self, pfx):
+        cfg = self._moe_cfg.clone()
+
+        m = MoeBuilder(cfg, self._ctx)
+
+        m.add_gate('gate', self._linear(f'{pfx}.gate'))
+
+        experts = ModuleListBuilder(ModuleListConfig(), self._ctx)
+        for e in range(self.cfg.num_experts):
+            experts[e] = self.ffn(f'{pfx}.experts.{e}', is_expert=True)
+        m.experts = experts.build()
+
+        return m.build()
+
+
+    def layers(self, pfx):
+        layers = ModuleListBuilder(ModuleListConfig(), self._ctx)
+
+        for i in layer_progress(self.cfg.num_hidden_layers):
+            d = DecoderLayerBuilder(DecoderLayerConfig(), self._ctx)
+            d.attention_norm = self.norm(self._get(f'{pfx}.{i}.input_layernorm.weight'))
+            d.attention = self.attn(f'{pfx}.{i}.self_attn')
+            d.ffn_norm = self.norm(self._get(f'{pfx}.{i}.post_attention_layernorm.weight'))
+            if self._n_experts:
+                d.moe_ffn = self.moe(f'{pfx}.{i}.mlp')
+            else:
+                d.feed_forward = self.ffn(f'{pfx}.{i}.mlp')
+            layers[i] = d.build()
+
+        return layers.build()
diff --git a/lmdeploy/turbomind/models/qwen3_5.py b/lmdeploy/turbomind/models/qwen3_5.py
new file mode 100644
index 0000000000..adde792f6f
--- /dev/null
+++ b/lmdeploy/turbomind/models/qwen3_5.py
@@ -0,0 +1,233 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Qwen3.5 TextModel for the new pipeline."""
+from __future__ import annotations
+
+import re
+
+import _turbomind as _tm
+from transformers.models.qwen3_5.configuration_qwen3_5 import Qwen3_5TextConfig
+from transformers.models.qwen3_5_moe.configuration_qwen3_5_moe import Qwen3_5MoeTextConfig
+
+from ..builders import (
+    AttentionBuilder,
+    DecoderLayerBuilder,
+    DecoderLayerConfig,
+    DeltaNetBuilder,
+    FfnBuilder,
+    ModuleListBuilder,
+    ModuleListConfig,
+    MoeBuilder,
+    TextModelBuilder,
+    _act_type_id,
+)
+from ..builders.attention import split_output_gate
+from ..text_model import TextModel
+from .base import INPUT_MODELS
+from .utils import (
+    layer_progress,
+    make_attention_config,
+    make_ffn_config,
+    make_model_weight_config,
+    make_moe_config,
+    read_packed_moe_expert,
+    reorder_rotary_emb,
+)
+
+
+def map_packed_qwen35_experts(name: str) -> str:
+    """Map packed expert names to weight names so parameter.py can classify."""
+    return re.sub(r'(mlp\.experts\.(?:gate_up|down)_proj)$', r'\1.weight', name)
+
+
+@INPUT_MODELS.register_module(name='qwen3_5-moe')
+@INPUT_MODELS.register_module(name='qwen3_5')
+class Qwen3_5Model(TextModel):
+    """Weight model for Qwen3.5 (dense + linear-attn + optional MoE)."""
+
+    _loader_mappings = [map_packed_qwen35_experts]
+    cfg: Qwen3_5TextConfig | Qwen3_5MoeTextConfig
+
+    def __init__(self, cfg: Qwen3_5TextConfig | Qwen3_5MoeTextConfig, *, resolver):
+        super().__init__(cfg, resolver=resolver)
+
+        self._attn_cfg = make_attention_config(cfg)
+        self._attn_cfg.output_gate = True
+
+        self._n_experts = getattr(cfg, 'num_experts', 0)
+
+        # ---- DeltaNet template ----
+        ln_key_heads = cfg.linear_num_key_heads
+        ln_val_heads = cfg.linear_num_value_heads
+        ln_key_dim   = cfg.linear_key_head_dim
+        ln_val_dim   = cfg.linear_value_head_dim
+
+        self._dn_cfg = _tm.DeltaNetConfig()
+        self._dn_cfg.hidden_dim      = self.cfg.hidden_size
+        self._dn_cfg.num_k_heads     = ln_key_heads
+        self._dn_cfg.num_v_heads     = ln_val_heads
+        self._dn_cfg.key_head_dim    = ln_key_dim
+        self._dn_cfg.value_head_dim  = ln_val_dim
+        self._dn_cfg.d_conv          = cfg.linear_conv_kernel_dim or 4
+        q_dim = ln_key_heads * ln_key_dim
+        v_dim = ln_val_heads * ln_val_dim
+        self._linear_qkv_split = (q_dim, q_dim, v_dim)
+
+        # ---- MoE template ----
+        if self._n_experts > 0:
+            self._moe_cfg = make_moe_config(
+                cfg,
+                experts_per_token=cfg.num_experts_per_tok)
+            self._moe_cfg.expert_num = self._n_experts
+            inter_size=cfg.moe_intermediate_size
+        else:
+            inter_size=cfg.intermediate_size
+
+        # ---- FFN template ----
+        self._ffn_cfg = make_ffn_config(
+            cfg,
+            act_type=_act_type_id('silu'), inter_size=inter_size)
+
+    # ------------------------------------------------------------------
+    # model() — same topology as old code
+    # ------------------------------------------------------------------
+
+    def model(self):
+        root_cfg = make_model_weight_config(self.cfg)
+        root = TextModelBuilder(
+            root_cfg, self._ctx,
+            root_handles=self._root_handles,
+            tp=self._model_tp,
+            vocab_size=self.cfg.vocab_size)
+        embed_key = 'model.language_model.embed_tokens.weight'
+        root.add_token_embeds(self._get(embed_key))
+        root.norm = self.norm(1.0 + self._get('model.language_model.norm.weight'))
+        lm_key = embed_key if self.cfg.tie_word_embeddings else 'lm_head.weight'
+        root.add_lm_head(self._linear(lm_key.removesuffix('.weight')))
+        root.layers = self.layers('model.language_model.layers')
+        root.build()
+
+    # ------------------------------------------------------------------
+    # Attention / linear-attention factories
+    # ------------------------------------------------------------------
+
+    def attn(self, pfx):
+        q, k, v, o = [self._linear(f'{pfx}.{x}_proj') for x in 'qkvo']
+
+        cfg = self._attn_cfg.clone()
+        q, gate = split_output_gate(q, head_num=cfg.head_num)
+
+        def reorder(x):
+            return reorder_rotary_emb(x, cfg.head_dim, cfg.rope.dim, resolver=self._resolver)
+
+        q, k = [reorder(x) for x in (q, k)]
+
+        m = AttentionBuilder(cfg, self._ctx, tp=self._attn_tp)
+
+        m.add_qkv_proj(q, k, v, gate=gate)
+        m.add_o_proj(o)
+
+        q_norm, k_norm = [self._get(f'{pfx}.{x}_norm.weight') for x in 'qk']
+
+        m.q_norm = self.norm(reorder(1.0 + q_norm.float()))
+        m.k_norm = self.norm(reorder(1.0 + k_norm.float()))
+
+        return m.build()
+
+    def linear_attn(self, pfx):
+        cfg = self._dn_cfg.clone()
+        builder = DeltaNetBuilder(cfg, self._ctx,
+                                  tp=self._attn_tp)
+
+        builder.add_input_projections(
+            in_proj_qkv=self._linear(f'{pfx}.in_proj_qkv'),
+            in_proj_z=self._linear(f'{pfx}.in_proj_z'),
+            in_proj_b=self._linear(f'{pfx}.in_proj_b'),
+            in_proj_a=self._linear(f'{pfx}.in_proj_a'),
+            out_proj=self._linear(f'{pfx}.out_proj'),
+            qkv_split=self._linear_qkv_split)
+        builder.add_scalar_params(
+            a_log=self._get(f'{pfx}.A_log'),
+            dt_bias=self._get(f'{pfx}.dt_bias'))
+        builder.add_conv1d(
+            self._get(f'{pfx}.conv1d.weight'),
+            qkv_split=self._linear_qkv_split)
+        builder.norm = self.norm(self._get(f'{pfx}.norm.weight'))  # ! not zero-centered
+        return builder.build()
+
+    # ------------------------------------------------------------------
+    # FFN / MoE factories
+    # ------------------------------------------------------------------
+
+    def ffn(self, pfx, inter_size, is_expert=False):
+        try:
+            w1, w3, w2 = [self._linear(f'{pfx}.{x}_proj')
+                          for x in ('gate', 'up', 'down')]
+        except KeyError:
+            return None
+
+        cfg = self._ffn_cfg.clone()
+        cfg.inter_size = inter_size
+        cfg.is_expert  = is_expert
+
+        m = FfnBuilder(cfg, self._ctx, tp=self._mlp_tp)
+        m.add_ffn(w1, w2, w3)
+        return m.build()
+
+    def moe(self, pfx):
+        cfg = self._moe_cfg.clone()
+
+        m = MoeBuilder(cfg, self._ctx)
+
+        m.add_gate('gate', self._linear(f'{pfx}.gate'))
+
+        experts = ModuleListBuilder(ModuleListConfig(), self._ctx)
+        for e in range(self._n_experts):
+            experts[e] = self._moe_expert_ffn(f'{pfx}.experts', e, self.cfg.moe_intermediate_size)
+        m.experts = experts.build()
+
+        m.add_gate('shared_gate', self._linear(f'{pfx}.shared_expert_gate'))
+        shared = self.ffn(f'{pfx}.shared_expert', self.cfg.shared_expert_intermediate_size)
+
+        return m.build(), shared
+
+    def _packed_moe_ffn(self, pfx, expert_idx, inter_size):
+        w1, w2, w3 = read_packed_moe_expert(
+            self.params,
+            f'{pfx}.gate_up_proj',
+            f'{pfx}.down_proj',
+            expert_idx,
+            resolver=self._resolver,
+        )
+        cfg = self._ffn_cfg.clone()
+        cfg.inter_size = inter_size
+        cfg.is_expert  = True
+        m = FfnBuilder(cfg, self._ctx, tp=self._mlp_tp)
+        m.add_ffn(w1, w2, w3)
+        return m.build()
+
+    def _moe_expert_ffn(self, pfx, expert_idx, inter_size):
+        expert_pfx = f'{pfx}.{expert_idx}'
+        inter_size = self.cfg.moe_intermediate_size
+        return (self.ffn(expert_pfx, inter_size, is_expert=True)
+                or self._packed_moe_ffn(pfx, expert_idx, inter_size))
+
+    # ------------------------------------------------------------------
+    # layers() — dispatch by layer type
+    # ------------------------------------------------------------------
+
+    def layers(self, pfx):
+        layers = ModuleListBuilder(ModuleListConfig(), self._ctx)
+        for i in layer_progress(self.cfg.num_hidden_layers):
+            d = DecoderLayerBuilder(DecoderLayerConfig(), self._ctx)
+            if self.cfg.layer_types[i] == 'linear_attention':
+                d.linear_attn = self.linear_attn(f'{pfx}.{i}.linear_attn')
+            else:
+                d.attention = self.attn(f'{pfx}.{i}.self_attn')
+            if self._n_experts > 0:
+                d.moe_ffn, d.feed_forward = self.moe(f'{pfx}.{i}.mlp')
+            else:
+                d.feed_forward = self.ffn(f'{pfx}.{i}.mlp', self.cfg.intermediate_size)
+            d.attention_norm = self.norm(1.0 + self._get(f'{pfx}.{i}.input_layernorm.weight').float())
+            d.ffn_norm = self.norm(1.0 + self._get(f'{pfx}.{i}.post_attention_layernorm.weight').float())
+            layers[i] = d.build()
+        return layers.build()
diff --git a/lmdeploy/turbomind/models/utils.py b/lmdeploy/turbomind/models/utils.py
new file mode 100644
index 0000000000..4a95a0df08
--- /dev/null
+++ b/lmdeploy/turbomind/models/utils.py
@@ -0,0 +1,441 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Shared utilities for source model input classes."""
+from __future__ import annotations
+
+import math
+from types import SimpleNamespace
+
+import _turbomind as _tm
+import torch
+
+from lmdeploy.archs import get_model_arch
+from lmdeploy.utils import get_logger
+
+from ..builders import _act_type_id
+from ..linear import Linear, _dequant_linear
+
+
+def source_model_config(model_config):
+    """Select the local config object consumed by a TurboMind source model.
+
+    Unwrap text_config for text-only wrappers. Keep llm_config on the outer config so aggregate models, such as
+    InternVL3.5, can validate and delegate explicitly.
+    """
+    if hasattr(model_config, 'text_config'):
+        return model_config.text_config
+    return model_config
+
+
+def load_model_config(model_path: str):
+    """Load the local Transformers config object for a source text model."""
+    _, model_config = get_model_arch(model_path)
+    return source_model_config(model_config)
+
+
+def _optional_attr(cfg, name: str, default=None):
+    if isinstance(cfg, dict):
+        return cfg.get(name, default)
+    return getattr(cfg, name, default)
+
+
+def _param_get(params, name: str, default=None):
+    if params is None:
+        return default
+    if isinstance(params, dict):
+        return params.get(name, default)
+    return getattr(params, name, default)
+
+
+def _param_has(params, name: str) -> bool:
+    if params is None:
+        return False
+    if isinstance(params, dict):
+        return name in params
+    return hasattr(params, name)
+
+
+_ROPE_TYPE_MAP = {
+    'default': 1,
+    'linear': 2,
+    'dynamic': 3,
+    'yarn': 4,
+    'llama3': 5,
+    'mrope': 6,
+}
+
+
+def rope_type_to_int(type_str: str) -> int:
+    return _ROPE_TYPE_MAP[type_str]
+
+
+def _get_mscale(scale, mscale=1):
+    """YaRN mscale helper. Shared by parse_rope_param and MLA softmax_scale."""
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+
+
+def parse_rope_param(cfg, head_dim: int) -> tuple[SimpleNamespace, int]:
+    """Parse RoPE configuration from a model config dict or object.
+
+    Returns:
+        rope_param: SimpleNamespace carrying rope fields (type, base, dim,
+            factor, max_position_embeddings, attention_factor, beta_fast,
+            beta_slow, low_freq_factor, high_freq_factor,
+            original_max_position_embeddings, mrope_section)
+        max_position_embeddings: int (0 if not present in config)
+    """
+    rope_parameters = _optional_attr(cfg, 'rope_parameters', None)
+    if rope_parameters is not None:
+        # transformers v5.0.0 aggregates rope settings into rope_parameters
+        rope_scaling = rope_parameters
+        rope_theta = float(_param_get(rope_scaling, 'rope_theta', 10000.0))
+    else:
+        rope_theta = float(_optional_attr(cfg, 'rope_theta', 10000.0))
+        rope_scaling = _optional_attr(cfg, 'rope_scaling', None)
+
+    max_position_embeddings = int(_optional_attr(cfg, 'max_position_embeddings', 0))
+    partial_rotary_factor = _param_get(rope_parameters, 'partial_rotary_factor', None)
+    if partial_rotary_factor is None:
+        partial_rotary_factor = float(_optional_attr(cfg, 'partial_rotary_factor', 1.0))
+    rope_param = SimpleNamespace(
+        type='default',
+        base=rope_theta,
+        dim=int(head_dim * partial_rotary_factor),
+        factor=1.0,
+        max_position_embeddings=None,
+        attention_factor=1.0,
+        beta_fast=32,
+        beta_slow=1,
+        low_freq_factor=None,
+        high_freq_factor=None,
+        original_max_position_embeddings=None,
+        mrope_section=None,
+    )
+
+    if rope_scaling is not None:
+        rope_type = _param_get(rope_scaling, 'rope_type', '') or _param_get(rope_scaling, 'type', '')
+        if _param_get(rope_scaling, 'mrope_section') is not None:
+            rope_type = 'mrope'
+        scaling_factor = _param_get(rope_scaling, 'factor', 0.0)
+
+        if rope_type == 'default':
+            pass
+        elif rope_type == 'dynamic':
+            rope_param.type = 'dynamic'
+            rope_param.factor = scaling_factor
+            rope_param.max_position_embeddings = max_position_embeddings
+        elif rope_type == 'linear':
+            rope_param.type = 'linear'
+            rope_param.factor = scaling_factor
+        elif rope_type == 'llama3':
+            low_freq_factor = _param_get(rope_scaling, 'low_freq_factor', 1.0)
+            high_freq_factor = _param_get(rope_scaling, 'high_freq_factor', 1.0)
+            original_max_position_embeddings = _param_get(rope_scaling, 'original_max_position_embeddings', 0)
+            rope_param.type = 'llama3'
+            rope_param.factor = scaling_factor
+            rope_param.low_freq_factor = low_freq_factor
+            rope_param.high_freq_factor = high_freq_factor
+            rope_param.original_max_position_embeddings = original_max_position_embeddings
+        elif rope_type == 'yarn':
+            attention_factor = _param_get(rope_scaling, 'attention_factor', None)
+            if attention_factor is None:
+                mscale = _param_get(rope_scaling, 'mscale')
+                mscale_all_dim = _param_get(rope_scaling, 'mscale_all_dim')
+                if mscale is not None and mscale_all_dim is not None:
+                    attention_factor = float(
+                        _get_mscale(scaling_factor, mscale) /
+                        _get_mscale(scaling_factor, mscale_all_dim))
+                else:
+                    attention_factor = _get_mscale(scaling_factor)
+            beta_fast = _param_get(rope_scaling, 'beta_fast', 32.0)
+            beta_slow = _param_get(rope_scaling, 'beta_slow', 1.0)
+            rope_param.type = 'yarn'
+            if _param_has(rope_scaling, 'original_max_position_embeddings'):
+                original_max_position_embeddings = _param_get(rope_scaling, 'original_max_position_embeddings')
+                scaling_factor = max_position_embeddings / original_max_position_embeddings
+            else:
+                original_max_position_embeddings = max_position_embeddings
+            rope_param.factor = scaling_factor
+            rope_param.max_position_embeddings = original_max_position_embeddings
+            rope_param.attention_factor = attention_factor
+            rope_param.beta_fast = beta_fast
+            rope_param.beta_slow = beta_slow
+        elif rope_type == 'mrope':
+            mrope_section = _param_get(rope_scaling, 'mrope_section')
+            rope_param.type = 'mrope'
+            rope_param.mrope_section = mrope_section
+        else:
+            raise RuntimeError(f'Unsupported rope type: {rope_type}')
+
+    return rope_param, max_position_embeddings
+
+
+def copy_rope_config(rope_cfg, rope_param, max_position_embeddings: int):
+    """Copy parsed RoPE fields into a TurboMind C++ rope config object."""
+    rope_cfg.type = rope_type_to_int(rope_param.type)
+    rope_cfg.base = rope_param.base
+    rope_cfg.dim = rope_param.dim
+    rope_cfg.factor = rope_param.factor
+    rope_cfg.max_position_embeddings = max_position_embeddings
+    if rope_param.type == 'yarn':
+        rope_cfg.yarn_attention_factor = rope_param.attention_factor
+        rope_cfg.yarn_beta_fast = rope_param.beta_fast
+        rope_cfg.yarn_beta_slow = rope_param.beta_slow
+    elif rope_param.type == 'llama3':
+        rope_cfg.llama3_low_freq_factor = rope_param.low_freq_factor
+        rope_cfg.llama3_high_freq_factor = rope_param.high_freq_factor
+        rope_cfg.llama3_original_max_position_embeddings = rope_param.original_max_position_embeddings
+    elif rope_param.type == 'mrope':
+        rope_cfg.mrope_section = rope_param.mrope_section
+
+
+def make_model_weight_config(cfg):
+    """Build the root ModelWeightConfig from root-module fields."""
+    model_cfg = _tm.ModelWeightConfig()
+    model_cfg.hidden_units = cfg.hidden_size
+    return model_cfg
+
+
+def make_attention_config(cfg, *, head_dim=None):
+    """Build common AttentionConfig fields from attention-module geometry."""
+    hidden_dim = cfg.hidden_size
+    head_num = cfg.num_attention_heads
+    head_dim = head_dim if head_dim is not None else getattr(cfg, 'head_dim', hidden_dim // head_num)
+    kv_head_num = cfg.num_key_value_heads
+    rope, max_position_embeddings = parse_rope_param(cfg, head_dim)
+    attn_cfg = _tm.AttentionConfig()
+    attn_cfg.hidden_dim = hidden_dim
+    attn_cfg.head_dim = head_dim
+    attn_cfg.head_num = head_num
+    attn_cfg.kv_head_num = kv_head_num
+    attn_cfg.window_size = 0
+    attn_cfg.softmax_scale = 0.0
+    copy_rope_config(attn_cfg.rope, rope, max_position_embeddings)
+    return attn_cfg
+
+
+def make_ffn_config(cfg, *, act_type, inter_size=None):
+    """Build common FfnConfig fields from FFN-module shape."""
+    ffn_cfg = _tm.FfnConfig()
+    ffn_cfg.hidden_dim = cfg.hidden_size
+    ffn_cfg.act_type = act_type
+    ffn_cfg.inter_size = inter_size if inter_size is not None else cfg.intermediate_size
+    return ffn_cfg
+
+
+def make_moe_config(cfg, *,
+                    experts_per_token,
+                    act_type=None,
+                    norm_topk_prob=True,
+                    topk_method='greedy',
+                    scoring_func='softmax',
+                    routed_scale=1.0,
+                    topk_group=1,
+                    n_group=1,
+                    router_n_groups=0):
+    """Build a MoeConfig populated from HF config and per-model overrides."""
+    if act_type is None:
+        act_type = _act_type_id('silu')
+
+    moe_cfg = _tm.MoeConfig()
+    moe_cfg.experts_per_token = experts_per_token
+    moe_cfg.norm_topk_prob = norm_topk_prob
+    moe_cfg.routed_scale = routed_scale
+    moe_cfg.topk_group = topk_group
+    moe_cfg.topk_method = topk_method
+    moe_cfg.n_group = n_group
+    moe_cfg.scoring_func = scoring_func
+    moe_cfg.router_n_groups = router_n_groups
+    moe_cfg.act_type = act_type
+    moe_cfg.fuse_silu = True
+    return moe_cfg
+
+
+def make_mla_config(cfg):
+    """Build an AttentionConfig for MLA models.
+
+    Computes MLA geometry, softmax scale (including YaRN mscale_all_dim),
+    and populates all MLA-specific AttentionConfig fields.
+
+    Returns:
+        _tm.AttentionConfig populated with MLA fields.
+    """
+    qk_nope_dim = cfg.qk_nope_head_dim
+    qk_rope_dim = cfg.qk_rope_head_dim
+    kv_lora_rank = cfg.kv_lora_rank
+    q_head_dim = qk_nope_dim + qk_rope_dim
+
+    size_per_head = q_head_dim
+    v_head_dim = cfg.v_head_dim
+    softmax_scale = 0.0
+    if kv_lora_rank and kv_lora_rank != qk_nope_dim:
+        size_per_head = kv_lora_rank + qk_rope_dim
+        v_head_dim = kv_lora_rank
+        softmax_scale = q_head_dim ** (-0.5)
+
+    rope, max_position_embeddings = parse_rope_param(cfg, qk_rope_dim)
+
+    # MLA-specific YaRN mscale_all_dim softmax_scale adjustment
+    rope_params = (getattr(cfg, 'rope_parameters', None)
+                   or getattr(cfg, 'rope_scaling', None))
+    if rope_params:
+        rope_type = (_param_get(rope_params, 'rope_type', '')
+                     or _param_get(rope_params, 'type', ''))
+        if rope_type == 'yarn':
+            mscale_all_dim = _param_get(rope_params, 'mscale_all_dim')
+            if mscale_all_dim:
+                scaling_factor = float(_param_get(rope_params, 'factor', 0.0))
+                mscale = _get_mscale(scaling_factor, mscale_all_dim)
+                softmax_scale = q_head_dim ** (-0.5) * mscale * mscale
+
+    attn_cfg = _tm.AttentionConfig()
+    attn_cfg.hidden_dim = cfg.hidden_size
+    attn_cfg.head_dim = size_per_head
+    attn_cfg.head_num = cfg.num_attention_heads
+    attn_cfg.kv_head_num = 1
+    attn_cfg.kv_lora_rank = kv_lora_rank
+    attn_cfg.q_lora_rank = cfg.q_lora_rank or 0
+    attn_cfg.qk_rope_dim = qk_rope_dim
+    attn_cfg.qk_nope_dim = qk_nope_dim
+    attn_cfg.v_head_dim = v_head_dim
+    copy_rope_config(attn_cfg.rope, rope, max_position_embeddings)
+    attn_cfg.softmax_scale = softmax_scale
+
+    return attn_cfg
+
+
+def _reorder_rotary_emb(x: torch.Tensor, head_dim: int, rope_dim: int):
+    """Reorder rotary embedding layout for TurboMind's RoPE kernel."""
+    if rope_dim < head_dim:
+        output_dims = x.size(-1)
+        head_num = output_dims // head_dim
+        orig_shape = x.shape
+        if x.dim() == 1:
+            x = x.unsqueeze(0)
+        x = x.view(x.size(0), head_num, head_dim)
+        rotary = x[:, :, :rope_dim]
+        passthrough = x[:, :, rope_dim:]
+        rotary = rotary.view(x.size(0), head_num, 2, rope_dim // 2).transpose(2, 3).contiguous()
+        rotary = rotary.view(x.size(0), head_num, rope_dim)
+        x = torch.cat([rotary, passthrough], dim=-1)
+        return x.reshape(orig_shape)
+    else:
+        output_dims = x.size(-1)
+        head_num = output_dims // head_dim
+        return x.view(-1, head_num, 2, head_dim // 2).transpose(2, 3).reshape(x.shape)
+
+
+def reorder_rotary_emb(x, head_dim: int, rope_dim: int, *, resolver=None):
+    """Apply RoPE layout permutation.
+
+    Accepts either a ``Linear`` or a raw ``torch.Tensor``.
+
+    For ``Linear`` inputs the permutation is applied to every tensor in the
+    bundle with quantization awareness (block-alignment check, dequant
+    fallback, block-level shuffling for scales/zeros). ``resolver`` is
+    required and must not be ``None`` — it supplies the compute dtype
+    threaded into ``_dequant_linear``.
+
+    For ``torch.Tensor`` inputs the element-level interleave-transpose is
+    applied directly. ``resolver`` is ignored.
+    """
+    if isinstance(x, Linear):
+        if resolver is None:
+            raise TypeError(
+                'resolver is required when passing a Linear to reorder_rotary_emb'
+            )
+        data_type = resolver.data_type
+        wfmt = x.weight_format
+        block_out = wfmt.block_out or 0
+
+        # If blocks don't align with heads, dequant first
+        if block_out and block_out % head_dim != 0:
+            x = _dequant_linear(x, data_type=data_type)
+            block_out = 0
+
+        new_tensors = {}
+        for kind, tensor in x.tensors.items():
+            if kind in ('scales', 'zeros') and block_out > 0:
+                # Block-level shuffle: reinterpret each block as a "head"
+                # so _reorder_rotary_emb shuffles at block granularity.
+                blocks_per_head = block_out // head_dim
+                if blocks_per_head <= 1:
+                    new_tensors[kind] = tensor
+                else:
+                    rope_dim_blocks = rope_dim * blocks_per_head // head_dim
+                    new_tensors[kind] = _reorder_rotary_emb(tensor, blocks_per_head, rope_dim_blocks)
+            elif tensor.size(-1) % head_dim == 0:
+                new_tensors[kind] = _reorder_rotary_emb(tensor, head_dim, rope_dim)
+            else:
+                new_tensors[kind] = tensor
+
+        return Linear(tensors=new_tensors, weight_format=x.weight_format)
+
+    return _reorder_rotary_emb(x, head_dim, rope_dim)
+
+
+def layer_progress(num_layers: int):
+    """Tqdm iterable for model.layers() per-layer conversion loops.
+
+    Yields the layer indices 0..num_layers-1, displaying a single-line
+    progress bar on stderr. ``leave=False`` clears the bar when the loop
+    completes. Lazy-imports tqdm so importing utils.py stays cheap.
+    """
+    from tqdm import tqdm
+    return tqdm(range(num_layers), desc='Loading', leave=False)
+
+
+def read_packed_moe_expert(
+    params: dict,
+    gate_up_pfx: str,
+    down_pfx: str,
+    expert_idx: int,
+    *,
+    resolver,
+    interleaved: bool = False,
+    trans: bool = False,
+) -> tuple[Linear, Linear, Linear]:
+    """Read one packed MoE expert's fused gate_up + down and split into (w1,
+    w2, w3) Linears in TM layout.
+
+    ``gate_up_pfx`` and ``down_pfx`` are the full prefixes to the two
+    packed tensors (e.g. ``'model.layers.5.mlp.experts.gate_up_proj'``).
+    The caller composes these strings; this helper concatenates nothing.
+
+    Parameters
+    ----------
+    interleaved : bool
+        Split scheme for the fused gate_up output dim.
+        ``False`` -> contiguous ``[..., :half]`` / ``[..., half:]`` (qwen3.5).
+        ``True``  -> stride-2 interleaved ``[..., ::2]`` / ``[..., 1::2]`` (gpt-oss).
+    trans : bool
+        For trivial-format checkpoints that store the packed tensor in
+        ``[n_experts, in, out]`` layout (gpt-oss), transposes the 2D
+        ``weight`` tensor to undo the HF-to-TM transpose applied by
+        ``TrivialFormat.normalize``. Only affects the ``weight`` kind on
+        trivial-format linears; quantized formats use their own normalizers.
+    """
+    gate_up = resolver.resolve(params, gate_up_pfx, index=expert_idx)
+    down    = resolver.resolve(params, down_pfx,    index=expert_idx)
+
+    if trans:
+        for lin in (gate_up, down):
+            if lin.weight_format.name == 'trivial':
+                w = lin.tensors.get('weight')
+                if w is not None and w.dim() == 2:
+                    lin.tensors['weight'] = w.t().contiguous()
+
+    w1_t: dict[str, torch.Tensor] = {}
+    w3_t: dict[str, torch.Tensor] = {}
+    for kind, t in gate_up.tensors.items():
+        if interleaved:
+            w1_t[kind] = t[..., ::2].contiguous()
+            w3_t[kind] = t[..., 1::2].contiguous()
+        else:
+            half = t.shape[-1] // 2
+            w1_t[kind] = t[..., :half].contiguous()
+            w3_t[kind] = t[..., half:].contiguous()
+    w1 = Linear(tensors=w1_t, weight_format=gate_up.weight_format)
+    w3 = Linear(tensors=w3_t, weight_format=gate_up.weight_format)
+    return w1, down, w3
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
index 732b38c84d..1a08b9cbe1 100644
--- a/lmdeploy/turbomind/supported_models.py
+++ b/lmdeploy/turbomind/supported_models.py
@@ -5,62 +5,25 @@
 logger = get_logger('lmdeploy')
 
 SUPPORTED_ARCHS = dict(
-    # baichuan-7b
-    BaiChuanForCausalLM='baichuan',
-    # baichuan2-7b, baichuan-13b, baichuan2-13b
-    BaichuanForCausalLM='baichuan2',
-    # gpt-oss
-    GptOssForCausalLM='gpt-oss',
-    # internlm
-    InternLMForCausalLM='llama',
-    # internlm2
-    InternLM2ForCausalLM='internlm2',
-    # internlm3
-    InternLM3ForCausalLM='llama',
-    # llama, llama2, alpaca, vicuna, codellama, ultracm, yi,
-    # deepseek-coder, deepseek-llm
-    LlamaForCausalLM='llama',
-    # Qwen 7B-72B, Qwen-VL-7B
-    QWenLMHeadModel='qwen',
-    # Qwen2
+    # Qwen2 / Qwen2-MoE
     Qwen2ForCausalLM='qwen2',
     Qwen2MoeForCausalLM='qwen2-moe',
-    # Qwen2-VL
-    Qwen2VLForConditionalGeneration='qwen2',
-    # Qwen2.5-VL
-    Qwen2_5_VLForConditionalGeneration='qwen2',
     # Qwen3
     Qwen3ForCausalLM='qwen3',
     Qwen3MoeForCausalLM='qwen3-moe',
     # Qwen 3.5
     Qwen3_5ForConditionalGeneration='qwen3_5',
     Qwen3_5MoeForConditionalGeneration='qwen3_5-moe',
-    # mistral
-    MistralForCausalLM='llama',
-    # llava
-    LlavaLlamaForCausalLM='llama',
-    LlavaMistralForCausalLM='llama',
-    LlavaForConditionalGeneration='llava',
-    # xcomposer2
-    InternLMXComposer2ForCausalLM='xcomposer2',
-    # internvl
-    InternVLChatModel='internvl',
-    # internvl3
-    InternVLForConditionalGeneration='internvl',
-    InternS1ForConditionalGeneration='internvl',
-    # deepseek-vl
-    MultiModalityCausalLM='deepseekvl',
-    DeepseekV2ForCausalLM='deepseek2',
-    # MiniCPMV
-    MiniCPMV='minicpmv',
-    # chatglm2/3, glm4
-    ChatGLMModel='glm4',
-    ChatGLMForConditionalGeneration='glm4',
+    # InternVL3.5
+    InternVLChatModel='internvl3_5',
+    # Llama (2, 3, 3.1, 3.2) + InternLM3
+    LlamaForCausalLM='llama',
+    InternLM2ForCausalLM='internlm2',
+    InternLM3ForCausalLM='llama',
     # glm4-moe-lite (e.g. GLM-4.7-Flash)
     Glm4MoeLiteForCausalLM='glm4-moe-lite',
-    # mixtral
-    MixtralForCausalLM='mixtral',
-    MolmoForCausalLM='molmo',
+    # gpt-oss
+    GptOssForCausalLM='gpt-oss',
 )
 
 
@@ -86,53 +49,19 @@ def is_supported(model_path: str):
     """  # noqa: E501
     import os
 
-    def _is_head_dim_supported(cfg):
-        head_dim = cfg.head_dim if hasattr(cfg, 'head_dim') else cfg.hidden_size // cfg.num_attention_heads
-        return head_dim in [128, 64]
-
     support_by_turbomind = False
     triton_model_path = os.path.join(model_path, 'triton_models')
     if os.path.exists(triton_model_path):
         support_by_turbomind = True
     else:
-
         arch, cfg = get_model_arch(model_path)
         quant_method = search_nested_config(cfg.to_dict(), 'quant_method')
         if quant_method and quant_method in ['smooth_quant']:
-            # tm hasn't support quantized models by applying smoothquant
             return False
 
         if arch in SUPPORTED_ARCHS.keys():
             support_by_turbomind = True
-            # special cases
-            if arch == 'BaichuanForCausalLM':
-                num_attn_head = cfg.num_attention_heads
-                if num_attn_head == 40:
-                    # baichuan-13B, baichuan2-13B not supported by turbomind
-                    support_by_turbomind = False
-            elif arch in ['Qwen2ForCausalLM', 'LlamaForCausalLM']:
-                support_by_turbomind = _is_head_dim_supported(cfg)
-            elif arch in ('ChatGLMModel', 'ChatGLMForConditionalGeneration'):
-                # chatglm1/2/3 is not working yet
-                support_by_turbomind = cfg.num_layers == 40
-                if getattr(cfg, 'vision_config', None) is not None:
-                    # glm-4v-9b not supported
-                    support_by_turbomind = False
-            elif arch == 'InternVLChatModel':
-                llm_arch = cfg.llm_config.architectures[0]
-                support_by_turbomind = (llm_arch in SUPPORTED_ARCHS and _is_head_dim_supported(cfg.llm_config))
-            elif arch in ['LlavaForConditionalGeneration', 'InternVLForConditionalGeneration']:
-                llm_arch = cfg.text_config.architectures[0]
-                if llm_arch in ['Qwen2ForCausalLM', 'LlamaForCausalLM']:
-                    support_by_turbomind = _is_head_dim_supported(cfg.text_config)
-            elif arch == 'MolmoForCausalLM':
-                kv_heads = cfg.num_key_value_heads
-                # TM hasn't supported allenai/Molmo-7B-O-0924 yet
-                support_by_turbomind = kv_heads is not None
-            elif arch == 'DeepseekV2ForCausalLM':
-                if getattr(cfg, 'vision_config', None) is not None:
-                    support_by_turbomind = False
-            elif arch == 'Glm4MoeLiteForCausalLM':
+            if arch == 'Glm4MoeLiteForCausalLM':
                 if getattr(cfg, 'vision_config', None) is not None:
                     support_by_turbomind = False
 
diff --git a/lmdeploy/turbomind/text_model.py b/lmdeploy/turbomind/text_model.py
new file mode 100644
index 0000000000..e87415565e
--- /dev/null
+++ b/lmdeploy/turbomind/text_model.py
@@ -0,0 +1,95 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""TextModel — per-architecture model owning HF parsing and C++ configs."""
+from __future__ import annotations
+
+from abc import ABC
+from typing import TYPE_CHECKING
+
+import torch
+
+from .builders import NormBuilder, make_norm_config
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig
+
+
+
+class TextModel(ABC):
+    """Text model: HF config -> C++ configs + weight commits.
+
+    Subclass contract:
+      - __init__ takes (cfg, *, resolver), calls super().__init__, then
+        builds per-module C++ config templates as self._attn_cfg /
+        self._ffn_cfg / self._moe_cfg / self._dn_cfg.
+      - Factory method NAMES (attn/ffn/moe/linear_attn/mla/norm/...)
+        are a convention for readability, NOT a protocol. Signatures
+        may differ across subclasses. The base class provides no
+        factory stubs; every subclass implements its own model()
+        that calls root.add_token_embeds / root.add_lm_head on a
+        TextModelBuilder for the root-level commits.
+    """
+
+    _loader_mappings: list = []
+
+
+    # ------------------------------------------------------------------
+    # Construction / parsing
+    # ------------------------------------------------------------------
+
+    def __init__(self, cfg: PretrainedConfig, *, resolver):
+        """Store local config and shared runtime helpers.
+
+        Source-model subclasses own architecture-specific field reads. Shared
+        utilities in ``models.utils`` build common C++ module configs.
+        """
+        self.cfg: PretrainedConfig = cfg
+        self._resolver = resolver
+
+    @property
+    def _vocab_size(self) -> int:
+        return self.cfg.vocab_size
+
+
+    # ------------------------------------------------------------------
+    # Runtime binding (called by ModelLoader after model_comm exists)
+    # ------------------------------------------------------------------
+
+    def bind_runtime(self, *, ctx, root_handles,
+                     attn_tp, mlp_tp, model_tp):
+        self._ctx = ctx
+        self._root_handles = root_handles
+        self._attn_tp = attn_tp
+        self._mlp_tp = mlp_tp
+        self._model_tp = model_tp
+
+    def set_params(self, params: dict):
+        self.params = params
+
+    # ------------------------------------------------------------------
+    # Checkpoint access helpers
+    # ------------------------------------------------------------------
+
+    def _get(self, key: str) -> torch.Tensor | None:
+        return self.params.get(key)
+
+    def _linear(self, pfx: str, *, optional: bool = False):
+        return self._resolver.resolve(self.params, pfx, optional=optional)
+
+
+
+    # ------------------------------------------------------------------
+    # Norm factories (shared across all models)
+    # ------------------------------------------------------------------
+
+    def norm(self, weight, *, dim=None):
+        """Build a NormBuilder for *weight* under this model's contexts.
+
+        ``dim`` defaults to ``weight.shape[-1]``.
+        """
+        cfg = make_norm_config(
+            dim=dim if dim is not None else weight.shape[-1],
+            norm_eps=self.cfg.rms_norm_eps,
+        )
+        m = NormBuilder(cfg, self._ctx)
+        m.set_weight(weight)
+        return m.build()
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
index 805bb3653e..13690f22c2 100644
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -9,7 +9,6 @@
 import sys
 from collections.abc import Sequence
 from concurrent.futures import ThreadPoolExecutor
-from dataclasses import asdict
 from functools import partial
 from multiprocessing.reduction import ForkingPickler
 from queue import Queue
@@ -17,7 +16,6 @@
 
 import pybase64
 import torch
-import yaml
 
 import lmdeploy
 from lmdeploy.messages import EngineOutput, GenerationConfig, ResponseType, ScheduleMetrics, TurbomindEngineConfig
@@ -25,7 +23,6 @@
 from lmdeploy.tokenizer import Tokenizer
 from lmdeploy.utils import get_logger, get_max_batch_size, get_model
 
-from .deploy.config import TurbomindModelConfig
 from .supported_models import is_supported
 
 # TODO: find another way import _turbomind
@@ -172,23 +169,12 @@ def __init__(self,
             self._process_weights()
             self._create_engine()
 
-        self.session_len = self.config.session_len
-
-    def _check_unloaded_tm_params(self):
-        tm_params = self._tm_model.tm_params
-        if len(tm_params) > 0:
-            uninitialized = list(tm_params.keys())
-            logger.warning('the model may not be loaded successfully '
-                           f'with {len(tm_params)} uninitialized params:\n{uninitialized}')
+        self.session_len = _engine_config.session_len
 
     def _load_weights(self):
         """Load weights."""
-        self._get_model_params()
-
         with torch.cuda.device(self.devices[0]):
-            self._tm_model.export()
-
-        self._check_unloaded_tm_params()
+            self._model_loader.export()
 
     def _process_weights(self):
         """Process weight."""
@@ -204,11 +190,18 @@ def _create_engine(self):
         self._engine_created = True
 
     def _create_weight(self, model_comm):
-        """Allocate weight buffer, load params if from_workspace."""
+        """Create per-GPU Context + empty ModelRoot sentinel.
+
+        Runs both C++ init steps sequentially per device, inside a
+        ThreadPoolExecutor so all ranks enter ``create_context``
+        concurrently and hit its ``h_global->Sync()`` barriers together.
+        ``create_root`` itself has no collectives, so it can follow
+        synchronously on each thread.
+        """
 
-        # create weight
         def _create_weight_func(device_id):
-            model_comm.create_weights(device_id)
+            model_comm.create_context(device_id)
+            model_comm.create_root(device_id)
 
         with ThreadPoolExecutor(max_workers=self.gpu_count) as executor:
             futures = []
@@ -217,72 +210,67 @@ def _create_weight_func(device_id):
             for future in futures:
                 future.result()
 
-    def _get_model_params(self):
-        """Get turbomind model params when loading from hf."""
-
-        model_comm = self.model_comm
-        tm_params = self._tm_model.tm_params
-        tm_params.clear()
-
-        def _get_params(device_id, que):
-            out = model_comm.get_weights(device_id)
-            que.put(out)
-
-        que = Queue()
-        with ThreadPoolExecutor(max_workers=self.gpu_count) as executor:
-            futures = []
-            for device_id in range(self.gpu_count):
-                futures.append(executor.submit(_get_params, device_id, que))
-            for future in futures:
-                future.result()
-
-        for _ in range(self.gpu_count):
-            tensor_map = que.get()
-            for k, v in tensor_map.items():
-                if k not in tm_params:
-                    tm_params[k] = [v]
-                else:
-                    tm_params[k].append(v)
-        logger.warning(f'get {len(tm_params)} model params')
-
-    def _postprocess_config(self, tm_config: TurbomindModelConfig, engine_config: TurbomindEngineConfig):
-        """Postprocess turbomind config by."""
-        import copy
-        self.config = copy.deepcopy(tm_config)
-        # Update the attribute values in `self.config` with the valid values
-        # from the corresponding attributes in `engine_config`, such as
-        # `session_len`, `quant_policy`, `rope_scaling_factor`, etc.
-        self.config.update_from_engine_config(engine_config)
-
-        # update some attributes of `engine_config` which depends on
-        # `session_len`
-        self.engine_config = engine_config
-
-        # pack `self.config` and `self.engine_config` into a dict
-        self.config_dict = self.config.to_dict()
-        self.config_dict.update(dict(engine_config=asdict(self.engine_config)))
-        logger.info(f'turbomind model config:\n\n'
-                    f'{json.dumps(self.config_dict, indent=2)}')
-
     def _from_hf(self, model_path: str, engine_config: TurbomindEngineConfig):
         """Load model which is in hf format."""
-        assert is_supported(model_path), (f'turbomind does not support {model_path}. '
-                                          'Plz try pytorch engine instead.')
+        assert is_supported(model_path), (
+            f'turbomind does not support {model_path}. '
+            'Plz try pytorch engine instead.')
 
-        # convert transformers model into turbomind model
-        from .deploy.converter import get_tm_model
-        tm_model = get_tm_model(model_path, self.model_name, self.chat_template_name, engine_config)
+        from .converter import get_tm_config
+        from .model_loader import ModelLoader
 
-        self._postprocess_config(tm_model.tm_config, engine_config)
+        text_model, model_path, data_type = get_tm_config(model_path, engine_config)
 
-        model_comm = _tm.TurboMind.create(model_dir='',
-                                          config=yaml.safe_dump(self.config_dict),
-                                          weight_type=self.config.model_config.weight_type)
+        self._vocab_size = text_model._vocab_size
+        self.engine_config = engine_config
 
-        # create empty weight
+        dtype_map = {
+            'bfloat16': _tm.DataType.TYPE_BF16,
+            'float16': _tm.DataType.TYPE_FP16,
+        }
+        ec = _tm.EngineConfig()
+        ec.data_type = dtype_map[engine_config.dtype]
+        ec.cache_block_seq_len = engine_config.cache_block_seq_len
+        ec.quant_policy = engine_config.quant_policy
+        ec.max_batch_size = engine_config.max_batch_size
+        ec.max_prefill_token_num = engine_config.max_prefill_token_num
+        ec.session_len = engine_config.session_len
+        ec.cache_max_block_count = engine_config.cache_max_entry_count
+        ec.cache_chunk_size = engine_config.cache_chunk_size
+        ec.enable_prefix_caching = engine_config.enable_prefix_caching
+        ec.enable_metrics = engine_config.enable_metrics
+        ec.num_tokens_per_iter = engine_config.num_tokens_per_iter
+        ec.max_prefill_iters = engine_config.max_prefill_iters
+        ec.async_ = engine_config.async_
+        ec.outer_dp_size = engine_config.outer_dp_size
+        ec.attn_dp_size = engine_config.attn_dp_size
+        ec.attn_tp_size = engine_config.attn_tp_size
+        ec.attn_cp_size = engine_config.attn_cp_size
+        ec.mlp_tp_size = engine_config.mlp_tp_size
+        ec.devices = engine_config.devices
+        ec.nnodes = engine_config.nnodes
+        ec.node_rank = engine_config.node_rank
+        ec.communicator = engine_config.communicator
+
+        logger.info(f'turbomind engine config:\n\n'
+                    f'dtype={engine_config.dtype}, session_len={engine_config.session_len}, '
+                    f'max_batch_size={engine_config.max_batch_size}, '
+                    f'devices={engine_config.devices}, '
+                    f'tp={engine_config.attn_tp_size}, '
+                    f'dp={engine_config.attn_dp_size}, '
+                    f'cp={engine_config.attn_cp_size}')
+
+        model_comm = _tm.TurboMind.create(model_dir='', engine_config=ec)
         self._create_weight(model_comm)
-        # output model
-        self._tm_model = tm_model
+
+        self._model_loader = ModelLoader(
+            model=text_model,
+            model_comm=model_comm,
+            gpu_count=self.gpu_count,
+            model_path=model_path,
+            data_type=data_type,
+            engine_config=engine_config,
+        )
         return model_comm
 
     async def sleep(self, level: int = 1):
@@ -319,12 +307,11 @@ def _construct(item):
             return func(*args).clone()
 
         if not hasattr(self, '_export_iter'):
-            self._get_model_params()
             que = Queue()
-            tm_model = self._tm_model
-            tm_model.input_model.model_path = que
+            ml = self._model_loader
+            ml.model_path = que
             self._update_params_que = que
-            self._export_iter = tm_model.export_iter()
+            self._export_iter = ml.export_iter()
 
         with torch.cuda.device(self.devices[0]):
             if isinstance(request.serialized_named_tensors, str):
@@ -336,7 +323,6 @@ def _construct(item):
             next(self._export_iter)
 
         if request.finished:
-            self._check_unloaded_tm_params()
             self._process_weights()
             if self._engine_created is False:
                 self._create_engine()
@@ -374,9 +360,6 @@ def from_pretrained(cls,
                    **kwargs)
 
     def close(self):
-        if hasattr(self, '_tm_model'):
-            # close immediately after init engine with empty_init=True
-            self._tm_model.tm_params.clear()
         if hasattr(self, '_export_iter'):
             del self._export_iter
         if self.model_comm is not None:
@@ -393,7 +376,7 @@ def create_instance(self, cuda_stream_id=0):
         Returns:
             TurboMindInstance: an instance of turbomind
         """
-        return TurboMindInstance(self, self.config, cuda_stream_id)
+        return TurboMindInstance(self, cuda_stream_id)
 
     def get_schedule_metrics(self):
         # TODO: support dp
@@ -525,15 +508,14 @@ class TurboMindInstance:
         cuda_stream_id(int): identity of a cuda stream
     """
 
-    def __init__(self, tm_model: TurboMind, config: TurbomindModelConfig, cuda_stream_id: int = 0):
+    def __init__(self, tm_model: 'TurboMind', cuda_stream_id: int = 0):
         self.tm_model = tm_model
         self.cuda_stream_id = cuda_stream_id
 
         # create model instances
-        lazy_init = self.tm_model.config_dict['engine_config'].get('empty_init', False)
+        lazy_init = self.tm_model.engine_config.empty_init
         self._model_inst = None if lazy_init else self._create_model_instance()
 
-        self.config = config
         self.lock = None
         # error code map from csrc (refer to `struct Request` in src/turbomind/engine/request.h)
         # to lmdeploy.messages.ResponseType
@@ -593,7 +575,7 @@ def prepare_embeddings(self, input_embeddings=None, input_embedding_ranges=None)
         length = sum([x.shape[0] for x in input_embeddings])
 
         _MAP = dict(bfloat16=torch.bfloat16, float16=torch.float16)
-        dtype = _MAP[self.tm_model.config.model_config.data_type]
+        dtype = _MAP[self.tm_model.engine_config.dtype]
 
         values = torch.empty((length, input_embeddings[0].shape[-1]), dtype=dtype, device='cpu')
         ranges = torch.tensor(input_embedding_ranges, dtype=torch.int32, device='cpu')
@@ -695,7 +677,7 @@ async def async_stream_infer(self,
 
         if gen_config.response_format is not None:
             tokenizer = self.tm_model.tokenizer
-            vocab_size = self.tm_model.config.model_config.vocab_size
+            vocab_size = self.tm_model._vocab_size
 
             try:
                 tokenizer_info = TokenizerInfo.from_huggingface(tokenizer.model.model, vocab_size=vocab_size)
diff --git a/lmdeploy/turbomind/weight_format.py b/lmdeploy/turbomind/weight_format.py
new file mode 100644
index 0000000000..499cd216ad
--- /dev/null
+++ b/lmdeploy/turbomind/weight_format.py
@@ -0,0 +1,493 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Weight format resolution for TurboMind checkpoint loading.
+
+Exports:
+
+- ``WeightFormat`` (ABC) and six concrete subclasses: ``TrivialFormat``,
+  ``AWQFormat``, ``GPTQFormat``, ``CompressedTensorFormat``, ``FP8Format``,
+  ``MXFP4Format``. Each subclass declares its ``name``, ``suffix_map``,
+  ``weight_dtype`` (``_tm.DataType`` or ``None``), ``has_zero_point`` flag,
+  and overrides ``accepts`` + ``normalize``. Optional overrides: ``pack``
+  (identity default), ``synthesize_zeros`` (raises by default), ``dequant``
+  (raises by default; ``TrivialFormat.dequant`` is identity).
+
+- ``WeightFormatResolver``: holds the model compute dtype plus an ordered
+  list of candidate formats. ``resolve(params, prefix, *, index=None,
+  optional=False)`` returns a ``Linear`` bundle in TM layout or raises
+  (``KeyError`` on missing tensors without ``optional``, ``ValueError`` when
+  tensors exist but no candidate matches).
+
+- ``pack_u4_row``: uint8 → int32 row packer used by quantized ``pack``
+  overrides and by downstream callers that pack packed-expert weights
+  after slicing.
+"""
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import ClassVar, NamedTuple
+
+import _turbomind as _tm
+import torch
+from torch import Tensor
+
+from .linear import Linear
+
+
+class PackedTensor(NamedTuple):
+    tensor:      torch.Tensor
+    alloc_shape: list[int] | None       # None = inherit from packed tensor
+    alloc_dtype: _tm.DataType | None  # None = inherit from packed tensor
+
+
+# ---------------------------------------------------------------------------
+# Low-level u4 packing / unpacking helpers (reused across normalize / pack)
+# ---------------------------------------------------------------------------
+
+
+def _get_u4_slices(x: Tensor, dtype: torch.dtype) -> list[Tensor]:
+    MAP = {torch.int32: 8, torch.uint8: 2}
+    xs = []
+    for _ in range(MAP[x.dtype]):
+        xs.append((x & 15).to(dtype))
+        x = x >> 4
+    return xs
+
+
+def _unpack_awq_gemm(x: Tensor) -> Tensor:
+    xs = _get_u4_slices(x, torch.uint8)
+    order = [0, 4, 1, 5, 2, 6, 3, 7]
+    ys = [xs[i] for i in order]
+    return torch.stack(ys, dim=-1).view(*x.shape[:-1], -1)
+
+
+def pack_u4_row(x: torch.Tensor) -> torch.Tensor:
+    """Pack uint8 4-bit values into int32 rows along the last dim.
+
+    Used by every int4 format's ``pack`` override and by callers that
+    re-pack tensors after slicing (e.g. packed-MoE expert split).
+    """
+    assert x.dtype == torch.uint8, f'x.dtype: {x.dtype}'
+    xs = x.view(*x.shape[:-1], -1, 8).split(1, dim=-1)
+    a = torch.zeros(xs[0].shape, dtype=torch.int32, device=x.device)
+    for t in reversed(xs):
+        a = (a << 4) | t
+    return a.squeeze(dim=-1)
+
+
+def _zeros_int4_symmetric(scales: Tensor) -> Tensor:
+    """Synthesize symmetric int4 zero-points (value = 8) matching *scales*
+    shape."""
+    return torch.full(scales.shape, 8, dtype=torch.uint8, device=scales.device)
+
+
+# ---------------------------------------------------------------------------
+# WeightFormat ABC
+# ---------------------------------------------------------------------------
+
+
+class WeightFormat(ABC):
+    """Abstract per-format policy object.
+
+    Class attributes (override in subclasses):
+
+    - ``name``: canonical format name used for string comparisons.
+    - ``suffix_map``: ``{checkpoint_suffix: tm_kind}``. Drives which
+      checkpoint tensors each format ingests at a given prefix.
+    - ``weight_dtype``: ``_tm.DataType`` for the weight storage dtype;
+      ``None`` for trivial (weight dtype equals compute dtype).
+    - ``has_zero_point``: ``True`` when the format uses a zero-point
+      tensor; gates the resolver's ``synthesize_zeros`` call.
+
+    Instance attributes (set by subclass ``__init__``):
+
+    - ``block_in``, ``block_out``: quantization block sizes. ``None`` for
+      dimensions without blocking.
+
+    Methods:
+
+    - ``accepts`` (abstract): classify a checkpoint suffix dict.
+    - ``normalize`` (abstract): raw-checkpoint tensor → TM layout.
+    - ``pack``: optional commit-time packer. Identity default.
+    - ``synthesize_zeros``: fabricate a zeros tensor when the checkpoint
+      omits it. Raises ``NotImplementedError`` by default.
+    - ``dequant``: produce a trivial ``{weight, bias?}`` dict from TM
+      tensors for mixed-format fusion. Raises ``NotImplementedError`` by
+      default. ``TrivialFormat.dequant`` is identity.
+    - ``make_data_format``: build the ``_tm.DataFormat`` descriptor.
+
+    Equality / hashing: two WeightFormats are equal iff they share class
+    and block sizes. This matters for the set-based uniformity checks in
+    ``concat_out_dim``.
+    """
+
+    name:           ClassVar[str]
+    suffix_map:     ClassVar[dict[str, str]]
+    weight_dtype:   ClassVar[_tm.DataType | None]
+    has_zero_point: ClassVar[bool]
+
+    block_in:  int | None
+    block_out: int | None
+
+    def __init__(self, *, block_in: int | None = None,
+                 block_out: int | None = None):
+        self.block_in  = block_in
+        self.block_out = block_out
+
+    @abstractmethod
+    def accepts(self, available: dict[str, Tensor]) -> bool: ...
+
+    @abstractmethod
+    def normalize(self, tensor: Tensor, kind: str) -> Tensor: ...
+
+    def pack(self, tensor: Tensor, kind: str) -> PackedTensor:
+        return PackedTensor(tensor, None, None)
+
+    def synthesize_zeros(self, scales: Tensor) -> Tensor:
+        raise NotImplementedError(
+            f'{type(self).__name__}.synthesize_zeros not implemented')
+
+    def dequant(self, tensors: dict[str, Tensor],
+                data_type) -> dict[str, Tensor]:
+        raise NotImplementedError(
+            f'{type(self).__name__}.dequant not implemented')
+
+    def make_data_format(self, data_type) -> _tm.DataFormat:
+        if self.weight_dtype is None:
+            return _tm.ResolveLinearWeightFormat(data_type, data_type, 1, 1)
+        return _tm.ResolveLinearWeightFormat(
+            data_type, self.weight_dtype,
+            self.block_in  or 1, self.block_out or 1)
+
+    def __eq__(self, other) -> bool:
+        if not isinstance(other, WeightFormat):
+            return NotImplemented
+        return (type(self) is type(other)
+                and self.block_in  == other.block_in
+                and self.block_out == other.block_out)
+
+    def __hash__(self) -> int:
+        return hash((type(self), self.block_in, self.block_out))
+
+
+# ---------------------------------------------------------------------------
+# Concrete subclasses
+# ---------------------------------------------------------------------------
+
+
+class TrivialFormat(WeightFormat):
+    name           = 'trivial'
+    suffix_map     = {'.weight': 'weight', '.bias': 'bias'}
+    weight_dtype   = None
+    has_zero_point = False
+
+    def accepts(self, available: dict[str, Tensor]) -> bool:
+        if not (available.keys() <= {'.weight', '.bias'}):
+            return False
+        w = available.get('.weight')
+        return w is None or w.dtype.is_floating_point
+
+    def normalize(self, x: Tensor, kind: str) -> Tensor:
+        x = x.cuda()
+        if x.dim() >= 2:
+            x = x.t()
+        return x
+
+    def dequant(self, tensors, data_type):
+        # Already trivial — nothing to undo. Identity override for mixed
+        # fusion groups.
+        return tensors
+
+
+class AWQFormat(WeightFormat):
+    name           = 'awq'
+    suffix_map     = {'.qweight': 'weight', '.scales': 'scales',
+                      '.qzeros': 'zeros',   '.bias': 'bias'}
+    weight_dtype   = _tm.DataType.TYPE_UINT4
+    has_zero_point = True
+
+    def __init__(self, *, block_in: int):
+        super().__init__(block_in=block_in, block_out=None)
+
+    def accepts(self, available: dict[str, Tensor]) -> bool:
+        qw = available.get('.qweight')
+        if qw is None or qw.dtype != torch.int32:
+            return False
+        scales = available.get('.scales')
+        if scales is not None and qw.ndim >= 2 and scales.ndim >= 2:
+            return qw.shape[-1] * 8 == scales.shape[-1]
+        return True
+
+    def normalize(self, x: Tensor, kind: str) -> Tensor:
+        # AWQ checkpoints store weights in TM-native layout:
+        #   qweight: [K, N//8] int32 → unpack → [K, N] (TM, no .t())
+        #   scales:  [K//g, N] float16 → already TM
+        #   zeros:   [K//g, N//8] int32 → unpack → [K//g, N]
+        x = x.cuda()
+        if x.dtype == torch.int32:
+            x = _unpack_awq_gemm(x)
+        if kind == 'zeros':
+            x = x.to(torch.float16)
+        return x
+
+    def pack(self, tensor: Tensor, kind: str) -> PackedTensor:
+        if kind == 'weight' and tensor.dtype == torch.uint8:
+            return PackedTensor(pack_u4_row(tensor),
+                                list(tensor.shape), self.weight_dtype)
+        return PackedTensor(tensor, None, None)
+
+    def dequant(self, tensors, data_type):
+        from lmdeploy.pytorch.backends.default.awq_modules import dequantize_gemm
+
+        qweight = tensors['weight']
+        scales  = tensors['scales']
+        qzeros  = tensors['zeros']
+        group_size = qweight.shape[0] // scales.shape[0]
+        w = dequantize_gemm(qweight, qzeros, scales, 4, group_size)
+        result: dict[str, Tensor] = {'weight': w}
+        if 'bias' in tensors:
+            result['bias'] = tensors['bias']
+        return result
+
+
+class GPTQFormat(WeightFormat):
+    name           = 'gptq'
+    suffix_map     = {'.qweight': 'weight', '.scales': 'scales',
+                      '.qzeros': 'zeros',   '.bias': 'bias'}
+    weight_dtype   = _tm.DataType.TYPE_UINT4
+    has_zero_point = True
+
+    def __init__(self, *, block_in: int):
+        super().__init__(block_in=block_in, block_out=None)
+
+    def accepts(self, available: dict[str, Tensor]) -> bool:
+        qw = available.get('.qweight')
+        if qw is None or qw.dtype != torch.int32:
+            return False
+        scales = available.get('.scales')
+        if scales is not None and qw.ndim >= 2 and scales.ndim >= 2:
+            return qw.shape[-1] == scales.shape[-1]
+        return True
+
+    def normalize(self, x: Tensor, kind: str) -> Tensor:
+        # GPTQ checkpoint stores weights in TM-native layout:
+        #   qweight: [K//8, N] int32 → unpack → [K, N]
+        #   scales:  [K//g, N] float16 → already TM
+        #   zeros:   [K//g, N//8] int32 → unpack → [K//g, N] (+1 offset)
+        x = x.cuda()
+        if x.dtype == torch.int32:
+            xs = _get_u4_slices(x, torch.uint8)
+            if kind == 'weight':
+                x = torch.stack(xs, dim=1).view(-1, x.size(-1))
+            else:
+                x = torch.stack(xs, dim=-1).view(x.size(0), -1) + 1
+        if kind == 'zeros':
+            x = x.to(torch.float16)
+        return x
+
+    def pack(self, tensor: Tensor, kind: str) -> PackedTensor:
+        if kind == 'weight' and tensor.dtype == torch.uint8:
+            return PackedTensor(pack_u4_row(tensor),
+                                list(tensor.shape), self.weight_dtype)
+        return PackedTensor(tensor, None, None)
+
+    def synthesize_zeros(self, scales: Tensor) -> Tensor:
+        return _zeros_int4_symmetric(scales)
+
+
+class CompressedTensorFormat(WeightFormat):
+    name           = 'compressed-tensors'
+    suffix_map     = {'.weight_packed':     'weight',
+                      '.weight_scale':      'scales',
+                      '.weight_zero_point': 'zeros',
+                      '.bias':              'bias'}
+    weight_dtype   = _tm.DataType.TYPE_UINT4
+    has_zero_point = True
+
+    def __init__(self, *, block_in: int):
+        super().__init__(block_in=block_in, block_out=None)
+
+    def accepts(self, available: dict[str, Tensor]) -> bool:
+        wp = available.get('.weight_packed')
+        return wp is not None and wp.dtype == torch.int32
+
+    def normalize(self, x: Tensor, kind: str) -> Tensor:
+        x = x.cuda()
+        if x.dtype == torch.int32:
+            xs = _get_u4_slices(x, torch.uint8)
+            if kind == 'weight':
+                x = torch.stack(xs, dim=-1).view(*x.shape[:-1], -1)
+            elif kind == 'zeros':
+                x = torch.stack(xs, dim=1).view(-1, x.size(-1))
+        if kind == 'zeros':
+            x = x.to(torch.float16)
+        if x.dim() >= 2:
+            x = x.t()
+        return x
+
+    def pack(self, tensor: Tensor, kind: str) -> PackedTensor:
+        if kind == 'weight' and tensor.dtype == torch.uint8:
+            return PackedTensor(pack_u4_row(tensor),
+                                list(tensor.shape), self.weight_dtype)
+        return PackedTensor(tensor, None, None)
+
+    def synthesize_zeros(self, scales: Tensor) -> Tensor:
+        return _zeros_int4_symmetric(scales)
+
+
+class FP8Format(WeightFormat):
+    name           = 'fp8'
+    suffix_map     = {'.weight':           'weight',
+                      '.weight_scale_inv': 'scales',
+                      '.bias':             'bias'}
+    weight_dtype   = _tm.DataType.TYPE_FP8_E4M3
+    has_zero_point = False
+
+    def __init__(self):
+        super().__init__(block_in=128, block_out=128)
+
+    def accepts(self, available: dict[str, Tensor]) -> bool:
+        if '.weight_scale_inv' not in available:
+            return False
+        w = available.get('.weight')
+        return w is None or w.dtype in (torch.float8_e4m3fn, torch.uint8)
+
+    def normalize(self, x: Tensor, kind: str) -> Tensor:
+        x = x.cuda()
+        if x.dtype == torch.float8_e4m3fn:
+            x = x.view(dtype=torch.uint8)
+        if x.dim() >= 2:
+            x = x.t()
+        return x
+
+    def dequant(self, tensors, data_type):
+        from .builders._base import _CPP_TO_TORCH
+
+        weight = tensors['weight']
+        scales = tensors['scales']
+        block_size = 128
+        fp8_weight = weight.view(torch.float8_e4m3fn).float()
+        scale = scales.float()
+        scale = scale.repeat_interleave(block_size, dim=0)
+        scale = scale.repeat_interleave(block_size, dim=1)
+        scale = scale[: fp8_weight.shape[0], : fp8_weight.shape[1]]
+        target_dtype = _CPP_TO_TORCH[data_type]
+        result: dict[str, Tensor] = {'weight': (fp8_weight * scale).to(target_dtype)}
+        if 'bias' in tensors:
+            result['bias'] = tensors['bias']
+        return result
+
+    def pack(self, tensor: Tensor, kind: str) -> PackedTensor:
+        if kind == 'weight':
+            return PackedTensor(tensor, list(tensor.shape), self.weight_dtype)
+        return PackedTensor(tensor, None, None)
+
+
+class MXFP4Format(WeightFormat):
+    name           = 'mxfp4'
+    suffix_map     = {'.blocks': 'weight', '.scales': 'scales', '.bias': 'bias'}
+    weight_dtype   = _tm.DataType.TYPE_FP4_E2M1
+    has_zero_point = False
+
+    def __init__(self):
+        super().__init__(block_in=32, block_out=None)
+
+    def accepts(self, available: dict[str, Tensor]) -> bool:
+        if '.scales' not in available:
+            return False
+        w = available.get('.blocks')
+        return w is None or w.dtype == torch.uint8
+
+    def normalize(self, x: Tensor, kind: str) -> Tensor:
+        x = x.cuda()
+        if kind == 'weight':
+            xs = _get_u4_slices(torch.flatten(x, start_dim=-2), torch.uint8)
+            x = torch.flatten(torch.stack(xs, dim=-1), start_dim=-2)
+        if x.dim() >= 2:
+            x = x.t()
+        return x
+
+    def pack(self, tensor: Tensor, kind: str) -> PackedTensor:
+        if kind == 'weight' and tensor.dtype == torch.uint8:
+            return PackedTensor(pack_u4_row(tensor),
+                                list(tensor.shape), self.weight_dtype)
+        return PackedTensor(tensor, None, None)
+
+
+# ---------------------------------------------------------------------------
+# Resolver
+# ---------------------------------------------------------------------------
+
+
+class WeightFormatResolver:
+    """Resolve a checkpoint prefix to a ``Linear`` bundle in TM layout.
+
+    Holds the model compute dtype and an ordered list of candidate
+    formats. ``resolve(params, prefix)`` probes the checkpoint at the
+    given prefix, dispatches to the first candidate whose ``accepts``
+    returns True, and constructs a ``Linear`` with the format's
+    ``make_data_format`` descriptor.
+
+    The suffix probe is scoped to the union of candidate ``suffix_map``
+    keys only — not a global "every format ever" list — so adding a new
+    format elsewhere does not widen the probe.
+
+    Priority is encoded by list order. The converter puts quantized
+    candidates first and ``TrivialFormat()`` last: a prefix that only
+    matches trivial (router, norm-like linears in a quantized model)
+    deterministically falls through.
+
+    Failure modes are loud and distinct:
+
+    - ``optional=False`` (default) + no tensors at prefix → ``KeyError``
+      with candidate suffix list.
+    - Tensors present but no candidate accepts → ``ValueError`` with
+      available keys and candidate names.
+    - Only "no tensors AND optional=True" returns ``None``.
+    """
+
+    def __init__(self, *, data_type: _tm.DataType,
+                 formats: list[WeightFormat]):
+        self._data_type = data_type
+        self._formats   = formats
+        self._suffixes  = frozenset(
+            s for f in formats for s in f.suffix_map)
+
+    @property
+    def data_type(self) -> _tm.DataType:
+        return self._data_type
+
+    def resolve(self, params: dict[str, Tensor], prefix: str, *,
+                index: int | None = None,
+                optional: bool = False) -> Linear | None:
+        available = {s: params[prefix + s]
+                     for s in self._suffixes if (prefix + s) in params}
+        if index is not None:
+            available = {s: t[index] for s, t in available.items()}
+
+        if not available:
+            if optional:
+                return None
+            raise KeyError(
+                f'no checkpoint tensors found at prefix {prefix!r} '
+                f'(candidate suffixes: {sorted(self._suffixes)})')
+
+        for fmt in self._formats:
+            if fmt.accepts(available):
+                return self._build_linear(fmt, available)
+
+        raise ValueError(
+            f'no weight format accepts tensors at {prefix!r}: '
+            f'got {sorted(available)}, '
+            f'tried {[f.name for f in self._formats]}')
+
+    def _build_linear(self, fmt: WeightFormat,
+                      available: dict[str, Tensor]) -> Linear:
+        tensors = {
+            kind: fmt.normalize(available[s], kind)
+            for s, kind in fmt.suffix_map.items()
+            if s in available
+        }
+        if fmt.has_zero_point and 'zeros' not in tensors:
+            tensors['zeros'] = fmt.synthesize_zeros(tensors['scales'])
+        return Linear(tensors=tensors,
+                      weight_format=fmt)
diff --git a/scripts/test_turbomind_model.py b/scripts/test_turbomind_model.py
new file mode 100644
index 0000000000..e35f7774cb
--- /dev/null
+++ b/scripts/test_turbomind_model.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env python3
+"""Smoke-test one TurboMind model for agent/subagent harnesses.
+
+Code flow: parse argv → configure HF/GPU and run one inference → print sections.
+
+Stdout is plain text in short sections, for example:
+
+  --- setup ---
+  model: <path>
+  tp: <tp>
+  gpus: <gpus>
+  TM_DEBUG_LEVEL: DEBUG    (only if --debug was passed)
+
+  --- timing ---
+  pipeline load: <s> s
+  inference: <s> s
+
+  --- tokens ---
+  input: <n>
+  generated: <n>
+
+  --- response begin ---
+  <full decoded text>
+  --- response end ---
+
+Exit code: 0 only if no uncaught exception (pipeline load + inference complete).
+On failure the full traceback is printed to stderr.
+Output quality is not validated.
+
+Usage (from repo root):
+
+  python scripts/test_turbomind_model.py \\
+      [--debug] <model_path> <cache_dir> <tp> <gpus>
+
+Optional --debug sets TM_DEBUG_LEVEL=DEBUG before loading TurboMind so asynchronous
+CUDA errors surface after kernel launch (see TurboMind CUDA helpers).
+
+Example gpus: "0" for tp=1, "0,1" for tp=2.
+"""
+from __future__ import annotations
+
+import os
+import sys
+import time
+import traceback
+from typing import NamedTuple
+
+import huggingface_hub.constants as hf_constants
+
+
+class SmokeResult(NamedTuple):
+    create_s: float
+    infer_s: float
+    text: str
+    input_token_len: int
+    generate_token_len: int
+
+
+def _set_hf_cache(path: str) -> None:
+    hf_constants.HF_HUB_CACHE = path
+    hf_constants.HF_HUB_OFFLINE = 1
+
+
+def parse_args(argv: list[str]) -> tuple[str, str, int, str, bool]:
+    prog = os.path.basename(argv[0]) if argv else 'test_turbomind_model.py'
+    rest = [a for a in argv[1:] if a != '--debug']
+    debug = len(rest) != len(argv) - 1
+
+    if len(rest) != 4:
+        print(
+            f'usage: {prog} [--debug] <model_path> <cache_dir> <tp> <gpus>',
+            file=sys.stderr,
+        )
+        sys.exit(2)
+
+    model_path, cache_dir, tp_s, gpus = rest
+    try:
+        tp = int(tp_s)
+    except ValueError:
+        print(f'invalid tp: {tp_s!r}', file=sys.stderr)
+        sys.exit(2)
+    return model_path, cache_dir, tp, gpus, debug
+
+
+def run_smoke_infer(
+    model_path: str,
+    cache_dir: str,
+    tp: int,
+    gpus: str,
+    *,
+    debug: bool = False,
+) -> SmokeResult:
+    _set_hf_cache(cache_dir)
+    os.environ['CUDA_VISIBLE_DEVICES'] = gpus
+    if debug:
+        os.environ['TM_DEBUG_LEVEL'] = 'DEBUG'
+
+    from lmdeploy import GenerationConfig, TurbomindEngineConfig, pipeline
+
+    engine_config = TurbomindEngineConfig(
+        async_=1,
+        max_batch_size=4,
+        session_len=4096,
+        cache_max_entry_count=0.5,
+        max_prefill_token_num=1024,
+        tp=tp,
+        dp=1,
+        enable_metrics=False,
+        communicator='nccl',
+    )
+    gen_config = GenerationConfig(max_new_tokens=128, do_sample=False)
+    prompt = 'Write a short paragraph about the importance of reading books.'
+
+    t0 = time.perf_counter()
+    with pipeline(model_path, backend_config=engine_config, log_level='WARNING') as pipe:
+        create_s = time.perf_counter() - t0
+        t1 = time.perf_counter()
+        out = pipe([prompt], gen_config=gen_config, do_preprocess=True)
+        infer_s = time.perf_counter() - t1
+
+    res = out[0]
+    text = res.text if hasattr(res, 'text') else str(res)
+    input_token_len = getattr(res, 'input_token_len', -1)
+    generate_token_len = getattr(res, 'generate_token_len', -1)
+    return SmokeResult(create_s, infer_s, text, input_token_len, generate_token_len)
+
+
+def print_report(
+    model_path: str,
+    tp: int,
+    gpus: str,
+    result: SmokeResult,
+    *,
+    debug: bool = False,
+) -> None:
+    if not result.text.strip():
+        print('warning: empty response text', file=sys.stderr)
+
+    print('--- setup ---')
+    print(f'model: {model_path}')
+    print(f'tp: {tp}')
+    print(f'gpus: {gpus}')
+    if debug:
+        print('TM_DEBUG_LEVEL: DEBUG')
+    print()
+    print('--- timing ---')
+    print(f'pipeline load: {result.create_s:.2f} s')
+    print(f'inference: {result.infer_s:.2f} s')
+    print()
+    print('--- tokens ---')
+    print(f'input: {result.input_token_len}')
+    print(f'generated: {result.generate_token_len}')
+    print()
+    print('--- response begin ---')
+    print(result.text, end='')
+    if result.text and not result.text.endswith('\n'):
+        print()
+    print('--- response end ---')
+
+
+def main() -> None:
+    model_path, cache_dir, tp, gpus, debug = parse_args(sys.argv)
+    result = run_smoke_infer(model_path, cache_dir, tp, gpus, debug=debug)
+    print_report(model_path, tp, gpus, result, debug=debug)
+
+
+if __name__ == '__main__':
+    try:
+        main()
+    except Exception:
+        traceback.print_exc()
+        sys.exit(1)
diff --git a/scripts/test_vlm.py b/scripts/test_vlm.py
new file mode 100644
index 0000000000..693daa2067
--- /dev/null
+++ b/scripts/test_vlm.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+"""Smoke-test InternVL3.5 VLM with an image."""
+import os
+import sys
+import time
+
+import huggingface_hub.constants as hf_constants
+
+
+def main():
+    model_path = sys.argv[1] if len(sys.argv) > 1 else 'OpenGVLab/InternVL3_5-8B'
+    cache_dir = sys.argv[2] if len(sys.argv) > 2 else '/nvme2/huggingface_hub/hub'
+    image_path = sys.argv[3] if len(sys.argv) > 3 else '/data/lmdeploy-modeling/resources/batch_memory.png'
+    gpus = sys.argv[4] if len(sys.argv) > 4 else '0'
+
+    hf_constants.HF_HUB_CACHE = cache_dir
+    hf_constants.HF_HUB_OFFLINE = 1
+    os.environ['CUDA_VISIBLE_DEVICES'] = gpus
+
+    from lmdeploy import GenerationConfig, TurbomindEngineConfig, pipeline
+    from lmdeploy.vl import load_image
+
+    engine_config = TurbomindEngineConfig(
+        async_=1,
+        max_batch_size=4,
+        session_len=8192,
+        cache_max_entry_count=0.5,
+        max_prefill_token_num=1024,
+        tp=1,
+        dp=1,
+        enable_metrics=False,
+        communicator='nccl',
+    )
+    gen_config = GenerationConfig(max_new_tokens=256, do_sample=False)
+
+    image = load_image(image_path)
+    prompt = 'Describe this image in detail. What do you see?'
+
+    print('--- setup ---')
+    print(f'model: {model_path}')
+    print(f'image: {image_path}')
+    print(f'gpus: {gpus}')
+    print()
+
+    t0 = time.perf_counter()
+    with pipeline(model_path, backend_config=engine_config, log_level='WARNING') as pipe:
+        load_s = time.perf_counter() - t0
+        print('--- timing ---')
+        print(f'pipeline load: {load_s:.2f} s')
+
+        t1 = time.perf_counter()
+        out = pipe([(prompt, image)], gen_config=gen_config, do_preprocess=True)
+        infer_s = time.perf_counter() - t1
+        print(f'inference: {infer_s:.2f} s')
+        print()
+
+        res = out[0]
+        text = res.text if hasattr(res, 'text') else str(res)
+        input_tokens = getattr(res, 'input_token_len', -1)
+        gen_tokens = getattr(res, 'generate_token_len', -1)
+
+        print('--- tokens ---')
+        print(f'input: {input_tokens}')
+        print(f'generated: {gen_tokens}')
+        print()
+        print('--- response begin ---')
+        print(text)
+        print('--- response end ---')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/turbomind/CMakeLists.txt b/src/turbomind/CMakeLists.txt
index 11f7d0ed22..2fec4ba1c6 100644
--- a/src/turbomind/CMakeLists.txt
+++ b/src/turbomind/CMakeLists.txt
@@ -36,4 +36,4 @@ target_link_libraries(turbomind PUBLIC
         nvtx_utils
         CUDA::cublasLt
         CUDA::cudart
-        yaml-cpp::yaml-cpp)
+        )
diff --git a/src/turbomind/core/CMakeLists.txt b/src/turbomind/core/CMakeLists.txt
index 6e8e6bc49d..c32f8f1609 100644
--- a/src/turbomind/core/CMakeLists.txt
+++ b/src/turbomind/core/CMakeLists.txt
@@ -22,9 +22,11 @@ add_library(core STATIC
         context.cc
         buffer.cc
         layout.cc
+        data_format.cc
         tensor.cc
         tensor.cu
         module.cc
+        registry.cc
         copy.cc
         logger.cc)
 
@@ -41,4 +43,7 @@ if (BUILD_TEST)
 
     add_executable(test_logger test_logger.cc)
     target_link_libraries(test_logger PRIVATE core Catch2::Catch2WithMain)
+
+    add_executable(test_data_format test_data_format.cc)
+    target_link_libraries(test_data_format PRIVATE core Catch2::Catch2WithMain)
 endif ()
diff --git a/src/turbomind/core/data_format.cc b/src/turbomind/core/data_format.cc
new file mode 100644
index 0000000000..4255073ca5
--- /dev/null
+++ b/src/turbomind/core/data_format.cc
@@ -0,0 +1,64 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/core/data_format.h"
+#include "src/turbomind/core/check.h"
+
+namespace turbomind {
+
+bool DataFormat::is_quantized() const noexcept
+{
+    if (scales.present() || zeros.present()) {
+        return true;
+    }
+    for (int bs : block_sizes) {
+        if (bs > 1) {
+            return true;
+        }
+    }
+    return false;
+}
+
+DataFormat ResolveLinearWeightFormat(DataType data_type, DataType weight_dtype, int block_in, int block_out)
+{
+    DataFormat fmt;
+    fmt.dtype = weight_dtype;
+
+    if (IsTrivialFloatType(weight_dtype)) {
+        TM_CHECK(block_in == 1 && block_out == 1)
+            << "Trivial float weight requires block_in==1 and block_out==1, got " << block_in << ", " << block_out;
+        fmt.block_sizes = {1, 1};
+        return fmt;
+    }
+
+    if (weight_dtype == kFloat8_e4m3) {
+        TM_CHECK(block_in == 128 && block_out == 128)
+            << "FP8 weight format requires block_in==128 and block_out==128, got " << block_in << ", " << block_out;
+        fmt.block_sizes  = {128, 128};
+        fmt.scales.dtype = kFloat;
+        return fmt;
+    }
+
+    if (weight_dtype == kFloat4_e2m1) {
+        TM_CHECK(block_in > 0 && block_out == 1)
+            << "FP4 weight format requires block_in>0 and block_out==1, got " << block_in << ", " << block_out;
+        fmt.block_sizes  = {block_in, 1};
+        fmt.scales.dtype = kUint8;
+        return fmt;
+    }
+
+    const bool is_qweight = weight_dtype == kUint4 || weight_dtype == kUint8;
+    if (is_qweight) {
+        TM_CHECK(block_in > 0 && block_in <= 256 && block_out == 1)
+            << "Quantized integer weight requires 0 < block_in <= 256 and block_out==1, got " << block_in << ", "
+            << block_out;
+        fmt.block_sizes  = {block_in, 1};
+        fmt.scales.dtype = data_type;
+        fmt.zeros.dtype  = data_type;
+        return fmt;
+    }
+
+    TM_CHECK(0) << "Unsupported weight format: " << to_string(weight_dtype);
+    return fmt;
+}
+
+}  // namespace turbomind
diff --git a/src/turbomind/core/data_format.h b/src/turbomind/core/data_format.h
new file mode 100644
index 0000000000..70605bd68e
--- /dev/null
+++ b/src/turbomind/core/data_format.h
@@ -0,0 +1,50 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#pragma once
+
+#include "src/turbomind/core/data_type.h"
+#include <vector>
+
+namespace turbomind {
+
+/// True for trivial (non-quantized) float dtypes: FP32, FP16, BF16.
+inline bool IsTrivialFloatType(DataType t) noexcept
+{
+    return t == kFloat || t == kHalf || t == kBfloat16;
+}
+
+/// Descriptor for a single quantization parameter (scales or zeros).
+struct QuantParamDesc {
+    DataType dtype{};       // kNull means "not present"
+    bool     transposed{};  // stored transposed w.r.t. data tensor
+
+    bool present() const noexcept
+    {
+        return dtype != kNull;
+    }
+};
+
+/// Universal descriptor for the storage format of a (possibly quantized) tensor.
+struct DataFormat {
+    DataType         dtype{};      // element type of the data tensor
+    std::vector<int> block_sizes;  // per-dimension block sizes (1 = no quantization)
+    QuantParamDesc   scales{};
+    QuantParamDesc   zeros{};
+
+    /// True if any quantization parameter is present or any block_size > 1.
+    bool is_quantized() const noexcept;
+
+    /// Number of dimensions described by this format.
+    int rank() const noexcept
+    {
+        return static_cast<int>(block_sizes.size());
+    }
+};
+
+/// Construct the DataFormat for a linear weight tensor in TM [in, out] layout.
+/// block_sizes stored in tensor-shape order: {block_in, block_out}, so
+/// block_sizes[0] is the K-axis group size and block_sizes[1] is the N-axis.
+/// Scales / zeros dtypes are derived from (data_type, weight_dtype) per the
+/// format's GEMM convention. Validates that the combination is supported.
+DataFormat ResolveLinearWeightFormat(DataType data_type, DataType weight_dtype, int block_in, int block_out);
+
+}  // namespace turbomind
diff --git a/src/turbomind/core/module.cc b/src/turbomind/core/module.cc
index bfdbd8a1e4..0cbf111b2e 100644
--- a/src/turbomind/core/module.cc
+++ b/src/turbomind/core/module.cc
@@ -1,78 +1,194 @@
+// Copyright (c) OpenMMLab. All rights reserved.
 
 #include "src/turbomind/core/module.h"
+
 #include "src/turbomind/core/check.h"
-#include <optional>
+#include "src/turbomind/core/registry.h"
+
+#include <sstream>
 
 namespace turbomind::core {
 
-Module::Module(): parent_{} {}
+// ======================================================================
+// Module
+// ======================================================================
+
+Module::Module() = default;
+
+Module::~Module() = default;
 
-Module::~Module()
+// ----- Type info -----
+
+const char* Module::type() const
 {
-    if (parent_) {
-        parent_->remove_module(*this);
-        parent_ = {};
-    }
+    return "Module";
 }
 
-void Module::register_module(std::string name, Module& module, std::optional<int> index)
+// ----- Hierarchy (default implementations) -----
+
+Module* Module::add_child(std::string /*name*/, std::unique_ptr<Module> /*child*/)
 {
-    module.parent_ = this;
-    if (index) {
-        name += ".";
-        name += std::to_string(*index);
+    return nullptr;
+}
+
+Module* Module::child(const std::string& /*name*/) const
+{
+    return nullptr;
+}
+
+void Module::for_each_child(std::function<void(const char*, Module*)> /*visitor*/) const
+{
+    // default: no-op
+}
+
+// ----- Parameters (default implementations) -----
+
+Param Module::param(const std::string& /*name*/)
+{
+    return {};
+}
+
+void Module::for_each_param(std::function<void(const char*, Tensor&)> /*visitor*/)
+{
+    // default: no-op
+}
+
+// ----- Lifecycle -----
+
+void Module::prepare()
+{
+    for_each_child([](const char* /*name*/, Module* child) {
+        if (child)
+            child->prepare();
+    });
+}
+
+// ----- Registry-driven child creation -----
+
+std::unique_ptr<Module> Module::create(const ModuleConfig& config)
+{
+    return ModuleRegistry::instance().create(std::string(config.module_type), config);
+}
+
+Module* Module::create_child(const std::string& name, const ModuleConfig& config)
+{
+    auto mod = create(config);
+    if (!mod) {
+        return nullptr;
     }
-    // std::cout << "register Module " << name << " " << &module << ", parent " << this << "\n";
-    modules_.emplace_back(std::move(name), &module);
+    return add_child(name, std::move(mod));
 }
 
-void Module::register_parameter(std::string name, Tensor& param)
+// ----- Lookup -----
+
+Module* Module::get(const std::string& segment)
 {
-    // std::cout << "register Parameter " << name << " " << &param << " " << param.layout() << "\n";
-    params_.emplace_back(std::move(name), &param);
+    auto* c = child(segment);
+    TM_CHECK(c != nullptr) << "child '" << segment << "' not found in " << type();
+    return c;
 }
 
-void Module::remove_module(Module& module)
+// ----- Verification -----
+
+bool Module::verify(std::vector<std::string>& missing)
 {
-    for (auto it = modules_.begin(); it != modules_.end(); ++it) {
-        if (it->second == &module) {
-            // std::cout << "erase " << it->first << " " << &module << " from " << this << "\n";
-            modules_.erase(it);
-            return;
+    // Recurse into children
+    for_each_child([&](const char* /*name*/, Module* child) {
+        if (child)
+            child->verify(missing);
+    });
+
+    // Check parameters are initialized
+    for_each_param([&](const char* name, Tensor& tensor) {
+        if (!tensor) {
+            missing.push_back(full_path() + "." + name);
         }
+    });
+
+    return missing.empty();
+}
+
+// ----- Utilities -----
+
+std::string Module::full_path() const
+{
+    if (!parent_) {
+        return name_;
+    }
+    std::string pp = parent_->full_path();
+    if (pp.empty()) {
+        return name_;
     }
-    TM_CHECK(0) << "module " << &module << " not found";
+    return pp + "." + name_;
 }
 
-void Module::remove_parameter(Tensor& param)
+// ======================================================================
+// ModuleList
+// ======================================================================
+
+Module* ModuleList::add_child(std::string name, std::unique_ptr<Module> child)
 {
-    for (auto it = params_.begin(); it != params_.end(); ++it) {
-        if (it->second == &param) {
-            params_.erase(it);
-            return;
+    TM_CHECK(child != nullptr);
+    TM_CHECK(child->parent_ == nullptr) << "module already has a parent";
+
+    // Parse index before moving name.
+    int index = -1;
+    {
+        std::istringstream iss(name);
+        iss >> index;
+        if (!iss.eof()) {
+            index = -1;
+        }
+    }
+
+    child->parent_ = this;
+    child->name_   = name;
+
+    Module* raw = child.get();
+    items_.emplace_back(std::move(name), std::move(child));
+
+    if (index >= 0) {
+        if (index >= static_cast<int>(indexed_.size())) {
+            indexed_.resize(index + 1, nullptr);
         }
+        indexed_[index] = raw;
     }
-    TM_CHECK(0) << "param " << &param << " not found";
+
+    return raw;
 }
 
-std::unordered_map<std::string, Tensor*> Module::get_parameters() const
+Module* ModuleList::child(const std::string& name) const
 {
-    std::unordered_map<std::string, Tensor*> m;
-    get_parameters_impl({}, m);
-    return m;
+    for (auto& [n, c] : items_) {
+        if (n == name) {
+            return c.get();
+        }
+    }
+    return nullptr;
 }
 
-void Module::get_parameters_impl(std::string prefix, std::unordered_map<std::string, Tensor*>& m) const
+void ModuleList::for_each_child(std::function<void(const char*, Module*)> visitor) const
 {
-    if (!prefix.empty()) {
-        prefix += ".";
+    for (auto& [name, c] : items_) {
+        visitor(name.c_str(), c.get());
     }
-    for (const auto& [k, v] : params_) {
-        m.emplace(prefix + k, v);
-    }
-    for (const auto& [k, v] : modules_) {
-        v->get_parameters_impl(prefix + k, m);
+}
+
+int ModuleList::size() const
+{
+    int n = 0;
+    for (auto* p : indexed_) {
+        if (p) {
+            ++n;
+        }
     }
+    return n;
 }
 
+// ======================================================================
+// ModuleList registry
+// ======================================================================
+
+TM_MODULE_REGISTER(ModuleList, ModuleListConfig);
+
 }  // namespace turbomind::core
diff --git a/src/turbomind/core/module.h b/src/turbomind/core/module.h
index 147a3d6593..e33c0344ea 100644
--- a/src/turbomind/core/module.h
+++ b/src/turbomind/core/module.h
@@ -2,11 +2,205 @@
 #ifndef TURBOMIND_CORE_MODULE_H
 #define TURBOMIND_CORE_MODULE_H
 
+#include <functional>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "src/turbomind/core/allocator.h"
+#include "src/turbomind/core/check.h"
+#include "src/turbomind/core/data_type.h"
+#include "src/turbomind/core/registry.h"
 #include "src/turbomind/core/tensor.h"
 
 namespace turbomind::core {
 
+// ======================================================================
+// X-macro config field infrastructure
+// ======================================================================
+
+#define TM_MEMBER(Type, name, ...) Type name{__VA_ARGS__};
+#define TM_PTR(Type, name, ...) visitor(#name, &Config::name);
+#define TM_FOR_EACH(ClassName, field_list)                                                                             \
+    template<typename Visitor>                                                                                         \
+    static void for_each(Visitor&& visitor)                                                                            \
+    {                                                                                                                  \
+        using Config = ClassName;                                                                                      \
+        field_list(TM_PTR)                                                                                             \
+    }
+
+// ======================================================================
+// ModuleConfig — plain base for typed config structs
+// ======================================================================
+
+struct ModuleConfig {
+    std::string_view module_type;
+};
+
+struct ModuleListConfig: ModuleConfig {
+    ModuleListConfig(): ModuleConfig{"ModuleList"} {}
+    template<typename Visitor>
+    static void for_each(Visitor&&)
+    {
+    }
+};
+
+// ======================================================================
+// X-macro expansion macros for Module-derived classes
+//
+// Usage in a derived class header:
+//
+//   #define MY_CHILDREN(X) \
+//       X(LinearWeight, w1) \
+//       X(NormWeight,   norm)
+//
+//   #define MY_PARAMS(X) \
+//       X(weight) \
+//       X(bias)
+//
+//   class MyWeight: public Module {
+//   public:
+//       MY_CHILDREN(TM_CHILD_MEMBER)
+//       MY_PARAMS(TM_PARAM_MEMBER)
+//
+//       // Optional: override virtuals using the CASE macros
+//       Module* add_child(std::string name, std::unique_ptr<Module> child) override;
+//       Module* child(const std::string& name) const override;
+//       Param param(const std::string& name) override;
+//       void for_each_child(std::function<void(const char*, Module*)> visitor) const override;
+//       void for_each_param(std::function<void(const char*, Tensor&)> visitor) override;
+//   };
+//
+//   // In the .cc file:
+//   Module* MyWeight::add_child(std::string name, std::unique_ptr<Module> child) {
+//       MY_CHILDREN(TM_ADD_CHILD_CASE)
+//       return nullptr;
+//   }
+//   // ... etc.
+// ======================================================================
+
+/// Declares a unique_ptr<Type> member named `name`.
+#define TM_CHILD_MEMBER(Type, name) std::unique_ptr<Type> name;
+
+/// Declares a Tensor member named `name`.
+#define TM_PARAM_MEMBER(name) core::Tensor name{};
+
+/// Fragment for add_child() override body: matches name and stores child.
+/// Assumes member `std::unique_ptr<Type> name` and local `std::string name_str`.
+#define TM_ADD_CHILD_CASE(Type, name)                                                                                  \
+    if (name_str == #name) {                                                                                           \
+        TM_CHECK_EQ(child->type(), Type().type());                                                                     \
+        name.reset(static_cast<Type*>(child.release()));                                                               \
+        attach_child_(name.get(), this, std::move(name_str));                                                          \
+        return name.get();                                                                                             \
+    }
+
+/// Fragment for child() override body: matches name and returns pointer.
+#define TM_CHILD_CASE(Type, name)                                                                                      \
+    if (name_str == #name) {                                                                                           \
+        return name.get();                                                                                             \
+    }
+
+/// Fragment for param() override body: matches name and returns Param handle.
+#define TM_PARAM_CASE(name)                                                                                            \
+    if (name_str == #name) {                                                                                           \
+        return core::Param{&name};                                                                                     \
+    }
+
+/// Fragment for for_each_child() override body: visits child.
+#define TM_VISIT_CHILD(Type, name) visitor(#name, name.get());
+
+/// Fragment for for_each_param() override body: visits param.
+#define TM_VISIT_PARAM(name) visitor(#name, name);
+
+/// Declares data members (children + params) and virtual method overrides.
+/// Used in the public section of a derived class.
+#define TM_MODULE_DECLARE(Class, ChildrenX, ParamsX)                                                                   \
+    ChildrenX(TM_CHILD_MEMBER) ParamsX(TM_PARAM_MEMBER) core::Module* add_child(                                       \
+        std::string name, std::unique_ptr<Module> child) override;                                                     \
+    core::Module* child(const std::string& name) const override;                                                       \
+    core::Param   param(const std::string& name) override;                                                             \
+    void          for_each_child(std::function<void(const char*, Module*)> visitor) const override;                    \
+    void          for_each_param(std::function<void(const char*, core::Tensor&)> visitor) override;
+
+/// Defines all X-macro generated method bodies for a derived module class.
+/// Used in the .cc file.  ChildrenX/ParamsX may be empty macros.
+#define TM_MODULE_METHODS(Class, ChildrenX, ParamsX)                                                                   \
+    core::Module* Class::add_child(std::string name, std::unique_ptr<core::Module> child)                              \
+    {                                                                                                                  \
+        std::string name_str = std::move(name);                                                                        \
+        ChildrenX(TM_ADD_CHILD_CASE) return nullptr;                                                                   \
+    }                                                                                                                  \
+    core::Module* Class::child(const std::string& name_str) const                                                      \
+    {                                                                                                                  \
+        ChildrenX(TM_CHILD_CASE) return nullptr;                                                                       \
+    }                                                                                                                  \
+    core::Param Class::param(const std::string& name_str)                                                              \
+    {                                                                                                                  \
+        ParamsX(TM_PARAM_CASE) return {};                                                                              \
+    }                                                                                                                  \
+    void Class::for_each_child(std::function<void(const char*, core::Module*)> visitor) const                          \
+    {                                                                                                                  \
+        ChildrenX(TM_VISIT_CHILD)                                                                                      \
+    }                                                                                                                  \
+    void Class::for_each_param(std::function<void(const char*, core::Tensor&)> visitor)                                \
+    {                                                                                                                  \
+        ParamsX(TM_VISIT_PARAM)                                                                                        \
+    }
+
+// ======================================================================
+// Param — lightweight handle to a Module parameter slot
+// ======================================================================
+
+/// Lightweight handle to a Tensor slot within a Module.
+/// Returned by Module::param(name). Used for per-param allocation.
+class Param {
+    Tensor* slot_;
+
+public:
+    Param(Tensor* slot = nullptr): slot_(slot) {}
+
+    /// Allocate the tensor with explicit shape/dtype. Returns the tensor for data copy.
+    Tensor alloc(const std::vector<size_t>& shape, DataType dtype)
+    {
+        TM_CHECK(slot_ != nullptr);
+        auto layout = Layout{std::vector<ssize_t>(shape.begin(), shape.end())};
+        *slot_      = Tensor{std::move(layout), dtype, kDEVICE};
+        return *slot_;
+    }
+
+    /// Get current tensor (empty if not yet allocated).
+    Tensor get() const
+    {
+        return slot_ ? *slot_ : Tensor{};
+    }
+
+    explicit operator bool() const
+    {
+        return slot_ && static_cast<bool>(*slot_);
+    }
+};
+
+// ======================================================================
+// Module — type-erased hierarchical module with virtual lifecycle
+// ======================================================================
+
+/// Type-erased hierarchical module with virtual lifecycle.
+///
+/// The module tree is built explicitly via ``create_child()`` from the Python
+/// loading pipeline. Children are looked up by name; no lazy creation.
+///   - ``prepare()`` runs post-load processing (format conversion, fusion).
+///   - ``verify()`` walks the tree and collects uninitialized params/modules.
+///
+/// Derived classes use X-macro hooks (TM_CHILD_MEMBER, TM_PARAM_MEMBER, etc.)
+/// to declare children and parameters as direct members, overriding the
+/// virtual lookup methods to match by name.
 class Module {
+    friend class ModuleList;
+
 public:
     virtual ~Module();
 
@@ -14,26 +208,133 @@ class Module {
 
     Module(const Module&) = delete;
     Module& operator=(const Module&) = delete;
+    Module(Module&&)                 = delete;
+    Module& operator=(Module&&) = delete;
 
-    Module(Module&&) noexcept = delete;
-    Module& operator=(Module&&) noexcept = delete;
+    // ----- Type info -----
 
-    void register_module(std::string name, Module& module, std::optional<int> index = {});
-    void register_parameter(std::string name, Tensor& param);
+    /// Returns a static string identifying the module type (e.g., "LinearWeight", "NormWeight").
+    virtual const char* type() const;
 
-    void remove_module(Module& module);
-    void remove_parameter(Tensor& param);
+    // ----- Hierarchy (virtual, overridden by derived classes) -----
 
-    std::unordered_map<std::string, Tensor*> get_parameters() const;
+    /// Owns child; registers it under the given local name.
+    /// Returns raw pointer to the added child, or nullptr if name not recognized.
+    /// Default: returns nullptr.
+    virtual Module* add_child(std::string name, std::unique_ptr<Module> child);
 
-private:
-    void get_parameters_impl(std::string prefix, std::unordered_map<std::string, Tensor*>& m) const;
+    /// Find a direct child by name. Default: returns nullptr.
+    virtual Module* child(const std::string& name) const;
+
+    /// Iterate over all children. Default: no-op.
+    virtual void for_each_child(std::function<void(const char*, Module*)> visitor) const;
+
+    // ----- Parameters (virtual, overridden by derived classes) -----
+
+    /// Find a parameter by name within this module. Default: returns empty Param.
+    virtual Param param(const std::string& name);
+
+    /// Iterate over all parameters. Default: no-op.
+    virtual void for_each_param(std::function<void(const char*, Tensor&)> visitor);
+
+    // ----- Lifecycle (virtual, default = recurse / no-op) -----
+
+    /// Post-load processing: weight format conversion, fusion.
+    /// Default recurses into children via for_each_child.
+    virtual void prepare();
+
+    // ----- Registry-driven child creation -----
+
+    /// Create a standalone module using the type registry (no parent binding).
+    static std::unique_ptr<Module> create(const ModuleConfig& config);
+
+    /// Create a child module using the type registry and attach it.
+    /// Uses config.module_type to look up the factory.
+    /// Returns pointer to the created child, or nullptr on failure.
+    Module* create_child(const std::string& name, const ModuleConfig& config = {});
+
+    /// Typed child accessor. Aborts if child not found.
+    template<typename T>
+    T* get(const std::string& name) const
+    {
+        auto* c = child(name);
+        TM_CHECK(c != nullptr) << "child '" << name << "' not found in " << type();
+        return static_cast<T*>(c);
+    }
+
+    /// Find a child by single segment name. Aborts on null.
+    Module* get(const std::string& segment);
+
+    // ----- Verification -----
+
+    /// Walk subtree, collect paths of uninitialized params/modules into ``missing``.
+    /// Composite modules override to also check required children exist.
+    /// Returns true if everything is OK.
+    virtual bool verify(std::vector<std::string>& missing);
+
+    // ----- Utilities -----
+
+    /// Build the fully-qualified path by walking up the parent chain.
+    std::string full_path() const;
+
+    /// Access the parent module (nullptr for root).
+    Module* parent() const noexcept
+    {
+        return parent_;
+    }
+
+    /// Access the local name of this module within its parent.
+    const std::string& name() const noexcept
+    {
+        return name_;
+    }
 
 protected:
-    Module* parent_;
+    Module*     parent_ = nullptr;
+    std::string name_;
+
+    /// Helper for add_child() overrides: sets parent and name on a child module.
+    /// This is needed because derived classes cannot access protected members
+    /// of other Module instances through the C++ protected access rules.
+    static void attach_child_(Module* child, Module* parent, std::string name)
+    {
+        child->parent_ = parent;
+        child->name_   = std::move(name);
+    }
+};
+
+// ======================================================================
+// ModuleList — indexed container for layer/expert sequences
+// ======================================================================
+
+/// A systematic container for indexed module sequences (layers, experts).
+/// Children are added explicitly via ``add_child`` or ``create_child``.
+class ModuleList: public Module {
+public:
+    const char* type() const override
+    {
+        return "ModuleList";
+    }
 
-    std::vector<std::pair<std::string, Module*>> modules_;
-    std::vector<std::pair<std::string, Tensor*>> params_;
+    ModuleList() = default;
+
+    explicit ModuleList(const core::ModuleListConfig&) {}  // empty config, no-op
+
+    /// Override to also track the child in the indexed_ vector.
+    Module* add_child(std::string name, std::unique_ptr<Module> child) override;
+
+    /// Find child by name.
+    Module* child(const std::string& name) const override;
+
+    /// Iterate over children.
+    void for_each_child(std::function<void(const char*, Module*)> visitor) const override;
+
+    /// Number of children created so far.
+    int size() const;
+
+private:
+    std::vector<std::pair<std::string, std::unique_ptr<Module>>> items_;
+    std::vector<Module*>                                         indexed_;
 };
 
 }  // namespace turbomind::core
diff --git a/src/turbomind/core/registry.cc b/src/turbomind/core/registry.cc
new file mode 100644
index 0000000000..5bae97e953
--- /dev/null
+++ b/src/turbomind/core/registry.cc
@@ -0,0 +1,35 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/core/registry.h"
+
+#include "src/turbomind/core/check.h"
+#include "src/turbomind/core/module.h"
+
+namespace turbomind::core {
+
+ModuleRegistry& ModuleRegistry::instance()
+{
+    static ModuleRegistry reg;
+    return reg;
+}
+
+void ModuleRegistry::register_type(const std::string& name, Factory factory)
+{
+    factories_[name] = std::move(factory);
+}
+
+std::unique_ptr<Module> ModuleRegistry::create(const std::string& type, const ModuleConfig& config) const
+{
+    auto it = factories_.find(type);
+    if (it == factories_.end()) {
+        return nullptr;
+    }
+    return it->second(config);
+}
+
+bool ModuleRegistry::has_type(const std::string& name) const
+{
+    return factories_.count(name) > 0;
+}
+
+}  // namespace turbomind::core
diff --git a/src/turbomind/core/registry.h b/src/turbomind/core/registry.h
new file mode 100644
index 0000000000..8401dbcb81
--- /dev/null
+++ b/src/turbomind/core/registry.h
@@ -0,0 +1,57 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#pragma once
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+
+namespace turbomind::core {
+
+// Forward declarations — full definitions in module.h.
+struct ModuleConfig;
+class Module;
+
+/// Module type registry. Maps type name strings to factory functions.
+class ModuleRegistry {
+public:
+    using Factory = std::function<std::unique_ptr<Module>(const ModuleConfig&)>;
+
+    static ModuleRegistry& instance();
+
+    /// Register a factory under the given type name.
+    /// Duplicate names overwrite silently.
+    void register_type(const std::string& name, Factory factory);
+
+    /// Convenience overload: derive the factory lambda from the concrete types.
+    /// `CfgT` defaults to `ModuleConfig` so callers that accept the base config
+    /// need not specify it explicitly.
+    template<typename T, typename CfgT = ModuleConfig>
+    void register_type(const std::string& name)
+    {
+        register_type(name, [](const ModuleConfig& cfg) -> std::unique_ptr<Module> {
+            return std::make_unique<T>(static_cast<const CfgT&>(cfg));
+        });
+    }
+
+    /// Create a module instance by type name and typed config.
+    /// Returns nullptr if type name is not registered.
+    std::unique_ptr<Module> create(const std::string& type, const ModuleConfig& config) const;
+
+    /// Check if a type name is registered.
+    bool has_type(const std::string& name) const;
+
+private:
+    ModuleRegistry() = default;
+    std::map<std::string, Factory> factories_;
+};
+
+}  // namespace turbomind::core
+
+#define TM_MODULE_REGISTER(ModuleClass, ConfigType)                                                                    \
+    namespace {                                                                                                        \
+    static const bool _tm_module_registered_ = [] {                                                                    \
+        ::turbomind::core::ModuleRegistry::instance().register_type<ModuleClass, ConfigType>(#ModuleClass);            \
+        return true;                                                                                                   \
+    }();                                                                                                               \
+    }
diff --git a/src/turbomind/core/test_data_format.cc b/src/turbomind/core/test_data_format.cc
new file mode 100644
index 0000000000..fcb4997927
--- /dev/null
+++ b/src/turbomind/core/test_data_format.cc
@@ -0,0 +1,76 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/core/data_format.h"
+
+#include "catch2/catch_test_macros.hpp"
+
+using namespace turbomind;
+
+TEST_CASE("DataFormat default is not quantized", "[data_format]")
+{
+    DataFormat fmt;
+    REQUIRE(!fmt.is_quantized());
+    REQUIRE(fmt.rank() == 0);
+    REQUIRE(!fmt.scales.present());
+    REQUIRE(!fmt.zeros.present());
+}
+
+TEST_CASE("DataFormat trivial is not quantized", "[data_format]")
+{
+    DataFormat fmt = ResolveLinearWeightFormat(kHalf, kHalf, 1, 1);
+    REQUIRE(!fmt.is_quantized());
+    REQUIRE(fmt.rank() == 2);
+    REQUIRE(fmt.block_sizes == std::vector<int>{1, 1});
+    REQUIRE(!fmt.scales.present());
+    REQUIRE(!fmt.zeros.present());
+}
+
+TEST_CASE("DataFormat FP8 blocked", "[data_format]")
+{
+    DataFormat fmt = ResolveLinearWeightFormat(kHalf, kFloat8_e4m3, 128, 128);
+    REQUIRE(fmt.is_quantized());
+    REQUIRE(fmt.dtype == kFloat8_e4m3);
+    REQUIRE(fmt.block_sizes == std::vector<int>{128, 128});
+    REQUIRE(fmt.scales.present());
+    REQUIRE(fmt.scales.dtype == kFloat);
+    REQUIRE(!fmt.zeros.present());
+}
+
+TEST_CASE("DataFormat FP4", "[data_format]")
+{
+    DataFormat fmt = ResolveLinearWeightFormat(kHalf, kFloat4_e2m1, 128, 1);
+    REQUIRE(fmt.is_quantized());
+    REQUIRE(fmt.dtype == kFloat4_e2m1);
+    REQUIRE(fmt.block_sizes == std::vector<int>{128, 1});
+    REQUIRE(fmt.scales.present());
+    REQUIRE(fmt.scales.dtype == kUint8);
+    REQUIRE(!fmt.zeros.present());
+}
+
+TEST_CASE("DataFormat AWQ uint4", "[data_format]")
+{
+    DataFormat fmt = ResolveLinearWeightFormat(kHalf, kUint4, 128, 1);
+    REQUIRE(fmt.is_quantized());
+    REQUIRE(fmt.dtype == kUint4);
+    REQUIRE(fmt.block_sizes == std::vector<int>{128, 1});
+    REQUIRE(fmt.scales.present());
+    REQUIRE(fmt.scales.dtype == kHalf);
+    REQUIRE(fmt.zeros.present());
+    REQUIRE(fmt.zeros.dtype == kHalf);
+}
+
+TEST_CASE("DataFormat uint8 quantized", "[data_format]")
+{
+    DataFormat fmt = ResolveLinearWeightFormat(kBfloat16, kUint8, 64, 1);
+    REQUIRE(fmt.is_quantized());
+    REQUIRE(fmt.block_sizes == std::vector<int>{64, 1});
+    REQUIRE(fmt.scales.dtype == kBfloat16);
+    REQUIRE(fmt.zeros.dtype == kBfloat16);
+}
+
+TEST_CASE("DataFormat trivial BF16", "[data_format]")
+{
+    DataFormat fmt = ResolveLinearWeightFormat(kBfloat16, kBfloat16, 1, 1);
+    REQUIRE(!fmt.is_quantized());
+    REQUIRE(fmt.dtype == kBfloat16);
+}
diff --git a/src/turbomind/engine/engine.cc b/src/turbomind/engine/engine.cc
index 391a034dae..02bd5a9bc8 100644
--- a/src/turbomind/engine/engine.cc
+++ b/src/turbomind/engine/engine.cc
@@ -18,9 +18,12 @@
 
 #include "src/turbomind/core/copy.h"
 #include "src/turbomind/core/logger.h"
+#include "src/turbomind/models/decoder_layer_weight.h"
+#include "src/turbomind/models/delta_net_weight.h"
 #include "src/turbomind/models/language_model.h"
 #include "src/turbomind/models/llama/SequenceManager.h"
 #include "src/turbomind/models/llama/llama_params.h"
+#include "src/turbomind/models/model_weight.h"
 #include "src/turbomind/utils/metrics.h"
 
 // #include "dbg.h"
@@ -53,14 +56,14 @@ struct Engine::Impl {
     using Requests = vector<shared_ptr<Request>>;
     using Signal   = std::function<void()>;
 
-    Impl(DataType      dtype,
-         EngineParam   param,
-         LanguageModel model,
-         Context&      ctx,
-         Gateway&      gateway,
-         int           device_id,
-         int           queue_id,
-         int           phases);
+    Impl(EngineParam        param,
+         LanguageModel      model,
+         const ModelWeight& weights,
+         Context&           ctx,
+         Gateway&           gateway,
+         int                device_id,
+         int                queue_id,
+         int                phases);
 
     void CreateSequenceManager();
 
@@ -102,7 +105,6 @@ struct Engine::Impl {
 
     ~Impl();
 
-    const DataType    dtype_;
     const EngineParam param_;
 
     Gateway& gateway_;
@@ -126,8 +128,9 @@ struct Engine::Impl {
     Queue<unique_ptr<BatchData>> inbound_;
     Queue<unique_ptr<BatchData>> outbound_;
 
-    LanguageModel model_;
-    ModelExecutor executor_;
+    LanguageModel      model_;
+    const ModelWeight& weights_;
+    ModelExecutor      executor_;
 
     std::thread internal_thread_;
 
@@ -172,15 +175,14 @@ Engine::Impl::~Impl()
     executor_ = {};
 }
 
-Engine::Impl::Impl(DataType      dtype,
-                   EngineParam   param,
-                   LanguageModel model,
-                   Context&      ctx,
-                   Gateway&      gateway,
-                   int           device_id,
-                   int           queue_id,
-                   int           phases):
-    dtype_{dtype},
+Engine::Impl::Impl(EngineParam        param,
+                   LanguageModel      model,
+                   const ModelWeight& weights,
+                   Context&           ctx,
+                   Gateway&           gateway,
+                   int                device_id,
+                   int                queue_id,
+                   int                phases):
     param_{param},
     gateway_{gateway},
     tp_group_{ctx.comm.h_tp_group},
@@ -192,7 +194,8 @@ Engine::Impl::Impl(DataType      dtype,
     queue_id_{queue_id},
     async_{phases > 1},
     is_warm_up_{*ctx.is_warm_up},
-    model_{std::move(model)}
+    model_{std::move(model)},
+    weights_{weights}
 {
     states_.emplace_back();
 
@@ -204,26 +207,53 @@ Engine::Impl::Impl(DataType      dtype,
 
     CreateSequenceManager();  // initializes `session_len_trunc_`
 
-    const ssize_t max_batch_block_num =
-        param.max_batch_size * cdiv(session_len_trunc_, model_.attn_param().cache_block_seq_len);
-    block_ptrs_buf_         = {max_batch_block_num, kCPUpinned};
-    block_ptrs_offsets_buf_ = {param.max_batch_size + 1, kCPUpinned};
+    const ssize_t max_batch_block_num = param.max_batch_size * cdiv(session_len_trunc_, param_.cache_block_seq_len);
+    block_ptrs_buf_                   = {max_batch_block_num, kCPUpinned};
+    block_ptrs_offsets_buf_           = {param.max_batch_size + 1, kCPUpinned};
 }
 
 void Engine::Impl::CreateSequenceManager()
 {
-    const auto cache_block_seq_len = model_.attn_param().cache_block_seq_len;
+    const auto cache_block_seq_len = param_.cache_block_seq_len;
+
+    // Derive DeltaNet fields if linear attention exists
+    bool has_linear_attention = false;
+    int  linear_key_head_dim = 0, linear_value_head_dim = 0;
+    int  linear_conv_kernel_dim = 0, linear_num_key_heads = 0, linear_num_value_heads = 0;
+    for (int i = 0; i < weights_.num_layer; ++i) {
+        if (auto* dn = weights_.layer(i)->linear_attn.get()) {
+            has_linear_attention   = true;
+            linear_key_head_dim    = dn->key_head_dim;
+            linear_value_head_dim  = dn->value_head_dim;
+            linear_conv_kernel_dim = dn->d_conv;
+            linear_num_key_heads   = dn->num_k_heads * param_.attn_tp_size;
+            linear_num_value_heads = dn->num_v_heads * param_.attn_tp_size;
+            break;
+        }
+    }
 
-    const auto& model_param = model_.model_param();
+    if (has_linear_attention && param_.enable_prefix_caching) {
+        TM_CHECK(0) << "Prefix caching is unsupported when linear attention is present";
+    }
 
-    const auto get_free_size = [&] {  //
+    const auto get_free_size = [&] {
         size_t free{}, total{};
         check_cuda_error(cudaMemGetInfo(&free, &total));
         return AllReduce(tp_group_, free, comm::RedOp::kMin);
     };
 
-    seq_mgr_ = std::make_unique<SequenceManager>(model_param,
-                                                 dtype_,
+    seq_mgr_ = std::make_unique<SequenceManager>(weights_.head_dim,
+                                                 weights_.kv_head_num / param_.attn_tp_size,
+                                                 weights_.num_layer,
+                                                 weights_.layer_types,
+                                                 param_.quant_policy,
+                                                 weights_.data_type,
+                                                 weights_.data_type,  // runtime_dtype = data_type
+                                                 linear_key_head_dim,
+                                                 linear_value_head_dim,
+                                                 linear_conv_kernel_dim,
+                                                 linear_num_key_heads,
+                                                 linear_num_value_heads,
                                                  cache_block_seq_len,
                                                  param_.attn_tp_size,
                                                  param_.max_batch_size,
@@ -248,7 +278,13 @@ void Engine::Impl::Validate(Requests& infer_reqs, Requests& kill_reqs)
     std::pmr::monotonic_buffer_resource    mbr;
     std::pmr::unordered_map<uint64_t, int> occur(&mbr);
 
-    const bool has_linear_attention = HasLinearAttention(model_.model_param());
+    bool has_linear_attention = false;
+    for (auto t : weights_.layer_types) {
+        if (t == 1) {
+            has_linear_attention = true;
+            break;
+        }
+    }
 
     auto count = [&occur](const auto& reqs) {
         for (const auto& r : reqs) {
@@ -874,15 +910,15 @@ Engine::Engine()                  = default;
 Engine::Engine(Engine&&) noexcept = default;
 Engine& Engine::operator=(Engine&&) noexcept = default;
 
-Engine::Engine(DataType      dtype,
-               EngineParam   param,
-               LanguageModel model,
-               Context&      ctx,
-               Gateway&      gateway,
-               int           device_id,
-               int           dp_rank,
-               int           phases):
-    impl_{std::make_unique<Impl>(dtype, param, std::move(model), ctx, gateway, device_id, dp_rank, phases)}
+Engine::Engine(EngineParam        param,
+               LanguageModel      model,
+               const ModelWeight& weights,
+               Context&           ctx,
+               Gateway&           gateway,
+               int                device_id,
+               int                dp_rank,
+               int                phases):
+    impl_{std::make_unique<Impl>(param, std::move(model), weights, ctx, gateway, device_id, dp_rank, phases)}
 {
 }
 
diff --git a/src/turbomind/engine/engine.h b/src/turbomind/engine/engine.h
index ea26d196a7..b7e75d268c 100644
--- a/src/turbomind/engine/engine.h
+++ b/src/turbomind/engine/engine.h
@@ -26,14 +26,14 @@ class Engine {
         return static_cast<bool>(impl_);
     }
 
-    Engine(DataType      dtype,
-           EngineParam   param,
-           LanguageModel model,
-           Context&      ctx,
-           Gateway&      gateway,
-           int           device_id,
-           int           queue_id,
-           int           phases);
+    Engine(EngineParam        param,
+           LanguageModel      model,
+           const ModelWeight& weights,
+           Context&           ctx,
+           Gateway&           gateway,
+           int                device_id,
+           int                queue_id,
+           int                phases);
 
     void Start();
 
diff --git a/src/turbomind/engine/engine_config.h b/src/turbomind/engine/engine_config.h
new file mode 100644
index 0000000000..2c0381e9c4
--- /dev/null
+++ b/src/turbomind/engine/engine_config.h
@@ -0,0 +1,46 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "src/turbomind/core/data_type.h"
+#include "src/turbomind/core/module.h"
+
+namespace turbomind {
+
+struct EngineConfig {
+#define ENGINE_FIELDS(X)                                                                                               \
+    X(DataType, data_type)                                                                                             \
+    X(int, cache_block_seq_len, 0)                                                                                     \
+    X(int, quant_policy, 0)                                                                                            \
+    X(int, tune_layer_num, 1)                                                                                          \
+    X(int, max_batch_size, 0)                                                                                          \
+    X(int, max_prefill_token_num, 0)                                                                                   \
+    X(int, max_context_token_num, 0)                                                                                   \
+    X(int, session_len, 0)                                                                                             \
+    X(float, cache_max_block_count, 0)                                                                                 \
+    X(int, cache_chunk_size, 0)                                                                                        \
+    X(bool, enable_prefix_caching, false)                                                                              \
+    X(bool, enable_metrics, false)                                                                                     \
+    X(int, num_tokens_per_iter, 0)                                                                                     \
+    X(int, max_prefill_iters, 1)                                                                                       \
+    X(int, async_, 0)                                                                                                  \
+    X(int, outer_dp_size)                                                                                              \
+    X(int, attn_dp_size)                                                                                               \
+    X(int, attn_tp_size)                                                                                               \
+    X(int, attn_cp_size)                                                                                               \
+    X(int, mlp_tp_size)                                                                                                \
+    X(std::vector<int>, devices)                                                                                       \
+    X(int, nnodes)                                                                                                     \
+    X(int, node_rank)                                                                                                  \
+    X(std::string, communicator)
+
+    ENGINE_FIELDS(TM_MEMBER)
+    TM_FOR_EACH(EngineConfig, ENGINE_FIELDS)
+
+#undef ENGINE_FIELDS
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/gemm/CMakeLists.txt b/src/turbomind/kernels/gemm/CMakeLists.txt
index 0cc5ba8d37..fe666fa2ac 100644
--- a/src/turbomind/kernels/gemm/CMakeLists.txt
+++ b/src/turbomind/kernels/gemm/CMakeLists.txt
@@ -51,15 +51,14 @@ set_property(TARGET gemm2 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 
 
 if (BUILD_TEST)
-        add_executable(test_gemm_v2
-                test/test_gemm_v2.cc
-                ../../models/llama/LlamaLinear.cu
-                ../../models/llama/LlamaDenseWeight.cc
-                test/reference.cu)
-        target_link_libraries(test_gemm_v2 PRIVATE gemm2 core cublas quantization_kernels gpt_kernels)
+        # add_executable(test_gemm_v2
+        #         test/test_gemm_v2.cc
+        #         ../../models/llama/LlamaLinear.cu
+        #         test/reference.cu)
+        # target_link_libraries(test_gemm_v2 PRIVATE gemm2 core cublas quantization_kernels gpt_kernels)
 
-        add_executable(test_moe_utils test/test_moe_utils.cu test/test_utils.cu)
-        target_link_libraries(test_moe_utils PRIVATE gemm2 core cublas)
+        # add_executable(test_moe_utils test/test_moe_utils.cu test/test_utils.cu)
+        # target_link_libraries(test_moe_utils PRIVATE gemm2 core cublas)
 
         # if (NOT MSVC)
         #         FetchContent_Declare(
diff --git a/src/turbomind/kernels/gemm/convert_v3.cu b/src/turbomind/kernels/gemm/convert_v3.cu
index 39fef1a858..36dabc301e 100644
--- a/src/turbomind/kernels/gemm/convert_v3.cu
+++ b/src/turbomind/kernels/gemm/convert_v3.cu
@@ -112,7 +112,7 @@ std::array<const LayoutConverter*, 2> GetConverters(DataType data_type,
             // clang-format on
         }
         else {
-            return {};  //  trivial case: dense floating point
+            return {};  //  trivial case: no quantization
         }
     }
 
diff --git a/src/turbomind/kernels/gemm/kernel_impl_sm90.h b/src/turbomind/kernels/gemm/kernel_impl_sm90.h
index 14ebd5d78b..e787d2d701 100644
--- a/src/turbomind/kernels/gemm/kernel_impl_sm90.h
+++ b/src/turbomind/kernels/gemm/kernel_impl_sm90.h
@@ -173,6 +173,8 @@ class KernelImplSm90: public Kernel {
         [[maybe_unused]] const int n = Ddesc.cols;
         [[maybe_unused]] const int k = Adesc.cols;
 
+        TM_CHECK_GE(cdiv(k, TILE_K), 2) << "The kernel requires at least 2 k-tiles to work";
+
         // std::cout << "M: " << m << ", N: " << n << ", K: " << k << "\n";
 
         auto transpose = [](MatrixLayout x) {
diff --git a/src/turbomind/kernels/gemm/test/testbed_v3.h b/src/turbomind/kernels/gemm/test/testbed_v3.h
index f1df7456d5..677bd7ce60 100644
--- a/src/turbomind/kernels/gemm/test/testbed_v3.h
+++ b/src/turbomind/kernels/gemm/test/testbed_v3.h
@@ -11,7 +11,7 @@
 #include "src/turbomind/kernels/gemm/types.h"
 #include "src/turbomind/kernels/quantization.h"
 
-#include "src/turbomind/models/llama/LlamaDenseWeight.h"
+#include "src/turbomind/models/linear_weight.h"
 #include "src/turbomind/models/llama/LlamaLinear.h"
 
 #include "src/turbomind/kernels/gpt_kernels.h"
@@ -21,8 +21,7 @@ namespace turbomind {
 using std::vector;
 using std::unique_ptr;
 
-using DenseWeight = LlamaDenseWeight;
-using Linear      = LlamaLinear;
+using Linear = LlamaLinear;
 
 using namespace gemm;
 
@@ -77,6 +76,55 @@ static Tensor CopyTransposed(const Tensor& src, Tensor out = {})
     return out;
 }
 
+/// Link individual expert weights into a batched block view for fused MoE.
+static void LinkExperts(std::function<LinearWeight*(int)> experts, int n, LinearWeight& d)
+{
+    const auto& e0 = *experts(0);
+
+    e0.copy_metadata_to(d);
+
+    d.k_desc.num = d.q_desc.num = n;
+
+    if (e0.bias()) {
+        d.bias() = Tensor{{n, e0.output_dim}, e0.bias().dtype(), kDEVICE};
+    }
+
+    std::vector<std::pair<void*, int>> weights;
+    std::vector<std::pair<void*, int>> scales;
+
+    for (int i = 0; i < n; ++i) {
+        auto& e = *experts(i);
+        weights.emplace_back(e.weight().raw_data(), e.k_desc.ld);
+        if (e.scales()) {
+            scales.emplace_back(e.scales().raw_data(), e.q_desc.ld);
+        }
+        if (e.bias()) {
+            Copy(e.bias(), d.bias().slice(i, 1).squeeze(0));
+        }
+    }
+
+    auto stream = core::Context::stream().handle();
+
+    if (d.weight_format.dtype == kFloat8_e4m3 && d.input_dtype() == kFloat8_e4m3) {
+        auto make_blocked_ptr = [&](const auto& ptrs) {
+            return std::shared_ptr<void>{gemm::MakeBlockedPtrs(ptrs, stream), [](auto p) { cudaFree(p); }};
+        };
+        d.weight()       = Tensor{make_blocked_ptr(weights), {n}, e0.weight().dtype(), kDEVICE};
+        d.scales()       = Tensor{make_blocked_ptr(scales), {n}, e0.scales().dtype(), kDEVICE};
+        d.k_desc.offsets = d.q_desc.offsets = (int*)1;
+    }
+    else {
+        auto make_strided_ptr = [&](const auto& ptrs) {
+            return std::shared_ptr<void>{gemm::MakeStridedPtrs(ptrs, stream), [](auto p) { cudaFree(p); }};
+        };
+        d.weight() = Tensor{make_strided_ptr(weights), {n}, d.weight_format.dtype, kDEVICE};
+        if (e0.scales()) {
+            d.scales() = Tensor{make_strided_ptr(scales), {n}, e0.scales().dtype(), kDEVICE};
+        }
+        d.k_desc.ld = d.q_desc.ld = 0;
+    }
+}
+
 struct Testbed_v3: Parameter {
 
     Testbed_v3(const Parameter& param): Parameter{param}, stream_{core::Context::stream().handle()}, linear_{}
@@ -100,14 +148,14 @@ struct Testbed_v3: Parameter {
 
         cudaGetDeviceProperties(&prop_, 0);
 
-        w_original_ = std::make_unique<DenseWeight>();
-        w_quant_    = std::make_unique<DenseWeight>();
-        w_dequant_  = std::make_unique<DenseWeight>();
+        w_original_ = std::make_unique<LinearWeight>();
+        w_quant_    = std::make_unique<LinearWeight>();
+        w_dequant_  = std::make_unique<LinearWeight>();
 
         for (int i = 0; i < expert_num; ++i) {
-            e_original_.push_back(std::make_unique<DenseWeight>());
-            e_quant_.push_back(std::make_unique<DenseWeight>());
-            e_dequant_.push_back(std::make_unique<DenseWeight>());
+            e_original_.push_back(std::make_unique<LinearWeight>());
+            e_quant_.push_back(std::make_unique<LinearWeight>());
+            e_dequant_.push_back(std::make_unique<LinearWeight>());
         }
 
         GenerateWeight();
@@ -237,44 +285,57 @@ struct Testbed_v3: Parameter {
 
     // - quantize weight
     // - dequantize weight
-    void GenerateWeight(DenseWeight& original, DenseWeight& quant, DenseWeight& dequant)
+    void GenerateWeight(LinearWeight& original, LinearWeight& quant, LinearWeight& dequant)
     {
-        original.emplace(input_dim, output_dim, data_type, false, data_type, group_size);
-        rng_.NormalFloat(original.weight, 1., .1);
-
-        quant.emplace(input_dim, output_dim, data_type, false, weight_type, group_size);
-        dequant.emplace(input_dim, output_dim, data_type, false, data_type, group_size);
+        auto make_cfg = [&](DataType wt) -> core::LinearConfig {
+            core::LinearConfig cfg;
+            cfg.input_dim  = input_dim;
+            cfg.output_dim = output_dim;
+            cfg.data_type  = data_type;
+            cfg.format     = ResolveLinearWeightFormat(data_type, wt, group_size, 1);
+            cfg.has_bias   = false;
+            return cfg;
+        };
+
+        new (&original) LinearWeight(make_cfg(data_type));
+        original.param("weight").alloc({(size_t)input_dim, (size_t)output_dim}, data_type);
+        rng_.NormalFloat(original.weight(), 1., .1);
+
+        new (&quant) LinearWeight(make_cfg(weight_type));
+        quant.param("weight").alloc({(size_t)input_dim, (size_t)output_dim}, weight_type);
+        new (&dequant) LinearWeight(make_cfg(data_type));
+        dequant.param("weight").alloc({(size_t)input_dim, (size_t)output_dim}, data_type);
 
         Buffer_<unsigned> rbits;
-        // rbits = {original.weight.size(), kDEVICE};
+        // rbits = {original.weight().size(), kDEVICE};
         // rng_.RandomBytes(Tensor{rbits});
 
         /// Weights are allocated in MN-major, but some quantization requires K-major tensor
 
         if (weight_type == data_type) {
-            Copy(original.weight, quant.weight);
-            Copy(original.weight, dequant.weight);
+            Copy(original.weight(), quant.weight());
+            Copy(original.weight(), dequant.weight());
         }
         else if (weight_type == kFloat8_e4m3) {
-            QuantizeSymmBlock(quant.weight, quant.scales, original.weight, stream_);
-            DequantizeSymmBlock(dequant.weight, quant.weight, quant.scales, stream_);
+            QuantizeSymmBlock(quant.weight(), quant.scales(), original.weight(), stream_);
+            DequantizeSymmBlock(dequant.weight(), quant.weight(), quant.scales(), stream_);
         }
         else if (weight_type == kUint4) {
             /// Weights are allocated in (M,N), quantization needs K-major tensor
-            QuantizeGroupwise(quant.weight.t(),
-                              quant.scales.t(),
-                              quant.zeros.t(),
-                              dequant.weight.t(),
-                              original.weight.t(),
+            QuantizeGroupwise(quant.weight().t(),
+                              quant.scales().t(),
+                              quant.zeros().t(),
+                              dequant.weight().t(),
+                              original.weight().t(),
                               {},
                               group_size);
         }
         else if (weight_type == kFloat4_e2m1) {
-            QuantizeGroupwise(quant.weight.t(),  //
-                              quant.scales.t(),
+            QuantizeGroupwise(quant.weight().t(),  //
+                              quant.scales().t(),
                               {},
-                              dequant.weight.t(),
-                              original.weight.t(),
+                              dequant.weight().t(),
+                              original.weight().t(),
                               rbits,
                               group_size);
         }
@@ -282,9 +343,9 @@ struct Testbed_v3: Parameter {
             TM_CHECK(0);
         }
 
-        original.prepare(0);
-        quant.prepare(expert_num > 0);
-        dequant.prepare(0);
+        original.prepare();
+        quant.prepare();
+        dequant.prepare();
     }
 
     void GetReference()
@@ -299,7 +360,7 @@ struct Testbed_v3: Parameter {
         }
     }
 
-    void GetReference(const Tensor& x, const unique_ptr<DenseWeight>& dense, Ref<Tensor> d_)
+    void GetReference(const Tensor& x, const unique_ptr<LinearWeight>& dense, Ref<Tensor> d_)
     {
         auto& d = d_.get();
         if (!d) {
@@ -311,7 +372,7 @@ struct Testbed_v3: Parameter {
         ref_.gemm(x.raw_data(), desc_A, dense->weight.raw_data(), dense->k_desc, d.raw_data(), desc_D);
     }
 
-    void GetReference(const Tensor& x, const vector<unique_ptr<DenseWeight>>& experts, Ref<Tensor> d_)
+    void GetReference(const Tensor& x, const vector<unique_ptr<LinearWeight>>& experts, Ref<Tensor> d_)
     {
         Tensor xe{{x.shape(0) * experts_per_token, input_dim}, data_type, kDEVICE};
         Tensor de{{x.shape(0) * experts_per_token, output_dim}, data_type, kDEVICE};
@@ -376,7 +437,7 @@ struct Testbed_v3: Parameter {
         }
     }
 
-    void Run(const Tensor& x, const vector<unique_ptr<DenseWeight>>& experts) {}
+    void Run(const Tensor& x, const vector<unique_ptr<LinearWeight>>& experts) {}
 
     void Compare()
     {
@@ -421,9 +482,9 @@ struct Testbed_v3: Parameter {
     Linear linear_;
 
     // ! weights are non-movable
-    unique_ptr<DenseWeight> w_original_;
-    unique_ptr<DenseWeight> w_quant_;
-    unique_ptr<DenseWeight> w_dequant_;
+    unique_ptr<LinearWeight> w_original_;
+    unique_ptr<LinearWeight> w_quant_;
+    unique_ptr<LinearWeight> w_dequant_;
 
     Tensor x_original_;
     Tensor x_quant_, x_scale_;
@@ -433,9 +494,9 @@ struct Testbed_v3: Parameter {
     Tensor d_quant_;     // x_original * w_quant, quant for X done by `Linear`
     Tensor d_dequant_;   // x_dequant  * w_dequant
 
-    vector<unique_ptr<DenseWeight>> e_original_;
-    vector<unique_ptr<DenseWeight>> e_quant_;
-    vector<unique_ptr<DenseWeight>> e_dequant_;
+    vector<unique_ptr<LinearWeight>> e_original_;
+    vector<unique_ptr<LinearWeight>> e_quant_;
+    vector<unique_ptr<LinearWeight>> e_dequant_;
 
     Buffer_<int> f2n_;
     Buffer_<int> en2f_;
diff --git a/src/turbomind/kernels/quantization.cu b/src/turbomind/kernels/quantization.cu
index 7899226f33..8dc07b85ed 100644
--- a/src/turbomind/kernels/quantization.cu
+++ b/src/turbomind/kernels/quantization.cu
@@ -66,9 +66,9 @@ void QuantizeSymm(Tensor& out, Tensor& scale, const Tensor& src, cudaStream_t st
     TM_CHECK_EQ(src.ndim(), 2);
     TM_CHECK_EQ(src.stride(1), 1);  // row-major
 
-    const auto [num, dim] = src.shapes(0, 1);
+    const auto num = src.shape(0);
+    const auto dim = src.shape(1);
 
-    using T      = bfloat16_t;
     using Tout   = fp8_e4m3_t;
     using Tscale = float;
 
@@ -99,15 +99,20 @@ void QuantizeSymm(Tensor& out, Tensor& scale, const Tensor& src, cudaStream_t st
 
     constexpr int block_dim = 512;
 
-    quant_symm_row<vec_size, group_size><<<num, block_dim, 0, st>>>(out.data<Tout>(),  //
-                                                                    out.stride(0),
-                                                                    scale.data<Tscale>(),
-                                                                    scale.stride(0),
-                                                                    src.data<T>(),
-                                                                    src.stride(0),
-                                                                    num,
-                                                                    dim,
-                                                                    448.f);
+    auto invoke = [&](auto t) {
+        using T = decltype(t);
+        quant_symm_row<vec_size, group_size><<<num, block_dim, 0, st>>>(out.data<Tout>(),  //
+                                                                        out.stride(0),
+                                                                        scale.data<Tscale>(),
+                                                                        scale.stride(0),
+                                                                        src.data<T>(),
+                                                                        src.stride(0),
+                                                                        num,
+                                                                        dim,
+                                                                        448.f);
+    };
+
+    TM_DISPATCH_PRIMARY_DTYPES(src.dtype(), invoke);
 }
 
 template<int vec_size, int group_size, class Tout, class Tscale, class T>
diff --git a/src/turbomind/models/CMakeLists.txt b/src/turbomind/models/CMakeLists.txt
index 10b8fe9d00..58a5700f98 100644
--- a/src/turbomind/models/CMakeLists.txt
+++ b/src/turbomind/models/CMakeLists.txt
@@ -4,13 +4,19 @@ add_library(models STATIC
         language_model.cc
         input_processor.cc
         output_processor.cc
+        linear_weight.cc
+        norm_weight.cc
+        attention_weight.cc
+        ffn_weight.cc
+        moe_weight.cc
+        delta_net_weight.cc
+        decoder_layer_weight.cc
+        model_weight.cc
+        model_root.cc
         llama/LlamaLinear.cu
         llama/BlockManager.cc
         llama/BlockTrie.cc
         llama/SequenceManager.cc
-        llama/LlamaWeight.cc
-        llama/LlamaDenseWeight.cc
-        llama/LlamaDecoderLayerWeight.cc
         llama/LlamaFfnLayer.cc
         llama/moe_ffn_layer.cc
         llama/unified_decoder.cc
@@ -18,7 +24,6 @@ add_library(models STATIC
         llama/llama_kernels.cu
         llama/llama_utils.cu
         llama/mla_utils.cu
-        llama/GatedDeltaNetWeight.cc
         llama/GatedDeltaNetLayer.cc
         llama/gated_delta_net_kernels.cu)
 set_property(TARGET models PROPERTY POSITION_INDEPENDENT_CODE ON)
diff --git a/src/turbomind/models/attention_weight.cc b/src/turbomind/models/attention_weight.cc
new file mode 100644
index 0000000000..f3cb16ec99
--- /dev/null
+++ b/src/turbomind/models/attention_weight.cc
@@ -0,0 +1,94 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/models/attention_weight.h"
+
+#include "src/turbomind/core/registry.h"
+#include "src/turbomind/kernels/core/math.h"
+#include "src/turbomind/models/llama/llama_rope.h"
+
+namespace turbomind {
+
+AttentionWeight::AttentionWeight(const core::AttentionConfig& cfg):
+    hidden_dim(cfg.hidden_dim),
+    head_dim(cfg.head_dim),
+    head_num(cfg.head_num),
+    kv_head_num(cfg.kv_head_num),
+    kv_lora_rank(cfg.kv_lora_rank),
+    q_lora_rank(cfg.q_lora_rank),
+    qk_rope_dim(cfg.qk_rope_dim),
+    v_head_dim(cfg.v_head_dim),
+    tp_size(cfg.tp_size),
+    tp_rank(cfg.tp_rank),
+    data_type(cfg.data_type),
+    window_size(cfg.window_size),
+    output_gate(cfg.output_gate),
+    softmax_scale(cfg.softmax_scale),
+    use_logn_attn(cfg.use_logn_attn),
+    rope(cfg.rope)
+{
+}
+
+void AttentionWeight::prepare()
+{
+    Module::prepare();
+}
+
+void init_rope_kernel_param(const core::RopeConfig& rope, RopeKernelParam& rope_kernel)
+{
+    auto rope_type = static_cast<RopeType>(rope.type);
+
+    rope_kernel.type         = rope_type;
+    rope_kernel.dim          = rope.dim;
+    rope_kernel.scale_factor = -std::log2(rope.base) / rope.dim;
+    if (rope_type == RopeType::kDynamic) {
+        rope_kernel.inv_factor = 1.f;
+    }
+    else {
+        rope_kernel.inv_factor = (rope.factor != 0.f) ? 1.0 / rope.factor : 1.f;
+    }
+
+    if (rope_type == RopeType::kYarn) {
+        auto&        dst = rope_kernel.yarn;
+        const double PI  = 3.14159265358979323846;
+
+        auto find_correction_dim = [&](float num_rotations) {
+            return (rope.dim * std::log(rope.max_position_embeddings / (num_rotations * 2 * PI)))
+                   / (2 * std::log(rope.base));
+        };
+
+        auto find_correction_range = [&](float low_rot, float high_rot, float& low, float& high) {
+            low  = std::floor(find_correction_dim(low_rot));
+            high = std::ceil(find_correction_dim(high_rot));
+            low  = std::max(low, 0.f);
+            high = std::min(high, rope.dim - 1.f);
+        };
+
+        float low, high;
+        find_correction_range(rope.yarn_beta_fast, rope.yarn_beta_slow, low, high);
+        if (low == high) {
+            high += 0.001f;
+        }
+        dst.ramp_inv_factor_div_2   = 1.0 / (high - low) / 2.0;
+        dst.ramp_inv_factor_mul_min = 1.0 / (high - low) * low;
+        dst.attention_factor        = rope.yarn_attention_factor;
+    }
+    else if (rope_type == RopeType::kLlama3) {
+        auto& dst = rope_kernel.llama3;
+
+        float inv_diff_freq_factor = 1.0 / (rope.llama3_high_freq_factor - rope.llama3_low_freq_factor);
+        dst.alpha = rope.llama3_original_max_position_embeddings / (2 * 3.14159265358979323846) * inv_diff_freq_factor;
+        dst.beta  = rope.llama3_low_freq_factor * inv_diff_freq_factor;
+    }
+    else if (rope_type == RopeType::kMrope) {
+        auto& dst     = rope_kernel.mrope;
+        dst.section.x = rope.mrope_section[0] * 2;
+        dst.section.y = rope.mrope_section[1] * 2 + dst.section.x;
+        dst.section.z = rope.mrope_section[2] * 2 + dst.section.y;
+    }
+}
+
+TM_MODULE_REGISTER(AttentionWeight, core::AttentionConfig);
+
+TM_MODULE_METHODS(AttentionWeight, ATTENTION_WEIGHT_CHILDREN, ATTENTION_WEIGHT_PARAMS)
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/attention_weight.h b/src/turbomind/models/attention_weight.h
new file mode 100644
index 0000000000..878c528dba
--- /dev/null
+++ b/src/turbomind/models/attention_weight.h
@@ -0,0 +1,126 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#pragma once
+
+#include <array>
+
+#include "src/turbomind/core/core.h"
+#include "src/turbomind/core/module.h"
+#include "src/turbomind/models/linear_weight.h"
+#include "src/turbomind/models/norm_weight.h"
+
+namespace turbomind::core {
+
+using MropeSection = std::array<int, 3>;
+
+struct RopeConfig {
+#define ROPE_FIELDS(X)                                                                                                 \
+    X(int, type, 0)                                                                                                    \
+    X(float, base, 10000.f)                                                                                            \
+    X(int, dim, 0)                                                                                                     \
+    X(float, factor, 1.f)                                                                                              \
+    X(int, max_position_embeddings, 0)                                                                                 \
+    X(float, yarn_attention_factor, 1.f)                                                                               \
+    X(float, yarn_beta_fast, 32.f)                                                                                     \
+    X(float, yarn_beta_slow, 1.f)                                                                                      \
+    X(float, llama3_low_freq_factor, 1.f)                                                                              \
+    X(float, llama3_high_freq_factor, 4.f)                                                                             \
+    X(int, llama3_original_max_position_embeddings, 0)                                                                 \
+    X(MropeSection, mrope_section, {})
+
+    ROPE_FIELDS(TM_MEMBER)
+    TM_FOR_EACH(RopeConfig, ROPE_FIELDS)
+
+#undef ROPE_FIELDS
+};
+
+struct AttentionConfig: ModuleConfig {
+    AttentionConfig(): ModuleConfig{"AttentionWeight"} {}
+
+#define ATTENTION_FIELDS(X)                                                                                            \
+    X(int, hidden_dim)                                                                                                 \
+    X(int, head_dim)                                                                                                   \
+    X(int, head_num)                                                                                                   \
+    X(int, kv_head_num)                                                                                                \
+    X(int, kv_lora_rank)                                                                                               \
+    X(int, q_lora_rank)                                                                                                \
+    X(int, qk_rope_dim)                                                                                                \
+    X(int, v_head_dim)                                                                                                 \
+    X(int, tp_size)                                                                                                    \
+    X(int, tp_rank)                                                                                                    \
+    X(DataType, data_type)                                                                                             \
+    X(int, window_size, 0)                                                                                             \
+    X(bool, output_gate, 0)                                                                                            \
+    X(RopeConfig, rope, {})                                                                                            \
+    X(int, qk_nope_dim)                                                                                                \
+    X(float, softmax_scale, 0.f)                                                                                       \
+    X(bool, use_logn_attn, false)
+
+    ATTENTION_FIELDS(TM_MEMBER)
+    TM_FOR_EACH(AttentionConfig, ATTENTION_FIELDS)
+
+#undef ATTENTION_FIELDS
+};
+
+}  // namespace turbomind::core
+
+namespace turbomind {
+
+struct RopeKernelParam;
+void init_rope_kernel_param(const core::RopeConfig& rope, RopeKernelParam& rope_kernel);
+
+class AttentionWeight: public core::Module {
+public:
+    const char* type() const override
+    {
+        return "AttentionWeight";
+    }
+
+    AttentionWeight() = default;
+
+    AttentionWeight(const core::AttentionConfig& cfg);
+
+    void prepare() override;
+
+    // --- X-macro field lists ---
+#define ATTENTION_WEIGHT_CHILDREN(X)                                                                                   \
+    X(LinearWeight, w_qkv)                                                                                             \
+    X(LinearWeight, wo)                                                                                                \
+    X(LinearWeight, q_proj)                                                                                            \
+    X(LinearWeight, q_a_proj)                                                                                          \
+    X(LinearWeight, q_b_proj)                                                                                          \
+    X(LinearWeight, kv_a_proj)                                                                                         \
+    X(NormWeight, q_norm)                                                                                              \
+    X(NormWeight, k_norm)                                                                                              \
+    X(NormWeight, q_a_layernorm)                                                                                       \
+    X(NormWeight, kv_a_layernorm)
+
+#define ATTENTION_WEIGHT_PARAMS(X) X(sinks)
+
+    TM_MODULE_DECLARE(AttentionWeight, ATTENTION_WEIGHT_CHILDREN, ATTENTION_WEIGHT_PARAMS)
+
+    bool is_mla() const
+    {
+        return kv_lora_rank > 0;
+    }
+
+    // --- Config fields (public for runtime access) ---
+    int      hidden_dim{};
+    int      head_dim{};
+    int      head_num{};
+    int      kv_head_num{};
+    int      kv_lora_rank{};
+    int      q_lora_rank{};
+    int      qk_rope_dim{};
+    int      v_head_dim{};
+    int      tp_size{};
+    int      tp_rank{};
+    DataType data_type{};
+    int      window_size{};
+    bool     output_gate{};
+    float    softmax_scale{};
+    bool     use_logn_attn{};
+
+    core::RopeConfig rope{};
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/decoder_layer_weight.cc b/src/turbomind/models/decoder_layer_weight.cc
new file mode 100644
index 0000000000..35c5b14ba9
--- /dev/null
+++ b/src/turbomind/models/decoder_layer_weight.cc
@@ -0,0 +1,40 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/models/decoder_layer_weight.h"
+#include "src/turbomind/models/attention_weight.h"
+#include "src/turbomind/models/delta_net_weight.h"
+#include "src/turbomind/models/ffn_weight.h"
+#include "src/turbomind/models/moe_weight.h"
+#include "src/turbomind/models/norm_weight.h"
+
+#include "src/turbomind/core/registry.h"
+
+namespace turbomind {
+
+DecoderLayerWeight::DecoderLayerWeight(const core::ModuleConfig&) {}
+
+DecoderLayerWeight::~DecoderLayerWeight() = default;
+
+bool DecoderLayerWeight::verify(std::vector<std::string>& missing)
+{
+    Module::verify(missing);
+    // At least one of attention or linear_attn must exist
+    if (!attention && !linear_attn) {
+        missing.push_back(full_path() + ": missing attention or linear_attn");
+    }
+    // At least one of feed_forward or moe_ffn must exist
+    if (!feed_forward && !moe_ffn) {
+        missing.push_back(full_path() + ": missing feed_forward or moe_ffn");
+    }
+    // attention_norm must exist
+    if (!attention_norm) {
+        missing.push_back(full_path() + ": missing attention_norm");
+    }
+    return missing.empty();
+}
+
+TM_MODULE_REGISTER(DecoderLayerWeight, core::ModuleConfig);
+
+TM_MODULE_METHODS(DecoderLayerWeight, DECODER_LAYER_WEIGHT_CHILDREN, DECODER_LAYER_WEIGHT_PARAMS)
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/decoder_layer_weight.h b/src/turbomind/models/decoder_layer_weight.h
new file mode 100644
index 0000000000..1e7b089863
--- /dev/null
+++ b/src/turbomind/models/decoder_layer_weight.h
@@ -0,0 +1,55 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#pragma once
+
+#include "src/turbomind/core/module.h"
+
+namespace turbomind::core {
+
+struct DecoderLayerConfig: ModuleConfig {
+    DecoderLayerConfig(): ModuleConfig{"DecoderLayerWeight"} {}
+    template<typename Visitor>
+    static void for_each(Visitor&&)
+    {
+    }
+};
+
+}  // namespace turbomind::core
+
+namespace turbomind {
+
+class AttentionWeight;
+class DeltaNetWeight;
+class FfnWeight;
+class MoeWeight;
+class NormWeight;
+
+/// Architecture-independent decoder layer weight composite.
+class DecoderLayerWeight: public core::Module {
+public:
+    const char* type() const override
+    {
+        return "DecoderLayerWeight";
+    }
+
+    DecoderLayerWeight() = default;
+    DecoderLayerWeight(const core::ModuleConfig&);
+
+    ~DecoderLayerWeight() override;  // defined in .cc where child types are complete
+
+    bool verify(std::vector<std::string>& missing) override;
+
+    // --- X-macro field lists ---
+#define DECODER_LAYER_WEIGHT_CHILDREN(X)                                                                               \
+    X(AttentionWeight, attention)                                                                                      \
+    X(DeltaNetWeight, linear_attn)                                                                                     \
+    X(FfnWeight, feed_forward)                                                                                         \
+    X(MoeWeight, moe_ffn)                                                                                              \
+    X(NormWeight, attention_norm)                                                                                      \
+    X(NormWeight, ffn_norm)
+
+#define DECODER_LAYER_WEIGHT_PARAMS(X)
+
+    TM_MODULE_DECLARE(DecoderLayerWeight, DECODER_LAYER_WEIGHT_CHILDREN, DECODER_LAYER_WEIGHT_PARAMS)
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/delta_net_weight.cc b/src/turbomind/models/delta_net_weight.cc
new file mode 100644
index 0000000000..d365fd2a7d
--- /dev/null
+++ b/src/turbomind/models/delta_net_weight.cc
@@ -0,0 +1,36 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/models/delta_net_weight.h"
+
+#include "src/turbomind/core/registry.h"
+#include "src/turbomind/utils/memory_utils.h"
+
+namespace turbomind {
+
+DeltaNetWeight::DeltaNetWeight(const core::DeltaNetConfig& cfg):
+    hidden_dim(cfg.hidden_dim),
+    num_k_heads(cfg.num_k_heads),
+    num_v_heads(cfg.num_v_heads),
+    key_head_dim(cfg.key_head_dim),
+    value_head_dim(cfg.value_head_dim),
+    d_conv(cfg.d_conv),
+    data_type(cfg.data_type),
+    tp_size(cfg.tp_size),
+    tp_rank(cfg.tp_rank)
+{
+}
+
+void DeltaNetWeight::prepare()
+{
+    Module::prepare();
+
+    EnsureFloatDtype(A_log, data_type);
+    EnsureFloatDtype(dt_bias, data_type);
+    EnsureFloatDtype(conv1d, data_type);
+}
+
+TM_MODULE_REGISTER(DeltaNetWeight, core::DeltaNetConfig);
+
+TM_MODULE_METHODS(DeltaNetWeight, DELTA_NET_WEIGHT_CHILDREN, DELTA_NET_WEIGHT_PARAMS)
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/delta_net_weight.h b/src/turbomind/models/delta_net_weight.h
new file mode 100644
index 0000000000..92fa9bbe72
--- /dev/null
+++ b/src/turbomind/models/delta_net_weight.h
@@ -0,0 +1,74 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#pragma once
+
+#include "src/turbomind/core/core.h"
+#include "src/turbomind/core/module.h"
+#include "src/turbomind/models/linear_weight.h"
+#include "src/turbomind/models/norm_weight.h"
+
+namespace turbomind::core {
+
+struct DeltaNetConfig: ModuleConfig {
+    DeltaNetConfig(): ModuleConfig{"DeltaNetWeight"} {}
+
+#define DELTANET_FIELDS(X)                                                                                             \
+    X(int, hidden_dim)                                                                                                 \
+    X(int, num_k_heads)                                                                                                \
+    X(int, num_v_heads)                                                                                                \
+    X(int, key_head_dim)                                                                                               \
+    X(int, value_head_dim)                                                                                             \
+    X(int, d_conv, 4)                                                                                                  \
+    X(DataType, data_type)                                                                                             \
+    X(int, tp_size)                                                                                                    \
+    X(int, tp_rank)
+
+    DELTANET_FIELDS(TM_MEMBER)
+    TM_FOR_EACH(DeltaNetConfig, DELTANET_FIELDS)
+
+#undef DELTANET_FIELDS
+};
+
+}  // namespace turbomind::core
+
+namespace turbomind {
+
+/// Weight module for Gated DeltaNet (linear attention) layers.
+class DeltaNetWeight: public core::Module {
+public:
+    const char* type() const override
+    {
+        return "DeltaNetWeight";
+    }
+
+    DeltaNetWeight() = default;
+
+    DeltaNetWeight(const core::DeltaNetConfig& cfg);
+
+    void prepare() override;
+
+    // --- X-macro field lists ---
+#define DELTA_NET_WEIGHT_CHILDREN(X)                                                                                   \
+    X(LinearWeight, in_proj_all)                                                                                       \
+    X(LinearWeight, out_proj)                                                                                          \
+    X(NormWeight, norm)
+
+#define DELTA_NET_WEIGHT_PARAMS(X)                                                                                     \
+    X(conv1d)                                                                                                          \
+    X(A_log)                                                                                                           \
+    X(dt_bias)
+
+    TM_MODULE_DECLARE(DeltaNetWeight, DELTA_NET_WEIGHT_CHILDREN, DELTA_NET_WEIGHT_PARAMS)
+
+    // --- Config fields (public for runtime access) ---
+    int      hidden_dim{};
+    int      num_k_heads{};
+    int      num_v_heads{};
+    int      key_head_dim{};
+    int      value_head_dim{};
+    int      d_conv{};
+    DataType data_type{};
+    int      tp_size{};
+    int      tp_rank{};
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/ffn_weight.cc b/src/turbomind/models/ffn_weight.cc
new file mode 100644
index 0000000000..9ecb66923d
--- /dev/null
+++ b/src/turbomind/models/ffn_weight.cc
@@ -0,0 +1,48 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/models/ffn_weight.h"
+
+#include "src/turbomind/core/registry.h"
+#include "src/turbomind/kernels/gemm/types.h"
+
+namespace turbomind {
+
+FfnWeight::FfnWeight(const core::FfnConfig& cfg):
+    hidden_dim{cfg.hidden_dim},
+    inter_size{cfg.inter_size / cfg.tp_size},
+    act_type{static_cast<ActivationType>(cfg.act_type)},
+    is_fused_silu{cfg.fuse_silu && act_type == ActivationType::kSilu},
+    is_expert_{cfg.is_expert},
+    data_type_{cfg.data_type},
+    tp_size{cfg.tp_size},
+    tp_rank{cfg.tp_rank}
+{
+}
+
+void FfnWeight::prepare()
+{
+    // Set epilogue on existing w1w3 child if fused silu is active.
+    if (w1w3) {
+        auto* fused = static_cast<LinearWeight*>(w1w3.get());
+        if (is_fused_silu) {
+            fused->epilogue = gemm::Epilogue::kGatedSilu;
+        }
+    }
+
+    // Propagate grouped-GEMM flag for MoE expert weights
+    if (is_expert_) {
+        for_each_child([](const char*, Module* m) {
+            if (auto* linear = dynamic_cast<LinearWeight*>(m)) {
+                linear->set_grouped(true);
+            }
+        });
+    }
+
+    Module::prepare();  // recurse into children
+}
+
+TM_MODULE_REGISTER(FfnWeight, core::FfnConfig);
+
+TM_MODULE_METHODS(FfnWeight, FFN_WEIGHT_CHILDREN, FFN_WEIGHT_PARAMS)
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/ffn_weight.h b/src/turbomind/models/ffn_weight.h
new file mode 100644
index 0000000000..604d91bd35
--- /dev/null
+++ b/src/turbomind/models/ffn_weight.h
@@ -0,0 +1,70 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#pragma once
+
+#include "src/turbomind/core/core.h"
+#include "src/turbomind/core/module.h"
+#include "src/turbomind/kernels/activation.h"
+#include "src/turbomind/models/linear_weight.h"
+
+namespace turbomind::core {
+
+struct FfnConfig: ModuleConfig {
+    FfnConfig(): ModuleConfig{"FfnWeight"} {}
+
+#define FFN_FIELDS(X)                                                                                                  \
+    X(int, hidden_dim)                                                                                                 \
+    X(int, inter_size)                                                                                                 \
+    X(int, act_type)                                                                                                   \
+    X(bool, fuse_silu)                                                                                                 \
+    X(bool, is_expert)                                                                                                 \
+    X(DataType, data_type)                                                                                             \
+    X(int, tp_size)                                                                                                    \
+    X(int, tp_rank)
+
+    FFN_FIELDS(TM_MEMBER)
+    TM_FOR_EACH(FfnConfig, FFN_FIELDS)
+
+#undef FFN_FIELDS
+};
+
+}  // namespace turbomind::core
+
+namespace turbomind {
+
+class FfnWeight: public core::Module {
+public:
+    const char* type() const override
+    {
+        return "FfnWeight";
+    }
+
+    FfnWeight() = default;
+
+    FfnWeight(const core::FfnConfig& cfg);
+
+    void prepare() override;
+
+    // --- X-macro child members ---
+#define FFN_WEIGHT_CHILDREN(X)                                                                                         \
+    X(LinearWeight, w1)                                                                                                \
+    X(LinearWeight, w3)                                                                                                \
+    X(LinearWeight, w2)                                                                                                \
+    X(LinearWeight, w1w3)
+
+#define FFN_WEIGHT_PARAMS(X)
+
+    TM_MODULE_DECLARE(FfnWeight, FFN_WEIGHT_CHILDREN, FFN_WEIGHT_PARAMS)
+
+    int            hidden_dim{};
+    int            inter_size{};
+    ActivationType act_type{};
+    bool           is_fused_silu{};
+    int            tp_size{};
+    int            tp_rank{};
+
+private:
+    bool     is_expert_{};
+    DataType data_type_{};
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/input_processor.cc b/src/turbomind/models/input_processor.cc
index 43f70578e7..a1148a6c2e 100644
--- a/src/turbomind/models/input_processor.cc
+++ b/src/turbomind/models/input_processor.cc
@@ -14,7 +14,7 @@ using std::vector;
 
 struct InputProcessor::Impl {
 public:
-    Impl(const EngineParam& engine, const ModelParam& model, int phases):
+    Impl(const EngineParam& engine, int hidden_units, DataType data_type, int phases):
         max_batch_size_{engine.max_batch_size}, max_forward_token_num_{engine.max_forward_token_num}
     {
         input_ids_buf_         = {max_forward_token_num_, kCPUpinned};
@@ -31,7 +31,7 @@ struct InputProcessor::Impl {
             d.autoreg_ids_pos = {max_batch_size_, kCPU};  // ! CPU buffer
 
             /// TODO: initialize only when required
-            d.input_embeds_buf = {{max_forward_token_num_, (int)model.hidden_units}, model.data_type, kCPUpinned};
+            d.input_embeds_buf = {{max_forward_token_num_, hidden_units}, data_type, kCPUpinned};
         }
     }
 
@@ -241,8 +241,8 @@ struct InputProcessor::Impl {
 
 InputProcessor::~InputProcessor() = default;
 
-InputProcessor::InputProcessor(const EngineParam& engine, const ModelParam& model, int phases):
-    impl_{std::make_unique<Impl>(engine, model, phases)}
+InputProcessor::InputProcessor(const EngineParam& engine, int hidden_units, DataType data_type, int phases):
+    impl_{std::make_unique<Impl>(engine, hidden_units, data_type, phases)}
 {
 }
 
diff --git a/src/turbomind/models/input_processor.h b/src/turbomind/models/input_processor.h
index be7502a9ef..52050543ac 100644
--- a/src/turbomind/models/input_processor.h
+++ b/src/turbomind/models/input_processor.h
@@ -9,7 +9,7 @@ class InputProcessor {
 public:
     ~InputProcessor();
 
-    InputProcessor(const EngineParam& engine, const ModelParam& model, int phases);
+    InputProcessor(const EngineParam& engine, int hidden_units, DataType data_type, int phases);
 
     void Run(BatchOp op, int phase, TensorMap& env);
 
diff --git a/src/turbomind/models/language_model.cc b/src/turbomind/models/language_model.cc
index dd2ff5756c..1b5a966c34 100644
--- a/src/turbomind/models/language_model.cc
+++ b/src/turbomind/models/language_model.cc
@@ -15,11 +15,11 @@
 #include "src/turbomind/generation/generation.h"
 #include "src/turbomind/kernels/gpt_kernels.h"
 #include "src/turbomind/models/input_processor.h"
-#include "src/turbomind/models/llama/LlamaWeight.h"
 #include "src/turbomind/models/llama/llama_kernels.h"
 #include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/models/llama/llama_utils.h"
 #include "src/turbomind/models/llama/unified_decoder.h"
+#include "src/turbomind/models/model_weight.h"
 #include "src/turbomind/models/output_processor.h"
 #include "src/turbomind/utils/anomaly_handler.h"
 #include "src/turbomind/utils/cuda_utils.h"
@@ -33,11 +33,8 @@ using std::unique_ptr;
 using std::shared_ptr;
 
 struct LanguageModel::Impl {
-    const DataType       dtype_;
-    const ModelParam     param_;
-    const AttentionParam attn_param_;
     const Communicators& comm_;
-    const LlamaWeight&   weights_;
+    const ModelWeight&   weights_;
     LlamaLinear&         linear_;
 
     const int  tp_size_;
@@ -102,14 +99,7 @@ struct LanguageModel::Impl {
         }
     }
 
-    Impl(DataType              dtype,
-         const ModelParam&     model,
-         const EngineParam&    engine,
-         const AttentionParam& attn,
-         const MoeParam&       moe,
-         const Context&        ctx,
-         const LlamaWeight&    weights,
-         int                   phases);
+    Impl(const EngineParam& engine, const Context& ctx, const ModelWeight& weights, int phases);
 
     Tensor LookupEmbedding(const Buffer_<int>& input_ids, Buffer symm_buf);
     Tensor PostEmbedding(const Tensor& features, Buffer symm_buf);
@@ -121,17 +111,7 @@ struct LanguageModel::Impl {
     void Fetch(int phase, TensorMap& env);
 };
 
-LanguageModel::Impl::Impl(DataType              dtype,
-                          const ModelParam&     model,
-                          const EngineParam&    engine,
-                          const AttentionParam& attn,
-                          const MoeParam&       moe,
-                          const Context&        ctx,
-                          const LlamaWeight&    weights,
-                          int                   phases):
-    dtype_{dtype},
-    param_{model},
-    attn_param_{attn},
+LanguageModel::Impl::Impl(const EngineParam& engine, const Context& ctx, const ModelWeight& weights, int phases):
     comm_{ctx.comm},
     weights_{weights},
     linear_{*ctx.linear},
@@ -161,19 +141,15 @@ LanguageModel::Impl::Impl(DataType              dtype,
         d.generating      = {engine.max_batch_size, kCPU};
     }
 
-    input_processor_.emplace(engine, param_, phases);
+    input_processor_.emplace(engine, weights_.hidden_units, weights_.data_type, phases);
 
-    unified_decoder_ = std::make_unique<UnifiedDecoder>(model, engine, attn, moe, ctx, phases);
+    unified_decoder_ = std::make_unique<UnifiedDecoder>(engine, ctx, phases, weights_);
 
-    generation_ = std::make_unique<Generation>(kFloat32,
-                                               engine.max_batch_size,
-                                               engine.session_len,
-                                               model.vocab_size,
-                                               weights.post_decoder_embedding.output_dim * tp_size_,
-                                               comm_.h_tp_group,
-                                               phases);
+    const int vocab_size = weights_.output->output_dim * tp_size_;
+
+    generation_ = std::make_unique<Generation>(
+        kFloat32, engine.max_batch_size, engine.session_len, weights_.vocab_size, vocab_size, comm_.h_tp_group, phases);
 
-    const int     vocab_size     = weights_.post_decoder_embedding.output_dim * tp_size_;
     const ssize_t max_fwd_tokens = engine.max_forward_token_num;
 
     if (ctx.comm.d_comm) {
@@ -182,18 +158,19 @@ LanguageModel::Impl::Impl(DataType              dtype,
         TM_CHECK(engine.max_forward_token_num % tp_size_ == 0);
 
         ssize_t bytes{};
-        bytes = std::max(bytes, byte_size(dtype_, max_fwd_tokens * engine.attn_dp_size * model.hidden_units));
-        bytes = std::max(bytes, byte_size(dtype_, engine.max_batch_size * vocab_size));
+        bytes = std::max(bytes,
+                         byte_size(weights_.data_type, max_fwd_tokens * engine.attn_dp_size * weights_.hidden_units));
+        bytes = std::max(bytes, byte_size(weights_.data_type, engine.max_batch_size * vocab_size));
 
         symm_buf_ = {bytes, symm_alloc};
         // Compute max logits length based on symm buffer size
-        max_logits_len_ = symm_buf_.view(dtype_).size() / vocab_size;
+        max_logits_len_ = symm_buf_.view(weights_.data_type).size() / vocab_size;
     }
     else {
-        max_logits_len_ = std::max<int>(max_fwd_tokens * model.hidden_units / vocab_size, engine.max_batch_size);
+        max_logits_len_ = std::max<int>(max_fwd_tokens * weights_.hidden_units / vocab_size, engine.max_batch_size);
     }
 
-    output_processor_.emplace(param_, max_logits_len_, tp_rank_, phases, [this](const Tensor& hstate) {
+    output_processor_.emplace(weights_.vocab_size, max_logits_len_, tp_rank_, phases, [this](const Tensor& hstate) {
         return PostEmbedding(hstate, symm_buf_);
     });
 }
@@ -202,14 +179,14 @@ Tensor LanguageModel::Impl::LookupEmbedding(const Buffer_<int>& input_ids, Buffe
 {
     const auto st = core::Context::stream().handle();
 
-    const int hidden_units = param_.hidden_units;
+    const int hidden_units = weights_.hidden_units;
 
-    const auto& embedding_table = weights_.pre_decoder_embedding.weight;
+    const auto& embedding_table = weights_.tok_embeddings;
     TM_CHECK_EQ(embedding_table.shape(1) * tp_size_, hidden_units);
 
     const int token_num = input_ids.size();
 
-    Tensor input_embeds{{token_num, hidden_units}, dtype_, kDEVICE};
+    Tensor input_embeds{{token_num, hidden_units}, weights_.data_type, kDEVICE};
 
     if (token_num == 0) {
         return input_embeds;
@@ -222,7 +199,7 @@ Tensor LanguageModel::Impl::LookupEmbedding(const Buffer_<int>& input_ids, Buffe
     else if (use_ag2d_) {
         const auto local_hidden_units = embedding_table.shape(1);
 
-        Tensor temp{symm_buf.view(dtype_), {token_num, tp_size_, local_hidden_units}};
+        Tensor temp{symm_buf.view(weights_.data_type), {token_num, tp_size_, local_hidden_units}};
         Tensor local{temp.slice({0, tp_rank_, 0}, {-1, 1, -1}).squeeze(1)};
 
         invokeEmbeddingLookup(local, input_ids, embedding_table, st);
@@ -245,13 +222,14 @@ Tensor LanguageModel::Impl::LookupEmbedding(const Buffer_<int>& input_ids, Buffe
     else {
         const auto local_hidden_units = embedding_table.shape(1);
 
-        Tensor temp{symm_buf.view(dtype_), {tp_size_, token_num, local_hidden_units}};
+        Tensor temp{symm_buf.view(weights_.data_type), {tp_size_, token_num, local_hidden_units}};
         Tensor local{temp.slice(tp_rank_).squeeze(0)};
 
         invokeEmbeddingLookup(local, input_ids, embedding_table, st);
         sync_check_cuda_error();
 
-        comm_.d_comm->AllGather(local.raw_data(), temp.raw_data(), local.size(), dtype_, comm_.d_tp_group, st);
+        comm_.d_comm->AllGather(
+            local.raw_data(), temp.raw_data(), local.size(), weights_.data_type, comm_.d_tp_group, st);
         sync_check_cuda_error();
 
         invokeInPlaceTranspose102((uint16_t*)input_embeds.raw_data(),
@@ -274,24 +252,24 @@ Tensor LanguageModel::Impl::PostEmbedding(const Tensor& features, Buffer symm_bu
     const auto st = core::Context::stream().handle();
 
     const int bsz              = features.shape(0);
-    const int local_vocab_size = weights_.post_decoder_embedding.output_dim;
+    const int local_vocab_size = weights_.output->output_dim;
     const int vocab_size       = local_vocab_size * tp_size_;
 
     if (bsz == 0) {
-        return Tensor{{0, vocab_size}, dtype_, kDEVICE};
+        return Tensor{{0, vocab_size}, weights_.data_type, kDEVICE};
     }
 
     if (tp_size_ == 1) {
-        Tensor logits{{bsz, vocab_size}, dtype_, kDEVICE};
-        linear_.Forward(features, weights_.post_decoder_embedding, logits);
+        Tensor logits{{bsz, vocab_size}, weights_.data_type, kDEVICE};
+        linear_.Forward(features, *weights_.output, logits);
         sync_check_cuda_error();
         TM_DEBUG_TENSOR(logits, "logits", 1);
         return logits;
     }
     else if (use_ag2d_) {
-        Tensor logits{symm_buf.view(dtype_), {bsz, tp_size_, local_vocab_size}};
+        Tensor logits{symm_buf.view(weights_.data_type), {bsz, tp_size_, local_vocab_size}};
         Tensor local = logits.slice({0, tp_rank_, 0}, {-1, 1, -1});
-        linear_.Forward(features, weights_.post_decoder_embedding, local.squeeze(1));
+        linear_.Forward(features, *weights_.output, local.squeeze(1));
         sync_check_cuda_error();
         comm_.d_comm->AllGather2D(local.raw_data(),
                                   logits.raw_data(),
@@ -307,9 +285,9 @@ Tensor LanguageModel::Impl::PostEmbedding(const Tensor& features, Buffer symm_bu
         return logits.view({bsz, -1});
     }
     else {
-        Tensor logits{symm_buf.view(dtype_), {tp_size_, bsz, local_vocab_size}};
+        Tensor logits{symm_buf.view(weights_.data_type), {tp_size_, bsz, local_vocab_size}};
         Tensor local = logits.slice({tp_rank_, 0, 0}, {1, -1, -1});
-        linear_.Forward(features, weights_.post_decoder_embedding, local.squeeze(0));
+        linear_.Forward(features, *weights_.output, local.squeeze(0));
         sync_check_cuda_error();
         comm_.d_comm->AllGather(local.raw_data(), logits.raw_data(), local.size(), local.dtype(), comm_.d_tp_group, st);
         sync_check_cuda_error();
@@ -439,9 +417,9 @@ void LanguageModel::Impl::Forward(int phase, TensorMap& env)
         env.produce("symm_buf", symm_buf_);
     }
 
-    env.produce("output_norm_weight", weights_.output_norm_weight);
+    env.produce("output_norm_weight", weights_.norm->weight);
 
-    unified_decoder_->Forward(phase, env, weights_.decoder_layer_weights);
+    unified_decoder_->Forward(phase, env, weights_.layers_list());
 
     // env.at("batch").data<BatchData*>()[0]->Notify();
 
@@ -491,16 +469,9 @@ LanguageModel::~LanguageModel() = default;
 
 LanguageModel::LanguageModel(LanguageModel&&) noexcept = default;
 
-LanguageModel::LanguageModel(DataType              dtype,
-                             const ModelParam&     model,
-                             const EngineParam&    engine,
-                             const AttentionParam& attn,
-                             const MoeParam&       moe,
-                             const Context&        ctx,
-                             const LlamaWeight&    weights,
-                             int                   phases)
+LanguageModel::LanguageModel(const EngineParam& engine, const Context& ctx, const ModelWeight& weights, int phases)
 {
-    impl_ = std::make_unique<Impl>(dtype, model, engine, attn, moe, ctx, weights, phases);
+    impl_ = std::make_unique<Impl>(engine, ctx, weights, phases);
 }
 
 void LanguageModel::Run(BatchOp op, int phase, TensorMap& env)
@@ -508,14 +479,4 @@ void LanguageModel::Run(BatchOp op, int phase, TensorMap& env)
     return TM_CHECK_NOTNULL(impl_)->Run(op, phase, env);
 }
 
-const ModelParam& LanguageModel::model_param() const noexcept
-{
-    return TM_CHECK_NOTNULL(impl_)->param_;
-}
-
-const AttentionParam& LanguageModel::attn_param() const noexcept
-{
-    return TM_CHECK_NOTNULL(impl_)->attn_param_;
-}
-
 }  // namespace turbomind
diff --git a/src/turbomind/models/language_model.h b/src/turbomind/models/language_model.h
index a883f56d5c..5699a9e37f 100644
--- a/src/turbomind/models/language_model.h
+++ b/src/turbomind/models/language_model.h
@@ -9,7 +9,7 @@
 
 namespace turbomind {
 
-class LlamaWeight;
+class ModelWeight;
 
 class LanguageModel {
 public:
@@ -24,20 +24,10 @@ class LanguageModel {
         return static_cast<bool>(impl_);
     }
 
-    LanguageModel(DataType              dtype,
-                  const ModelParam&     model,
-                  const EngineParam&    engine,
-                  const AttentionParam& attn,
-                  const MoeParam&       moe,
-                  const Context&        ctx,
-                  const LlamaWeight&    weights,
-                  int                   phases);
+    LanguageModel(const EngineParam& engine, const Context& ctx, const ModelWeight& weights, int phases);
 
     void Run(BatchOp op, int phase, TensorMap& env);
 
-    const ModelParam&     model_param() const noexcept;
-    const AttentionParam& attn_param() const noexcept;
-
 private:
     struct Impl;
     std::unique_ptr<Impl> impl_;
diff --git a/src/turbomind/models/linear_weight.cc b/src/turbomind/models/linear_weight.cc
new file mode 100644
index 0000000000..be9ffbd3fe
--- /dev/null
+++ b/src/turbomind/models/linear_weight.cc
@@ -0,0 +1,280 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/models/linear_weight.h"
+
+#include "src/turbomind/core/allocator.h"
+#include "src/turbomind/core/data_type.h"
+#include "src/turbomind/core/registry.h"
+#include "src/turbomind/kernels/gemm/cast.h"
+#include "src/turbomind/kernels/gemm/convert.h"
+#include "src/turbomind/kernels/gemm/types.h"
+#include "src/turbomind/kernels/gemm/utils.h"
+#include "src/turbomind/kernels/gpt_kernels.h"
+#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/memory_utils.h"
+
+namespace turbomind {
+
+LinearWeight::LinearWeight(const core::LinearConfig& cfg):
+    input_dim(cfg.input_dim),
+    output_dim(cfg.output_dim),
+    data_type(cfg.data_type),
+    weight_format(cfg.format),
+    has_bias_(cfg.has_bias)
+{
+    std::tie(input_format, output_format) = DeriveActivationFormats(weight_format, data_type, getSMVersion());
+}
+
+std::pair<DataFormat, DataFormat> DeriveActivationFormats(const DataFormat& weight_format, DataType data_type, int sm)
+{
+    DataFormat in_fmt;
+    DataFormat out_fmt;
+    in_fmt.dtype        = data_type;
+    in_fmt.block_sizes  = {1, 1};
+    out_fmt.dtype       = data_type;
+    out_fmt.block_sizes = {1, 1};
+
+    // Empty weight_format (from LinearBuilder.set_weight path for embeddings /
+    // lm_head): treat as trivial. No quantization on I/O.
+    if (weight_format.dtype == DataType{}) {
+        return {in_fmt, out_fmt};
+    }
+
+    if (!weight_format.is_quantized()) {
+        return {in_fmt, out_fmt};
+    }
+
+    if (weight_format.dtype == kFloat8_e4m3) {
+        if (sm == 90) {
+            int gs              = weight_format.block_sizes[0];  // K-axis, tensor-shape order
+            in_fmt.dtype        = kFloat8_e4m3;
+            in_fmt.block_sizes  = {gs, 1};
+            in_fmt.scales.dtype = kFloat;
+        }
+        return {in_fmt, out_fmt};
+    }
+
+    // FP4 / U4 / U8: input stays in model activation dtype — the GEMM
+    // upcasts / dequants on the fly. output_format is also activation dtype.
+    return {in_fmt, out_fmt};
+}
+
+gemm::QuantDesc MakeQuantDesc(const DataFormat& fmt)
+{
+    if (!fmt.is_quantized()) {
+        return {gemm::QuantType::kNone, 0};
+    }
+    int gs = (fmt.block_sizes.size() > 0) ? fmt.block_sizes[0] : 1;
+
+    if (fmt.dtype == kFloat8_e4m3) {
+        // Weight format has bidirectional blocking {128, 128} → B-type.
+        // Activation format has K-axis-only blocking {gs, 1} → K-type.
+        if (fmt.block_sizes.size() > 1 && fmt.block_sizes[1] > 1) {
+            return {gemm::QuantType::kB, gs};
+        }
+        return {gemm::QuantType::kK, gs};
+    }
+    // FP4 / U4 / U8: K-grouped quantization
+    return {gemm::QuantType::kK, gs};
+}
+
+void LinearWeight::copy_metadata_to(LinearWeight& dst) const
+{
+    dst.input_dim     = input_dim;
+    dst.output_dim    = output_dim;
+    dst.data_type     = data_type;
+    dst.weight_format = weight_format;
+    dst.input_format  = input_format;
+    dst.output_format = output_format;
+    dst.epilogue      = epilogue;
+    dst.has_bias_     = has_bias_;
+    dst.is_grouped_   = is_grouped_;
+    dst.k_desc        = k_desc;
+    dst.q_desc        = q_desc;
+}
+
+// ======================================================================
+// prepare (weight format conversion)
+// ======================================================================
+
+void LinearWeight::prepare()
+{
+    if (!weight) {
+        return;
+    }
+
+    // Set up GEMM descriptor (was previously in do_allocate)
+    k_desc.type  = weight.dtype();
+    k_desc.order = gemm::kRowMajor;
+    k_desc.rows  = input_dim;
+    k_desc.cols  = output_dim;
+    k_desc.ld    = output_dim;
+
+    // No format conversion needed if weight_spec was never set (trivial weights
+    // loaded via commit_tensor, e.g. tok_embeddings, output head).
+    if (weight_format.dtype == DataType{}) {
+        EnsureFloatDtype(weight, data_type);
+        if (weight.dtype() == data_type) {
+            k_desc.type = data_type;
+        }
+        return;
+    }
+
+    auto stream = core::Context::stream().handle();
+
+    if (weight_format.dtype == kFloat8_e4m3 && input_dtype() == kFloat8_e4m3) {
+        // FP8 native path: transpose weight and scales for native kernels.
+        auto process = [&](Tensor& x, MatrixLayout& d, auto dtype) {
+            using T = decltype(dtype);
+            Tensor trans{{x.shape(1), x.shape(0)}, x.dtype(), kDEVICE};
+            invokeTransposeAxis01((T*)trans.raw_data(), (T*)x.raw_data(), x.shape(0), x.shape(1), 1, stream);
+            x = std::move(trans);
+            d = MatrixLayout{x.dtype(), gemm::kColMajor, (int)x.shape(1), (int)x.shape(0), (int)x.stride(0)};
+        };
+
+        TM_CHECK_EQ(weight.dtype(), kFloat8_e4m3);
+        process(weight, k_desc, uint8_t{});
+
+        // FP8 native path requires f32 scales; cast if loaded as bf16/fp16.
+        EnsureFloatDtype(scales, kFloat);
+
+        TM_CHECK_EQ(scales.dtype(), kFloat);
+        process(scales, q_desc, float{});
+    }
+    else if (weight_format.dtype == kFloat8_e4m3) {
+        // FP8 non-native path (non-SM90)
+    }
+    else {
+        // General quantization format conversion path.
+        using namespace gemm;
+
+        auto [conv_w, conv_s] =
+            GetConverters(data_type, weight_format.dtype, input_dtype(), is_grouped_, getSMVersion());
+
+        if (conv_w) {
+            const auto order_w = conv_w->order;
+            const bool is_A    = get_operand_tag(conv_w->pack) == OPERAND_A;
+            const bool is_B    = !is_A;
+
+            const int bits = byte_size(weight_format.dtype, 8);
+
+            Tensor_<uint16_t> tmp{{input_dim, output_dim}, kDEVICE};
+
+            if (bits == 4) {
+                extend_to_u16(tmp.data(), (const uint4_t*)weight.raw_data(), tmp.size(), stream);
+                sync_check_cuda_error();
+            }
+            else if (bits == 8) {
+                extend_to_u16(tmp.data(), (const uint8_t*)weight.raw_data(), tmp.size(), stream);
+                sync_check_cuda_error();
+            }
+            else if (bits == 16) {
+                check_cuda_error(
+                    cudaMemcpyAsync(tmp.raw_data(), weight.raw_data(), weight.byte_size(), cudaMemcpyDefault, stream));
+            }
+
+            if (order_w == kRowMajor) {
+                Tensor_<uint16_t> trans{{output_dim, input_dim}, kDEVICE};
+                invokeTransposeAxis01(trans.data(), tmp.data(), input_dim, output_dim, 1, stream);
+                tmp = trans;
+            }
+
+            MatrixLayout w_desc{
+                data_type,
+                order_w,
+                (int)output_dim,
+                (int)input_dim,
+                order_w == kRowMajor ? (int)input_dim : (int)output_dim,
+            };
+
+            if (is_B) {
+                std::swap(w_desc.rows, w_desc.cols);
+                w_desc.order = ~w_desc.order;
+            }
+
+            MatrixLayout kd = w_desc;
+            kd.type         = weight_format.dtype;
+            if (bits == 4) {
+                kd.type = data_type_v<uint4_t>;
+            }
+            else if (bits == 8) {
+                kd.type = data_type_v<uint8_t>;
+            }
+            kd.pack = conv_w->pack;
+
+            check_cuda_error(cudaMemsetAsync(weight.raw_data(), 0, weight.byte_size(), stream));
+            TM_CHECK(conv_w->Convert(tmp.data(), w_desc, weight.raw_data(), kd, stream) == 0);
+            sync_check_cuda_error();
+
+            kd.type = weight_format.dtype;
+            if (is_A) {
+                kd = transpose(kd);
+            }
+            k_desc = kd;
+        }
+
+        if (conv_s) {
+            const auto order_s = conv_s->order;
+            const auto pack_s  = conv_s->pack;
+            const bool is_A    = get_operand_tag(conv_s->pack) == OPERAND_U;
+
+            Tensor   tmp_q;
+            DataType scale_type;
+
+            if (zeros) {
+                tmp_q = {{scales.size(), 2}, kHalf, kDEVICE};
+                fuse_scales_and_zeros(
+                    tmp_q.data<half>(), scales.data<half>(), zeros.data<half>(), scales.size(), stream);
+                scale_type = kUint32;
+                zeros      = {};
+                scales     = empty_like(tmp_q);
+            }
+            else if (weight_format.dtype == kFloat8_e4m3) {
+                tmp_q = empty_like(scales);
+                Copy(scales, tmp_q);
+                scale_type = kUint16;
+            }
+            else {
+                tmp_q = empty_like(scales);
+                Copy(scales, tmp_q);
+                scale_type = kUint8;
+            }
+
+            if (data_type == kHalf && weight_format.dtype == kFloat4_e2m1) {
+                AdjustUe8m0ScaleForHalf(tmp_q.data<uint8_t>(), tmp_q.size(), stream);
+                sync_check_cuda_error();
+            }
+
+            int          gs = weight_format.block_sizes[0];  // K-axis, tensor-shape order
+            MatrixLayout s_desc{
+                scale_type,
+                order_s,
+                (int)output_dim,
+                (int)input_dim / gs,
+                (int)output_dim,
+            };
+
+            if (!is_A) {
+                std::swap(s_desc.rows, s_desc.cols);
+                s_desc.order = ~s_desc.order;
+            }
+
+            MatrixLayout qd = s_desc;
+            qd.pack         = pack_s;
+
+            TM_CHECK(conv_s->Convert(tmp_q.raw_data(), s_desc, scales.raw_data(), qd, stream) == 0);
+            sync_check_cuda_error();
+
+            if (is_A) {
+                qd = transpose(qd);
+            }
+            q_desc = qd;
+        }
+    }
+}
+
+TM_MODULE_REGISTER(LinearWeight, core::LinearConfig);
+
+TM_MODULE_METHODS(LinearWeight, LINEAR_WEIGHT_CHILDREN, LINEAR_WEIGHT_PARAMS)
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/linear_weight.h b/src/turbomind/models/linear_weight.h
new file mode 100644
index 0000000000..dd4680d061
--- /dev/null
+++ b/src/turbomind/models/linear_weight.h
@@ -0,0 +1,105 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#pragma once
+
+#include "src/turbomind/core/core.h"
+#include "src/turbomind/core/data_format.h"
+#include "src/turbomind/core/module.h"
+#include "src/turbomind/kernels/gemm/types.h"
+
+namespace turbomind::core {
+
+struct LinearConfig: ModuleConfig {
+    LinearConfig(): ModuleConfig{"LinearWeight"} {}
+
+#define LINEAR_FIELDS(X)                                                                                               \
+    X(int, input_dim)                                                                                                  \
+    X(int, output_dim)                                                                                                 \
+    X(DataType, data_type)                                                                                             \
+    X(DataFormat, format)                                                                                              \
+    X(bool, has_bias)
+
+    LINEAR_FIELDS(TM_MEMBER)
+    TM_FOR_EACH(LinearConfig, LINEAR_FIELDS)
+
+#undef LINEAR_FIELDS
+};
+
+}  // namespace turbomind::core
+
+namespace turbomind {
+
+using gemm::Epilogue;
+using gemm::MatrixLayout;
+
+/// Derive (input_format, output_format) for a GEMM whose weight uses
+/// `weight_format`, given the model's activation dtype and hardware SM.
+std::pair<DataFormat, DataFormat> DeriveActivationFormats(const DataFormat& weight_format, DataType data_type, int sm);
+
+/// Derive GEMM QuantDesc for an operand described by DataFormat.
+/// For unquantized formats, returns {QuantType::kNone, 0}.
+gemm::QuantDesc MakeQuantDesc(const DataFormat& fmt);
+
+class LinearWeight: public core::Module {
+public:
+    const char* type() const override
+    {
+        return "LinearWeight";
+    }
+
+    LinearWeight() = default;
+    LinearWeight(const core::LinearConfig& cfg);
+
+    void prepare() override;
+    void copy_metadata_to(LinearWeight& dst) const;
+
+    /// Set grouped-GEMM mode (for MoE expert weights that need row-major layout).
+    void set_grouped(bool grouped)
+    {
+        is_grouped_ = grouped;
+    }
+
+    explicit operator bool() const noexcept
+    {
+        return static_cast<bool>(weight);
+    }
+
+    // --- three DataFormats fully describe the GEMM ---
+    DataFormat weight_format{};  // from cfg.format
+    DataFormat input_format{};   // derived in ctor
+    DataFormat output_format{};  // derived in ctor
+
+    DataType input_dtype() const
+    {
+        return input_format.dtype;
+    }
+    DataType output_dtype() const
+    {
+        return output_format.dtype;
+    }
+
+    // --- dimensions + model activation dtype ---
+    int      input_dim  = 0;
+    int      output_dim = 0;
+    DataType data_type{};  // model activation dtype, copied from cfg.data_type
+
+    // --- GEMM knobs ---
+    Epilogue     epilogue{};
+    MatrixLayout k_desc{};
+    MatrixLayout q_desc{};
+
+#define LINEAR_WEIGHT_CHILDREN(X)
+
+#define LINEAR_WEIGHT_PARAMS(X)                                                                                        \
+    X(weight)                                                                                                          \
+    X(bias)                                                                                                            \
+    X(scales)                                                                                                          \
+    X(zeros)
+
+    TM_MODULE_DECLARE(LinearWeight, LINEAR_WEIGHT_CHILDREN, LINEAR_WEIGHT_PARAMS)
+
+private:
+    bool has_bias_   = false;
+    bool is_grouped_ = false;
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/llama/CMakeLists.txt b/src/turbomind/models/llama/CMakeLists.txt
index 121b80f09a..361c01ca82 100644
--- a/src/turbomind/models/llama/CMakeLists.txt
+++ b/src/turbomind/models/llama/CMakeLists.txt
@@ -13,7 +13,6 @@ add_library(Llama STATIC
         BlockTrie.cc
         SequenceManager.cc
         LlamaWeight.cc
-        LlamaDenseWeight.cc
         LlamaDecoderLayerWeight.cc
         LlamaFfnLayer.cc
         moe_ffn_layer.cc
diff --git a/src/turbomind/models/llama/GatedDeltaNetLayer.cc b/src/turbomind/models/llama/GatedDeltaNetLayer.cc
index e5a7459143..accba2833a 100644
--- a/src/turbomind/models/llama/GatedDeltaNetLayer.cc
+++ b/src/turbomind/models/llama/GatedDeltaNetLayer.cc
@@ -9,43 +9,19 @@
 
 namespace turbomind {
 
-GatedDeltaNetLayer::GatedDeltaNetLayer(const ModelParam&     model,
-                                       const AttentionParam& attn,
-                                       const EngineParam&    engine,
-                                       int                   tp_size,
-                                       const Context&        ctx,
-                                       int                   phases):
-    hidden_units_(model.hidden_units),
-    num_k_heads_(model.linear_num_key_heads / tp_size),
-    num_v_heads_(model.linear_num_value_heads / tp_size),
-    key_head_dim_(model.linear_key_head_dim > 0 ? model.linear_key_head_dim : model.head_dim),
-    value_head_dim_(model.linear_value_head_dim > 0 ? model.linear_value_head_dim : model.head_dim),
-    d_conv_(model.linear_conv_kernel_dim > 0 ? model.linear_conv_kernel_dim : 4),
-    key_dim_(num_k_heads_ * key_head_dim_),
-    value_dim_(num_v_heads_ * value_head_dim_),
-    conv_dim_(key_dim_ * 2 + value_dim_),
-    norm_eps_(model.norm_eps),
-    dtype_(model.data_type),
-    state_dtype_(model.linear_state_dtype),
-    linear_(*ctx.linear)
+GatedDeltaNetLayer::GatedDeltaNetLayer(DataType                state_dtype,
+                                       const std::vector<int>& layer_types,
+                                       const EngineParam&      engine,
+                                       const Context&          ctx,
+                                       int                     phases):
+    tp_size_(engine.attn_tp_size), num_linear_layers_(0), state_dtype_(state_dtype), linear_(*ctx.linear)
 {
-    layer_types_       = model.layer_types;
-    num_linear_layers_ = 0;
+    layer_types_ = layer_types;
     for (auto t : layer_types_) {
         if (t == 1)
             ++num_linear_layers_;
     }
 
-    TM_LOG_INFO("GatedDeltaNetLayer: num_k={} num_v={} k_dim={} v_dim={} "
-                "conv_dim={} d_conv={} num_linear_layers={}",
-                num_k_heads_,
-                num_v_heads_,
-                key_dim_,
-                value_dim_,
-                conv_dim_,
-                d_conv_,
-                num_linear_layers_);
-
     if (num_linear_layers_ > 0) {
         conv_state_ptrs_buf_      = {engine.max_batch_size, kCPUpinned};
         recurrent_state_ptrs_buf_ = {engine.max_batch_size, kCPUpinned};
@@ -79,8 +55,7 @@ GatedDeltaNetLayer::~GatedDeltaNetLayer()
 void GatedDeltaNetLayer::Run(BatchOp op, int phase, TensorMap& env)
 {
     if (op == BatchOp::kAdd) {
-        Buffer_<RequestCache*> rc    = env.at("requests").buffer();
-        const auto             dtype = dtype_;
+        Buffer_<RequestCache*> rc = env.at("requests").buffer();
         for (int i = 0; i < rc.size(); ++i) {}
     }
     else if (op == BatchOp::kSetup) {
@@ -161,21 +136,31 @@ void GatedDeltaNetLayer::Forward(ForwardParam p)
     auto dispatch = [&](auto t) {
         using T = decltype(t);
 
+        const auto& w              = *p.weights;
+        const int   num_k_heads    = w.num_k_heads / tp_size_;
+        const int   num_v_heads    = w.num_v_heads / tp_size_;
+        const int   key_head_dim   = w.key_head_dim;
+        const int   value_head_dim = w.value_head_dim;
+        const int   d_conv         = w.d_conv;
+        const int   key_dim        = num_k_heads * key_head_dim;
+        const int   value_dim      = num_v_heads * value_head_dim;
+        const int   conv_dim       = key_dim * 2 + value_dim;
+
         // =================================================================
         // 1. Single fused input projection: reads p.input once from HBM.
         //    Output columns are ordered: [qkv | z | b | a]
-        //    where the split dims are: conv_dim_, value_dim_, v_heads_tp_, v_heads_tp_
+        //    where the split dims are: conv_dim, value_dim, v_heads_tp, v_heads_tp
         // =================================================================
-        const int v_heads_tp = num_v_heads_;  // already TP-sharded
-        Tensor    all_proj   = linear_.Forward(p.input, weights.in_proj_all);
+        const int v_heads_tp = num_v_heads;  // already TP-sharded
+        Tensor    all_proj   = linear_.Forward(p.input, *weights.in_proj_all);
         sync_check_cuda_error();
 
         // Column offsets per token (all_proj is token-major, row-major):
-        //   [0, conv_dim_)           -> mixed_qkv
-        //   [conv_dim_, +value_dim_) -> z
-        //   [conv_dim_+value_dim_, +v_heads_tp) -> b (beta logit)
-        //   [conv_dim_+value_dim_+v_heads_tp, +v_heads_tp) -> a (alpha/dt)
-        const int all_col = conv_dim_ + value_dim_ + v_heads_tp * 2;
+        //   [0, conv_dim)           -> mixed_qkv
+        //   [conv_dim, +value_dim) -> z
+        //   [conv_dim+value_dim, +v_heads_tp) -> b (beta logit)
+        //   [conv_dim+value_dim+v_heads_tp, +v_heads_tp) -> a (alpha/dt)
+        const int all_col = conv_dim + value_dim + v_heads_tp * 2;
         // const T* sub-pointers are derived per-request below; stride = all_col.
 
         // =================================================================
@@ -183,13 +168,13 @@ void GatedDeltaNetLayer::Forward(ForwardParam p)
         //    b_raw and a_raw are sliced from the fused projection output.
         //    Stride between tokens is all_col elements.
         // =================================================================
-        const int bg_total = token_num * num_v_heads_;
+        const int bg_total = token_num * num_v_heads;
 
-        const int b_offset = conv_dim_ + value_dim_;  // column offset to b logits
-        const int a_offset = b_offset + v_heads_tp;   // column offset to a logits
+        const int b_offset = conv_dim + value_dim;   // column offset to b logits
+        const int a_offset = b_offset + v_heads_tp;  // column offset to a logits
 
-        Tensor beta{{token_num, num_v_heads_}, dtype, device};
-        Tensor g{{token_num, num_v_heads_}, dtype, device};
+        Tensor beta{{token_num, num_v_heads}, dtype, device};
+        Tensor g{{token_num, num_v_heads}, dtype, device};
 
         auto b = all_proj.slice({0, b_offset}, {-1, v_heads_tp});
         auto a = all_proj.slice({0, a_offset}, {-1, v_heads_tp});
@@ -199,12 +184,12 @@ void GatedDeltaNetLayer::Forward(ForwardParam p)
         // =================================================================
         // 3. Process all requests at once via batched kernel launches
         // =================================================================
-        Tensor attn_out{{token_num, value_dim_}, dtype, device};
-        Tensor conv_out{{token_num, conv_dim_}, dtype, device};
+        Tensor attn_out{{token_num, value_dim}, dtype, device};
+        Tensor conv_out{{token_num, conv_dim}, dtype, device};
 
         const int state_layer_idx              = linear_layer_index(p.layer_id, layer_types_);
-        const int conv_state_layer_offset      = state_layer_idx * (conv_dim_ * d_conv_);
-        const int recurrent_state_layer_offset = state_layer_idx * (num_v_heads_ * key_head_dim_ * value_head_dim_);
+        const int conv_state_layer_offset      = state_layer_idx * (conv_dim * d_conv);
+        const int recurrent_state_layer_offset = state_layer_idx * (num_v_heads * key_head_dim * value_head_dim);
 
         // ----- 3a. Fused Causal Conv1d + SiLU (all requests) -----
         // all_proj carries the non-contiguous qkv slice (stride = all_col);
@@ -252,7 +237,7 @@ void GatedDeltaNetLayer::Forward(ForwardParam p)
                                                dc_state,
                                                dc_q,
                                                decode_count,
-                                               num_k_heads_,
+                                               num_k_heads,
                                                recurrent_state_layer_offset,
                                                state_dtype_,
                                                sm_count_,
@@ -269,7 +254,7 @@ void GatedDeltaNetLayer::Forward(ForwardParam p)
                                                    pf_state,
                                                    pf_q,
                                                    prefill_count,
-                                                   num_k_heads_,
+                                                   num_k_heads,
                                                    recurrent_state_layer_offset,
                                                    state_dtype_,
                                                    sm_count_,
@@ -290,7 +275,7 @@ void GatedDeltaNetLayer::Forward(ForwardParam p)
                                                state_slice,
                                                q_slice,
                                                decode_count,
-                                               num_k_heads_,
+                                               num_k_heads,
                                                recurrent_state_layer_offset,
                                                state_dtype_,
                                                sm_count_,
@@ -307,7 +292,7 @@ void GatedDeltaNetLayer::Forward(ForwardParam p)
                                                    state_slice,
                                                    q_slice,
                                                    prefill_count,
-                                                   num_k_heads_,
+                                                   num_k_heads,
                                                    recurrent_state_layer_offset,
                                                    state_dtype_,
                                                    sm_count_,
@@ -319,16 +304,16 @@ void GatedDeltaNetLayer::Forward(ForwardParam p)
         sync_check_cuda_error();
 
         // ----- 3c. RMSNormGated (all tokens at once) -----
-        // Gate (z) lives at column conv_dim_ of all_proj with row-stride all_col.
-        Tensor gate        = all_proj.slice({0, conv_dim_}, {-1, value_dim_});
-        Tensor hidden_view = attn_out.view({token_num * num_v_heads_, value_head_dim_});
-        invokeRMSNormGated(hidden_view, gate, weights.norm, norm_eps_, stream);
+        // Gate (z) lives at column conv_dim of all_proj with row-stride all_col.
+        Tensor gate        = all_proj.slice({0, conv_dim}, {-1, value_dim});
+        Tensor hidden_view = attn_out.view({token_num * num_v_heads, value_head_dim});
+        invokeRMSNormGated(hidden_view, gate, weights.norm->weight, weights.norm->norm_eps_, stream);
         sync_check_cuda_error();
 
         // =================================================================
         // 4. Output projection (all tokens at once)
         // =================================================================
-        (void)linear_.Forward(attn_out, weights.out_proj, p.output);
+        (void)linear_.Forward(attn_out, *weights.out_proj, p.output);
         sync_check_cuda_error();
     };
 
diff --git a/src/turbomind/models/llama/GatedDeltaNetLayer.h b/src/turbomind/models/llama/GatedDeltaNetLayer.h
index 67e240c891..bb6b1e2c9a 100644
--- a/src/turbomind/models/llama/GatedDeltaNetLayer.h
+++ b/src/turbomind/models/llama/GatedDeltaNetLayer.h
@@ -2,7 +2,7 @@
 
 #include "src/turbomind/core/tensor.h"
 #include "src/turbomind/engine/batch.h"
-#include "src/turbomind/models/llama/GatedDeltaNetWeight.h"
+#include "src/turbomind/models/delta_net_weight.h"
 #include "src/turbomind/models/llama/LlamaLinear.h"
 #include "src/turbomind/models/llama/context.h"
 #include "src/turbomind/models/llama/llama_params.h"
@@ -12,19 +12,18 @@ namespace turbomind {
 class GatedDeltaNetLayer {
 public:
     struct ForwardParam {
-        int                        phase;
-        Tensor                     input;
-        Tensor                     output;
-        const GatedDeltaNetWeight* weights;
-        int                        layer_id;
+        int                   phase;
+        Tensor                input;
+        Tensor                output;
+        const DeltaNetWeight* weights;
+        int                   layer_id;
     };
 
-    GatedDeltaNetLayer(const ModelParam&     model,
-                       const AttentionParam& attn,
-                       const EngineParam&    engine,
-                       int                   tp_size,
-                       const Context&        ctx,
-                       int                   phases);
+    GatedDeltaNetLayer(DataType                state_dtype,
+                       const std::vector<int>& layer_types,
+                       const EngineParam&      engine,
+                       const Context&          ctx,
+                       int                     phases);
 
     ~GatedDeltaNetLayer();
 
@@ -35,32 +34,21 @@ class GatedDeltaNetLayer {
 private:
     void Setup(int phase, TensorMap& env);
 
-    // Model dimensions
-    int              hidden_units_;
-    int              num_k_heads_;
-    int              num_v_heads_;
-    int              key_head_dim_;
-    int              value_head_dim_;
-    int              d_conv_;
-    int              key_dim_;            // num_k_heads * key_head_dim
-    int              value_dim_;          // num_v_heads * value_head_dim
-    int              conv_dim_;           // key_dim * 2 + value_dim
-    int              num_linear_layers_;  // count of linear attention layers for state sizing
-    std::vector<int> layer_types_;        // model layer types for index mapping
-
-    float    norm_eps_;
-    DataType dtype_;
-    DataType state_dtype_;  // recurrent state dtype (may differ from dtype_ for float32 state)
+    // Config passed at construction
+    int              tp_size_;
+    int              num_linear_layers_;
+    std::vector<int> layer_types_;
+    DataType         state_dtype_;
 
     LlamaLinear& linear_;
 
     // Per-phase batch data (mirrors UnifiedAttentionLayer pattern)
     struct Data {
-        std::vector<RequestCache*> rc;          // borrowed batch RequestCache pointers
-        std::vector<int>           input_lens;  // snapshot of input_len per request (captured at Setup time)
+        std::vector<RequestCache*> rc;
+        std::vector<int>           input_lens;
         int                        batch_size = 0;
-        Buffer_<int>               q_offsets;  // cumulative input-token offsets, device buffer
-        Buffer_<int>               k_offsets;  // cumulative key (history+input) offsets, device buffer
+        Buffer_<int>               q_offsets;
+        Buffer_<int>               k_offsets;
         std::vector<Tensor>        conv_states;
         std::vector<Tensor>        recurrent_states;
         Buffer_<void*>             conv_state_ptrs;
@@ -72,14 +60,12 @@ class GatedDeltaNetLayer {
     Buffer_<void*> conv_state_ptrs_buf_;
     Buffer_<void*> recurrent_state_ptrs_buf_;
 
-    // Queried once at construction; passed to all three kernel launchers.
     int          sm_count_{1};
-    Buffer_<int> work_counter_;  // 1-element device int for v3 atomic claiming
+    Buffer_<int> work_counter_;
 
-    // Dual-stream dispatch: prefill on high-priority aux stream, decode on main
     cudaStream_t aux_stream_{};
-    cudaEvent_t  ev_before_{};  // main→aux: prior work done
-    cudaEvent_t  ev_after_{};   // aux→main: prefill done
+    cudaEvent_t  ev_before_{};
+    cudaEvent_t  ev_after_{};
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/GatedDeltaNetWeight.cc b/src/turbomind/models/llama/GatedDeltaNetWeight.cc
deleted file mode 100644
index c31ab7c0f2..0000000000
--- a/src/turbomind/models/llama/GatedDeltaNetWeight.cc
+++ /dev/null
@@ -1,176 +0,0 @@
-#include "src/turbomind/models/llama/GatedDeltaNetWeight.h"
-#include "src/turbomind/kernels/gpt_kernels.h"
-#include "src/turbomind/utils/cuda_utils.h"
-
-namespace turbomind {
-
-GatedDeltaNetWeight::GatedDeltaNetWeight(int      hidden_dim,
-                                         int      num_k_heads,
-                                         int      num_v_heads,
-                                         int      key_head_dim,
-                                         int      value_head_dim,
-                                         int      d_conv,
-                                         bool     bias,
-                                         int      tp_size,
-                                         int      tp_rank,
-                                         DataType data_type,
-                                         DataType weight_type,
-                                         int      group_size):
-    tp_rank_(tp_rank), tp_size_(tp_size)
-{
-    const int key_dim    = num_k_heads * key_head_dim / tp_size;
-    const int value_dim  = num_v_heads * value_head_dim / tp_size;
-    const int v_heads_tp = num_v_heads / tp_size;
-    const int conv_dim   = key_dim * 2 + value_dim;
-
-    // GatedDeltaNet projections are stored as plain dense weights in the checkpoint
-    // (dense_wtype = data_type avoids quantization path for these projections).
-    const DataType dense_wtype = data_type;
-    const int      dense_gsz   = 0;
-
-    // Individual projections registered for checkpoint loading
-    in_proj_qkv.emplace(hidden_dim, conv_dim, data_type, bias, dense_wtype, dense_gsz);
-    in_proj_z.emplace(hidden_dim, value_dim, data_type, bias, dense_wtype, dense_gsz);
-    in_proj_b.emplace(hidden_dim, v_heads_tp, data_type, bias, dense_wtype, dense_gsz);
-    in_proj_a.emplace(hidden_dim, v_heads_tp, data_type, bias, dense_wtype, dense_gsz);
-    out_proj.emplace(value_dim, hidden_dim, data_type, bias, dense_wtype, dense_gsz);
-
-    register_module("in_proj_qkv", in_proj_qkv, tp_rank_);
-    register_module("in_proj_z", in_proj_z, tp_rank_);
-    register_module("in_proj_b", in_proj_b, tp_rank_);
-    register_module("in_proj_a", in_proj_a, tp_rank_);
-    register_module("out_proj", out_proj, tp_rank_);
-
-    // conv1d: depthwise weights, shape (conv_dim, d_conv)
-    conv1d = Tensor{{conv_dim, d_conv}, data_type, kDEVICE};
-    register_parameter("conv1d." + std::to_string(tp_rank_) + ".weight", conv1d);
-
-    // A_log: log-space decay per head, shape (num_v_heads/tp,)
-    A_log = Tensor{{v_heads_tp}, data_type, kDEVICE};
-    register_parameter("A_log." + std::to_string(tp_rank_) + ".weight", A_log);
-
-    // dt_bias: per head, shape (num_v_heads/tp,)
-    dt_bias = Tensor{{v_heads_tp}, data_type, kDEVICE};
-    register_parameter("dt_bias." + std::to_string(tp_rank_) + ".weight", dt_bias);
-
-    // norm: RMSNormGated weight, shape (value_head_dim,)
-    norm = Tensor{{value_head_dim}, data_type, kDEVICE};
-    register_parameter("norm.weight", norm);
-}
-
-// ---------------------------------------------------------------------------
-// Row-wise concatenation of 4 weight matrices into a single pre-allocated
-// destination tensor.
-//
-// Each source weight has shape (input_dim, out_dim_i) in row-major storage.
-// The destination has shape (input_dim, sum_i out_dim_i) and rows are filled
-// by concatenating the corresponding source rows in order.
-//
-// Implemented with cudaMemcpy2DAsync so that no extra temporary is needed:
-// each source "column block" is scattered into the correct column range of
-// the destination in one pass per source.
-// ---------------------------------------------------------------------------
-static void
-concat_weights_4(const Tensor& a, const Tensor& b, const Tensor& c, const Tensor& d, Tensor& dst, cudaStream_t st)
-{
-    // Tensors are (K=input_dim, M=output_dim) in row-major order.
-    // Each row of `dst` is [a_row | b_row | c_row | d_row].
-    const int K       = dst.shape(0);
-    const int M_a     = a.shape(1);
-    const int M_b     = b.shape(1);
-    const int M_c     = c.shape(1);
-    const int M_d     = d.shape(1);
-    const int M_dst   = dst.shape(1);  // M_a + M_b + M_c + M_d
-    const int elem_sz = byte_size(dst.dtype(), 1);
-
-    // Pitch of the destination row in bytes
-    const size_t dst_pitch   = (size_t)M_dst * elem_sz;
-    const size_t src_pitch_a = (size_t)M_a * elem_sz;
-    const size_t src_pitch_b = (size_t)M_b * elem_sz;
-    const size_t src_pitch_c = (size_t)M_c * elem_sz;
-    const size_t src_pitch_d = (size_t)M_d * elem_sz;
-
-    char* dst_ptr = reinterpret_cast<char*>(dst.raw_data());
-
-    // Columns [0, M_a)
-    check_cuda_error(
-        cudaMemcpy2DAsync(dst_ptr, dst_pitch, a.raw_data(), src_pitch_a, src_pitch_a, K, cudaMemcpyDefault, st));
-
-    // Columns [M_a, M_a+M_b)
-    check_cuda_error(cudaMemcpy2DAsync(
-        dst_ptr + src_pitch_a, dst_pitch, b.raw_data(), src_pitch_b, src_pitch_b, K, cudaMemcpyDefault, st));
-
-    // Columns [M_a+M_b, M_a+M_b+M_c)
-    check_cuda_error(cudaMemcpy2DAsync(dst_ptr + src_pitch_a + src_pitch_b,
-                                       dst_pitch,
-                                       c.raw_data(),
-                                       src_pitch_c,
-                                       src_pitch_c,
-                                       K,
-                                       cudaMemcpyDefault,
-                                       st));
-
-    // Columns [M_a+M_b+M_c, M_dst)
-    check_cuda_error(cudaMemcpy2DAsync(dst_ptr + src_pitch_a + src_pitch_b + src_pitch_c,
-                                       dst_pitch,
-                                       d.raw_data(),
-                                       src_pitch_d,
-                                       src_pitch_d,
-                                       K,
-                                       cudaMemcpyDefault,
-                                       st));
-    sync_check_cuda_error();
-}
-
-void GatedDeltaNetWeight::prepare()
-{
-    auto stream = core::Context::stream().handle();
-
-    // Preprocess individual weights (converts blockscale FP8, etc.)
-    in_proj_qkv.preprocess();
-    in_proj_z.preprocess();
-    in_proj_b.preprocess();
-    in_proj_a.preprocess();
-    out_proj.preprocess();
-    out_proj.prepare();
-
-    // Build the fused input projection weight:
-    //   shape (hidden_dim,  conv_dim + value_dim + 2*v_heads_tp)
-    //   = [in_proj_qkv | in_proj_z | in_proj_b | in_proj_a]  (column-wise)
-    const int out_all = in_proj_qkv.output_dim  //
-                        + in_proj_z.output_dim  //
-                        + in_proj_b.output_dim  //
-                        + in_proj_a.output_dim;
-
-    in_proj_all.emplace(in_proj_qkv.input_dim,
-                        out_all,
-                        in_proj_qkv.data_type,
-                        /*bias=*/false,
-                        in_proj_qkv.weight_type,
-                        in_proj_qkv.group_size);
-
-    concat_weights_4(
-        in_proj_qkv.weight, in_proj_z.weight, in_proj_b.weight, in_proj_a.weight, in_proj_all.weight, stream);
-
-    // Prepare (convert/repack) the fused weight for GEMM
-    in_proj_all.prepare();
-
-    // Release the now-redundant individual weight tensors to free HBM
-    in_proj_qkv = {};
-    in_proj_z   = {};
-    in_proj_b   = {};
-    in_proj_a   = {};
-
-    // Transpose conv1d from checkpoint layout [conv_dim, d_conv] to kernel layout [d_conv, conv_dim]
-    {
-        const int rows = conv1d.shape(0);  // conv_dim
-        const int cols = conv1d.shape(1);  // d_conv
-
-        Tensor conv1d_t{{cols, rows}, conv1d.dtype(), kDEVICE};
-        invokeTransposeAxis01((uint16_t*)conv1d_t.raw_data(), (uint16_t*)conv1d.raw_data(), rows, cols, 1, stream);
-        sync_check_cuda_error();
-        conv1d = std::move(conv1d_t);
-    }
-}
-
-}  // namespace turbomind
diff --git a/src/turbomind/models/llama/GatedDeltaNetWeight.h b/src/turbomind/models/llama/GatedDeltaNetWeight.h
deleted file mode 100644
index 6683584cfd..0000000000
--- a/src/turbomind/models/llama/GatedDeltaNetWeight.h
+++ /dev/null
@@ -1,52 +0,0 @@
-#pragma once
-
-#include "src/turbomind/core/core.h"
-#include "src/turbomind/core/module.h"
-#include "src/turbomind/models/llama/LlamaDenseWeight.h"
-
-namespace turbomind {
-
-struct GatedDeltaNetWeight: public core::Module {
-
-    GatedDeltaNetWeight() = default;
-
-    GatedDeltaNetWeight(int      hidden_dim,
-                        int      num_k_heads,
-                        int      num_v_heads,
-                        int      key_head_dim,
-                        int      value_head_dim,
-                        int      d_conv,
-                        bool     bias,
-                        int      tp_size,
-                        int      tp_rank,
-                        DataType data_type,
-                        DataType weight_type,
-                        int      group_size);
-
-    void prepare();
-
-    // Individual projections – populated at load time from the checkpoint.
-    // After prepare() completes they are released (null-ed) to free HBM.
-    LlamaDenseWeight in_proj_qkv;  // hidden -> key_dim*2 + value_dim
-    LlamaDenseWeight in_proj_z;    // hidden -> value_dim (output gate)
-    LlamaDenseWeight in_proj_b;    // hidden -> num_v_heads (beta, per-head scalar)
-    LlamaDenseWeight in_proj_a;    // hidden -> num_v_heads (alpha/dt, per-head scalar)
-
-    // Fused projection: hidden -> (conv_dim + value_dim + 2*v_heads_tp).
-    // Built from the four above in prepare(); used for all inference GEMMs.
-    // Reduces p.input HBM reads from 4× to 1× per forward pass.
-    LlamaDenseWeight in_proj_all;
-
-    LlamaDenseWeight out_proj;  // value_dim -> hidden
-
-    // Non-dense parameters
-    Tensor conv1d;   // depthwise conv weights: (d_conv, conv_dim)
-    Tensor A_log;    // log-space decay: (num_v_heads,)
-    Tensor dt_bias;  // dt bias: (num_v_heads,)
-    Tensor norm;     // RMSNormGated weight: (value_head_dim,)
-
-    int tp_rank_;
-    int tp_size_;
-};
-
-}  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
deleted file mode 100644
index ca7fc25e97..0000000000
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Copyright (c) OpenMMLab. All rights reserved.
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Modified from
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.cc
-
-#include <cstdlib>
-
-#include <cuda_bf16.h>
-#include <cuda_runtime.h>
-
-#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
-
-#include "src/turbomind/core/data_type.h"
-#include "src/turbomind/core/logger.h"
-#include "src/turbomind/models/llama/LlamaDenseWeight.h"
-#include "src/turbomind/models/llama/llama_params.h"
-#include "src/turbomind/utils/cuda_utils.h"
-
-namespace turbomind {
-
-static bool is_fuse_silu_act()
-{
-    static const bool value = [] {
-        const auto str = std::getenv("TM_FUSE_SILU_ACT");
-        if (str) {
-            try {
-                auto v = std::stoi(str) != 0;
-                TM_LOG_INFO("TM_FUSE_SILU_ACT={}", (int)v);
-                return v;
-            }
-            catch (...) {
-            }
-        }
-        // TM_LOG_INFO("TM_FUSE_SILU_ACT=1");
-        return true;
-    }();
-    return value;
-}
-
-LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(
-    DataType data_type, int layer_id, const ModelParam& model, const EngineParam& engine, const MoeParam& moe_param):
-    head_num_(model.head_num),
-    kv_head_num_(model.kv_head_num),
-    size_per_head_(model.head_dim),
-    hidden_units_(model.hidden_units),
-    inter_size_(model.inter_size.at(layer_id)),
-    data_type_{data_type},
-    weight_type_(model.weight_type),
-    expert_weight_type_(model.expert_weight_type),
-    attn_bias_(model.attn_bias),
-    attn_tp_size_(engine.attn_tp_size),
-    attn_tp_rank_(engine.attn_tp_rank),
-    mlp_tp_size_(engine.mlp_tp_size),
-    mlp_tp_rank_(engine.mlp_tp_rank)
-{
-    bool is_linear_attention = false;
-    if (layer_id < (int)model.layer_types.size() && model.layer_types[layer_id] == 1) {
-        is_linear_attention = true;
-    }
-
-    if (is_linear_attention) {
-        linear_attn_weights.reset(
-            new GatedDeltaNetWeight{hidden_units_,
-                                    model.linear_num_key_heads,
-                                    model.linear_num_value_heads,
-                                    model.linear_key_head_dim,
-                                    model.linear_value_head_dim,
-                                    model.linear_conv_kernel_dim > 0 ? model.linear_conv_kernel_dim : 4,
-                                    attn_bias_,
-                                    attn_tp_size_,
-                                    attn_tp_rank_,
-                                    data_type_,
-                                    weight_type_,
-                                    model.group_size});
-        register_module("linear_attn", *linear_attn_weights);
-    }
-    else {
-        // Attention uses weight_type (fp16 in mixed quant scenarios)
-        self_attn_weights.reset(new LlamaAttentionWeight{hidden_units_,
-                                                         size_per_head_,
-                                                         head_num_,
-                                                         kv_head_num_,
-                                                         model.mla,
-                                                         attn_bias_,
-                                                         model.qk_norm,
-                                                         attn_tp_size_,
-                                                         attn_tp_rank_,
-                                                         data_type_,
-                                                         weight_type_,
-                                                         model.group_size,
-                                                         model.window_size.empty() ? 0 : model.window_size.at(layer_id),
-                                                         model.attn_sink,
-                                                         model.attn_output_gate});
-        register_module("attention", *self_attn_weights);
-    }
-
-    // FFN uses ffn_weight_type, except for layers fully excluded from
-    // quantization (e.g. 'model.layers.0.' in modules_to_not_convert)
-    // where all weights—including FFN—are in data_type (fp16).
-    if (inter_size_) {
-        const DataType ffn_wtype = model.unquantized_expert_layers.count(layer_id) ? data_type_ : model.ffn_weight_type;
-        const bool     is_cublas_gemm = byte_size(ffn_wtype, 8) == 16;
-        ffn_weights.reset(new LlamaFfnWeight{
-            hidden_units_,
-            inter_size_,
-            model.mlp_bias,
-            mlp_tp_size_,
-            mlp_tp_rank_,
-            data_type_,
-            ffn_wtype,
-            model.group_size,
-            model.act_type,
-            is_fuse_silu_act() && !is_cublas_gemm,
-        });
-        register_module("feed_forward", *ffn_weights);
-    }
-
-    // MoE routed experts use expert_weight_type (int4 for AWQ, e2m1 for mxfp4)
-    // unless the layer is in unquantized_expert_layers (e.g. layer 0 excluded
-    // from quantization via modules_to_not_convert).
-    if (layer_id < moe_param.expert_num.size() && moe_param.expert_num[layer_id]) {
-        const DataType moe_wtype = model.unquantized_expert_layers.count(layer_id) ? data_type_ : expert_weight_type_;
-        moe_weights.reset(new MoeFfnWeight{layer_id,
-                                           moe_param,
-                                           hidden_units_,
-                                           model.mlp_bias,
-                                           data_type_,
-                                           moe_wtype,
-                                           model.group_size,
-                                           mlp_tp_size_,
-                                           mlp_tp_rank_,
-                                           model.act_type,
-                                           is_fuse_silu_act()});
-        register_module("moe_ffn", *moe_weights);
-    }
-
-    self_attn_norm = Tensor{{hidden_units_}, data_type_, kDEVICE};
-    ffn_norm       = Tensor{{hidden_units_}, data_type_, kDEVICE};
-    register_parameter("attention_norm.weight", self_attn_norm);
-    register_parameter("ffn_norm.weight", ffn_norm);
-}
-
-LlamaDecoderLayerWeight::~LlamaDecoderLayerWeight() = default;
-
-void LlamaDecoderLayerWeight::prepare(const cudaDeviceProp& prop, cudaStream_t st)
-{
-    if (self_attn_weights) {
-        self_attn_weights->prepare();
-    }
-
-    if (linear_attn_weights) {
-        linear_attn_weights->prepare();
-    }
-
-    if (ffn_weights) {
-        ffn_weights->prepare(false);
-    }
-
-    if (moe_weights) {
-        moe_weights->prepare();
-    }
-}
-
-}  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
deleted file mode 100644
index 6ac387ab12..0000000000
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) OpenMMLab. All rights reserved.
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Modified from
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.h
-
-#pragma once
-
-#include "src/turbomind/core/core.h"
-
-#include "src/turbomind/models/llama/GatedDeltaNetWeight.h"
-#include "src/turbomind/models/llama/LlamaDenseWeight.h"
-#include "src/turbomind/models/llama/llama_params.h"
-
-namespace turbomind {
-
-struct LlamaDecoderLayerWeight: core::Module {
-public:
-    LlamaDecoderLayerWeight() = delete;
-
-    LlamaDecoderLayerWeight(DataType           data_type,
-                            int                layer_id,
-                            const ModelParam&  model,
-                            const EngineParam& engine,
-                            const MoeParam&    moe_param);
-
-    ~LlamaDecoderLayerWeight();
-    LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight&) = delete;
-    LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight&) = delete;
-
-    void prepare(const cudaDeviceProp& prop, cudaStream_t st);
-
-    Tensor self_attn_norm;
-    Tensor ffn_norm;
-
-    std::unique_ptr<LlamaAttentionWeight> self_attn_weights;
-    std::unique_ptr<GatedDeltaNetWeight>  linear_attn_weights;
-
-    std::unique_ptr<LlamaFfnWeight> ffn_weights;
-    std::unique_ptr<MoeFfnWeight>   moe_weights;
-
-private:
-    int head_num_;
-    int kv_head_num_;
-    int size_per_head_;
-    int hidden_units_;
-    int inter_size_;
-
-    DataType data_type_;
-    DataType weight_type_;
-    DataType expert_weight_type_;
-
-    int  bit_size_;
-    bool attn_bias_;
-    int  attn_tp_size_;
-    int  attn_tp_rank_;
-    int  mlp_tp_size_;
-    int  mlp_tp_rank_;
-};
-
-}  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaDenseWeight.cc b/src/turbomind/models/llama/LlamaDenseWeight.cc
deleted file mode 100644
index 1764d3622a..0000000000
--- a/src/turbomind/models/llama/LlamaDenseWeight.cc
+++ /dev/null
@@ -1,690 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include <utility>
-
-#include "src/turbomind/models/llama/LlamaDenseWeight.h"
-
-#include "src/turbomind/core/allocator.h"
-#include "src/turbomind/core/data_type.h"
-
-#include "src/turbomind/kernels/activation.h"
-#include "src/turbomind/kernels/gemm/cast.h"
-#include "src/turbomind/kernels/gemm/convert.h"
-#include "src/turbomind/kernels/gemm/gemm.h"
-#include "src/turbomind/kernels/gemm/types.h"
-#include "src/turbomind/kernels/gemm/utils.h"
-#include "src/turbomind/kernels/gpt_kernels.h"
-#include "src/turbomind/utils/cuda_utils.h"
-
-namespace turbomind {
-
-void LlamaDenseWeight::emplace(
-    int input_dim, int output_dim, DataType data_type, bool bias, DataType weight_type, int group_size)
-{
-    this->data_type   = data_type;
-    this->input_type  = data_type;
-    this->weight_type = weight_type;
-    this->input_dim   = input_dim;
-    this->output_dim  = output_dim;
-    this->group_size  = group_size;
-
-    const bool is_qweight = weight_type == kUint4 || weight_type == kUint8;
-
-    weight = Tensor({input_dim, output_dim}, weight_type, kDEVICE);
-    register_parameter(is_qweight ? "qweight" : "weight", weight);
-
-    if (bias) {
-        this->bias = Tensor{{output_dim}, data_type, kDEVICE};
-        register_parameter("bias", this->bias);
-    }
-
-    if (weight_type == kFloat8_e4m3) {
-        TM_CHECK_EQ(group_size, 128);
-        scales       = Tensor{{cdiv(input_dim, group_size), cdiv(output_dim, group_size)}, kFloat, kDEVICE};
-        weight_quant = QuantDesc{gemm::QuantType::kB, group_size};
-        if (getSMVersion() == 90) {
-            input_type  = kFloat8_e4m3;
-            input_quant = QuantDesc{gemm::QuantType::kK, group_size};
-        }
-        register_parameter("scales", scales);
-    }
-    else if (weight_type == kFloat4_e2m1) {
-        scales       = Tensor{{cdiv(input_dim, group_size), output_dim}, kUint8, kDEVICE};
-        input_type   = data_type;
-        weight_quant = QuantDesc{gemm::QuantType::kK, group_size};
-        register_parameter("scales", scales);
-    }
-    else if (is_qweight) {
-        TM_CHECK(input_dim % group_size == 0) << input_dim << " " << group_size;
-        scales       = Tensor{{input_dim / group_size, output_dim}, data_type, kDEVICE};
-        zeros        = Tensor{{input_dim / group_size, output_dim}, data_type, kDEVICE};
-        weight_quant = QuantDesc{gemm::QuantType::kK, group_size};
-        register_parameter("scales", scales);
-        register_parameter("zeros", zeros);
-    }
-
-    k_desc = {};
-    q_desc = {};
-
-    // default case: floating point, N-major
-    k_desc.type  = weight.dtype();
-    k_desc.order = gemm::kRowMajor;
-    k_desc.rows  = input_dim;
-    k_desc.cols  = output_dim;
-    k_desc.ld    = output_dim;
-}
-
-void LlamaDenseWeight::preprocess()
-{
-    if (!weight) {
-        return;
-    }
-    if (weight_quant.type == gemm::QuantType::kB && input_quant.type == gemm::QuantType::kNone) {
-        // Convert blockwise scales to groupwise scales
-        weight_quant.type = gemm::QuantType::kK;
-        scales            = BlockscaleToGroupscale(scales, data_type, weight_quant.group_size);
-    }
-}
-
-static void Convert(LlamaDenseWeight& dense, bool is_grouped, cudaStream_t st)
-{
-    using namespace gemm;
-
-    auto [conv_w, conv_s] =
-        GetConverters(dense.data_type, dense.weight_type, dense.input_type, is_grouped, getSMVersion());
-
-    if (conv_w) {
-        const auto order_w = conv_w->order;
-        const bool is_A    = get_operand_tag(conv_w->pack) == OPERAND_A;
-        const bool is_B    = !is_A;
-
-        const int bits = byte_size(dense.weight_type, 8);
-
-        Tensor_<uint16_t> tmp{{dense.input_dim, dense.output_dim}, kDEVICE};
-
-        if (bits == 4) {  // u4 -> u16
-            extend_to_u16(tmp.data(), (const uint4_t*)dense.weight.raw_data(), tmp.size(), st);
-            sync_check_cuda_error();
-        }
-        else if (bits == 8) {  // u8 -> u16
-            extend_to_u16(tmp.data(), (const uint8_t*)dense.weight.raw_data(), tmp.size(), st);
-            sync_check_cuda_error();
-        }
-        else if (bits == 16) {
-            check_cuda_error(
-                cudaMemcpyAsync(tmp.raw_data(), dense.weight.raw_data(), tmp.byte_size(), cudaMemcpyDefault, st));
-        }
-
-        if (order_w == kRowMajor) {  // (k,m) -> (m,k)
-            Tensor_<uint16_t> trans{{dense.output_dim, dense.input_dim}, kDEVICE};
-            invokeTransposeAxis01(trans.data(), tmp.data(), dense.input_dim, dense.output_dim, 1, st);
-            tmp = trans;
-        }
-
-        MatrixLayout w_desc{
-            dense.data_type,
-            order_w,
-            (int)dense.output_dim,  // M
-            (int)dense.input_dim,   // K
-            order_w == kRowMajor ? (int)dense.input_dim : (int)dense.output_dim,
-        };
-
-        if (is_B) {
-            std::swap(w_desc.rows, w_desc.cols);
-            w_desc.order = ~w_desc.order;
-        }
-
-        MatrixLayout k_desc = w_desc;
-        k_desc.type         = dense.weight_type;
-        // Converter does not recognize e2m1 / e4m3
-        if (bits == 4) {
-            k_desc.type = data_type_v<uint4_t>;
-        }
-        else if (bits == 8) {
-            k_desc.type = data_type_v<uint8_t>;
-        }
-        k_desc.pack = conv_w->pack;
-
-        check_cuda_error(cudaMemsetAsync(dense.weight.raw_data(), 0, dense.weight.byte_size(), st));
-
-        TM_CHECK(conv_w->Convert(tmp.data(), w_desc, dense.weight.raw_data(), k_desc, st) == 0);
-
-        sync_check_cuda_error();
-
-        k_desc.type = dense.weight_type;
-        if (is_A) {
-            k_desc = transpose(k_desc);
-        }
-        dense.k_desc = k_desc;
-    }
-
-    if (conv_s) {
-        const auto order_s = conv_s->order;
-        const auto pack_s  = conv_s->pack;
-        const bool is_A    = get_operand_tag(conv_s->pack) == OPERAND_U;
-        const bool is_B    = !is_A;
-
-        Tensor   tmp_q;
-        DataType scale_type;
-
-        if (dense.zeros) {  // AWQ/GPTQ fuse scales and zeros
-            tmp_q = {{dense.scales.size(), 2}, kHalf, kDEVICE};
-            fuse_scales_and_zeros(
-                tmp_q.data<half>(), dense.scales.data<half>(), dense.zeros.data<half>(), dense.scales.size(), st);
-            scale_type   = kUint32;  // half2
-            dense.zeros  = {};
-            dense.scales = empty_like(tmp_q);
-        }
-        else if (dense.weight_type == kFloat8_e4m3) {  // e4m3
-            tmp_q = empty_like(dense.scales);
-            Copy(dense.scales, tmp_q);
-            scale_type = kUint16;  // bf16
-        }
-        else {  // mxfp4
-            tmp_q = empty_like(dense.scales);
-            Copy(dense.scales, tmp_q);
-            scale_type = kUint8;  // ue8m0
-        }
-
-        if (dense.data_type == kHalf && dense.weight_type == kFloat4_e2m1) {  // mxfp4
-            AdjustUe8m0ScaleForHalf(tmp_q.data<uint8_t>(), tmp_q.size(), st);
-            sync_check_cuda_error();
-        }
-
-        MatrixLayout s_desc{
-            scale_type,
-            order_s,
-            (int)dense.output_dim,                    // M
-            (int)dense.input_dim / dense.group_size,  // K
-            (int)dense.output_dim,                    // always MN-major
-        };
-
-        if (is_B) {
-            std::swap(s_desc.rows, s_desc.cols);
-            s_desc.order = ~s_desc.order;
-        }
-
-        MatrixLayout q_desc = s_desc;
-        q_desc.pack         = pack_s;
-
-        TM_CHECK(conv_s->Convert(tmp_q.raw_data(), s_desc, dense.scales.raw_data(), q_desc, st) == 0);
-        sync_check_cuda_error();
-
-        // weight is placed at B in `Linear`
-        if (is_A) {
-            q_desc = transpose(q_desc);
-        }
-        dense.q_desc = q_desc;
-    }
-}
-
-static void ConvertBlockscaleFP8Native(LlamaDenseWeight& dense, cudaStream_t stream)
-{
-    using namespace gemm;
-
-    TM_CHECK_GE(getSMVersion(), 90);
-    TM_CHECK_EQ(dense.data_type, data_type_v<bfloat16_t>);
-
-    auto process = [&](Tensor& x, MatrixLayout& d, auto dtype) {
-        using T = decltype(dtype);
-        Tensor trans{{x.shape(1), x.shape(0)}, x.dtype(), kDEVICE};
-        invokeTransposeAxis01((T*)trans.raw_data(), (T*)x.raw_data(), x.shape(0), x.shape(1), 1, stream);
-        x = std::move(trans);
-        d = MatrixLayout{x.dtype(),  //
-                         kColMajor,
-                         (int)x.shape(1),
-                         (int)x.shape(0),
-                         (int)x.stride(0)};
-    };
-
-    TM_CHECK_EQ(dense.weight.dtype(), kFloat8_e4m3);
-    process(dense.weight, dense.k_desc, uint8_t{});
-
-    TM_CHECK_EQ(dense.scales.dtype(), kFloat);
-    process(dense.scales, dense.q_desc, float{});
-}
-
-void LlamaDenseWeight::prepare(bool fused_moe)
-{
-    if (!weight) {
-        return;
-    }
-
-    auto stream = core::Context::stream().handle();
-
-    if (weight_type == kFloat8_e4m3 && input_type == kFloat8_e4m3) {
-        ConvertBlockscaleFP8Native(*this, stream);
-    }
-    else {
-        Convert(*this, fused_moe, stream);
-    }
-}
-
-LlamaAttentionWeight::LlamaAttentionWeight(int      hidden_dim,
-                                           int      head_dim,
-                                           int      head_num,
-                                           int      kv_head_num,
-                                           MLAParam mla,
-                                           bool     bias,
-                                           bool     qk_norm,
-                                           int      tp_size,
-                                           int      tp_rank,
-                                           DataType data_type,
-                                           DataType weight_type,
-                                           int      group_size,
-                                           int      window_size,
-                                           bool     sink,
-                                           bool     attn_output_gate)
-{
-    this->window_size = window_size;
-
-    // attn_output_gate doubles Q dimension (extra gate projection fused into Q)
-    const int q_factor = attn_output_gate ? 2 : 1;
-
-    if (mla.kv_lora_rank == 0) {
-        qkv.emplace(hidden_dim,
-                    (head_num * q_factor + 2 * kv_head_num) * head_dim / tp_size,
-                    data_type,
-                    bias,
-                    weight_type,
-                    group_size);
-        register_module("w_qkv", qkv, tp_rank);
-        if (qk_norm) {
-            q_a_layernorm  = Tensor{{head_dim}, data_type, kDEVICE};
-            kv_a_layernorm = Tensor{{head_dim}, data_type, kDEVICE};
-            register_parameter("q_norm", q_a_layernorm);
-            register_parameter("k_norm", kv_a_layernorm);
-        }
-    }
-    else {
-        const int qk_nope_dim = head_dim - mla.qk_rope_dim;
-        if (mla.q_lora_rank) {
-            q_a_proj.emplace(hidden_dim, mla.q_lora_rank, data_type, false, weight_type, group_size);
-            q_b_proj.emplace(mla.q_lora_rank, head_num * head_dim / tp_size, data_type, false, weight_type, group_size);
-            q_a_layernorm = Tensor{{q_b_proj.input_dim}, data_type, kDEVICE};
-            register_module("q_a_proj", q_a_proj);
-            register_module("q_b_proj", q_b_proj, tp_rank);
-            register_parameter("q_a_layernorm", q_a_layernorm);
-        }
-        else {
-            q_proj.emplace(hidden_dim, head_num * head_dim / tp_size, data_type, false, weight_type, group_size);
-            register_module("q_proj", q_proj, tp_rank);
-        }
-        kv_a_proj.emplace(hidden_dim, mla.kv_lora_rank + mla.qk_rope_dim, data_type, false, weight_type, group_size);
-        // kv_b_proj.emplace(mla.kv_lora_rank,
-        //                   head_num * (qk_nope_dim + mla.v_head_dim) / tp_size,
-        //                   data_type,
-        //                   false,
-        //                   weight_type,
-        //                   group_size);
-
-        kv_a_layernorm = Tensor{{mla.kv_lora_rank}, data_type, kDEVICE};
-        register_module("kv_a_proj", kv_a_proj);
-        // register_module("kv_b_proj", kv_b_proj, tp_rank);
-        register_parameter("kv_a_layernorm", kv_a_layernorm);
-    }
-    output.emplace((head_num * head_dim) / tp_size, hidden_dim, data_type, bias, weight_type, group_size);
-    register_module("wo", output, tp_rank);
-
-    if (sink) {
-        sinks = Tensor{{head_num / tp_size}, data_type, kDEVICE};
-        register_parameter(std::to_string(tp_rank) + ".sinks", sinks);
-    }
-}
-
-void LlamaAttentionWeight::prepare()
-{
-    std::vector weights{
-        &qkv, &output, &q_a_proj, &q_a_proj, &q_b_proj, &kv_a_proj  // &kv_b_proj,
-    };
-    for (auto& w : weights) {
-        w->preprocess();
-        w->prepare();
-    }
-}
-
-LlamaFfnWeight::LlamaFfnWeight(int            hidden_dim,
-                               int            inter_size,
-                               bool           bias,
-                               int            tp_size,
-                               int            tp_rank,
-                               DataType       data_type,
-                               DataType       weight_type,
-                               int            group_size,
-                               ActivationType act_type,
-                               bool           fuse_silu_act)
-{
-    TM_CHECK(inter_size % tp_size == 0) << inter_size << " " << tp_size;
-
-    inter_size /= tp_size;
-
-    this->inter_size    = inter_size;
-    this->tp_rank       = tp_rank;
-    this->act_type      = act_type;
-    this->is_fused_silu = fuse_silu_act && this->act_type == ActivationType::kSilu;
-
-    gating.emplace(hidden_dim, inter_size, data_type, bias, weight_type, group_size);
-
-    intermediate.emplace(hidden_dim, inter_size, data_type, bias, weight_type, group_size);
-
-    output.emplace(inter_size, hidden_dim, data_type, bias, weight_type, group_size);
-
-    if (gating.input_type == kFloat8_e4m3) {  // SM90 FP8*FP8 GEMM, can't fuse
-        this->is_fused_silu = false;
-    }
-
-    register_module("w1", gating, tp_rank);
-    register_module("w3", intermediate, tp_rank);
-    register_module("w2", output, tp_rank);
-}
-
-static void Interleave(const Tensor& a, const Tensor& b, Tensor& c, cudaStream_t st)
-{
-    TM_CHECK(a.layout() == b.layout());
-    int M, K;
-    if (a.ndim() == 2) {
-        std::tie(K, M) = a.shapes(0, 1);
-    }
-    else {
-        M = a.shape(0);
-        K = 1;
-    }
-    auto a_ = a.raw_data();
-    auto b_ = b.raw_data();
-    auto c_ = c.raw_data();
-
-    const int bits = byte_size(a.dtype(), 8);
-    if (bits == 4) {
-        Buffer_<uint8_t> ta{a.size(), kDEVICE};
-        Buffer_<uint8_t> tb{b.size(), kDEVICE};
-        Buffer_<uint8_t> tc{c.size(), kDEVICE};
-        extend_to_u8(ta.data(), (uint4_t*)a_, a.size(), st);
-        extend_to_u8(tb.data(), (uint4_t*)b_, b.size(), st);
-        interleave_output_dims(tc.data(), ta.data(), tb.data(), M, K, st);
-        compact_to_u4((uint4_t*)c_, tc.data(), c.size(), st);
-    }
-    else if (bits == 8) {
-        interleave_output_dims((uint8_t*)c_, (uint8_t*)a_, (uint8_t*)b_, M, K, st);
-    }
-    else if (bits == 16) {
-        interleave_output_dims((uint16_t*)c_, (uint16_t*)a_, (uint16_t*)b_, M, K, st);
-    }
-    else if (bits == 32) {
-        interleave_output_dims((uint32_t*)c_, (uint32_t*)a_, (uint32_t*)b_, M, K, st);
-    }
-    else {
-        TM_CHECK(0);
-    }
-}
-
-void interleave(LlamaDenseWeight& c, LlamaDenseWeight& a, LlamaDenseWeight& b, DataType data_type, cudaStream_t st)
-{
-    TM_CHECK_EQ(c.input_dim, a.input_dim);
-    TM_CHECK_EQ(c.input_dim, b.input_dim);
-    TM_CHECK_EQ(c.output_dim, a.output_dim * 2);
-    TM_CHECK_EQ(c.output_dim, b.output_dim * 2);
-    TM_CHECK_EQ(c.group_size, a.group_size);
-    TM_CHECK_EQ(c.group_size, b.group_size);
-
-    Interleave(a.weight, b.weight, c.weight, st);
-    sync_check_cuda_error();
-
-    if (a.scales) {
-        Interleave(a.scales, b.scales, c.scales, st);
-        sync_check_cuda_error();
-    }
-    if (a.zeros) {
-        Interleave(a.zeros, b.zeros, c.zeros, st);
-        sync_check_cuda_error();
-    }
-    if (a.bias) {
-        Interleave(a.bias, b.bias, c.bias, st);
-        sync_check_cuda_error();
-    }
-}
-
-static void Chunk(const Tensor& a, const Tensor& b, Tensor& c, cudaStream_t st)
-{
-    TM_CHECK(a.layout() == b.layout());
-    int M, K, spitch, dpitch;
-    if (a.ndim() == 2) {
-        std::tie(K, M) = a.shapes(0, 1);
-        spitch         = byte_size(a.dtype(), a.stride(0));
-        dpitch         = byte_size(c.dtype(), c.stride(0));
-    }
-    else {
-        M      = a.shape(0);
-        K      = 1;
-        spitch = byte_size(a.dtype(), M);
-        dpitch = byte_size(c.dtype(), c.shape(0));
-    }
-    int height = K;
-    int width  = byte_size(a.dtype(), M);
-    check_cuda_error(cudaMemcpy2DAsync((char*)c.raw_data(),  //
-                                       dpitch,
-                                       (const char*)a.raw_data(),
-                                       spitch,
-                                       width,
-                                       height,
-                                       cudaMemcpyDefault,
-                                       st));
-    check_cuda_error(cudaMemcpy2DAsync((char*)c.raw_data() + width,  //
-                                       dpitch,
-                                       (const char*)b.raw_data(),
-                                       spitch,
-                                       width,
-                                       height,
-                                       cudaMemcpyDefault,
-                                       st));
-}
-
-void chunk(LlamaDenseWeight& c, LlamaDenseWeight& a, LlamaDenseWeight& b, DataType data_type, cudaStream_t st)
-{
-    TM_CHECK_EQ(c.input_dim, a.input_dim);
-    TM_CHECK_EQ(c.input_dim, b.input_dim);
-    TM_CHECK_EQ(c.output_dim, a.output_dim * 2);
-    TM_CHECK_EQ(c.output_dim, b.output_dim * 2);
-    TM_CHECK_EQ(c.group_size, a.group_size);
-    TM_CHECK_EQ(c.group_size, b.group_size);
-
-    Chunk(a.weight, b.weight, c.weight, st);
-    sync_check_cuda_error();
-
-    if (a.scales) {
-        Chunk(a.scales, b.scales, c.scales, st);
-        sync_check_cuda_error();
-    }
-    if (a.zeros) {
-        Chunk(a.zeros, b.zeros, c.zeros, st);
-        sync_check_cuda_error();
-    }
-    if (a.bias) {
-        Chunk(a.bias, b.bias, c.bias, st);
-        sync_check_cuda_error();
-    }
-}
-
-void LlamaFfnWeight::prepare(bool fused_moe)
-{
-    const auto data_type = gating.data_type;
-
-    auto stream = core::Context().stream().handle();
-
-    gating.preprocess();
-    intermediate.preprocess();
-
-    if (fuse_up_and_gate) {
-        auto& gate_and_up = fused_gating_intermediate;
-
-        gate_and_up.emplace(gating.input_dim,  //
-                            gating.output_dim * 2,
-                            gating.data_type,
-                            (bool)gating.bias,
-                            gating.weight_type,
-                            gating.group_size);
-        gate_and_up.preprocess();
-        register_module("w1w3", gate_and_up, this->tp_rank);
-
-        if (is_fused_silu) {
-            interleave(gate_and_up, gating, intermediate, data_type, stream);
-            gate_and_up.epilogue = gemm::Epilogue::kGatedSilu;
-        }
-        else {
-            chunk(gate_and_up, gating, intermediate, data_type, stream);
-        }
-
-        fused_gating_intermediate.prepare(fused_moe);
-
-        gating       = {};
-        intermediate = {};
-    }
-    else {
-        gating.prepare(fused_moe);
-        intermediate.prepare(fused_moe);
-    }
-
-    output.preprocess();
-    output.prepare(fused_moe);
-}
-
-MoeFfnWeight::MoeFfnWeight(int             layer_id,
-                           const MoeParam& param,
-                           int             hidden_dim,
-                           bool            mlp_bias,
-                           DataType        data_type,
-                           DataType        weight_type,
-                           int             group_size,
-                           int             tp_size,
-                           int             tp_rank,
-                           ActivationType  act_type,
-                           bool            fuse_silu_act)
-{
-    if ((int)param.expert_num.size() <= layer_id) {
-        return;
-    }
-
-    const int expert_num = param.expert_num[layer_id];
-
-    if (expert_num == 0) {
-        return;
-    }
-
-    gate.emplace(hidden_dim, expert_num, data_type, param.router_bias, data_type, 1);
-    register_module("gate", gate);
-
-    if (param.topk_method == "noaux_tc") {
-        score_correction_bias = Tensor{{expert_num}, kFloat, kDEVICE};
-        register_parameter("gate.score_correction_bias", score_correction_bias);
-    }
-
-    method = param.method;
-
-    const bool is_cublas_gemm = method == MoeParam::kNaive && byte_size(weight_type, 8) == 16;
-    if (is_cublas_gemm || mlp_bias) {
-        fuse_silu_act = false;
-    }
-
-    experts.reserve(expert_num);
-    for (int i = 0; i < expert_num; ++i) {
-        experts.emplace_back(new LlamaFfnWeight{hidden_dim,
-                                                param.inter_size,
-                                                mlp_bias,
-                                                tp_size,
-                                                tp_rank,
-                                                data_type,
-                                                weight_type,
-                                                group_size,
-                                                act_type,
-                                                fuse_silu_act});
-        register_module("experts", *experts.back(), i);
-    }
-
-    if (param.shared_gate) {
-        shared_gate.emplace(hidden_dim, 1, data_type, false, data_type, 1);
-        register_module("shared_gate", shared_gate);
-    }
-}
-
-void MoeFfnWeight::prepare()
-{
-    const auto fused_moe = method == MoeParam::kFused;
-
-    gate.prepare();
-    shared_gate.prepare();
-
-    for (auto& e : experts) {
-        e->prepare(fused_moe);
-    }
-
-    const int n = experts.size();
-    LinkExperts([&](int i) { return &experts[i]->fused_gating_intermediate; }, n, block.fused_gating_intermediate);
-    LinkExperts([&](int i) { return &experts[i]->output; }, n, block.output);
-
-    auto& e = *experts.at(0);
-    // Copy MLP properties
-    block.inter_size    = e.inter_size;
-    block.is_fused_silu = e.is_fused_silu;
-    block.act_type      = e.act_type;
-}
-
-void LinkExperts(std::function<LlamaDenseWeight*(int)> experts, int n, LlamaDenseWeight& d)
-{
-    const auto& e = *experts(0);
-
-    d.input_dim    = e.input_dim;
-    d.output_dim   = e.output_dim;
-    d.group_size   = e.group_size;
-    d.data_type    = e.data_type;
-    d.input_type   = e.input_type;
-    d.weight_type  = e.weight_type;
-    d.input_quant  = e.input_quant;
-    d.weight_quant = e.weight_quant;
-    d.k_desc       = e.k_desc;
-    d.q_desc       = e.q_desc;
-    d.epilogue     = e.epilogue;
-
-    d.k_desc.num = d.q_desc.num = n;
-
-    if (e.bias) {
-        d.bias = Tensor{{n, e.output_dim}, e.bias.dtype(), kDEVICE};
-    }
-
-    std::vector<std::pair<void*, int>> weights;
-    std::vector<std::pair<void*, int>> scales;
-
-    for (int i = 0; i < n; ++i) {
-        auto& e = *experts(i);
-        weights.emplace_back(e.weight.raw_data(), e.k_desc.ld);
-        if (e.scales) {
-            scales.emplace_back(e.scales.raw_data(), e.q_desc.ld);
-        }
-        if (e.bias) {
-            Copy(e.bias, d.bias.slice(i, 1).squeeze(0));
-        }
-    }
-
-    auto stream = core::Context::stream().handle();
-
-    if (d.weight_type == kFloat8_e4m3 && d.input_type == kFloat8_e4m3) {
-        auto make_blocked_ptr = [&](const auto& ptrs) {
-            return std::shared_ptr<void>{gemm::MakeBlockedPtrs(ptrs, stream), [](auto p) { cudaFree(p); }};
-        };
-        d.weight = Tensor{make_blocked_ptr(weights), {n}, e.weight.dtype(), kDEVICE};
-        d.scales = Tensor{make_blocked_ptr(scales), {n}, e.scales.dtype(), kDEVICE};
-        // This is needed to be recognized as blocked striding mode
-        d.k_desc.offsets = d.q_desc.offsets = (int*)1;
-    }
-    else {
-        auto make_strided_ptr = [&](const auto& ptrs) {
-            return std::shared_ptr<void>{gemm::MakeStridedPtrs(ptrs, stream), [](auto p) { cudaFree(p); }};
-        };
-        d.weight = Tensor{make_strided_ptr(weights), {n}, d.weight_type, kDEVICE};
-        if (e.scales) {
-            d.scales = Tensor{make_strided_ptr(scales), {n}, e.scales.dtype(), kDEVICE};
-        }
-        // pre-sm90 grouped GEMM need `ld == 0 to resolve strided_ptr
-        d.k_desc.ld = d.q_desc.ld = 0;
-    }
-}
-
-}  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h
deleted file mode 100644
index 7aa8673586..0000000000
--- a/src/turbomind/models/llama/LlamaDenseWeight.h
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Copyright (c) OpenMMLab. All rights reserved.
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/DenseWeight.h
-
-#pragma once
-
-#include "src/turbomind/core/core.h"
-#include "src/turbomind/core/module.h"
-
-#include "src/turbomind/kernels/activation.h"
-#include "src/turbomind/kernels/gemm/types.h"
-
-#include "src/turbomind/models/llama/llama_params.h"
-
-namespace turbomind {
-
-using gemm::QuantDesc;
-using gemm::MatrixLayout;
-using gemm::Epilogue;
-
-struct LlamaDenseWeight: public core::Module {
-
-    LlamaDenseWeight():
-        data_type{}, weight_type{}, input_type{}, weight_quant{}, input_quant{}, epilogue{}, k_desc{}, q_desc{}
-    {
-    }
-
-    void emplace(int input_dim, int output_dim, DataType data_type, bool bias, DataType weight_type, int group_size);
-
-    void preprocess();
-
-    void prepare(bool fused_moe = 0);
-
-    LlamaDenseWeight& operator=(std::nullptr_t)
-    {
-        this->~LlamaDenseWeight();
-        new (this) LlamaDenseWeight{};
-        return *this;
-    }
-
-    operator bool() const noexcept
-    {
-        return static_cast<bool>(weight);
-    }
-
-    int input_dim  = 0;
-    int output_dim = 0;
-    int group_size = 1;
-
-    Tensor weight;
-    Tensor bias;
-
-    Tensor scales;
-    Tensor zeros;
-
-    DataType data_type;
-
-    DataType weight_type;
-    DataType input_type;
-
-    QuantDesc weight_quant;
-    QuantDesc input_quant;
-
-    Epilogue epilogue;
-
-    MatrixLayout k_desc;
-    MatrixLayout q_desc;
-};
-
-struct LlamaAttentionWeight: public core::Module {
-
-    LlamaAttentionWeight() = default;
-
-    LlamaAttentionWeight(int      hidden_dim,
-                         int      head_dim,
-                         int      head_num,
-                         int      kv_head_num,
-                         MLAParam mla,
-                         bool     bias,
-                         bool     qk_norm,
-                         int      tp_size,
-                         int      tp_rank,
-                         DataType data_type,
-                         DataType weight_type,
-                         int      group_size,
-                         int      window_size,
-                         bool     sink,
-                         bool     attn_output_gate = false);
-
-    void prepare();
-
-    LlamaDenseWeight qkv;
-    LlamaDenseWeight output;
-
-    Tensor sinks;
-
-    LlamaDenseWeight q_proj;
-    LlamaDenseWeight q_a_proj;
-    LlamaDenseWeight q_b_proj;
-    LlamaDenseWeight kv_a_proj;
-    // LlamaDenseWeight kv_b_proj;
-
-    Tensor q_a_layernorm;
-    Tensor kv_a_layernorm;
-
-    int window_size{};
-};
-
-struct LlamaFfnWeight: core::Module {
-
-    LlamaFfnWeight() = default;
-
-    LlamaFfnWeight(int            hidden_dim,
-                   int            inter_size,
-                   bool           bias,
-                   int            tp_size,
-                   int            tp_rank,
-                   DataType       data_type,
-                   DataType       weight_type,
-                   int            group_size,
-                   ActivationType act_type,
-                   bool           fuse_silu_act);
-
-    static constexpr bool fuse_up_and_gate = true;
-
-    void prepare(bool fused_moe);
-
-    LlamaDenseWeight gating;
-    LlamaDenseWeight intermediate;
-    LlamaDenseWeight output;
-    LlamaDenseWeight fused_gating_intermediate;
-
-    ActivationType act_type;
-
-    int  inter_size{};
-    bool is_fused_silu{};
-
-    int tp_rank{};
-};
-
-struct MoeFfnWeight: core::Module {
-
-    MoeFfnWeight() = default;
-
-    MoeFfnWeight(int             layer_id,
-                 const MoeParam& param,
-                 int             hidden_dim,
-                 bool            mlp_bias,
-                 DataType        data_type,
-                 DataType        weight_type,
-                 int             group_size,
-                 int             tp_size,
-                 int             tp_rank,
-                 ActivationType  act_type,
-                 bool            fuse_silu_act);
-
-    void prepare();
-
-    LlamaDenseWeight gate;
-    LlamaDenseWeight shared_gate;
-
-    /// Per-expert score correction bias for noaux_tc routing (optional; used when topk_method == "noaux_tc")
-    Tensor score_correction_bias;
-
-    std::vector<std::unique_ptr<LlamaFfnWeight>> experts;
-
-    // reference into `experts`
-    LlamaFfnWeight block;
-
-    MoeParam::Method method{};
-};
-
-void LinkExperts(std::function<LlamaDenseWeight*(int)> experts, int n, LlamaDenseWeight& d);
-
-}  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.cc b/src/turbomind/models/llama/LlamaFfnLayer.cc
index d9b91bf929..a989987007 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.cc
+++ b/src/turbomind/models/llama/LlamaFfnLayer.cc
@@ -39,8 +39,11 @@ void LlamaFfnLayer::forward(ForwardParam param)
     Tensor gating;
     Tensor inter;
 
-    if (mlp.fused_gating_intermediate.weight) {
-        auto mix = linear_.Forward(param.input, mlp.fused_gating_intermediate);
+    auto* fused     = mlp.w1w3.get();
+    bool  use_fused = fused && fused->weight;
+
+    if (use_fused) {
+        auto mix = linear_.Forward(param.input, *fused);
         sync_check_cuda_error();
 
         gating = mix.slice({0, 0}, {(int)token_num, inter_size});
@@ -49,16 +52,18 @@ void LlamaFfnLayer::forward(ForwardParam param)
         }
     }
     else {
-        gating = linear_.Forward(param.input, mlp.gating);
+        gating = linear_.Forward(param.input, *mlp.w1);
         sync_check_cuda_error();
         TM_DEBUG_TENSOR(gating, Concat("w1", layer_id), 3);
 
-        inter = linear_.Forward(param.input, mlp.intermediate);
+        inter = linear_.Forward(param.input, *mlp.w3);
         sync_check_cuda_error();
         TM_DEBUG_TENSOR(inter, Concat("w3", layer_id), 3);
     }
 
-    if (!mlp.is_fused_silu) {
+    // When using the fused kernel (w1w3 + fused silu), activation is already applied.
+    // Otherwise (separate w1/w3 or non-fused), apply activation explicitly.
+    if (!use_fused || !mlp.is_fused_silu) {
         // gate' = silu(gate) * up
         Activation(gating, inter, mlp.act_type, stream);
         sync_check_cuda_error();
@@ -67,7 +72,7 @@ void LlamaFfnLayer::forward(ForwardParam param)
 
     {  // w2(x)
         NvtxScope scope("w2");
-        linear_.Forward(gating, mlp.output, param.output);
+        linear_.Forward(gating, *mlp.w2, param.output);
         sync_check_cuda_error();
     }
 }
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.h b/src/turbomind/models/llama/LlamaFfnLayer.h
index ea5bee7987..059ab6b010 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.h
+++ b/src/turbomind/models/llama/LlamaFfnLayer.h
@@ -20,30 +20,26 @@
 #pragma once
 
 #include "src/turbomind/core/core.h"
-#include "src/turbomind/models/llama/LlamaDenseWeight.h"
+#include "src/turbomind/models/ffn_weight.h"
 #include "src/turbomind/models/llama/LlamaLinear.h"
 #include "src/turbomind/models/llama/context.h"
-#include "src/turbomind/models/llama/llama_params.h"
 
 namespace turbomind {
 
 class LlamaFfnLayer {
 public:
-    LlamaFfnLayer(const ModelParam& model, const Context& ctx): hidden_units_(model.hidden_units), linear_(*ctx.linear)
-    {
-    }
+    LlamaFfnLayer(const Context& ctx): linear_(*ctx.linear) {}
 
     struct ForwardParam {
-        Tensor                input;
-        Tensor                output;
-        const LlamaFfnWeight* weights;
-        int                   layer_id;
+        Tensor           input;
+        Tensor           output;
+        const FfnWeight* weights;
+        int              layer_id;
     };
 
     void forward(ForwardParam param);
 
 private:
-    const size_t hidden_units_;
     LlamaLinear& linear_;
 };
 
diff --git a/src/turbomind/models/llama/LlamaLinear.cu b/src/turbomind/models/llama/LlamaLinear.cu
index 8578f0144e..b2a0386b60 100644
--- a/src/turbomind/models/llama/LlamaLinear.cu
+++ b/src/turbomind/models/llama/LlamaLinear.cu
@@ -12,7 +12,7 @@
 
 #include "src/turbomind/kernels/quantization.h"
 
-#include "src/turbomind/models/llama/LlamaDenseWeight.h"
+#include "src/turbomind/models/linear_weight.h"
 #include "src/turbomind/models/llama/LlamaLinear.h"
 
 #include "src/turbomind/utils/cuda_utils.h"
@@ -53,17 +53,17 @@ struct LlamaLinear::Impl {
         workspace_ = {};
     }
 
-    std::tuple<Tensor, MatrixLayout, Tensor, MatrixLayout> GetOperandB(const LlamaDenseWeight& dense)
+    std::tuple<Tensor, MatrixLayout, Tensor, MatrixLayout> GetOperandB(const LinearWeight& weight)
     {
-        const Tensor& B      = dense.weight;
-        const Tensor& V      = dense.scales;
-        MatrixLayout  desc_B = dense.k_desc;
-        MatrixLayout  desc_V = dense.q_desc;
+        const Tensor& B      = weight.weight;
+        const Tensor& V      = weight.scales;
+        MatrixLayout  desc_B = weight.k_desc;
+        MatrixLayout  desc_V = weight.q_desc;
         return {B, desc_B, V, desc_V};
     }
 
     std::tuple<Tensor, MatrixLayout, Tensor, MatrixLayout>
-    GetOperandA(const LlamaDenseWeight& dense, const Tensor& input, Buffer_<int> indices, const Buffer_<int>& offsets)
+    GetOperandA(const LinearWeight& weight, const Tensor& input, Buffer_<int> indices, const Buffer_<int>& offsets)
     {
         auto st = core::Context::stream().handle();
 
@@ -73,7 +73,7 @@ struct LlamaLinear::Impl {
         const int m = indices ? indices.size() : input.shape(0);
 
         // Currently, FP8 only; INT8 may be added later
-        if (input.dtype() != dense.input_type) {
+        if (input.dtype() != weight.input_dtype()) {
             QuantizeSymm(A, U, input, st);
             sync_check_cuda_error();
         }
@@ -101,7 +101,7 @@ struct LlamaLinear::Impl {
             desc_U = {U.dtype(), kColMajor, (int)U.shape(1), (int)U.shape(0), (int)U.stride(0)};
         }
         if (offsets) {
-            desc_A.num = desc_U.num = dense.k_desc.num;
+            desc_A.num = desc_U.num = weight.k_desc.num;
             desc_A.offsets = desc_U.offsets = const_cast<int*>(offsets.data());
         }
         if (indices) {
@@ -111,28 +111,28 @@ struct LlamaLinear::Impl {
         return {A, desc_A, U, desc_U};
     }
 
-    void Forward(Tensor&                 output,
-                 const Tensor&           input,  //
-                 const LlamaDenseWeight& dense,
-                 const Buffer_<int>&     indices,
-                 const Buffer_<int>&     offsets)
+    void Forward(Tensor&             output,
+                 const Tensor&       input,  //
+                 const LinearWeight& weight,
+                 const Buffer_<int>& indices,
+                 const Buffer_<int>& offsets)
     {
         using namespace gemm;
 
         Operation op{};
         op.dispatch  = dispatch_policy_;
-        op.epilogue  = dense.epilogue;
-        op.quant_a   = dense.input_quant;
-        op.quant_b   = dense.weight_quant;
+        op.epilogue  = weight.epilogue;
+        op.quant_a   = MakeQuantDesc(weight.input_format);
+        op.quant_b   = MakeQuantDesc(weight.weight_format);
         op.batch_dim = 0;
 
-        auto&& [A, desc_A, U, desc_U] = GetOperandA(dense, input, indices, offsets);
-        auto&& [B, desc_B, V, desc_V] = GetOperandB(dense);
+        auto&& [A, desc_A, U, desc_U] = GetOperandA(weight, input, indices, offsets);
+        auto&& [B, desc_B, V, desc_V] = GetOperandB(weight);
 
         Tensor& D = output;
         if (!D) {
-            int dim = dense.epilogue == Epilogue::kGatedSilu ? dense.output_dim / 2 : dense.output_dim;
-            D       = Tensor{{desc_A.rows, dim}, dense.data_type, kDEVICE};
+            int dim = weight.epilogue == Epilogue::kGatedSilu ? weight.output_dim / 2 : weight.output_dim;
+            D       = Tensor{{desc_A.rows, dim}, weight.output_dtype(), kDEVICE};
         }
 
         // std::cout << "D: " << D << " " << desc_B.num << "\n";
@@ -141,7 +141,7 @@ struct LlamaLinear::Impl {
             output.dtype(),
             kRowMajor,
             (int)output.shape(0),
-            dense.output_dim,
+            weight.output_dim,
             (int)output.stride(0),
         };
 
@@ -181,18 +181,18 @@ struct LlamaLinear::Impl {
 
 LlamaLinear::LlamaLinear(): impl_{std::make_shared<Impl>()} {}
 
-Tensor LlamaLinear::Forward(const Tensor&           input,  //
-                            const LlamaDenseWeight& weight,
-                            std::optional<Tensor>   output)
+Tensor LlamaLinear::Forward(const Tensor&         input,  //
+                            const LinearWeight&   weight,
+                            std::optional<Tensor> output)
 {
     return Forward(input, weight, {}, {}, output);
 }
 
-Tensor LlamaLinear::Forward(const Tensor&           input,  //
-                            const LlamaDenseWeight& weight,
-                            const Buffer_<int>&     indices,
-                            const Buffer_<int>&     offsets,
-                            std::optional<Tensor>   output)
+Tensor LlamaLinear::Forward(const Tensor&         input,  //
+                            const LinearWeight&   weight,
+                            const Buffer_<int>&   indices,
+                            const Buffer_<int>&   offsets,
+                            std::optional<Tensor> output)
 {
     Tensor in = input.view({-1, input.shape(-1)});
     Tensor out;
diff --git a/src/turbomind/models/llama/LlamaLinear.h b/src/turbomind/models/llama/LlamaLinear.h
index 8c4037b48e..d2c9354204 100644
--- a/src/turbomind/models/llama/LlamaLinear.h
+++ b/src/turbomind/models/llama/LlamaLinear.h
@@ -6,7 +6,7 @@
 #include <ostream>
 
 #include "src/turbomind/core/core.h"
-#include "src/turbomind/models/llama/LlamaDenseWeight.h"
+#include "src/turbomind/models/linear_weight.h"
 
 namespace turbomind {
 
@@ -14,15 +14,15 @@ class LlamaLinear {
 public:
     explicit LlamaLinear();
 
-    Tensor Forward(const Tensor&           input,  //
-                   const LlamaDenseWeight& weight,
-                   std::optional<Tensor>   output = {});
+    Tensor Forward(const Tensor&         input,  //
+                   const LinearWeight&   weight,
+                   std::optional<Tensor> output = {});
 
-    Tensor Forward(const Tensor&           input,
-                   const LlamaDenseWeight& weight,
-                   const Buffer_<int>&     indices,
-                   const Buffer_<int>&     offsets,
-                   std::optional<Tensor>   output = {});
+    Tensor Forward(const Tensor&         input,
+                   const LinearWeight&   weight,
+                   const Buffer_<int>&   indices,
+                   const Buffer_<int>&   offsets,
+                   std::optional<Tensor> output = {});
 
     void set_measure(bool measure);
 
diff --git a/src/turbomind/models/llama/LlamaWeight.cc b/src/turbomind/models/llama/LlamaWeight.cc
deleted file mode 100644
index 26ba9ca198..0000000000
--- a/src/turbomind/models/llama/LlamaWeight.cc
+++ /dev/null
@@ -1,187 +0,0 @@
-/*
- * Copyright (c) OpenMMLab. All rights reserved.
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Modified from
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptWeight.cc
-
-#include <cuda_runtime.h>
-
-#include "src/turbomind/core/allocator.h"
-#include "src/turbomind/core/context.h"
-#include "src/turbomind/models/llama/LlamaDenseWeight.h"
-#include "src/turbomind/models/llama/LlamaWeight.h"
-#include "src/turbomind/models/llama/llama_params.h"
-#include "src/turbomind/utils/cuda_utils.h"
-
-namespace turbomind {
-
-LlamaWeight::LlamaWeight(DataType           data_type,
-                         const ModelParam&  model,
-                         const EngineParam& engine_param,
-                         const MoeParam&    moe_param):
-    model_param_{model},
-    engine_param_{engine_param},
-    moe_param_{moe_param},
-    hidden_units_(model.hidden_units),
-    inter_size_(model.inter_size),
-    vocab_size_(model.vocab_size),
-    vocab_size_padded_(model.vocab_size),
-    embedding_size_(model.embedding_size),
-    num_layer_(model.layer_num),
-    data_type_{data_type},
-    weight_type_{model.weight_type},
-    tp_size_(engine_param.attn_tp_size * engine_param.attn_cp_size),
-    tp_rank_(engine_param.attn_tp_rank * engine_param.attn_cp_size + engine_param.attn_cp_rank)
-{
-    if (vocab_size_padded_ % tp_size_ != 0) {
-        vocab_size_padded_ = (vocab_size_ + tp_size_ - 1) / tp_size_ * tp_size_;
-        TM_LOG_WARN("pad vocab size from {} to {}", vocab_size_, vocab_size_padded_);
-    }
-    if (embedding_size_ % tp_size_ != 0) {
-        embedding_size_ = (embedding_size_ + tp_size_ - 1) / tp_size_ * tp_size_;
-        TM_LOG_WARN("pad embed size from {} to {}", embedding_size_, embedding_size_);
-    }
-    FT_CHECK(hidden_units_ % tp_size_ == 0);
-    TM_CHECK_EQ(vocab_size_padded_ % tp_size_, 0);
-    TM_CHECK_EQ(hidden_units_ % tp_size_, 0);
-
-    stream_ = core::Stream::create();
-    alloca_ = core::Allocator{stream_, false};
-
-    initialize();
-}
-
-LlamaWeight::~LlamaWeight()
-{
-    release();
-}
-
-bool LlamaWeight::is_initialized() const
-{
-    return initialized_;
-}
-
-void LlamaWeight::initialize()
-{
-    core::ContextGuard guard = context();
-
-    pre_decoder_embedding.emplace(embedding_size_, hidden_units_ / tp_size_, data_type_, false, data_type_, 1);
-    post_decoder_embedding.emplace(hidden_units_, vocab_size_padded_ / tp_size_, data_type_, false, data_type_, 1);
-    register_module("tok_embeddings", pre_decoder_embedding, tp_rank_);
-    register_module("output", post_decoder_embedding, tp_rank_);
-
-    /// Lower VRAM pressure on consumer grade GPUs
-    /// TODO: Support token embeds on pinned host memory
-    pre_decoder_embedding.weight  = empty_like(pre_decoder_embedding.weight, kCPU);
-    post_decoder_embedding.weight = empty_like(post_decoder_embedding.weight, kCPU);
-
-    decoder_layer_weights.reserve(num_layer_);
-    for (int i = 0; i < num_layer_; ++i) {
-        decoder_layer_weights.emplace_back(
-            new LlamaDecoderLayerWeight(data_type_, i, model_param_, engine_param_, moe_param_));
-        register_module("layers", *decoder_layer_weights.back(), i);
-    }
-
-    output_norm_weight = Tensor{{hidden_units_}, data_type_, kDEVICE};
-    register_parameter("norm.weight", output_norm_weight);
-    initialized_ = true;
-}
-
-void LlamaWeight::release()
-{
-    core::ContextGuard guard = context();
-
-    pre_decoder_embedding  = {};
-    post_decoder_embedding = {};
-    output_norm_weight     = {};
-
-    for (auto& p : decoder_layer_weights) {
-        delete p;
-    }
-
-    decoder_layer_weights.clear();
-    pinned_weights_.clear();
-
-    // Wait for deallocations
-    core::Context::stream().Sync();
-
-    // release memory back to os
-    core::Context::device_alloc()->trim(0);
-    initialized_ = false;
-}
-
-void LlamaWeight::to_device(const core::Device& device)
-{
-    TM_CHECK(device.type == kCPU || device.type == kDEVICE);
-    core::ContextGuard guard{stream_, alloca_, Allocator{kCPUpinned}};
-
-    auto tensor_ptr_map = get_parameters();
-    for (auto& [name, tensor_ptr] : tensor_ptr_map) {
-        if (device.type == kCPU) {
-            if (pinned_weights_.find(name) == pinned_weights_.end()) {
-                pinned_weights_[name] = empty_like(*tensor_ptr, kCPUpinned);
-                Copy(*tensor_ptr, pinned_weights_[name]);
-            }
-            *tensor_ptr = {};
-        }
-        else {
-            TM_CHECK(pinned_weights_.find(name) != pinned_weights_.end());
-            *tensor_ptr = empty_like(pinned_weights_[name], kDEVICE);
-            Copy(pinned_weights_[name], *tensor_ptr);
-        }
-    }
-    core::Context::stream().Sync();
-    if (device.type == kCPU) {
-        core::Context::device_alloc()->trim(0);
-    }
-}
-
-core::ContextGuard LlamaWeight::context() const
-{
-    return core::ContextGuard{stream_, alloca_};
-}
-
-void LlamaWeight::prepare(const cudaDeviceProp& prop)
-{
-    core::ContextGuard guard = context();
-
-    // Wait for the weights to be filled externally
-    check_cuda_error(cudaDeviceSynchronize());
-
-    auto stream = core::Context::stream().handle();
-
-    for (auto& layer : decoder_layer_weights) {
-        layer->prepare(prop, stream);
-    }
-
-    auto to_device = [](Tensor& x) {
-        auto tmp = std::exchange(x, empty_like(x, kDEVICE));
-        Copy(tmp, x);
-        return tmp;
-    };
-
-    // Keep the host tensor until stream synchronization
-    auto tmp_token_embeds = to_device(pre_decoder_embedding.weight);
-    auto tmp_lm_head      = to_device(post_decoder_embedding.weight);
-
-    post_decoder_embedding.prepare();
-
-    // Block until processing is done
-    check_cuda_error(cudaStreamSynchronize(stream));
-}
-
-}  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaWeight.h b/src/turbomind/models/llama/LlamaWeight.h
deleted file mode 100644
index 5b018ab4ab..0000000000
--- a/src/turbomind/models/llama/LlamaWeight.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) OpenMMLab. All rights reserved.
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Modified from
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptWeight.h
-
-#pragma once
-
-#include <unordered_map>
-
-#include "src/turbomind/core/context.h"
-#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
-#include "src/turbomind/models/llama/LlamaDenseWeight.h"
-#include "src/turbomind/models/llama/llama_params.h"
-
-namespace turbomind {
-
-struct LlamaWeight: core::Module {
-    LlamaWeight() = default;
-
-    LlamaWeight(DataType           data_type,
-                const ModelParam&  model_param,
-                const EngineParam& engine_param,
-                const MoeParam&    moe_param);
-
-    ~LlamaWeight();
-
-    LlamaWeight(const LlamaWeight&) = delete;
-    LlamaWeight& operator=(const LlamaWeight&) = delete;
-
-    void prepare(const cudaDeviceProp& prop);
-
-    bool is_initialized() const;
-
-    void initialize();
-
-    void release();
-
-    void to_device(const core::Device& device);
-
-    core::ContextGuard context() const;
-
-    std::vector<LlamaDecoderLayerWeight*> decoder_layer_weights;
-
-    LlamaDenseWeight pre_decoder_embedding;
-    LlamaDenseWeight post_decoder_embedding;
-
-    Tensor output_norm_weight;
-
-private:
-    const ModelParam  model_param_;
-    const EngineParam engine_param_;
-    const MoeParam    moe_param_;
-
-    int hidden_units_;
-    int vocab_size_;
-    int vocab_size_padded_;
-    int embedding_size_;
-    int num_layer_;
-
-    DataType data_type_;
-    DataType weight_type_;
-
-    std::unordered_map<std::string, Tensor> pinned_weights_;
-
-    int tp_size_;  // this will follow attn tp param
-    int tp_rank_;
-
-    std::vector<int> inter_size_;
-
-    core::Stream    stream_;
-    core::Allocator alloca_;
-    bool            initialized_{false};
-};
-
-}  // namespace turbomind
diff --git a/src/turbomind/models/llama/SequenceManager.cc b/src/turbomind/models/llama/SequenceManager.cc
index ea2817727d..09d516df49 100644
--- a/src/turbomind/models/llama/SequenceManager.cc
+++ b/src/turbomind/models/llama/SequenceManager.cc
@@ -31,26 +31,36 @@ std::string vector2string(const std::vector<T>& data)
     return ss.str();
 }
 
-SequenceManager::SequenceManager(const ModelParam& model_param,
-                                 DataType          runtime_dtype,
-                                 int               cache_block_seq_len,
-                                 int               attn_tp_size,
-                                 int               max_batch_size,
-                                 double            block_count,
-                                 int               chunk_size,
-                                 bool              enable_prefix_caching,
-                                 int               rank,
-                                 int               attn_cp_size,
-                                 core::Allocator   allocator,
-                                 GetFreeMemSize    get_free_size):
+SequenceManager::SequenceManager(int                     head_dim,
+                                 int                     kv_head_num,
+                                 int                     num_layer,
+                                 const std::vector<int>& layer_types,
+                                 int                     quant_policy,
+                                 DataType                data_type,
+                                 DataType                runtime_dtype,
+                                 int                     linear_key_head_dim,
+                                 int                     linear_value_head_dim,
+                                 int                     linear_conv_kernel_dim,
+                                 int                     linear_num_key_heads,
+                                 int                     linear_num_value_heads,
+                                 int                     cache_block_seq_len,
+                                 int                     attn_tp_size,
+                                 int                     max_batch_size,
+                                 double                  block_count,
+                                 int                     chunk_size,
+                                 bool                    enable_prefix_caching,
+                                 int                     rank,
+                                 int                     attn_cp_size,
+                                 core::Allocator         allocator,
+                                 GetFreeMemSize          get_free_size):
     block_seq_len_(cache_block_seq_len), rank_(rank), attn_cp_size_(attn_cp_size)
 {
     TM_CHECK_GT(attn_tp_size, 0);
     TM_CHECK_GT(cache_block_seq_len, 0);
 
-    int cache_layer_num   = model_param.layer_num;
+    int cache_layer_num   = num_layer;
     int num_linear_layers = 0;
-    for (const auto& type : model_param.layer_types) {
+    for (const auto& type : layer_types) {
         if (type == 1) {
             --cache_layer_num;
             ++num_linear_layers;
@@ -61,22 +71,19 @@ SequenceManager::SequenceManager(const ModelParam& model_param,
 
     if (num_linear_layers > 0) {
 
-        const int key_head_dim =
-            model_param.linear_key_head_dim > 0 ? model_param.linear_key_head_dim : model_param.head_dim;
-        const int value_head_dim =
-            model_param.linear_value_head_dim > 0 ? model_param.linear_value_head_dim : model_param.head_dim;
-        const int d_conv      = model_param.linear_conv_kernel_dim > 0 ? model_param.linear_conv_kernel_dim : 4;
-        const int num_k_heads = model_param.linear_num_key_heads / attn_tp_size;
-        const int num_v_heads = model_param.linear_num_value_heads / attn_tp_size;
-        const int key_dim     = num_k_heads * key_head_dim;
-        const int value_dim   = num_v_heads * value_head_dim;
-        const int conv_dim    = key_dim * 2 + value_dim;
+        const int key_head_dim   = linear_key_head_dim > 0 ? linear_key_head_dim : head_dim;
+        const int value_head_dim = linear_value_head_dim > 0 ? linear_value_head_dim : head_dim;
+        const int d_conv         = linear_conv_kernel_dim > 0 ? linear_conv_kernel_dim : 4;
+        const int num_k_heads    = linear_num_key_heads / attn_tp_size;
+        const int num_v_heads    = linear_num_value_heads / attn_tp_size;
+        const int key_dim        = num_k_heads * key_head_dim;
+        const int value_dim      = num_v_heads * value_head_dim;
+        const int conv_dim       = key_dim * 2 + value_dim;
 
         TM_CHECK_GT(max_batch_size, 0);
-        pooled_conv_states_ = {{max_batch_size, num_linear_layers, d_conv, conv_dim}, model_param.data_type, kDEVICE};
-        pooled_recurrent_states_ = {{max_batch_size, num_linear_layers, num_v_heads, key_head_dim, value_head_dim},
-                                    model_param.linear_state_dtype,
-                                    kDEVICE};
+        pooled_conv_states_      = {{max_batch_size, num_linear_layers, d_conv, conv_dim}, data_type, kDEVICE};
+        pooled_recurrent_states_ = {
+            {max_batch_size, num_linear_layers, num_v_heads, key_head_dim, value_head_dim}, data_type, kDEVICE};
 
         free_linear_state_slots_.reserve(max_batch_size);
         for (int slot = max_batch_size - 1; slot >= 0; --slot) {
@@ -94,17 +101,16 @@ SequenceManager::SequenceManager(const ModelParam& model_param,
                     (pooled_conv_states_.byte_size() + pooled_recurrent_states_.byte_size()) * mb);
     }
 
-    const int  dbits        = byte_size(runtime_dtype, 8);
-    const auto quant_policy = model_param.quant_policy;
-    const int  elem_bits    = quant_policy ? quant_policy : dbits;
+    const int dbits     = byte_size(runtime_dtype, 8);
+    const int elem_bits = quant_policy ? quant_policy : dbits;
 
     BlockConfig block_config{
-        (int)model_param.head_dim,
-        (int)model_param.kv_head_num / attn_tp_size,
+        head_dim,
+        kv_head_num,
         cache_block_seq_len,
         elem_bits == dbits ? 0 : dbits,
         elem_bits,
-        model_param.head_dim == 576,  // share kv
+        head_dim == 576,  // share kv
     };
 
     block::Layout layout{block_config};
diff --git a/src/turbomind/models/llama/SequenceManager.h b/src/turbomind/models/llama/SequenceManager.h
index fff2706379..1b728cd825 100644
--- a/src/turbomind/models/llama/SequenceManager.h
+++ b/src/turbomind/models/llama/SequenceManager.h
@@ -11,7 +11,6 @@
 
 #include "src/turbomind/models/llama/BlockManager.h"
 #include "src/turbomind/models/llama/BlockTrie.h"
-#include "src/turbomind/models/llama/llama_params.h"
 
 namespace turbomind {
 
@@ -90,18 +89,28 @@ class SequenceManager {
     };
     // clang-format on
 
-    explicit SequenceManager(const ModelParam& model_param,
-                             DataType          runtime_dtype,
-                             int               cache_block_seq_len,
-                             int               attn_tp_size,
-                             int               max_batch_size,
-                             double            block_count,
-                             int               chunk_size,
-                             bool              enable_prefix_caching,
-                             int               rank,
-                             int               attn_cp_size,
-                             core::Allocator   allocator,
-                             GetFreeMemSize    get_free_size);
+    explicit SequenceManager(int                     head_dim,
+                             int                     kv_head_num,
+                             int                     num_layer,
+                             const std::vector<int>& layer_types,
+                             int                     quant_policy,
+                             DataType                data_type,
+                             DataType                runtime_dtype,
+                             int                     linear_key_head_dim,
+                             int                     linear_value_head_dim,
+                             int                     linear_conv_kernel_dim,
+                             int                     linear_num_key_heads,
+                             int                     linear_num_value_heads,
+                             int                     cache_block_seq_len,
+                             int                     attn_tp_size,
+                             int                     max_batch_size,
+                             double                  block_count,
+                             int                     chunk_size,
+                             bool                    enable_prefix_caching,
+                             int                     rank,
+                             int                     attn_cp_size,
+                             core::Allocator         allocator,
+                             GetFreeMemSize          get_free_size);
 
     SequenceManager(const SequenceManager&)     = delete;
     SequenceManager(SequenceManager&&) noexcept = default;
diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h
index b61e5b1fe8..4ce0a586fa 100644
--- a/src/turbomind/models/llama/llama_params.h
+++ b/src/turbomind/models/llama/llama_params.h
@@ -3,161 +3,22 @@
 #pragma once
 
 #include <cstddef>
-#include <map>
-#include <regex>
-#include <set>
-#include <string>
 
-#include "src/turbomind/core/data_type.h"
-#include "src/turbomind/kernels/activation.h"
-#include "src/turbomind/models/llama/llama_rope.h"
+#include "src/turbomind/engine/engine_config.h"
 
 namespace turbomind {
 
-struct MLAParam {
-    int q_lora_rank;
-    int kv_lora_rank;
-    int qk_rope_dim;
-    int v_head_dim;
-};
-
-struct ModelParam {
-    size_t   head_num;
-    size_t   head_dim;
-    size_t   kv_head_num;
-    size_t   hidden_units;
-    size_t   layer_num;
-    size_t   vocab_size;
-    size_t   embedding_size;
-    float    norm_eps;
-    int      quant_policy;
-    bool     attn_bias;
-    bool     attn_sink;
-    bool     mlp_bias;
-    DataType data_type;
-
-    // Weight types for mixed quantization support.
-    // Models like mixed AWQ (e.g. QuantTrio GLM-4.7-Flash) quantize FFN/expert
-    // weights to int4 but keep attention weights as fp16. GptOss mxfp4 quantizes
-    // only MoE experts to e2m1 while keeping attention and shared experts as fp16.
-    //
-    //                  weight_type   ffn_weight_type   expert_weight_type
-    //  Pure fp16       float16       float16           float16
-    //  Full AWQ        int4          int4              int4
-    //  Mixed AWQ       float16       int4              int4
-    //  GptOss mxfp4    bfloat16      bfloat16          e2m1
-    DataType weight_type;         // attention weights
-    DataType expert_weight_type;  // MoE routed expert weights
-    DataType ffn_weight_type;     // dense FFN / shared expert weights
-
-    int      group_size;
-    MLAParam mla;
-    bool     qk_norm;
-    int      tune_layer_num;
-
-    ActivationType act_type;
-
-    std::vector<int> window_size;
-    std::vector<int> inter_size;
-    std::vector<int> layer_types;
-
-    // Qwen3.5 Gated DeltaNet linear attention params
-    int linear_key_head_dim    = 0;
-    int linear_value_head_dim  = 0;
-    int linear_conv_kernel_dim = 0;
-    int linear_num_key_heads   = 0;
-    int linear_num_value_heads = 0;
-
-    DataType linear_state_dtype = {};
-
-    bool attn_output_gate = false;  // Qwen3.5: doubles Q projection in full-attention layers
-
-    // Layer indices whose MoE experts use data_type (fp16) instead of
-    // expert_weight_type (e.g. int4).  Populated from modules_to_not_convert
-    // patterns like 'model.layers.0.'.
-    std::set<int> unquantized_expert_layers;
-};
-
-inline bool HasLinearAttention(const ModelParam& model_param)
-{
-    for (int type : model_param.layer_types) {
-        if (type == 1) {
-            return true;
-        }
-    }
-    return false;
-}
-
-/// TODO: rename all `gate` in the context of MoE router to `router`
-struct MoeParam {
-    enum Method
-    {
-        kNaive,
-        kFused
-    } method;
-
-    int   experts_per_token;
-    int   inter_size;
-    bool  norm_topk_prob;
-    bool  shared_gate;
-    float routed_scale;
-
-    bool router_bias;
-
-    int         topk_group;
-    std::string topk_method;
-    int         n_group;
-    std::string scoring_func;
-    int         router_n_groups;
-
-    std::vector<int> expert_num;
-};
-
-struct AttentionParam {
-    float softmax_scale;
-    int   cache_block_seq_len;
-    // logn attention
-    bool use_logn_attn;
-    int  max_position_embeddings;
-    // rotary embedding
-    RopeParam rope;
-};
-
-struct EngineParam {
-    // batch params
-    int max_batch_size;
-    int session_len;
-    int step_length;
-
-    // cache params
-    float cache_max_block_count;
-    int   cache_chunk_size;
-    bool  enable_prefix_caching;
-    bool  enable_metrics;
-
-    // chunking params
-    int max_forward_token_num;
-    int max_context_token_num;
-    int num_tokens_per_iter;
-    int max_prefill_iters;
-
-    // parallel params
-    int outer_dp_size;
-    int outer_dp_rank;
-    int attn_dp_size;
-    int attn_dp_rank;
-    int attn_tp_size;
-    int attn_tp_rank;
-    int attn_cp_size;
-    int attn_cp_rank;
-    int mlp_tp_size;
-    int mlp_tp_rank;
-
-    // multi-node
-    int nnodes;
-    int node_rank;
-
-    std::vector<int> devices;
+struct EngineParam: EngineConfig {
+    // Runtime-derived fields (set in CreateContext)
+    int outer_dp_rank = 0;
+    int attn_dp_rank  = 0;
+    int attn_tp_rank  = 0;
+    int attn_cp_rank  = 0;
+    int mlp_tp_rank   = 0;
+    int model_tp_rank = 0;  // rank(d_tp_group), in [0, attn_tp_size × attn_cp_size)
+
+    // Derived field (set in Impl ctor)
+    int max_forward_token_num = 0;
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/llama_rope.h b/src/turbomind/models/llama/llama_rope.h
index 83ffeeff5c..83732cb1d3 100644
--- a/src/turbomind/models/llama/llama_rope.h
+++ b/src/turbomind/models/llama/llama_rope.h
@@ -2,10 +2,6 @@
 
 #pragma once
 
-#include <cmath>
-#include <map>
-#include <string>
-
 #include <cuda_runtime.h>
 
 namespace turbomind {
@@ -21,48 +17,6 @@ enum class RopeType
     kMrope,
 };
 
-inline RopeType GetRoPEType(const std::string& type)
-{
-    std::map<std::string, RopeType> lookup = {{"default", RopeType::kDefault},
-                                              {"linear", RopeType::kLinear},
-                                              {"dynamic", RopeType::kDynamic},
-                                              {"yarn", RopeType::kYarn},
-                                              {"llama3", RopeType::kLlama3},
-                                              {"mrope", RopeType::kMrope}};
-    return lookup.at(type);
-}
-
-struct YarnRopeParam {
-    float attention_factor;
-    float beta_fast;
-    float beta_slow;
-};
-
-struct Llama3RopeParam {
-    float low_freq_factor;
-    float high_freq_factor;
-    int   original_max_position_embeddings;
-};
-
-struct MropeRopeParam {
-    int3 section;
-};
-
-struct RopeParam {
-    RopeType type;
-    // common
-    float base;
-    int   dim;
-    float factor;
-    int   max_position_embeddings;
-    // unique
-    union {
-        YarnRopeParam   yarn;
-        Llama3RopeParam llama3;
-        MropeRopeParam  mrope;
-    };
-};
-
 struct YarnRopeKernelParam {
     float scale_factor;
     float attention_factor;
@@ -98,62 +52,4 @@ struct RopeKernelParam {
     MropeRopeKernelParam  mrope;
 };
 
-inline void init_rope_kernel_param(const RopeParam& rope, RopeKernelParam& rope_kernel)
-{
-    rope_kernel.type         = rope.type;
-    rope_kernel.dim          = rope.dim;
-    rope_kernel.scale_factor = -std::log2(rope.base) / rope.dim;
-    if (rope.type == RopeType::kDynamic) {
-        rope_kernel.inv_factor = 1.f;
-    }
-    else {
-        rope_kernel.inv_factor = (rope.factor != 0.f) ? 1.0 / rope.factor : 1.f;
-    }
-
-    if (rope.type == RopeType::kYarn) {
-        auto&        src = rope.yarn;
-        auto&        dst = rope_kernel.yarn;
-        const double PI  = 3.14159265358979323846;
-
-        auto find_correction_dim = [&](float num_rotations) {
-            return (rope.dim * std::log(rope.max_position_embeddings / (num_rotations * 2 * PI)))
-                   / (2 * std::log(rope.base));
-        };
-
-        auto find_correction_range = [&](float low_rot, float high_rot, float& low, float& high) {
-            low  = std::floor(find_correction_dim(low_rot));
-            high = std::ceil(find_correction_dim(high_rot));
-            low  = std::max(low, 0.f);
-            high = std::min(high, rope.dim - 1.f);
-        };
-
-        float low, high;
-        find_correction_range(src.beta_fast, src.beta_slow, low, high);
-        // https://github.com/huggingface/transformers/blob/6c3f168b36882f0beebaa9121eafa1928ba29633/src/transformers/modeling_rope_utils.py#L216
-        if (low == high) {
-            high += 0.001f;
-        }
-        dst.ramp_inv_factor_div_2   = 1.0 / (high - low) / 2.0;
-        dst.ramp_inv_factor_mul_min = 1.0 / (high - low) * low;
-        dst.attention_factor        = src.attention_factor;
-    }
-    else if (rope.type == RopeType::kLlama3) {
-        auto& src = rope.llama3;
-        auto& dst = rope_kernel.llama3;
-
-        const double PI                   = 3.14159265358979323846;
-        float        inv_diff_freq_factor = 1.0 / (src.high_freq_factor - src.low_freq_factor);
-        dst.alpha                         = src.original_max_position_embeddings / (2 * PI) * inv_diff_freq_factor;
-        dst.beta                          = src.low_freq_factor * inv_diff_freq_factor;
-    }
-
-    else if (rope.type == RopeType::kMrope) {
-        auto& src     = rope.mrope;
-        auto& dst     = rope_kernel.mrope;
-        dst.section.x = src.section.x * 2;
-        dst.section.y = src.section.y * 2 + dst.section.x;
-        dst.section.z = src.section.z * 2 + dst.section.y;
-    }
-}
-
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/moe_ffn_layer.cc b/src/turbomind/models/llama/moe_ffn_layer.cc
index f1a16f5a68..d44c1c4932 100644
--- a/src/turbomind/models/llama/moe_ffn_layer.cc
+++ b/src/turbomind/models/llama/moe_ffn_layer.cc
@@ -6,11 +6,10 @@
 #include "src/turbomind/kernels/activation.h"
 #include "src/turbomind/kernels/norm/rms_norm.h"
 
-#include "src/turbomind/models/llama/LlamaDenseWeight.h"
 #include "src/turbomind/models/llama/LlamaLinear.h"
-#include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/models/llama/llama_utils.h"
 #include "src/turbomind/models/llama/moe_ffn_layer.h"
+#include "src/turbomind/models/moe_weight.h"
 
 #include "src/turbomind/utils/anomaly_handler.h"
 #include "src/turbomind/utils/cuda_utils.h"
@@ -19,54 +18,40 @@
 
 namespace turbomind {
 
-MoeFfnLayer::MoeFfnLayer(const ModelParam& model, const MoeParam& param, const EngineParam& engine, const Context& ctx):
-    inter_size_(param.inter_size / engine.mlp_tp_size),
-    hidden_dim_(model.hidden_units),
+MoeFfnLayer::MoeFfnLayer(const EngineParam& engine, const Context& ctx):
     tp_size_(engine.mlp_tp_size),
-    param_(param),
-    is_warm_up_{*ctx.is_warm_up},
-    linear_(*ctx.linear)
+    max_token_num_(engine.max_forward_token_num * engine.attn_dp_size),
+    is_warm_up_(*ctx.is_warm_up),
+    linear_(*ctx.linear),
+    expert_ffn_(std::make_unique<LlamaFfnLayer>(ctx))
 {
-    TM_CHECK(!param.expert_num.empty());
+}
 
-    const int max_expert_num = *std::max_element(param.expert_num.begin(), param.expert_num.end());
+void MoeFfnLayer::Init(ForwardParam& p)
+{
+    const int expert_num        = p.weights->num_experts();
+    const int experts_per_token = p.weights->experts_per_token;
 
-    if (param_.method == MoeParam::kFused) {
-        // pass
-    }
-    else {
-        expert_ffn_ = std::make_unique<LlamaFfnLayer>(model, ctx);
-    }
+    h_offsets_ = {expert_num + 1, kCPU};
+
+    const int pad_token_num = (max_token_num_ + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
 
-    h_offsets_ = {max_expert_num + 1, kCPUpinned};
-
-    const int max_token_num = engine.max_forward_token_num * engine.attn_dp_size;
-    const int pad_token_num = (max_token_num + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
-
-    // dbg(inter_size_,
-    //     hidden_dim_,
-    //     tp_size_,
-    //     param_.method,
-    //     param.expert_num,
-    //     max_expert_num,
-    //     max_token_num,
-    //     pad_token_num,
-    //     param_.experts_per_token);
-
-    masks_   = {max_expert_num * pad_token_num, kDEVICE};
-    f2n_     = {param_.experts_per_token * max_token_num, kDEVICE};
-    f2E_     = {param_.experts_per_token * max_token_num, kDEVICE};
-    en2f_    = {param_.experts_per_token * max_token_num, kDEVICE};
-    scales_  = {param_.experts_per_token * max_token_num, kDEVICE};
-    offsets_ = {max_expert_num + 1, kDEVICE};
-    accum_   = {max_expert_num * kMoeGateMaxTiles, kDEVICE};
+    masks_   = {expert_num * pad_token_num, kDEVICE};
+    f2n_     = {experts_per_token * max_token_num_, kDEVICE};
+    f2E_     = {experts_per_token * max_token_num_, kDEVICE};
+    en2f_    = {experts_per_token * max_token_num_, kDEVICE};
+    scales_  = {experts_per_token * max_token_num_, kDEVICE};
+    offsets_ = {expert_num + 1, kDEVICE};
+    accum_   = {expert_num * kMoeGateMaxTiles, kDEVICE};
+
+    initialized_ = true;
 }
 
-Tensor_<float> MoeFfnLayer::Gate(const Tensor& input, const LlamaDenseWeight& gate)
+Tensor_<float> MoeFfnLayer::Gate(const Tensor& input, const LinearWeight& gate)
 {
-    auto& weight = gate.weight;
-    TM_CHECK_EQ(input.shape(1), weight.shape(0));
-    Tensor_<float> logits{{input.shape(0), weight.shape(1)}, kDEVICE};
+    auto& w = gate.weight;
+    TM_CHECK_EQ(input.shape(1), w.shape(0));
+    Tensor_<float> logits{{input.shape(0), w.shape(1)}, kDEVICE};
     linear_.Forward(input, gate, logits);
     sync_check_cuda_error();
     ApplyBias(logits, gate.bias, core::Context::stream().handle());
@@ -76,28 +61,37 @@ Tensor_<float> MoeFfnLayer::Gate(const Tensor& input, const LlamaDenseWeight& ga
 
 void MoeFfnLayer::Forward(ForwardParam& p)
 {
+    if (!initialized_) {
+        Init(p);
+    }
+
     const int   tokens = p.input.shape(0);
     const auto& moe    = *p.weights;
 
+    const auto& block = *TM_CHECK_NOTNULL(moe.block());
+
+    const int hidden_dim = block.hidden_dim;
+    const int inter_size = block.inter_size;
+
     const size_t padded     = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
-    const int    expert_num = moe.experts.size();
+    const int    expert_num = moe.num_experts();
 
     FT_CHECK(expert_num);
 
-    auto logits = Gate(p.input, moe.gate);
+    auto logits = Gate(p.input, *moe.gate.get());
 
     TM_DEBUG_TENSOR(logits, "logits", 2);
 
     const auto st = core::Context::stream().handle();
 
-    // dump_logits(tokens, layer_id);
-
-    if (param_.topk_method == "noaux_tc") {
+    if (p.weights->topk_method == "noaux_tc") {
         // invokeMoeGate_NoAuxTC clears accum and masks internally
-        TM_CHECK_EQ(param_.n_group, 1);
-        TM_CHECK_EQ(param_.topk_group, 1);
-        const float* correction_bias =
-            (moe.score_correction_bias.size() > 0) ? moe.score_correction_bias.data<float>() : nullptr;
+        TM_CHECK_EQ(p.weights->n_group, 1);
+        TM_CHECK_EQ(p.weights->topk_group, 1);
+        const float* correction_bias = nullptr;
+        if (moe.score_correction_bias) {
+            correction_bias = moe.score_correction_bias.size() > 0 ? moe.score_correction_bias.data<float>() : nullptr;
+        }
         invokeMoeGate_NoAuxTC(f2n_.data(),
                               f2E_.data(),
                               en2f_.data(),
@@ -110,10 +104,10 @@ void MoeFfnLayer::Forward(ForwardParam& p)
                               tokens,
                               padded,
                               expert_num,
-                              param_.experts_per_token,
-                              param_.norm_topk_prob,
-                              param_.routed_scale,
-                              param_.scoring_func == "sigmoid",
+                              p.weights->experts_per_token,
+                              p.weights->norm_topk_prob,
+                              p.weights->routed_scale,
+                              p.weights->scoring_func == "sigmoid",
                               st);
     }
     else {
@@ -121,9 +115,9 @@ void MoeFfnLayer::Forward(ForwardParam& p)
         check_cuda_error(cudaMemsetAsync(accum_.data(), 0, sizeof(int) * expert_num * kMoeGateMaxTiles, st));
 
         bool softmax = true;
-        if (param_.topk_method == "group_limited_greedy") {
+        if (p.weights->topk_method == "group_limited_greedy") {
             invokeMoeSoftmaxMaskTopKGroups(
-                logits.data(), tokens, expert_num, expert_num / param_.n_group, param_.topk_group, st);
+                logits.data(), tokens, expert_num, expert_num / p.weights->n_group, p.weights->topk_group, st);
             sync_check_cuda_error();
             softmax = false;
         }
@@ -140,17 +134,17 @@ void MoeFfnLayer::Forward(ForwardParam& p)
                          tokens,
                          padded,
                          expert_num,
-                         param_.experts_per_token,
+                         p.weights->experts_per_token,
                          softmax,
-                         param_.norm_topk_prob,
-                         param_.routed_scale,
+                         p.weights->norm_topk_prob,
+                         p.weights->routed_scale,
                          st);
     }
     sync_check_cuda_error();
 
     if (is_warm_up_) {
         std::mt19937     g;
-        const auto       expert_ids = SampleUniform(tokens, expert_num, param_.experts_per_token, g);
+        const auto       expert_ids = SampleUniform(tokens, expert_num, p.weights->experts_per_token, g);
         std::vector<int> cnt(expert_num);
         for (const auto& x : expert_ids) {
             ++cnt[x];
@@ -163,48 +157,41 @@ void MoeFfnLayer::Forward(ForwardParam& p)
             cudaMemcpyAsync(offsets_.data(), h_offsets_.data(), sizeof(int) * (expert_num + 1), cudaMemcpyDefault, st));
     }
 
-    temp_ = Tensor{{param_.experts_per_token * tokens, hidden_dim_}, p.input.dtype(), p.input.device()};
+    temp_ = Tensor{{p.weights->experts_per_token * tokens, hidden_dim}, p.input.dtype(), p.input.device()};
 
-    if (param_.method == MoeParam::kNaive) {
+    auto indices = f2n_.slice(0, tokens * p.weights->experts_per_token);
+    auto offsets = offsets_.slice(0, expert_num + 1);
 
-        invokeMoeDispatch(temp_, p.input, f2n_.data(), param_.experts_per_token, st);
+    if (block.w1w3) {
+        // Fused w1w3 path
+        Tensor inter = linear_.Forward(p.input, *block.w1w3, indices, offsets_);
         sync_check_cuda_error();
 
-        check_cuda_error(
-            cudaMemcpyAsync(h_offsets_.data(), offsets_.data(), sizeof(int) * (expert_num + 1), cudaMemcpyDefault, st));
-
-        check_cuda_error(cudaStreamSynchronize(st));
-
-        TM_CHECK_EQ(h_offsets_[expert_num], tokens * param_.experts_per_token);
-
-        for (int i = 0; i < expert_num; ++i) {
-            if (int count = h_offsets_[i + 1] - h_offsets_[i]) {
-                auto io = temp_.slice({h_offsets_[i], 0}, {count, -1});
-                expert_ffn_->forward({io, io, moe.experts.at(i).get(), p.layer_id});
-            }
+        if (!block.is_fused_silu) {
+            Activation(inter, block.w1w3->bias, f2E_, block.act_type, st);
+            sync_check_cuda_error();
         }
+
+        linear_.Forward(inter.slice({0, 0}, {-1, inter_size}), *block.w2, {}, offsets, temp_);
+        sync_check_cuda_error();
     }
     else {
+        // Separate w1/w3 path
+        Tensor gating = linear_.Forward(p.input, *block.w1, indices, offsets_);
+        sync_check_cuda_error();
 
-        auto& block = moe.block;
-
-        auto indices = f2n_.slice(0, tokens * param_.experts_per_token);
-        auto offsets = offsets_.slice(0, expert_num + 1);
-
-        Tensor inter = linear_.Forward(p.input, block.fused_gating_intermediate, indices, offsets_);
+        Tensor up = linear_.Forward(p.input, *block.w3, indices, offsets_);
         sync_check_cuda_error();
 
-        if (!block.is_fused_silu) {
-            Activation(inter, block.fused_gating_intermediate.bias, f2E_, moe.block.act_type, st);
-            sync_check_cuda_error();
-        }
+        Activation(gating, up, block.act_type, st);
+        sync_check_cuda_error();
 
-        linear_.Forward(inter.slice({0, 0}, {-1, inter_size_}), block.output, {}, offsets, temp_);
+        linear_.Forward(gating, *block.w2, {}, offsets, temp_);
         sync_check_cuda_error();
     }
 
-    if (moe.shared_gate.weight) {
-        shared_scales_ = Gate(p.input, moe.shared_gate);
+    if (moe.shared_gate) {
+        shared_scales_ = Gate(p.input, *moe.shared_gate);
     }
 }
 
@@ -214,12 +201,12 @@ void MoeFfnLayer::Combine(ForwardParam& p)
 
     invokeMoeCombine(p.output,
                      temp_,
-                     p.weights->block.output.bias,
+                     moe.block()->w2->bias,
                      scales_.data(),
                      en2f_.data(),
                      f2E_.data(),
                      shared_scales_.data_or((float*)nullptr),
-                     param_.experts_per_token,
+                     p.weights->experts_per_token,
                      1.f / tp_size_,
                      p.scale,
                      core::Context::stream().handle());
diff --git a/src/turbomind/models/llama/moe_ffn_layer.h b/src/turbomind/models/llama/moe_ffn_layer.h
index 939cd9c60e..d50bc4869b 100644
--- a/src/turbomind/models/llama/moe_ffn_layer.h
+++ b/src/turbomind/models/llama/moe_ffn_layer.h
@@ -4,22 +4,22 @@
 
 #include "src/turbomind/kernels/gemm/context.h"
 #include "src/turbomind/kernels/gemm/moe_utils_v2.h"
-#include "src/turbomind/models/llama/LlamaDenseWeight.h"
 #include "src/turbomind/models/llama/LlamaFfnLayer.h"
 #include "src/turbomind/models/llama/llama_params.h"
+#include "src/turbomind/models/moe_weight.h"
 
 namespace turbomind {
 
 class MoeFfnLayer {
 public:
-    MoeFfnLayer(const ModelParam& model, const MoeParam& param, const EngineParam& engine, const Context& ctx);
+    MoeFfnLayer(const EngineParam& engine, const Context& ctx);
 
     struct ForwardParam {
-        Tensor              input;
-        Tensor              output;
-        const MoeFfnWeight* weights;
-        float               scale;
-        int                 layer_id;
+        Tensor           input;
+        Tensor           output;
+        const MoeWeight* weights;
+        float            scale;
+        int              layer_id;
     };
 
     void Forward(ForwardParam& p);
@@ -27,22 +27,22 @@ class MoeFfnLayer {
     void Combine(ForwardParam& p);
 
 private:
-    Tensor_<float> Gate(const Tensor& input, const LlamaDenseWeight& gate);
+    void Init(ForwardParam& p);
+
+    Tensor_<float> Gate(const Tensor& input, const LinearWeight& gate);
 
     void dump_logits(int token_num, int layer_id, int expert_num);
 
-    const int inter_size_;
-    const int hidden_dim_;
     const int tp_size_;
-
-    const MoeParam param_;
-
-    int& is_warm_up_;
+    const int max_token_num_;
+    int&      is_warm_up_;
 
     LlamaLinear& linear_;
 
     std::unique_ptr<LlamaFfnLayer> expert_ffn_;
 
+    bool initialized_ = false;
+
     ///////////////////////////////////////////////////////
     /// runtime states
     Buffer_<int> h_offsets_;
diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc
index 390ea72926..e1db9f4710 100644
--- a/src/turbomind/models/llama/unified_attention_layer.cc
+++ b/src/turbomind/models/llama/unified_attention_layer.cc
@@ -91,67 +91,44 @@ UnifiedAttentionLayer::~UnifiedAttentionLayer()
     aux_stream_             = {};
 }
 
-UnifiedAttentionLayer::UnifiedAttentionLayer(const ModelParam&     model,
-                                             const AttentionParam& attn,
-                                             const EngineParam&    engine,
-                                             int                   tp_size,
-                                             const Context&        ctx,
-                                             int                   phases,
-                                             bool                  init):
-    head_num_(model.head_num),
-    kv_head_num_(model.kv_head_num),
-    size_per_head_(model.head_dim),
-    hidden_units_(model.hidden_units),
-    local_head_num_(head_num_ / tp_size),
-    local_kv_head_num_(model.kv_head_num / tp_size),
-    param_(attn),
-    model_param_(model),
-    engine_param_(engine),
-    cp_fn_ctx_(ctx.comm.d_comm, ctx.comm.d_cp_group),
+UnifiedAttentionLayer::UnifiedAttentionLayer(int                           quant_policy,
+                                             const std::vector<int>&       layer_types,
+                                             int                           layer_num,
+                                             std::vector<AttentionWeight*> attn_weights,
+                                             const EngineParam&            engine,
+                                             const Context&                ctx,
+                                             int                           phases,
+                                             bool                          init):
+    quant_policy_{quant_policy},
+    rope_{attn_weights[0]->rope},
+    engine_param_{engine},
+    cp_fn_ctx_{ctx.comm.d_comm, ctx.comm.d_cp_group},
     is_warm_up_{*ctx.is_warm_up},
-    context_(ctx),
+    context_{ctx},
+    init_{init},
     linear_(*ctx.linear),
-    arch_(getSMVersion())
+    arch_{getSMVersion()}
 {
-    TM_CHECK_EQ(head_num_ % tp_size, 0) << head_num_ << " " << tp_size;
-    TM_CHECK_EQ(head_num_ % kv_head_num_, 0) << head_num_ << " " << kv_head_num_;
+    TM_CHECK(!attn_weights.empty()) << "attn_weights must not be empty";
+    TM_CHECK(attn_weights[0]) << "attn_weights[0] must not be null";
 
     check_cuda_error(cudaStreamCreateWithFlags(&aux_stream_, cudaStreamNonBlocking));
     check_cuda_error(cudaEventCreateWithFlags(&qkv_event_, cudaEventDisableTiming));
     check_cuda_error(cudaEventCreateWithFlags(&aux_event_, cudaEventDisableTiming));
 
-    init_rope_kernel_param(param_.rope, rope_param_);
+    init_rope_kernel_param(rope_, rope_param_);
 
     // Skip other attention layer types
-    std::vector<int> layer_types = model_param_.layer_types;
-    layer_types.resize(model_param_.layer_num);
-    cache_layer_ids_.resize(layer_types.size(), -1);
+    std::vector<int> types = layer_types;
+    types.resize(layer_num);
+    cache_layer_ids_.resize(types.size(), -1);
     int next_cache_id = 0;
-    for (size_t i = 0; i < layer_types.size(); ++i) {
-        if (layer_types[i] == 0) {
+    for (size_t i = 0; i < types.size(); ++i) {
+        if (types[i] == 0) {
             cache_layer_ids_[i] = next_cache_id++;
         }
     }
 
-    Allocator alloc            = core::Context::device_alloc();
-    ssize_t   workspace_tokens = kMaxWorkspaceTokens;
-    if (engine_param_.attn_cp_size > 1) {
-        alloc = GetSymmAllocator(ctx.comm.d_comm);
-        workspace_tokens += engine_param_.max_forward_token_num;
-    }
-    // partial_O layout:
-    //   w/  cp, decode(q, h, k, 2) + prefill(q, h, 1, 2)
-    //   w/o cp, decode(q, h, k, 2)
-    partial_O_  = Tensor_<float>({workspace_tokens, local_head_num_, size_per_head_}, kDEVICE);
-    partial_ML_ = Tensor_<float>({engine_param_.attn_cp_size, workspace_tokens, local_head_num_, 2}, alloc);
-    split_cnt_  = Tensor_<int>({workspace_tokens}, kDEVICE);
-    if (init) {
-        const int dim = (int)local_head_num_ * (int)size_per_head_;
-        tmp_attn_     = Tensor{{engine_param_.max_forward_token_num, dim}, model.data_type, kDEVICE};
-    }
-
-    Clear(split_cnt_.buffer());
-
     const int bsz = engine.max_batch_size;
 
     if (rope_param_.type == RopeType::kDynamic) {
@@ -162,7 +139,7 @@ UnifiedAttentionLayer::UnifiedAttentionLayer(const ModelParam&     model,
         mrope_position_delta_buf_ = {bsz, kCPUpinned};
         mrope_length_buf_         = {bsz, kCPUpinned};
     }
-    const int max_blocks = bsz * cdiv(engine.session_len, param_.cache_block_seq_len);
+    const int max_blocks = bsz * cdiv(engine.session_len, engine_param_.cache_block_seq_len);
     for (int i = 0; i < phases; ++i) {
         auto& d               = data_.emplace_back(std::make_shared<AttentionData>());
         d->block_ptrs         = {max_blocks + 16, kDEVICE};
@@ -178,9 +155,37 @@ UnifiedAttentionLayer::UnifiedAttentionLayer(const ModelParam&     model,
             rope_param_.mrope.stride = d->mrope_position_ids.stride(0);
         }
     }
+
+    // Eagerly initialize workspace buffers (was previously lazy in Init())
+    {
+        const auto& w              = *attn_weights[0];
+        const int   tp_size        = w.tp_size;
+        const int   local_head_num = w.head_num / tp_size;
+        const int   size_per_head  = w.head_dim;
+
+        TM_CHECK_EQ(w.head_num % tp_size, 0) << w.head_num << " " << tp_size;
+        TM_CHECK_EQ(w.head_num % w.kv_head_num, 0) << w.head_num << " " << w.kv_head_num;
+
+        ssize_t   workspace_tokens = kMaxWorkspaceTokens;
+        Allocator alloc            = core::Context::device_alloc();
+        if (engine_param_.attn_cp_size > 1) {
+            alloc = GetSymmAllocator(context_.comm.d_comm);
+            workspace_tokens += engine_param_.max_forward_token_num;
+        }
+
+        partial_O_  = Tensor_<float>({workspace_tokens, local_head_num, size_per_head}, kDEVICE);
+        partial_ML_ = Tensor_<float>({engine_param_.attn_cp_size, workspace_tokens, local_head_num, 2}, alloc);
+        split_cnt_  = Tensor_<int>({workspace_tokens}, kDEVICE);
+        if (init_) {
+            const int dim = local_head_num * size_per_head;
+            tmp_attn_     = Tensor{{engine_param_.max_forward_token_num, dim}, w.data_type, kDEVICE};
+        }
+
+        Clear(split_cnt_.buffer());
+    }
 }
 
-static void init_dynamic_ntk(RequestCache& cache, const RopeParam& rope)
+static void init_dynamic_ntk(RequestCache& cache, const core::RopeConfig& rope)
 {
     cache.rope_base = rope.base;
     if (auto scaling_factor = rope.factor; scaling_factor > 1.f) {
@@ -203,7 +208,7 @@ void UnifiedAttentionLayer::Run(BatchOp op, int phase, TensorMap& env)
         Buffer_<RequestCache*> rc = env.at("requests").buffer();
         if (rope_param_.type == RopeType::kDynamic) {
             for (int i = 0; i < rc.size(); ++i) {
-                init_dynamic_ntk(*rc[i], param_.rope);
+                init_dynamic_ntk(*rc[i], rope_);
             }
         }
     }
@@ -313,6 +318,8 @@ void UnifiedAttentionLayer::Forward(ForwardParam p)
 
     const auto& weights = *p.weights;
 
+    TM_LOG_DEBUG("layer=%d, token_num=%d", layer_id, token_num);
+
     Tensor qkv;
 
     auto& d = *data_.at(p.phase);
@@ -321,14 +328,12 @@ void UnifiedAttentionLayer::Forward(ForwardParam p)
     //     DebugTensor(p.input.slice(d.dbg_offset, d.dbg_size), Concat("attn_in", p.layer_id), 0);
     // }
 
-    if (weights.qkv.output_dim) {
+    if (weights.w_qkv && weights.w_qkv->output_dim) {
         // [token_num, hidden_dim] -> [token_num, local_q_kv_head_num, head_dim]
-        qkv = linear_.Forward(p.input, weights.qkv);
+        qkv = linear_.Forward(p.input, *weights.w_qkv);
         sync_check_cuda_error();
 
-        if (model_param_.qk_norm) {
-            qk_norm(qkv, weights);
-        }
+        qk_norm(qkv, weights);
     }
     else {
         qkv = forward_mla(p.input, weights);
@@ -345,12 +350,16 @@ void UnifiedAttentionLayer::Forward(ForwardParam p)
 
     // Apply sigmoid gating: attn *= sigmoid(gate)
     // Gate is stored at the end of each token's QKV: [Q|K|V|Gate]
-    if (model_param_.attn_output_gate) {
-        const int  q_count     = qkv.shape(0);
-        const int  attn_dim    = local_head_num_ * size_per_head_;
-        const int  gate_offset = (local_head_num_ + 2 * local_kv_head_num_) * size_per_head_;
-        const int  qkv_stride  = (2 * local_head_num_ + 2 * local_kv_head_num_) * size_per_head_;
-        const auto stream      = core::Context::stream().handle();
+    if (weights.output_gate) {
+        const int  tp_size           = weights.tp_size;
+        const int  local_head_num    = weights.head_num / tp_size;
+        const int  local_kv_head_num = weights.kv_head_num / tp_size;
+        const int  size_per_head     = weights.head_dim;
+        const int  q_count           = qkv.shape(0);
+        const int  attn_dim          = local_head_num * size_per_head;
+        const int  gate_offset       = (local_head_num + 2 * local_kv_head_num) * size_per_head;
+        const int  qkv_stride        = (2 * local_head_num + 2 * local_kv_head_num) * size_per_head;
+        const auto stream            = core::Context::stream().handle();
         invokeSigmoidGateMultiply(attn.raw_data(),
                                   (const char*)qkv.raw_data() + gate_offset * byte_size(qkv.dtype(), 1),
                                   attn_dim,
@@ -369,13 +378,18 @@ void UnifiedAttentionLayer::Forward(ForwardParam p)
 
     //////////////////////////////////////////////
     /// output gemm <Bs,HD> -> <Bs,HD>
-    (void)linear_.Forward(attn, weights.output, p.output);
+    (void)linear_.Forward(attn, *weights.wo, p.output);
     sync_check_cuda_error();
 }
 
 template<class T>
 Tensor UnifiedAttentionLayer::core_attention(Tensor& qkv, const ForwardParam& p, const WeightType& weights)
 {
+    const int tp_size           = weights.tp_size;
+    const int local_head_num    = weights.head_num / tp_size;
+    const int local_kv_head_num = weights.kv_head_num / tp_size;
+    const int size_per_head     = weights.head_dim;
+
     const auto device = qkv.device();
     const auto dtype  = qkv.dtype();
 
@@ -386,20 +400,19 @@ Tensor UnifiedAttentionLayer::core_attention(Tensor& qkv, const ForwardParam& p,
 
     TM_CHECK_EQ(d.prefill.q_sum + d.decode.n, q_count);
 
-    const int local_q_kv_head_num = local_head_num_ + 2 * local_kv_head_num_;
+    const int local_q_kv_head_num = local_head_num + 2 * local_kv_head_num;
 
     Tensor attn;
     if (tmp_attn_) {
         attn = tmp_attn_.slice(0, q_count);
     }
     else {
-        attn = {{q_count, (int)local_head_num_ * (int)size_per_head_}, dtype, device};
+        attn = {{q_count, local_head_num * size_per_head}, dtype, device};
     }
 
-    const bool is_mla = model_param_.mla.kv_lora_rank > 0;
+    const bool is_mla = weights.is_mla();
 
-    Tensor tmp_kv{
-        {(int)local_kv_head_num_, is_mla ? 1 : 2, d.prefill.k_sum + MAX_CTA_S, (int)size_per_head_}, dtype, device};
+    Tensor tmp_kv{{local_kv_head_num, is_mla ? 1 : 2, d.prefill.k_sum + MAX_CTA_S, size_per_head}, dtype, device};
 
     const int cache_layer_id = cache_layer_ids_[p.layer_id];
 
@@ -410,27 +423,27 @@ Tensor UnifiedAttentionLayer::core_attention(Tensor& qkv, const ForwardParam& p,
         params.out = (T*)attn.raw_data();
 
         params.q = (T*)qkv.raw_data();
-        params.k = params.q + local_head_num_ * size_per_head_;
+        params.k = params.q + local_head_num * size_per_head;
         if (is_mla) {
             params.v      = params.k;
-            params.stride = (local_head_num_ + 1 * local_kv_head_num_) * size_per_head_;
+            params.stride = (local_head_num + 1 * local_kv_head_num) * size_per_head;
         }
         else {
-            params.v = params.k + local_kv_head_num_ * size_per_head_;
+            params.v = params.k + local_kv_head_num * size_per_head;
             // When attn_output_gate, QKV layout is [Q|K|V|Gate] per token
             // stride must account for the extra gate portion at the end
-            if (model_param_.attn_output_gate) {
-                params.stride = (2 * local_head_num_ + 2 * local_kv_head_num_) * size_per_head_;
+            if (weights.output_gate) {
+                params.stride = (2 * local_head_num + 2 * local_kv_head_num) * size_per_head;
             }
             else {
-                params.stride = (local_head_num_ + 2 * local_kv_head_num_) * size_per_head_;
+                params.stride = (local_head_num + 2 * local_kv_head_num) * size_per_head;
             }
         }
 
-        if (weights.qkv.bias) {
-            params.q_bias = (T*)weights.qkv.bias.data_or<T>(nullptr);
-            params.k_bias = params.q_bias + local_head_num_ * size_per_head_;
-            params.v_bias = params.k_bias + local_kv_head_num_ * size_per_head_;
+        if (!is_mla && weights.w_qkv && weights.w_qkv->bias) {
+            params.q_bias = (T*)weights.w_qkv->bias.data_or<T>(nullptr);
+            params.k_bias = params.q_bias + local_head_num * size_per_head;
+            params.v_bias = params.k_bias + local_kv_head_num * size_per_head;
         }
 
         params.batch_size = stat.n;
@@ -443,21 +456,21 @@ Tensor UnifiedAttentionLayer::core_attention(Tensor& qkv, const ForwardParam& p,
         params.block_iter_params = BlockIteratorParams{(char**)d.block_ptrs.data(),  //
                                                        d.block_ptrs_offsets.data() + offset,
                                                        cache_layer_id,
-                                                       (int)param_.cache_block_seq_len};
+                                                       engine_param_.cache_block_seq_len};
 
         // prefill only
         if (is_mla) {
             params.linear_iter_params = LinearIteratorParams{
-                tmp_kv.raw_data(),            // flattened KV
-                stat.k_sum * size_per_head_,  // stride to next head
-                0                             // stride from K to V
+                tmp_kv.raw_data(),           // flattened KV
+                stat.k_sum * size_per_head,  // stride to next head
+                0                            // stride from K to V
             };
         }
         else {
             params.linear_iter_params = LinearIteratorParams{
-                tmp_kv.raw_data(),                // flattened KV
-                stat.k_sum * size_per_head_ * 2,  // stride to next head
-                stat.k_sum * size_per_head_       // stride from K to V
+                tmp_kv.raw_data(),               // flattened KV
+                stat.k_sum * size_per_head * 2,  // stride to next head
+                stat.k_sum * size_per_head       // stride from K to V
             };
         }
 
@@ -465,21 +478,21 @@ Tensor UnifiedAttentionLayer::core_attention(Tensor& qkv, const ForwardParam& p,
         params.cu_q_len = d.q_offsets.data() + offset;
         params.cu_k_len = d.k_offsets.data() + offset;
 
-        params.num_heads     = local_head_num_;
-        params.num_kv_heads  = local_kv_head_num_;
-        params.size_per_head = size_per_head_;
+        params.num_heads     = local_head_num;
+        params.num_kv_heads  = local_kv_head_num;
+        params.size_per_head = size_per_head;
         params.layer_id      = cache_layer_id;
 
         double scaling = 1.;
-        if (param_.softmax_scale) {  // model predefined softmax scale
-            scaling *= param_.softmax_scale;
+        if (weights.softmax_scale) {  // model predefined softmax scale
+            scaling *= weights.softmax_scale;
         }
         else {  // default value
             scaling /= std::sqrt((float)params.size_per_head);
         }
         params.inv_sqrt_dh = scaling * std::log2(std::exp(1.));
 
-        params.sinks       = weights.sinks.data_or((T*)nullptr);
+        params.sinks       = weights.sinks ? weights.sinks.data_or((T*)nullptr) : (T*)nullptr;
         params.scale_sinks = scaling;
 
         params.window_size = weights.window_size;
@@ -498,8 +511,8 @@ Tensor UnifiedAttentionLayer::core_attention(Tensor& qkv, const ForwardParam& p,
         }
 
         // logn attn
-        params.use_logn_attn           = param_.use_logn_attn;
-        params.max_position_embeddings = param_.max_position_embeddings;
+        params.use_logn_attn           = weights.use_logn_attn;
+        params.max_position_embeddings = weights.rope.max_position_embeddings;
 
         // Decoding use only for now
         params.split_cnt   = split_cnt_.data();
@@ -515,9 +528,9 @@ Tensor UnifiedAttentionLayer::core_attention(Tensor& qkv, const ForwardParam& p,
 
             // update ML,O offset if both prefill and decode present
             const int offset_ML_stage =
-                engine_param_.attn_cp_size * (offset ? kMaxWorkspaceTokens * local_head_num_ * 2 : 0);
-            const int offset_ML_rank = params.cp_rank * params.token_num * local_head_num_ * params.max_split_k * 2;
-            const int offset_O       = offset ? kMaxWorkspaceTokens * local_head_num_ * size_per_head_ : 0;
+                engine_param_.attn_cp_size * (offset ? kMaxWorkspaceTokens * local_head_num * 2 : 0);
+            const int offset_ML_rank = params.cp_rank * params.token_num * local_head_num * params.max_split_k * 2;
+            const int offset_O       = offset ? kMaxWorkspaceTokens * local_head_num * size_per_head : 0;
 
             params.partial_ML = partial_ML_.data() + offset_ML_stage + offset_ML_rank;
             params.partial_O  = partial_O_.data() + offset_O;
@@ -527,7 +540,7 @@ Tensor UnifiedAttentionLayer::core_attention(Tensor& qkv, const ForwardParam& p,
             params.cp_fn          = CpPost;
             params.cp_fn_ctx      = (void*)&cp_fn_ctx_;
             cp_fn_ctx_.cp_rank    = params.cp_rank;
-            cp_fn_ctx_.count      = params.token_num * local_head_num_ * params.max_split_k * 2;
+            cp_fn_ctx_.count      = params.token_num * local_head_num * params.max_split_k * 2;
             cp_fn_ctx_.partial_ML = partial_ML_.data() + offset_ML_stage;
             cp_fn_ctx_.stream     = stream;
         }
@@ -535,7 +548,7 @@ Tensor UnifiedAttentionLayer::core_attention(Tensor& qkv, const ForwardParam& p,
         params.arch   = arch_;
         params.stream = stream;
 
-        params.quant_policy = model_param_.quant_policy;
+        params.quant_policy = quant_policy_;
         return params;
     };
 
@@ -592,48 +605,53 @@ Tensor UnifiedAttentionLayer::core_attention(Tensor& qkv, const ForwardParam& p,
 Tensor UnifiedAttentionLayer::forward_mla(const Tensor& hidden_state, const WeightType& w)
 {
 
+    const int tp_size           = w.tp_size;
+    const int local_head_num    = w.head_num / tp_size;
+    const int local_kv_head_num = w.kv_head_num / tp_size;
+    const int size_per_head     = w.head_dim;
+
     const auto token_num = hidden_state.shape(0);
     const auto dtype     = hidden_state.dtype();
 
-    const int q_lora_rank  = w.q_a_proj.output_dim;
-    const int kv_lora_rank = w.kv_a_layernorm.size();
-    const int qk_rope_dim  = w.kv_a_proj.output_dim - kv_lora_rank;
+    const int q_lora_rank  = w.q_a_proj->output_dim;
+    const int kv_lora_rank = w.kv_a_layernorm->weight.size();
+    const int qk_rope_dim  = w.kv_a_proj->output_dim - kv_lora_rank;
 
     Tensor q;
 
     const auto stream = core::Context::stream().handle();
 
-    if (w.q_proj.weight) {
-        q = linear_.Forward(hidden_state, w.q_proj);
+    if (w.q_proj && w.q_proj->weight) {
+        q = linear_.Forward(hidden_state, *w.q_proj);
         sync_check_cuda_error();
     }
     else {
-        Tensor q_a = linear_.Forward(hidden_state, w.q_a_proj);
+        Tensor q_a = linear_.Forward(hidden_state, *w.q_a_proj);
         sync_check_cuda_error();
 
-        invokeRMSNorm(q_a, q_a, w.q_a_layernorm, model_param_.norm_eps, stream);
+        invokeRMSNorm(q_a, q_a, w.q_a_layernorm->weight, w.q_a_layernorm->norm_eps_, stream);
         sync_check_cuda_error();
 
-        q = linear_.Forward(q_a, w.q_b_proj);
+        q = linear_.Forward(q_a, *w.q_b_proj);
         sync_check_cuda_error();
     }
 
-    Tensor kv_a_k_pe = linear_.Forward(hidden_state, w.kv_a_proj);
+    Tensor kv_a_k_pe = linear_.Forward(hidden_state, *w.kv_a_proj);
     sync_check_cuda_error();
 
     auto kv_a = kv_a_k_pe.slice({0, 0}, {-1, kv_lora_rank});
-    invokeRMSNorm(kv_a, kv_a, w.kv_a_layernorm, model_param_.norm_eps, stream);
+    invokeRMSNorm(kv_a, kv_a, w.kv_a_layernorm->weight, w.kv_a_layernorm->norm_eps_, stream);
     sync_check_cuda_error();
 
-    const int local_q_kv_head_num = local_head_num_ + 1 * local_kv_head_num_;
+    const int local_q_kv_head_num = local_head_num + 1 * local_kv_head_num;
 
-    Tensor qkv{{token_num, local_q_kv_head_num, size_per_head_}, dtype, hidden_state.device()};
+    Tensor qkv{{token_num, local_q_kv_head_num, size_per_head}, dtype, hidden_state.device()};
     MLACopyQKV(dtype,
                qkv.raw_data(),
                q.raw_data(),
                kv_a_k_pe.raw_data(),
                token_num,
-               local_head_num_,
+               local_head_num,
                kv_lora_rank,
                qk_rope_dim,
                stream);
@@ -644,23 +662,32 @@ Tensor UnifiedAttentionLayer::forward_mla(const Tensor& hidden_state, const Weig
 
 void UnifiedAttentionLayer::qk_norm(Tensor& qkv, const WeightType& weights)
 {
+    if (!(weights.q_norm || weights.k_norm)) {
+        return;
+    }
+
+    TM_CHECK(weights.q_norm && weights.k_norm);
+
+    const int tp_size           = weights.tp_size;
+    const int local_head_num    = weights.head_num / tp_size;
+    const int local_kv_head_num = weights.kv_head_num / tp_size;
+    const int size_per_head     = weights.head_dim;
+
     const auto stream = core::Context::stream().handle();
 
     check_cuda_error(cudaEventRecord(qkv_event_, stream));
     check_cuda_error(cudaStreamWaitEvent(aux_stream_, qkv_event_));
 
-    TM_CHECK(model_param_.attn_bias == false) << "not implemented";
-
     const auto token_num = qkv.shape(0);
 
-    auto qkv3 = qkv.view({token_num, -1, (int)size_per_head_});
+    auto qkv3 = qkv.view({token_num, -1, size_per_head});
 
-    auto q = qkv3.slice({0, 0, 0}, {-1, (int)local_head_num_, -1});
-    invokeRMSNormQK(q, weights.q_a_layernorm, model_param_.norm_eps, stream);
+    auto q = qkv3.slice({0, 0, 0}, {-1, local_head_num, -1});
+    invokeRMSNormQK(q, weights.q_norm->weight, weights.q_norm->norm_eps_, stream);
     sync_check_cuda_error();
 
-    auto k = qkv3.slice({0, (int)local_head_num_, 0}, {-1, (int)local_kv_head_num_, -1});
-    invokeRMSNormQK(k, weights.kv_a_layernorm, model_param_.norm_eps, aux_stream_);
+    auto k = qkv3.slice({0, local_head_num, 0}, {-1, local_kv_head_num, -1});
+    invokeRMSNormQK(k, weights.k_norm->weight, weights.k_norm->norm_eps_, aux_stream_);
     sync_check_cuda_error();
 
     check_cuda_error(cudaEventRecord(aux_event_, aux_stream_));
diff --git a/src/turbomind/models/llama/unified_attention_layer.h b/src/turbomind/models/llama/unified_attention_layer.h
index 457029ca5c..79c20d3115 100644
--- a/src/turbomind/models/llama/unified_attention_layer.h
+++ b/src/turbomind/models/llama/unified_attention_layer.h
@@ -22,15 +22,17 @@
 #pragma once
 
 #include <cuda_runtime.h>
+#include <vector>
 
 #include "src/turbomind/core/core.h"
 #include "src/turbomind/engine/batch.h"
 #include "src/turbomind/kernels/attention/cp_utils.h"
 #include "src/turbomind/kernels/gemm/test/test_utils.h"
-#include "src/turbomind/models/llama/LlamaDenseWeight.h"
+#include "src/turbomind/models/attention_weight.h"
 #include "src/turbomind/models/llama/LlamaLinear.h"
 #include "src/turbomind/models/llama/context.h"
 #include "src/turbomind/models/llama/llama_params.h"
+#include "src/turbomind/models/llama/llama_rope.h"
 
 namespace turbomind {
 
@@ -38,7 +40,7 @@ struct AttentionData;
 
 class UnifiedAttentionLayer {
 public:
-    using WeightType = LlamaAttentionWeight;
+    using WeightType = AttentionWeight;
 
     static constexpr int kMaxKVSplits        = 128;
     static constexpr int kMaxWorkspaceTokens = 4096;
@@ -53,13 +55,14 @@ class UnifiedAttentionLayer {
 
     ~UnifiedAttentionLayer();
 
-    UnifiedAttentionLayer(const ModelParam&     model,
-                          const AttentionParam& attn,
-                          const EngineParam&    engine,
-                          int                   tp_size,
-                          const Context&        context,
-                          int                   phases,
-                          bool                  init);
+    UnifiedAttentionLayer(int                           quant_policy,
+                          const std::vector<int>&       layer_types,
+                          int                           layer_num,
+                          std::vector<AttentionWeight*> attn_weights,
+                          const EngineParam&            engine,
+                          const Context&                context,
+                          int                           phases,
+                          bool                          init);
 
     void Run(BatchOp op, int phase, TensorMap& env);
 
@@ -77,19 +80,12 @@ class UnifiedAttentionLayer {
     void qk_norm(Tensor& qkv, const WeightType& weights);
 
 private:
-    const int head_num_;
-    const int kv_head_num_;
-    const int size_per_head_;
-    const int hidden_units_;
-    const int local_head_num_;
-    const int local_kv_head_num_;
-
-    const AttentionParam param_;
-    const EngineParam    engine_param_;
-    const ModelParam     model_param_;
-    const Context&       context_;
-
-    int& is_warm_up_;
+    const int              quant_policy_;
+    const core::RopeConfig rope_;
+    const EngineParam      engine_param_;
+    const Context&         context_;
+    int&                   is_warm_up_;
+    const bool             init_;
 
     LlamaLinear& linear_;
     const int    arch_{};
@@ -107,7 +103,7 @@ class UnifiedAttentionLayer {
     std::vector<int> cache_layer_ids_;
 
     ///////////////////////////////////////////////////////
-    /// temp runtime buffers
+    /// temp runtime buffers (allocated in constructor)
     Tensor_<float> partial_O_;
     Tensor_<float> partial_ML_;
     Tensor_<int>   split_cnt_;
diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc
index 0a8d7508cd..20112b03c0 100644
--- a/src/turbomind/models/llama/unified_decoder.cc
+++ b/src/turbomind/models/llama/unified_decoder.cc
@@ -8,11 +8,13 @@
 #include "src/turbomind/core/allocator.h"
 #include "src/turbomind/kernels/core/math.h"
 #include "src/turbomind/kernels/norm/rms_norm.h"
+#include "src/turbomind/models/decoder_layer_weight.h"
 #include "src/turbomind/models/llama/llama_kernels.h"
 #include "src/turbomind/models/llama/llama_utils.h"
 #include "src/turbomind/models/llama/moe_ffn_layer.h"
 #include "src/turbomind/models/llama/unified_attention_layer.h"
 #include "src/turbomind/models/llama/unified_decoder.h"
+#include "src/turbomind/models/model_weight.h"
 #include "src/turbomind/utils/anomaly_handler.h"
 #include "src/turbomind/utils/cuda_utils.h"
 
@@ -30,37 +32,70 @@ void UnifiedDecoder::Run(BatchOp op, int phase, TensorMap& env)
     }
 }
 
-UnifiedDecoder::UnifiedDecoder(const ModelParam&     model,
-                               const EngineParam&    engine,
-                               const AttentionParam& attn,
-                               const MoeParam&       moe,
-                               const Context&        ctx,
-                               int                   phases):
-    layer_num_(model.layer_num),
-    hidden_units_(model.hidden_units),
+UnifiedDecoder::UnifiedDecoder(const EngineParam& engine,
+                               const Context&     ctx,
+                               int                phases,
+                               const ModelWeight& model_weight):
+    layer_num_(model_weight.num_layer),
+    hidden_units_(model_weight.hidden_units),
     attn_tp_size_(engine.attn_tp_size),
     attn_dp_size_(engine.attn_dp_size),
     attn_dp_rank_(engine.attn_dp_rank),
     mlp_tp_size_(engine.mlp_tp_size),
     attn_tp_group_(ctx.comm.d_tp_group),
-    rmsnorm_eps_(model.norm_eps),
     d_comm_(ctx.comm.d_comm),
-    tune_layer_num_(model.tune_layer_num),
+    tune_layer_num_(engine.tune_layer_num),
     is_warm_up_{*ctx.is_warm_up}
 {
-    if (std::accumulate(moe.expert_num.begin(), moe.expert_num.end(), 0LL)) {
-        moe_ffn_layer_ = std::make_unique<MoeFfnLayer>(model, moe, engine, ctx);
+    bool has_moe = false;
+    for (int i = 0; i < model_weight.num_layer; ++i) {
+        if (model_weight.layer(i)->moe_ffn) {
+            has_moe = true;
+            break;
+        }
+    }
+    if (has_moe) {
+        moe_ffn_layer_ = std::make_unique<MoeFfnLayer>(engine, ctx);
     }
 
-    attn_layer_ =
-        std::make_unique<UnifiedAttentionLayer>(model, attn, engine, attn_tp_size_, ctx, phases, (bool)moe_ffn_layer_);
+    std::vector<AttentionWeight*> attn_weights;
+    attn_weights.reserve(model_weight.num_layer);
+    for (int i = 0; i < model_weight.num_layer; ++i) {
+        if (auto* attn = model_weight.layer(i)->attention.get()) {
+            attn_weights.push_back(attn);
+        }
+    }
 
-    if (std::find(model.layer_types.begin(), model.layer_types.end(), 1) != model.layer_types.end()) {
-        linear_attn_layer_ = std::make_unique<GatedDeltaNetLayer>(model, attn, engine, attn_tp_size_, ctx, phases);
+    attn_layer_ = std::make_unique<UnifiedAttentionLayer>(engine.quant_policy,
+                                                          model_weight.layer_types,
+                                                          model_weight.num_layer,
+                                                          attn_weights,
+                                                          engine,
+                                                          ctx,
+                                                          phases,
+                                                          (bool)moe_ffn_layer_);
+
+    bool has_linear_attn = false;
+    for (auto t : model_weight.layer_types) {
+        if (t == 1) {
+            has_linear_attn = true;
+            break;
+        }
+    }
+    if (has_linear_attn) {
+        linear_attn_layer_ =
+            std::make_unique<GatedDeltaNetLayer>(model_weight.data_type, model_weight.layer_types, engine, ctx, phases);
     }
 
-    if (std::accumulate(model.inter_size.begin(), model.inter_size.end(), 0LL)) {
-        ffn_layer_ = std::make_unique<LlamaFfnLayer>(model, ctx);
+    bool has_ffn = false;
+    for (int i = 0; i < model_weight.num_layer; ++i) {
+        if (model_weight.layer(i)->feed_forward) {
+            has_ffn = true;
+            break;
+        }
+    }
+    if (has_ffn) {
+        ffn_layer_ = std::make_unique<LlamaFfnLayer>(ctx);
     }
 }
 
@@ -68,6 +103,7 @@ void UnifiedDecoder::AllreduceResidualRMSnorm(Tensor&       hidden_states,
                                               Tensor&       residual,
                                               const Tensor& bias,
                                               const Tensor& weight,
+                                              float         eps,
                                               int           token_num,
                                               int           group0,
                                               int           group1,
@@ -83,7 +119,7 @@ void UnifiedDecoder::AllreduceResidualRMSnorm(Tensor&       hidden_states,
                                                 residual.data_or((void*)nullptr),
                                                 bias.data_or((void*)nullptr),
                                                 weight.raw_data(),
-                                                rmsnorm_eps_,
+                                                eps,
                                                 hidden_units_,
                                                 dtype,
                                                 group0,
@@ -97,7 +133,7 @@ void UnifiedDecoder::AllreduceResidualRMSnorm(Tensor&       hidden_states,
                                               residual.data_or((void*)nullptr),
                                               bias.data_or((void*)nullptr),
                                               weight.raw_data(),
-                                              rmsnorm_eps_,
+                                              eps,
                                               hidden_units_,
                                               token_num,
                                               dtype,
@@ -113,7 +149,7 @@ void UnifiedDecoder::AllreduceResidualRMSnorm(Tensor&       hidden_states,
                                   dtype,
                                   hidden_units_,
                                   token_num,
-                                  rmsnorm_eps_,
+                                  eps,
                                   stream);
         sync_check_cuda_error();
     }
@@ -174,12 +210,18 @@ void UnifiedDecoder::Forward(int phase, TensorMap& args, const std::vector<Weigh
         local_hidden_states = global_hidden_states;
     }
 
+    TM_LOG_DEBUG("local_token_num=%d, global_token_num=%d", (int)local_token_num, (int)global_token_num);
+
     TM_DEBUG_TENSOR(local_residual, "res", 1);
-    TM_DEBUG_TENSOR(weights.at(0)->self_attn_norm, "norm_weight", 2);
 
     const auto stream = core::Context::stream().handle();
 
-    invokeRMSNorm(local_hidden_states, local_residual, weights.at(0)->self_attn_norm, rmsnorm_eps_, stream);
+    invokeRMSNorm(local_hidden_states,
+                  local_residual,
+                  weights.at(0)->attention_norm->weight,
+                  weights.at(0)->attention_norm->norm_eps_,
+                  stream);
+
     sync_check_cuda_error();
 
     TM_DEBUG_TENSOR(local_hidden_states, Concat("norm0", 0), 2);
@@ -201,13 +243,13 @@ void UnifiedDecoder::Forward(int phase, TensorMap& args, const std::vector<Weigh
 
         /////////////////////////////////////////////
         /// self-attention or linear-attention
-        if (weights.at(layer)->linear_attn_weights) {
+        if (weights.at(layer)->linear_attn) {
             linear_attn_layer_->Forward(
-                {phase, local_hidden_states, local_hidden_states, weights.at(layer)->linear_attn_weights.get(), layer});
+                {phase, local_hidden_states, local_hidden_states, weights.at(layer)->linear_attn.get(), layer});
         }
         else {
-            attn_layer_->Forward(
-                {phase, local_hidden_states, local_hidden_states, weights.at(layer)->self_attn_weights.get(), layer});
+            auto* attn = weights.at(layer)->attention.get();
+            attn_layer_->Forward({phase, local_hidden_states, local_hidden_states, attn, layer});
         }
 
         TM_DEBUG_TENSOR(local_hidden_states, Concat("attn_block", layer), 2);
@@ -215,17 +257,18 @@ void UnifiedDecoder::Forward(int phase, TensorMap& args, const std::vector<Weigh
         // For gated delta networks, we may need a different output.bias name or it doesn't have it.
         // We will just use `output.bias` from either layer.
         Tensor out_bias;
-        if (weights.at(layer)->linear_attn_weights) {
-            out_bias = weights.at(layer)->linear_attn_weights->out_proj.bias;
+        if (weights.at(layer)->linear_attn) {
+            out_bias = weights.at(layer)->linear_attn->out_proj->bias;
         }
         else {
-            out_bias = weights.at(layer)->self_attn_weights->output.bias;
+            out_bias = weights.at(layer)->attention->wo->bias;
         }
 
         AllreduceResidualRMSnorm(global_hidden_states,
                                  local_residual,
                                  out_bias,
-                                 weights.at(layer)->ffn_norm,
+                                 weights.at(layer)->ffn_norm->weight,
+                                 weights.at(layer)->ffn_norm->norm_eps_,
                                  local_token_num,
                                  attn_tp_group_,
                                  0,
@@ -239,18 +282,18 @@ void UnifiedDecoder::Forward(int phase, TensorMap& args, const std::vector<Weigh
 
         std::optional<MoeFfnLayer::ForwardParam> moe_fwd_param;
 
-        if (weights.at(layer)->moe_weights) {
+        if (weights.at(layer)->moe_ffn) {
             moe_fwd_param = MoeFfnLayer::ForwardParam{global_hidden_states,
                                                       global_hidden_states,
-                                                      weights.at(layer)->moe_weights.get(),
+                                                      weights.at(layer)->moe_ffn.get(),
                                                       ffn_layer_ ? 1.f : 0.f,
                                                       layer};
             moe_ffn_layer_->Forward(*moe_fwd_param);
         }
 
-        if (weights.at(layer)->ffn_weights) {
+        if (ffn_layer_ && weights.at(layer)->feed_forward) {
             ffn_layer_->forward(
-                {global_hidden_states, global_hidden_states, weights.at(layer)->ffn_weights.get(), (int)layer});
+                {global_hidden_states, global_hidden_states, weights.at(layer)->feed_forward.get(), (int)layer});
         }
 
         if (moe_fwd_param) {
@@ -261,12 +304,13 @@ void UnifiedDecoder::Forward(int phase, TensorMap& args, const std::vector<Weigh
 
         const bool last = layer == layer_num_ - 1;
 
-        auto& scale_weight = !last ? weights.at(layer + 1)->self_attn_norm : args.at("output_norm_weight");
+        auto& scale_weight = !last ? weights.at(layer + 1)->attention_norm->weight : args.at("output_norm_weight");
 
         AllreduceResidualRMSnorm(global_hidden_states,
                                  local_residual,
                                  {},
                                  scale_weight,
+                                 weights.at(layer)->ffn_norm->norm_eps_,
                                  local_token_num,
                                  0,
                                  attn_tp_group_,
diff --git a/src/turbomind/models/llama/unified_decoder.h b/src/turbomind/models/llama/unified_decoder.h
index 05e9ea73a4..7c1f36af65 100644
--- a/src/turbomind/models/llama/unified_decoder.h
+++ b/src/turbomind/models/llama/unified_decoder.h
@@ -2,7 +2,6 @@
 
 #include "src/turbomind/comm/device_comm.h"
 #include "src/turbomind/models/llama/GatedDeltaNetLayer.h"
-#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
 #include "src/turbomind/models/llama/LlamaFfnLayer.h"
 #include "src/turbomind/models/llama/context.h"
 #include "src/turbomind/models/llama/llama_params.h"
@@ -11,16 +10,14 @@
 
 namespace turbomind {
 
+class ModelWeight;
+class DecoderLayerWeight;
+
 class UnifiedDecoder {
 public:
-    using WeightType = LlamaDecoderLayerWeight;
+    using WeightType = DecoderLayerWeight;
 
-    UnifiedDecoder(const ModelParam&     model,
-                   const EngineParam&    engine,
-                   const AttentionParam& attn,
-                   const MoeParam&       moe,
-                   const Context&        ctx,
-                   int                   phases);
+    UnifiedDecoder(const EngineParam& engine, const Context& ctx, int phases, const ModelWeight& model_weight);
 
     void Run(BatchOp op, int phase, TensorMap& env);
 
@@ -37,8 +34,6 @@ class UnifiedDecoder {
 
     const int attn_tp_group_;
 
-    const float rmsnorm_eps_;
-
     comm::DeviceCommImpl* const d_comm_;
 
     const int tune_layer_num_;
@@ -54,6 +49,7 @@ class UnifiedDecoder {
                                   Tensor&       residual,
                                   const Tensor& bias,
                                   const Tensor& weight,
+                                  float         eps,
                                   int           token_num,
                                   int           t0,
                                   int           t1,
diff --git a/src/turbomind/models/model_root.cc b/src/turbomind/models/model_root.cc
new file mode 100644
index 0000000000..5b2092a7ec
--- /dev/null
+++ b/src/turbomind/models/model_root.cc
@@ -0,0 +1,26 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/models/model_root.h"
+#include "src/turbomind/core/check.h"
+
+namespace turbomind {
+
+ModelRoot::ModelRoot()
+{
+    // CUDA device is already set by CudaDeviceGuard in TurboMind::CreateRoot.
+    stream_ = core::Stream::create();
+    alloca_ = core::Allocator{stream_, /*use_default_pool=*/true};
+}
+
+ModelRoot::~ModelRoot() = default;
+
+void ModelRoot::prepare()
+{
+    TM_CHECK(text_model) << "ModelRoot::prepare: text_model not attached; did the spec "
+                            "forget root.build()?";
+    Module::prepare();
+}
+
+TM_MODULE_METHODS(ModelRoot, MODEL_ROOT_CHILDREN, MODEL_ROOT_PARAMS)
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/model_root.h b/src/turbomind/models/model_root.h
new file mode 100644
index 0000000000..0ea0b0ad72
--- /dev/null
+++ b/src/turbomind/models/model_root.h
@@ -0,0 +1,58 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#pragma once
+
+#include "src/turbomind/core/core.h"
+#include "src/turbomind/core/module.h"
+#include "src/turbomind/models/model_weight.h"
+
+namespace turbomind {
+
+/// Sentinel root for the weight tree.  Lives in TurboMind::Impl::weights_
+/// and owns the CUDA stream + pool-backed allocator used during weight
+/// loading.  Python creates a ModelWeight via _tm.create_module and
+/// attaches it as the `text_model` child via add_child_raw.
+class ModelRoot: public core::Module {
+public:
+    const char* type() const override
+    {
+        return "ModelRoot";
+    }
+
+    ModelRoot();
+    ~ModelRoot() override;
+
+    void prepare() override;
+
+    core::ContextGuard context() const
+    {
+        return core::ContextGuard{stream_, alloca_};
+    }
+
+    const core::Stream& stream() const
+    {
+        return stream_;
+    }
+    const core::Allocator& allocator() const
+    {
+        return alloca_;
+    }
+
+    /// Convenience accessor.  Nullptr before Python attaches via
+    /// `add_child_raw('text_model', ...)`.
+    ModelWeight* text_model_ptr() const
+    {
+        return text_model.get();
+    }
+
+#define MODEL_ROOT_CHILDREN(X) X(ModelWeight, text_model)
+
+#define MODEL_ROOT_PARAMS(X)
+
+    TM_MODULE_DECLARE(ModelRoot, MODEL_ROOT_CHILDREN, MODEL_ROOT_PARAMS)
+
+private:
+    core::Stream    stream_{};
+    core::Allocator alloca_{};
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/model_weight.cc b/src/turbomind/models/model_weight.cc
new file mode 100644
index 0000000000..21f954e2cd
--- /dev/null
+++ b/src/turbomind/models/model_weight.cc
@@ -0,0 +1,88 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/models/model_weight.h"
+#include "src/turbomind/core/registry.h"
+#include "src/turbomind/models/attention_weight.h"
+#include "src/turbomind/models/decoder_layer_weight.h"
+
+namespace turbomind {
+
+ModelWeight::ModelWeight(const core::ModelWeightConfig& cfg):
+    tp_size(cfg.tp_size), tp_rank(cfg.tp_rank), data_type(cfg.data_type), hidden_units(cfg.hidden_units)
+{
+}
+
+void ModelWeight::prepare()
+{
+    for_each_child([](const char* /*name*/, Module* child) {
+        if (child)
+            child->prepare();
+    });
+
+    auto* l0 = layer(0);
+    TM_CHECK(l0);
+    // Find first full-attention layer (linear-attn layers have no attention child)
+    DecoderLayerWeight* attn_layer = nullptr;
+    for (int i = 0; i < (int)layers->size(); ++i) {
+        if (layer(i)->attention) {
+            attn_layer = layer(i);
+            break;
+        }
+    }
+    TM_CHECK(attn_layer) << "No full-attention layer found";
+    head_dim    = attn_layer->attention->head_dim;
+    kv_head_num = attn_layer->attention->kv_head_num;
+
+    vocab_size        = tok_embeddings.shape(0);
+    embedding_size    = vocab_size;
+    num_layer         = layers->size();
+    vocab_size_padded = TM_CHECK_NOTNULL(output)->output_dim * tp_size;
+
+    layer_types.resize(num_layer);
+    for (int i = 0; i < num_layer; ++i) {
+        layer_types[i] = layer(i)->linear_attn ? 1 : 0;
+    }
+
+    EnsureFloatDtype(tok_embeddings, data_type);
+}
+
+DecoderLayerWeight* ModelWeight::layer(int i) const
+{
+    if (!layers) {
+        return nullptr;
+    }
+    return static_cast<DecoderLayerWeight*>(layers->child(std::to_string(i)));
+}
+
+std::vector<DecoderLayerWeight*> ModelWeight::layers_list() const
+{
+    if (!layers_cache_.empty()) {
+        return layers_cache_;
+    }
+    if (!layers) {
+        return {};
+    }
+    layers_cache_.resize(layers->size());
+    for (int i = 0; i < layers->size(); ++i) {
+        layers_cache_[i] = static_cast<DecoderLayerWeight*>(layers->child(std::to_string(i)));
+    }
+    return layers_cache_;
+}
+
+bool ModelWeight::verify(std::vector<std::string>& missing)
+{
+    Module::verify(missing);
+    if (!tok_embeddings) {
+        missing.push_back(full_path() + ": missing tok_embeddings");
+    }
+    if (!norm) {
+        missing.push_back(full_path() + ": missing norm");
+    }
+    return missing.empty();
+}
+
+TM_MODULE_REGISTER(ModelWeight, core::ModelWeightConfig);
+
+TM_MODULE_METHODS(ModelWeight, MODEL_WEIGHT_CHILDREN, MODEL_WEIGHT_PARAMS)
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/model_weight.h b/src/turbomind/models/model_weight.h
new file mode 100644
index 0000000000..b4b0f17864
--- /dev/null
+++ b/src/turbomind/models/model_weight.h
@@ -0,0 +1,83 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#pragma once
+
+#include "src/turbomind/core/core.h"
+#include "src/turbomind/core/module.h"
+#include "src/turbomind/models/linear_weight.h"
+#include "src/turbomind/models/norm_weight.h"
+#include "src/turbomind/utils/memory_utils.h"
+
+#include <vector>
+
+namespace turbomind::core {
+
+struct ModelWeightConfig: ModuleConfig {
+    ModelWeightConfig(): ModuleConfig{"ModelWeight"} {}
+
+#define MODEL_WEIGHT_FIELDS(X)                                                                                         \
+    X(int, tp_size)                                                                                                    \
+    X(int, tp_rank)                                                                                                    \
+    X(DataType, data_type)                                                                                             \
+    X(int, hidden_units)
+
+    MODEL_WEIGHT_FIELDS(TM_MEMBER)
+    TM_FOR_EACH(ModelWeightConfig, MODEL_WEIGHT_FIELDS)
+
+#undef MODEL_WEIGHT_FIELDS
+};
+
+}  // namespace turbomind::core
+
+namespace turbomind {
+
+class DecoderLayerWeight;
+
+/// Root weight module for a model. Owns the full weight tree.
+class ModelWeight: public core::Module {
+public:
+    const char* type() const override
+    {
+        return "ModelWeight";
+    }
+
+    ModelWeight() = default;
+
+    explicit ModelWeight(const core::ModelWeightConfig& cfg);
+
+    void prepare() override;
+    bool verify(std::vector<std::string>& missing) override;
+
+    // --- X-macro field lists ---
+#define MODEL_WEIGHT_CHILDREN(X)                                                                                       \
+    X(LinearWeight, output)                                                                                            \
+    X(NormWeight, norm)                                                                                                \
+    X(core::ModuleList, layers)
+
+#define MODEL_WEIGHT_PARAMS(X) X(tok_embeddings)
+
+    TM_MODULE_DECLARE(ModelWeight, MODEL_WEIGHT_CHILDREN, MODEL_WEIGHT_PARAMS)
+
+    // --- Accessors ---
+    DecoderLayerWeight*              layer(int i) const;
+    std::vector<DecoderLayerWeight*> layers_list() const;
+
+    // --- Derived in prepare() from children -- public for direct access ---
+    DataType         data_type{};
+    int              hidden_units{};
+    int              vocab_size{};
+    int              vocab_size_padded{};
+    int              embedding_size{};
+    int              num_layer{};
+    int              head_dim{};
+    int              kv_head_num{};
+    std::vector<int> layer_types;
+
+    // --- From ModelWeightConfig at construction ---
+    int tp_size{};
+    int tp_rank{};
+
+private:
+    mutable std::vector<DecoderLayerWeight*> layers_cache_;
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/moe_weight.cc b/src/turbomind/models/moe_weight.cc
new file mode 100644
index 0000000000..23353d8fd5
--- /dev/null
+++ b/src/turbomind/models/moe_weight.cc
@@ -0,0 +1,156 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/models/moe_weight.h"
+
+#include "src/turbomind/core/check.h"
+#include "src/turbomind/core/registry.h"
+#include "src/turbomind/kernels/gemm/convert.h"
+#include "src/turbomind/utils/cuda_utils.h"
+
+namespace turbomind {
+
+MoeWeight::MoeWeight(const core::MoeConfig& cfg)
+{
+    experts_per_token = cfg.experts_per_token;
+    norm_topk_prob    = cfg.norm_topk_prob;
+    routed_scale      = static_cast<float>(cfg.routed_scale);
+    topk_group        = cfg.topk_group;
+    topk_method       = cfg.topk_method;
+    n_group           = cfg.n_group;
+    scoring_func      = cfg.scoring_func;
+    router_n_groups   = cfg.router_n_groups;
+    data_type_        = cfg.data_type;
+    act_type_         = static_cast<ActivationType>(cfg.act_type);
+    fuse_silu_act_    = cfg.fuse_silu;
+    expert_num        = cfg.expert_num;
+}
+
+// Adapted from LinkExperts for LinearWeight
+static void LinkLinearExperts(std::function<LinearWeight*(int)> experts, int n, LinearWeight& d)
+{
+    const auto& e0 = *experts(0);
+
+    e0.copy_metadata_to(d);
+
+    d.k_desc.num = d.q_desc.num = n;
+
+    if (e0.bias) {
+        d.bias = Tensor{{n, e0.output_dim}, e0.bias.dtype(), kDEVICE};
+    }
+
+    std::vector<std::pair<void*, int>> weights;
+    std::vector<std::pair<void*, int>> scales;
+
+    for (int i = 0; i < n; ++i) {
+        auto& e = *experts(i);
+        weights.emplace_back(e.weight.raw_data(), e.k_desc.ld);
+        if (e.scales) {
+            scales.emplace_back(e.scales.raw_data(), e.q_desc.ld);
+        }
+        if (e.bias) {
+            Copy(e.bias, d.bias.slice(i, 1).squeeze(0));
+        }
+    }
+
+    auto stream = core::Context::stream().handle();
+
+    if (d.weight_format.dtype == kFloat8_e4m3 && d.input_dtype() == kFloat8_e4m3) {
+        auto make_blocked_ptr = [&](const auto& ptrs) {
+            return std::shared_ptr<void>{gemm::MakeBlockedPtrs(ptrs, stream), [](auto p) { cudaFree(p); }};
+        };
+        d.weight         = Tensor{make_blocked_ptr(weights), {n}, e0.weight.dtype(), kDEVICE};
+        d.scales         = Tensor{make_blocked_ptr(scales), {n}, e0.scales.dtype(), kDEVICE};
+        d.k_desc.offsets = d.q_desc.offsets = (int*)1;
+    }
+    else {
+        auto make_strided_ptr = [&](const auto& ptrs) {
+            return std::shared_ptr<void>{gemm::MakeStridedPtrs(ptrs, stream), [](auto p) { cudaFree(p); }};
+        };
+        d.weight = Tensor{make_strided_ptr(weights), {n}, d.weight_format.dtype, kDEVICE};
+        if (e0.scales) {
+            d.scales = Tensor{make_strided_ptr(scales), {n}, e0.scales.dtype(), kDEVICE};
+        }
+        d.k_desc.ld = d.q_desc.ld = 0;
+    }
+}
+
+FfnWeight* MoeWeight::expert(int i) const
+{
+    if (!experts) {
+        return nullptr;
+    }
+    return static_cast<FfnWeight*>(experts->child(std::to_string(i)));
+}
+
+void MoeWeight::prepare()
+{
+    // First prepare all children (experts, gate, etc.)
+    Module::prepare();
+
+    // Create batched block view for fused MoE path
+    auto e0 = TM_CHECK_NOTNULL(expert(0));  // exemplar expert
+
+    core::FfnConfig block_cfg;
+    block_cfg.hidden_dim = e0->hidden_dim;
+    block_cfg.inter_size = e0->inter_size;
+    // tp_size=1: expert weights are already TP-sharded — the block
+    // just batches them and must not divide inter_size a second time.
+    block_cfg.tp_size    = 1;
+    block_cfg.tp_rank    = 0;
+    block_cfg.data_type  = data_type_;
+    block_cfg.act_type   = static_cast<int>(act_type_);
+    block_cfg.fuse_silu  = fuse_silu_act_;
+    block_               = std::make_unique<FfnWeight>(block_cfg);
+
+    // Link each linear in the block to the corresponding expert linears
+    auto get_expert_w1w3 = [this](int i) -> LinearWeight* {
+        auto* exp = expert(i);
+        return exp ? exp->w1w3.get() : nullptr;
+    };
+    auto get_expert_w1 = [this](int i) -> LinearWeight* {
+        auto* exp = expert(i);
+        return exp ? exp->w1.get() : nullptr;
+    };
+    auto get_expert_w3 = [this](int i) -> LinearWeight* {
+        auto* exp = expert(i);
+        return exp ? exp->w3.get() : nullptr;
+    };
+    auto get_expert_w2 = [this](int i) -> LinearWeight* {
+        auto* exp = expert(i);
+        return exp ? exp->w2.get() : nullptr;
+    };
+
+    if (get_expert_w1w3(0)) {
+        // Fused w1w3 path: experts have a single fused gate+up projection
+        block_->add_child("w1w3", std::make_unique<LinearWeight>());
+        LinkLinearExperts(get_expert_w1w3, expert_num, *block_->w1w3);
+    }
+    else {
+        // Separate w1/w3 path: link individually
+        block_->add_child("w1", std::make_unique<LinearWeight>());
+        block_->add_child("w3", std::make_unique<LinearWeight>());
+        if (get_expert_w1(0)) {
+            LinkLinearExperts(get_expert_w1, expert_num, *block_->w1);
+        }
+        if (get_expert_w3(0)) {
+            LinkLinearExperts(get_expert_w3, expert_num, *block_->w3);
+        }
+    }
+
+    block_->add_child("w2", std::make_unique<LinearWeight>());
+    if (get_expert_w2(0)) {
+        LinkLinearExperts(get_expert_w2, expert_num, *block_->w2);
+    }
+
+    // Propagate the actual fused-silu state from the first expert to
+    // the block.  Each expert's prepare() has already run above, so
+    // is_fused_silu() now reflects whether the GEMM epilogue applies
+    // SiLU.
+    block_->is_fused_silu = e0->is_fused_silu;
+}
+
+TM_MODULE_REGISTER(MoeWeight, core::MoeConfig);
+
+TM_MODULE_METHODS(MoeWeight, MOE_WEIGHT_CHILDREN, MOE_WEIGHT_PARAMS)
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/moe_weight.h b/src/turbomind/models/moe_weight.h
new file mode 100644
index 0000000000..4b767e6710
--- /dev/null
+++ b/src/turbomind/models/moe_weight.h
@@ -0,0 +1,95 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#pragma once
+
+#include "src/turbomind/core/core.h"
+#include "src/turbomind/core/module.h"
+#include "src/turbomind/models/ffn_weight.h"
+
+namespace turbomind {
+
+}  // namespace turbomind
+
+namespace turbomind::core {
+
+struct MoeConfig: ModuleConfig {
+    MoeConfig(): ModuleConfig{"MoeWeight"} {}
+
+#define MOE_FIELDS(X)                                                                                                  \
+    X(int, expert_num)                                                                                                 \
+    X(int, experts_per_token)                                                                                          \
+    X(int, act_type)                                                                                                   \
+    X(bool, fuse_silu)                                                                                                 \
+    X(bool, norm_topk_prob)                                                                                            \
+    X(std::string, topk_method)                                                                                        \
+    X(std::string, scoring_func)                                                                                       \
+    X(int, topk_group)                                                                                                 \
+    X(int, n_group)                                                                                                    \
+    X(int, router_n_groups)                                                                                            \
+    X(double, routed_scale)                                                                                            \
+    X(DataType, data_type)
+
+    MOE_FIELDS(TM_MEMBER)
+    TM_FOR_EACH(MoeConfig, MOE_FIELDS)
+
+#undef MOE_FIELDS
+};
+
+}  // namespace turbomind::core
+
+namespace turbomind {
+
+class MoeWeight: public core::Module {
+public:
+    const char* type() const override
+    {
+        return "MoeWeight";
+    }
+
+    MoeWeight() = default;
+
+    MoeWeight(const core::MoeConfig& cfg);
+
+    void prepare() override;
+    int  num_experts() const
+    {
+        return expert_num;
+    }
+
+    // --- X-macro child members ---
+#define MOE_WEIGHT_CHILDREN(X)                                                                                         \
+    X(LinearWeight, gate)                                                                                              \
+    X(LinearWeight, shared_gate)                                                                                       \
+    X(core::ModuleList, experts)
+
+#define MOE_WEIGHT_PARAMS(X) X(score_correction_bias)
+
+    TM_MODULE_DECLARE(MoeWeight, MOE_WEIGHT_CHILDREN, MOE_WEIGHT_PARAMS)
+
+    // --- Typed accessors ---
+    FfnWeight* expert(int i) const;
+    FfnWeight* block() const
+    {
+        return block_.get();
+    }
+
+    // --- Config fields (public for runtime access) ---
+    int         expert_num{};
+    int         experts_per_token{};
+    bool        norm_topk_prob{};
+    float       routed_scale{};
+    int         topk_group{};
+    std::string topk_method;
+    int         n_group{};
+    std::string scoring_func;
+    int         router_n_groups{};
+
+private:
+    ActivationType act_type_{};
+    bool           fuse_silu_act_{};
+
+    DataType data_type_{};
+
+    std::unique_ptr<FfnWeight> block_;
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/norm_weight.cc b/src/turbomind/models/norm_weight.cc
new file mode 100644
index 0000000000..a1f234f50a
--- /dev/null
+++ b/src/turbomind/models/norm_weight.cc
@@ -0,0 +1,21 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/models/norm_weight.h"
+
+#include "src/turbomind/core/registry.h"
+#include "src/turbomind/utils/memory_utils.h"
+
+namespace turbomind {
+
+NormWeight::NormWeight(const core::NormConfig& cfg): shape_{cfg.dim}, dtype_{cfg.data_type}, norm_eps_{cfg.norm_eps} {}
+
+void NormWeight::prepare()
+{
+    EnsureFloatDtype(weight, dtype_);
+}
+
+TM_MODULE_REGISTER(NormWeight, core::NormConfig);
+
+TM_MODULE_METHODS(NormWeight, NORM_WEIGHT_CHILDREN, NORM_WEIGHT_PARAMS)
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/norm_weight.h b/src/turbomind/models/norm_weight.h
new file mode 100644
index 0000000000..ccf78ecaf4
--- /dev/null
+++ b/src/turbomind/models/norm_weight.h
@@ -0,0 +1,54 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#pragma once
+
+#include "src/turbomind/core/core.h"
+#include "src/turbomind/core/module.h"
+
+namespace turbomind::core {
+
+struct NormConfig: ModuleConfig {
+    NormConfig(): ModuleConfig{"NormWeight"} {}
+
+#define NORM_FIELDS(X)                                                                                                 \
+    X(int, dim)                                                                                                        \
+    X(DataType, data_type)                                                                                             \
+    X(float, norm_eps, 0.f)
+
+    NORM_FIELDS(TM_MEMBER)
+    TM_FOR_EACH(NormConfig, NORM_FIELDS)
+
+#undef NORM_FIELDS
+};
+
+}  // namespace turbomind::core
+
+namespace turbomind {
+
+class NormWeight: public core::Module {
+public:
+    const char* type() const override
+    {
+        return "NormWeight";
+    }
+
+    NormWeight() = default;
+
+    explicit NormWeight(const core::NormConfig& cfg);
+
+    /// Post-load: cast weight to configured dtype if needed.
+    void prepare() override;
+
+#define NORM_WEIGHT_CHILDREN(X)
+
+#define NORM_WEIGHT_PARAMS(X) X(weight)
+
+    TM_MODULE_DECLARE(NormWeight, NORM_WEIGHT_CHILDREN, NORM_WEIGHT_PARAMS)
+
+    float norm_eps_{};
+
+private:
+    std::vector<ssize_t> shape_;
+    DataType             dtype_{};
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/output_processor.cc b/src/turbomind/models/output_processor.cc
index 92f943de40..a0fdce788d 100644
--- a/src/turbomind/models/output_processor.cc
+++ b/src/turbomind/models/output_processor.cc
@@ -22,15 +22,8 @@ struct OutputProcessor::Impl {
 
     std::function<Tensor(const Tensor&)> lm_head_;
 
-    Impl(const ModelParam&                    model,
-         int                                  max_logits_len,
-         int                                  tp_rank,
-         int                                  phases,
-         std::function<Tensor(const Tensor&)> lm_head):
-        vocab_size_{(int)model.vocab_size},
-        max_logits_len_{max_logits_len},
-        tp_rank_{tp_rank},
-        lm_head_{std::move(lm_head)}
+    Impl(int vocab_size, int max_logits_len, int tp_rank, int phases, std::function<Tensor(const Tensor&)> lm_head):
+        vocab_size_{vocab_size}, max_logits_len_{max_logits_len}, tp_rank_{tp_rank}, lm_head_{std::move(lm_head)}
     {
         for (int i = 0; i < phases; ++i) {
             data_.emplace_back();
@@ -286,8 +279,8 @@ struct OutputProcessor::Impl {
 OutputProcessor::~OutputProcessor() = default;
 
 OutputProcessor::OutputProcessor(
-    const ModelParam& model, int max_logits_len, int tp_rank, int phases, std::function<Tensor(const Tensor&)> lm_head):
-    impl_{std::make_unique<Impl>(model, max_logits_len, tp_rank, phases, std::move(lm_head))}
+    int vocab_size, int max_logits_len, int tp_rank, int phases, std::function<Tensor(const Tensor&)> lm_head):
+    impl_{std::make_unique<Impl>(vocab_size, max_logits_len, tp_rank, phases, std::move(lm_head))}
 {
 }
 
diff --git a/src/turbomind/models/output_processor.h b/src/turbomind/models/output_processor.h
index 661cb72c74..2dcd569d4c 100644
--- a/src/turbomind/models/output_processor.h
+++ b/src/turbomind/models/output_processor.h
@@ -1,7 +1,6 @@
 #pragma once
 
 #include "src/turbomind/engine/batch.h"
-#include "src/turbomind/models/llama/llama_params.h"
 
 namespace turbomind {
 
@@ -9,11 +8,8 @@ class OutputProcessor {
 public:
     ~OutputProcessor();
 
-    OutputProcessor(const ModelParam&                    model,  //
-                    int                                  max_logits_len,
-                    int                                  tp_rank,
-                    int                                  phases,
-                    std::function<Tensor(const Tensor&)> lm_head);
+    OutputProcessor(
+        int vocab_size, int max_logits_len, int tp_rank, int phases, std::function<Tensor(const Tensor&)> lm_head);
 
     void Run(BatchOp op, int phase, TensorMap& env);
 
diff --git a/src/turbomind/python/CMakeLists.txt b/src/turbomind/python/CMakeLists.txt
index 2b4ceb557f..2df6e8fc52 100644
--- a/src/turbomind/python/CMakeLists.txt
+++ b/src/turbomind/python/CMakeLists.txt
@@ -14,6 +14,12 @@ endif()
 
 pybind11_add_module(${PROJECT_NAME} bind.cpp)
 target_link_libraries(${PROJECT_NAME} PRIVATE turbomind xgrammar)
+# Force-link all objects from static libraries that contain self-registration
+# globals (module type registrars).  Without --whole-archive, the linker
+# strips registrar objects because nobody references them directly.
+set_property(TARGET ${PROJECT_NAME} APPEND PROPERTY LINK_OPTIONS
+    "-Wl,--whole-archive" "$<TARGET_FILE:models>" "$<TARGET_FILE:turbomind>" "-Wl,--no-whole-archive")
+target_link_libraries(${PROJECT_NAME} PRIVATE models)
 
 pybind11_add_module(_xgrammar xgrammar_bind.cpp)
 target_link_libraries(_xgrammar PRIVATE core xgrammar)
diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp
index 535c284535..1e36f35bad 100644
--- a/src/turbomind/python/bind.cpp
+++ b/src/turbomind/python/bind.cpp
@@ -14,9 +14,20 @@
 
 #include "xgrammar/compiler.h"
 
+#include "src/turbomind/core/data_format.h"
 #include "src/turbomind/core/data_type.h"
+#include "src/turbomind/core/module.h"
 #include "src/turbomind/core/tensor.h"
+#include "src/turbomind/engine/engine_config.h"
 #include "src/turbomind/engine/model_request.h"
+#include "src/turbomind/models/attention_weight.h"
+#include "src/turbomind/models/decoder_layer_weight.h"
+#include "src/turbomind/models/delta_net_weight.h"
+#include "src/turbomind/models/ffn_weight.h"
+#include "src/turbomind/models/linear_weight.h"
+#include "src/turbomind/models/model_weight.h"
+#include "src/turbomind/models/moe_weight.h"
+#include "src/turbomind/models/norm_weight.h"
 #include "src/turbomind/python/dlpack.h"
 #include "src/turbomind/turbomind.h"
 #include "src/turbomind/utils/cuda_utils.h"
@@ -292,6 +303,25 @@ struct ScopedGIL {
 
 }  // namespace
 
+// --- Generic config binding helper ---
+
+template<typename Config>
+void bind_config(py::module_& m, const char* name)
+{
+    py::class_<Config, turbomind::core::ModuleConfig> cls(m, name);
+    cls.def(py::init<>());
+    Config::for_each([&](const char* fname, auto member_ptr) { cls.def_readwrite(fname, member_ptr); });
+    cls.def("clone", [](const Config& c) { return Config(c); });
+}
+
+template<typename T>
+void bind_struct(py::module_& m, const char* name)
+{
+    py::class_<T> cls(m, name);
+    cls.def(py::init<>());
+    T::for_each([&](const char* fname, auto member_ptr) { cls.def_readwrite(fname, member_ptr); });
+}
+
 PYBIND11_MODULE(_turbomind, m)
 {
     py::class_<ft::RequestMetrics, std::shared_ptr<ft::RequestMetrics>>(m, "RequestMetrics")
@@ -378,7 +408,10 @@ PYBIND11_MODULE(_turbomind, m)
             .value("TYPE_FP16", kFloat16)
             .value("TYPE_FP32", kFloat32)
             .value("TYPE_FP64", kFloat64)
-            .value("TYPE_BF16", kBfloat16);
+            .value("TYPE_BF16", kBfloat16)
+            .value("TYPE_FP8_E4M3", kFloat8_e4m3)
+            .value("TYPE_FP4_E2M1", kFloat4_e2m1)
+            .value("TYPE_UINT4", kUint4);
 
         // memory type
         py::enum_<ft::DeviceType>(m, "MemoryType")
@@ -387,12 +420,53 @@ PYBIND11_MODULE(_turbomind, m)
             .value("MEMORY_GPU", ft::DeviceType::kDEVICE);
     }
 
+    // DataFormat descriptors
+    py::class_<turbomind::QuantParamDesc>(m, "QuantParamDesc")
+        .def_readonly("dtype", &turbomind::QuantParamDesc::dtype)
+        .def_readonly("transposed", &turbomind::QuantParamDesc::transposed)
+        .def("present", &turbomind::QuantParamDesc::present);
+
+    py::class_<turbomind::DataFormat>(m, "DataFormat")
+        .def_readonly("dtype", &turbomind::DataFormat::dtype)
+        .def_readonly("block_sizes", &turbomind::DataFormat::block_sizes)
+        .def_readonly("scales", &turbomind::DataFormat::scales)
+        .def_readonly("zeros", &turbomind::DataFormat::zeros)
+        .def("is_quantized", &turbomind::DataFormat::is_quantized)
+        .def("rank", &turbomind::DataFormat::rank);
+
+    m.def("ResolveLinearWeightFormat",
+          &turbomind::ResolveLinearWeightFormat,
+          py::arg("data_type"),
+          py::arg("weight_dtype"),
+          py::arg("block_in"),
+          py::arg("block_out"));
+
+    // --- Config struct bindings ---
+    py::class_<turbomind::core::ModuleConfig>(m, "ModuleConfig")
+        .def_property_readonly("module_type", [](const turbomind::core::ModuleConfig& c) -> std::string {
+            return std::string(c.module_type);
+        });
+
+    bind_config<turbomind::core::LinearConfig>(m, "LinearConfig");
+    bind_struct<turbomind::core::RopeConfig>(m, "RopeConfig");
+    bind_struct<turbomind::EngineConfig>(m, "EngineConfig");
+    bind_config<turbomind::core::AttentionConfig>(m, "AttentionConfig");
+    bind_config<turbomind::core::FfnConfig>(m, "FfnConfig");
+    bind_config<turbomind::core::MoeConfig>(m, "MoeConfig");
+    bind_config<turbomind::core::DeltaNetConfig>(m, "DeltaNetConfig");
+    bind_config<turbomind::core::ModuleListConfig>(m, "ModuleListConfig");
+    bind_config<turbomind::core::NormConfig>(m, "NormConfig");
+    bind_config<turbomind::core::DecoderLayerConfig>(m, "DecoderLayerConfig");
+    bind_config<turbomind::core::ModelWeightConfig>(m, "ModelWeightConfig");
+
     // tensor
     py::class_<Tensor, std::shared_ptr<Tensor>>(m, "Tensor")
         .def_property_readonly("where", [](const Tensor& t) { return t.device().type; })
         .def_property_readonly("type", [](const Tensor& t) { return t.dtype(); })
         .def_property_readonly("shape", [](const Tensor& t) { return t.shape(); })
         .def_property_readonly("data", [](const Tensor& t) { return t.raw_data(); })
+        .def_property_readonly("byte_size", [](const Tensor& t) { return t.byte_size(); })
+        .def("__bool__", [](const Tensor& t) { return t.byte_size() > 0; })
         .def(
             "copy_from",
             [](Tensor& self, py::object obj) {
@@ -502,12 +576,112 @@ PYBIND11_MODULE(_turbomind, m)
             py::call_guard<py::gil_scoped_release>(),
             "grammar"_a);
 
+    // Python context manager wrapper for ContextGuard.
+    // Stores copies of Stream + Allocator; constructs the real guard
+    // in-place on __enter__ and destroys it on __exit__.
+    struct PyContextGuard {
+        ft::core::Stream                        stream;
+        ft::core::Allocator                     alloc;
+        std::unique_ptr<ft::core::ContextGuard> guard;
+
+        PyContextGuard(ft::core::Stream s, ft::core::Allocator a): stream(std::move(s)), alloc(std::move(a)) {}
+
+        void enter()
+        {
+            guard = std::make_unique<ft::core::ContextGuard>(stream, alloc);
+        }
+        void exit()
+        {
+            guard.reset();
+        }
+    };
+
+    py::class_<PyContextGuard>(m, "ContextGuard")
+        .def("__enter__",
+             [](PyContextGuard& g) -> PyContextGuard& {
+                 g.enter();
+                 return g;
+             })
+        .def("__exit__", [](PyContextGuard& g, py::object, py::object, py::object) { g.exit(); });
+
+    // Param — lightweight handle to a Module parameter slot
+    py::class_<ft::core::Param>(m, "Param")
+        .def(
+            "alloc",
+            [](ft::core::Param& p, std::vector<size_t> shape, ft::DataType dtype) {
+                return std::make_shared<Tensor>(p.alloc(shape, dtype));
+            },
+            "shape"_a,
+            "dtype"_a)
+        .def("get", [](ft::core::Param& p) { return std::make_shared<Tensor>(p.get()); })
+        .def("__bool__", [](ft::core::Param& p) { return static_cast<bool>(p); });
+
+    // Module class — navigation and allocation interface
+    py::class_<ft::core::Module>(m, "Module")
+        .def(
+            "get",
+            [](ft::core::Module& m, const std::string& segment) -> ft::core::Module* { return m.get(segment); },
+            py::return_value_policy::reference,
+            "segment"_a)
+        .def(
+            "param",
+            [](ft::core::Module& m, const std::string& name) -> ft::core::Param { return m.param(name); },
+            "name"_a)
+        .def("prepare", [](ft::core::Module& m) { m.prepare(); })
+        .def(
+            "child",
+            [](ft::core::Module& m, const std::string& name) -> ft::core::Module* { return m.child(name); },
+            py::return_value_policy::reference,
+            "name"_a)
+        // Config-based create_child: accepts any ModuleConfig subclass
+        .def(
+            "create_child",
+            [](ft::core::Module&              m,
+               const std::string&             name,
+               turbomind::core::ModuleConfig& config) -> ft::core::Module* { return m.create_child(name, config); },
+            py::return_value_policy::reference,
+            "name"_a,
+            "config"_a)
+        .def("type", [](ft::core::Module& m) -> const char* { return m.type(); })
+        .def("full_path", [](ft::core::Module& m) -> std::string { return m.full_path(); })
+        .def(
+            "__getitem__",
+            [](ft::core::Module& m, const std::string& key) -> ft::core::Module* { return m.get(key); },
+            py::return_value_policy::reference)
+        .def(
+            "__getitem__",
+            [](ft::core::Module& m, int idx) -> ft::core::Module* { return m.get(std::to_string(idx)); },
+            py::return_value_policy::reference)
+        // Deferred parent binding — transfer ownership of a previously created module
+        .def(
+            "add_child_raw",
+            [](ft::core::Module& parent, const std::string& name, ft::core::Module* child) -> ft::core::Module* {
+                auto owned = std::unique_ptr<ft::core::Module>(child);
+                return parent.add_child(name, std::move(owned));
+            },
+            py::return_value_policy::reference,
+            "name"_a,
+            "child"_a);
+
+    // Standalone module creation (no parent needed)
+    m.def(
+        "create_module",
+        [](turbomind::core::ModuleConfig& config) -> ft::core::Module* {
+            auto mod = ft::core::Module::create(config);
+            return mod.release();
+        },
+        py::return_value_policy::reference,
+        "config"_a);
+
+    // LinearWeight — specific interface for weight loading
+    py::class_<turbomind::LinearWeight, ft::core::Module>(m, "LinearWeight");
+
     // transformer model
     using ft::TurboMind;
     py::class_<TurboMind, std::shared_ptr<TurboMind>>(m, "TurboMind")
         .def_static(
             "create",
-            [](std::string model_dir, std::string config, std::string weight_type) -> std::shared_ptr<TurboMind> {
+            [](std::string model_dir, turbomind::EngineConfig config) -> std::shared_ptr<TurboMind> {
                 auto gil_factory = [] {  //
                     // erase the type
                     return std::static_pointer_cast<void>(std::make_shared<ScopedGIL>());
@@ -517,22 +691,35 @@ PYBIND11_MODULE(_turbomind, m)
                     delete ptr;
                 };
 
-                std::shared_ptr<TurboMind> model(new TurboMind(model_dir, config, gil_factory), no_gil_deleter);
+                std::shared_ptr<TurboMind> model(new TurboMind(model_dir, std::move(config), gil_factory),
+                                                 no_gil_deleter);
                 return model;
             },
             "model_dir"_a,
-            "config"_a      = "",
-            "weight_type"_a = "half")
+            "engine_config"_a)
         .def(
             "create_request",
             [](TurboMind* model) { return model->CreateRequest(); },
             py::call_guard<py::gil_scoped_release>())
-        .def("create_weights", &TurboMind::CreateWeights, py::call_guard<py::gil_scoped_release>(), "index"_a)
+        .def("create_context", &TurboMind::CreateContext, py::call_guard<py::gil_scoped_release>(), "index"_a)
         .def(
-            "get_weights",
-            [](TurboMind* model, int index) { return model->GetWeights(index); },
+            "create_root",
+            [](TurboMind* model, int index) -> ft::core::Module* { return model->CreateRoot(index); },
+            py::return_value_policy::reference,
             py::call_guard<py::gil_scoped_release>(),
             "index"_a)
+        .def(
+            "root",
+            [](TurboMind* model, int index) -> ft::core::Module* { return model->root(index); },
+            py::return_value_policy::reference,
+            "index"_a)
+        .def(
+            "context",
+            [](ft::TurboMind* model, int index) -> std::unique_ptr<PyContextGuard> {
+                auto [stream, alloc] = model->weight_context(index);
+                return std::make_unique<PyContextGuard>(std::move(stream), std::move(alloc));
+            },
+            "index"_a)
         .def(
             "process_weight",
             [](TurboMind* model, int index) { model->ProcessWeights(index); },
@@ -548,17 +735,8 @@ PYBIND11_MODULE(_turbomind, m)
             [](TurboMind* model, int index) { return model->GetScheduleMetrics(index); },
             py::call_guard<py::gil_scoped_release>(),
             "index"_a)
-        .def(
-            "sleep",
-            [](TurboMind* model, int index, int level) { model->Sleep(index, level); },
-            py::call_guard<py::gil_scoped_release>(),
-            "index"_a,
-            "level"_a)
-        .def(
-            "wakeup",
-            [](TurboMind* model, int index, const std::vector<std::string>& tags) { model->WakeUp(index, tags); },
-            py::call_guard<py::gil_scoped_release>(),
-            "index"_a,
-            "tags"_a)
-        .def("is_dummy_node", [](TurboMind* model) { return model->is_dummy_node(); });
+        .def("is_dummy_node", [](TurboMind* model) { return model->is_dummy_node(); })
+        .def("attn_tp_rank", &TurboMind::GetAttnTpRank, "index"_a)
+        .def("mlp_tp_rank", &TurboMind::GetMlpTpRank, "index"_a)
+        .def("model_tp_rank", &TurboMind::GetModelTpRank, "index"_a);
 }
diff --git a/src/turbomind/turbomind.cc b/src/turbomind/turbomind.cc
index 1529269fde..664e6ca0bf 100644
--- a/src/turbomind/turbomind.cc
+++ b/src/turbomind/turbomind.cc
@@ -1,6 +1,5 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
-#include <filesystem>
 #include <future>
 #include <random>
 
@@ -18,18 +17,16 @@
 #include "src/turbomind/engine/model_request.h"
 
 #include "src/turbomind/models/language_model.h"
-#include "src/turbomind/models/llama/LlamaWeight.h"
 #include "src/turbomind/models/llama/context.h"
 #include "src/turbomind/models/llama/llama_params.h"
-#include "src/turbomind/models/llama/llama_utils.h"
+#include "src/turbomind/models/model_root.h"
+#include "src/turbomind/models/model_weight.h"
 
 #include "src/turbomind/kernels/gemm/tuner/params.h"
 
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/metrics.h"
 
-#include <yaml-cpp/yaml.h>
-
 // #include "dbg.h"
 
 namespace turbomind {
@@ -39,140 +36,10 @@ using std::string;
 using std::shared_ptr;
 using std::unique_ptr;
 
-static std::optional<MoeParam::Method> get_moe_method()
-{
-    static const auto value = []() -> std::optional<MoeParam::Method> {
-        const auto p = std::getenv("TM_MOE_METHOD");
-        if (p) {
-            std::string str(p);
-            for (auto& x : str) {
-                x = std::tolower(x);
-            }
-            if (str == "naive") {
-                return MoeParam::kNaive;
-            }
-            else if (str == "fused") {
-                return MoeParam::kFused;
-            }
-            else {
-                std::cerr << "[WARNING] unrecognised MoE method: " << str << "\n";
-            }
-        }
-        return {};
-    }();
-    return value;
-}
-
-/// TODO: move config parsing to suitable place
-static void parse_default_rope_param(const YAML::Node& node, RopeParam& param)
-{
-    param.base = node["base"].as<float>();
-    param.dim  = node["dim"].as<int>();
-    if (param.base == 0.f || param.dim == 0) {
-        TM_LOG_ERROR("invalid rope param: base = {}, dim = {}", param.base, param.dim);
-        FT_CHECK(0);
-    }
-}
-
-static void parse_linear_rope_param(const YAML::Node& node, RopeParam& param)
-{
-    parse_default_rope_param(node, param);
-    param.factor = node["factor"].as<float>();
-}
-
-static void parse_dynamic_rope_param(const YAML::Node& node, RopeParam& param)
-{
-    parse_linear_rope_param(node, param);
-    param.max_position_embeddings = node["max_position_embeddings"].as<int>();
-}
-
-static void parse_yarn_rope_param(const YAML::Node& node, RopeParam& param)
-{
-    parse_dynamic_rope_param(node, param);
-    param.yarn.attention_factor = node["attention_factor"].as<float>();
-    param.yarn.beta_fast        = node["beta_fast"].as<float>();
-    param.yarn.beta_slow        = node["beta_slow"].as<float>();
-}
-
-static void parse_llama3_rope_param(const YAML::Node& node, RopeParam& param)
-{
-    parse_linear_rope_param(node, param);
-    param.llama3.low_freq_factor                  = node["low_freq_factor"].as<float>();
-    param.llama3.high_freq_factor                 = node["high_freq_factor"].as<float>();
-    param.llama3.original_max_position_embeddings = node["original_max_position_embeddings"].as<int>();
-}
-
-static void parse_mrope_rope_param(const YAML::Node& node, RopeParam& param)
-{
-    parse_default_rope_param(node, param);
-    auto mrope_section = node["mrope_section"].as<std::vector<int>>();
-    FT_CHECK(mrope_section.size() == 3);
-    param.mrope.section = {mrope_section[0], mrope_section[1], mrope_section[2]};
-}
-
-static void parse_rope_param(const YAML::Node& node, RopeParam& rope)
-{
-    rope.type = GetRoPEType(node["type"].as<std::string>());
-
-    switch (rope.type) {
-        case RopeType::kDefault:
-            parse_default_rope_param(node, rope);
-            break;
-        case RopeType::kLinear:
-            parse_linear_rope_param(node, rope);
-            break;
-        case RopeType::kDynamic:
-            parse_dynamic_rope_param(node, rope);
-            break;
-        case RopeType::kYarn:
-            parse_yarn_rope_param(node, rope);
-            break;
-        case RopeType::kLlama3:
-            parse_llama3_rope_param(node, rope);
-            break;
-        case RopeType::kMrope:
-            parse_mrope_rope_param(node, rope);
-            break;
-        default:
-            FT_CHECK(0);
-            break;
-    }
-}
-
-static DataType data_type_from_string(std::string str)
-{
-    if (str == "fp16" || str == "float16") {
-        return kFloat16;
-    }
-    else if (str == "bf16" || str == "bfloat16") {
-        return kBfloat16;
-    }
-    else if (str == "fp32") {
-        return kFloat32;
-    }
-    else if (str == "int8") {
-        return kUint8;
-    }
-    else if (str == "int4") {
-        return kUint4;
-    }
-    else if (str == "fp8") {
-        return kFloat8_e4m3;
-    }
-    else if (str == "e2m1") {
-        return kFloat4_e2m1;
-    }
-    TM_CHECK(0) << "unsupported weight type: " << str;
-    return {};
-}
-
 struct TurboMind::Impl {
-    DataType       data_type_;
-    ModelParam     model_param_;
-    AttentionParam attn_param_;
-    MoeParam       moe_param_;
-    EngineParam    engine_param_;
-    size_t         comm_size_;
+    DataType    data_type_;
+    EngineParam engine_param_;
+    size_t      comm_size_;
 
     vector<EngineParam> engine_params_;
 
@@ -187,11 +54,10 @@ struct TurboMind::Impl {
     vector<int> global_rank_;
 
     // Weights & engine instances for the ranks
-    vector<shared_ptr<LlamaWeight>> weights_;
-    vector<shared_ptr<Context>>     contexts_;
-    vector<Engine>                  engines_;
+    vector<shared_ptr<ModelRoot>> weights_;
+    vector<shared_ptr<Context>>   contexts_;
+    vector<Engine>                engines_;
 
-    string model_name_;
     string model_dir_;
 
     vector<int> queue_id_;
@@ -202,37 +68,23 @@ struct TurboMind::Impl {
 
     ~Impl();
 
-    Impl(string model_dir, string config, FFICtxFactory ffi_ctx_factory);
+    Impl(string model_dir, EngineConfig config, FFICtxFactory ffi_ctx_factory);
 
     unique_ptr<ModelRequest> CreateRequest()
     {
         return std::make_unique<ModelRequest>(gateway_.get(),  //
                                               data_type_,
                                               engine_param_.session_len,
-                                              model_param_.vocab_size,
-                                              model_param_.hidden_units);
+                                              weights_[0]->text_model_ptr()->vocab_size,
+                                              weights_[0]->text_model_ptr()->hidden_units);
     }
 
-    void CreateWeights(int index)
+    core::Module* CreateRoot(int index)
     {
         CudaDeviceGuard dev_guard(engine_param_.devices[index]);
-
-        CreateContext(index);
-
-        weights_[index] = std::make_shared<LlamaWeight>(data_type_,  //
-                                                        model_param_,
-                                                        engine_params_.at(index),
-                                                        moe_param_);
-    }
-
-    TensorMap GetWeights(int index)
-    {
-        const auto& tensor_ptr_map = TM_CHECK_NOTNULL(weights_[index])->get_parameters();
-        TensorMap   params;
-        for (const auto& [name, tensor_ptr] : tensor_ptr_map) {
-            params[name] = *tensor_ptr;
-        }
-        return params;
+        TM_CHECK(contexts_[index] != nullptr) << "CreateContext(" << index << ") must run before CreateRoot";
+        weights_[index] = std::make_shared<ModelRoot>();
+        return weights_[index].get();
     }
 
     void ProcessWeights(int index)
@@ -240,10 +92,8 @@ struct TurboMind::Impl {
         CudaDeviceGuard dev_guard(engine_param_.devices[index]);
         FT_CHECK(weights_[index] != nullptr);
 
-        cudaDeviceProp props{};
-        check_cuda_error(cudaGetDeviceProperties(&props, engine_param_.devices[index]));
-
-        weights_[index]->prepare(props);
+        auto ctx_guard = weights_[index]->context();
+        weights_[index]->prepare();
         sync_check_cuda_error();
     }
 
@@ -255,54 +105,12 @@ struct TurboMind::Impl {
 
     void Sleep(int index, int level)
     {
-        CudaDeviceGuard dev_guard(engine_param_.devices[index]);
-
-        if (level == 2) {
-            // free weights
-            weights_[index]->release();
-        }
-        else {
-            // offload weights to CPU
-            TM_CHECK(moe_param_.experts_per_token == 0) << "level 1 sleep not supported for MoE model";
-            weights_[index]->to_device(kCPU);
-        }
-
-        // free model (kv cache and buffer)
-        if (index == 0) {
-            gateway_->shutdown();
-            gateway_.reset();
-        }
-
-        engines_[index] = {};
-        contexts_[index]->allocator->trim(0);
-
-        trim_default_mempool(engine_param_.devices[index]);
+        // Sleep/wakeup is broken — disabled
     }
 
     void WakeUp(int index, const std::vector<std::string>& tags)
     {
-        CudaDeviceGuard dev_guard(engine_param_.devices[index]);
-
-        std::set<std::string> keys(tags.begin(), tags.end());
-
-        auto& ctx = *TM_CHECK_NOTNULL(contexts_[index]);
-
-        if (keys.find("weights") != keys.end()) {
-            TM_CHECK(weights_[index] != nullptr);
-            if (weights_[index]->is_initialized()) {
-                weights_[index]->to_device(kDEVICE);
-            }
-            else {
-                weights_[index]->initialize();
-            }
-        }
-
-        if (keys.find("kv_cache") != keys.end()) {
-            if (index == 0) {
-                gateway_ = std::make_shared<Gateway>(n_queues_, ffi_ctx_factory_);
-            }
-            CreateEngine(index);
-        }
+        // Sleep/wakeup is broken — disabled
     }
 
     void HandleMissingParams()
@@ -337,136 +145,20 @@ TurboMind::Impl::~Impl()
     }
 }
 
-TurboMind::Impl::Impl(string model_dir, string config, FFICtxFactory ffi_ctx_factory):
-    data_type_{}, model_param_{}, attn_param_{}, moe_param_{}, engine_param_{}, ffi_ctx_factory_{ffi_ctx_factory}
+TurboMind::Impl::Impl(string model_dir, EngineConfig config, FFICtxFactory ffi_ctx_factory):
+    data_type_{}, engine_param_{}, ffi_ctx_factory_{ffi_ctx_factory}
 {
-    TM_CHECK(!config.empty());
-
-    YAML::Node node;
-    try {
-        node = YAML::Load(config);
-    }
-    catch (const YAML::Exception& e) {
-        TM_CHECK(0) << "Error loading YAML config: " << e.what() << "\nconfig:\n" << config;
-    }
-
-    /// TODO: move config parsing to suitable place
-    const auto model     = node["model_config"];
-    const auto attention = node["attention_config"];
-    const auto engine    = node["engine_config"];
-
-    data_type_ = model_param_.data_type = data_type_from_string(model["data_type"].as<std::string>());
+    data_type_ = config.data_type;
     TM_CHECK(data_type_ == kBfloat16 || data_type_ == kHalf);
 
-    model_name_                     = model["model_name"].as<std::string>();
-    model_param_.head_num           = model["head_num"].as<int>();
-    model_param_.head_dim           = model["size_per_head"].as<int>();
-    model_param_.kv_head_num        = model["kv_head_num"].as<int>(0);
-    model_param_.hidden_units       = model["hidden_units"].as<int>();
-    model_param_.layer_num          = model["num_layer"].as<int>();
-    model_param_.vocab_size         = model["vocab_size"].as<int>();
-    model_param_.embedding_size     = model["embedding_size"].as<int>();
-    model_param_.norm_eps           = model["norm_eps"].as<float>();
-    model_param_.tune_layer_num     = model["tune_layer_num"].as<int>(1);
-    model_param_.mla.q_lora_rank    = model["q_lora_rank"].as<int>();
-    model_param_.mla.kv_lora_rank   = model["kv_lora_rank"].as<int>();
-    model_param_.mla.qk_rope_dim    = model["qk_rope_dim"].as<int>();
-    model_param_.mla.v_head_dim     = model["v_head_dim"].as<int>();
-    attn_param_.cache_block_seq_len = attention["cache_block_seq_len"].as<int>(0);
-    model_param_.quant_policy       = engine["quant_policy"].as<int>(0);
-
-    auto inter_size = model["inter_size"];
-    for (auto it = inter_size.begin(); it != inter_size.end(); ++it) {
-        model_param_.inter_size.push_back(it->as<int>());
-    }
-
-    if (auto layer_types = model["layer_types"]) {
-        for (auto it = layer_types.begin(); it != layer_types.end(); ++it) {
-            auto type_str = it->as<std::string>("");
-            if (type_str == "linear_attention") {
-                model_param_.layer_types.push_back(1);
-            }
-            else if (type_str == "full_attention" || type_str.empty()) {
-                model_param_.layer_types.push_back(0);
-            }
-            else {
-                TM_LOG_WARN("Unknown layer_type '{}', treating as full_attention.", type_str);
-                model_param_.layer_types.push_back(0);
-            }
-        }
-    }
+    // Copy config into the EngineConfig base of engine_param_
+    static_cast<EngineConfig&>(engine_param_) = config;
 
-    // Qwen3.5 Gated DeltaNet linear attention parameters
-    model_param_.linear_key_head_dim    = model["linear_key_head_dim"].as<int>(0);
-    model_param_.linear_value_head_dim  = model["linear_value_head_dim"].as<int>(0);
-    model_param_.linear_conv_kernel_dim = model["linear_conv_kernel_dim"].as<int>(0);
-    model_param_.linear_num_key_heads   = model["linear_num_key_heads"].as<int>(0);
-    model_param_.linear_num_value_heads = model["linear_num_value_heads"].as<int>(0);
-    model_param_.attn_output_gate       = model["attn_output_gate"].as<bool>(false);
-    model_param_.linear_state_dtype     = data_type_;
-
-    if (auto uqel = model["unquantized_expert_layers"]) {
-        for (auto it = uqel.begin(); it != uqel.end(); ++it) {
-            model_param_.unquantized_expert_layers.insert(it->as<int>());
-        }
-    }
-    model_param_.attn_sink = model["attn_sink"].as<bool>();
-    model_param_.mlp_bias  = model["mlp_bias"].as<bool>();
-    if (model["activation_type"].as<std::string>("") == "gpt-oss") {
-        model_param_.act_type = ActivationType::kSiluGptOss;
-    }
-
-    auto window_size = model["window_size"];
-    for (auto it = window_size.begin(); it != window_size.end(); ++it) {
-        model_param_.window_size.push_back(it->as<int>());
-    }
-
-    model_param_.attn_bias  = model["attn_bias"].as<int>(0);
-    model_param_.qk_norm    = model["qk_norm"].as<bool>();
-    model_param_.group_size = model["group_size"].as<int>(0);
+    phases_ = config.async_ ? 2 : 1;
 
-    attn_param_.softmax_scale = attention["softmax_scale"].as<float>(0);
-    // logn attn for qwen model
-    attn_param_.use_logn_attn           = attention["use_logn_attn"].as<int>(0);
-    attn_param_.max_position_embeddings = attention["max_position_embeddings"].as<int>(0);
-    // rotary embedding parameters
-    parse_rope_param(attention["rope_param"], attn_param_.rope);
-
-    engine_param_.max_batch_size = engine["max_batch_size"].as<int>(0);
-    auto max_forward_token_num   = engine["max_prefill_token_num"].as<int>(0);
+    auto max_forward_token_num = config.max_prefill_token_num;
     max_forward_token_num += engine_param_.max_batch_size;
 
-    engine_param_.max_context_token_num = engine["max_context_token_num"].as<int>(0);
-    engine_param_.session_len           = model["session_len"].as<int>(0);
-
-    engine_param_.cache_max_block_count = engine["cache_max_entry_count"].as<float>(0);
-    engine_param_.cache_chunk_size      = engine["cache_chunk_size"].as<int>(0);
-    engine_param_.enable_prefix_caching = engine["enable_prefix_caching"].as<bool>(false);
-    engine_param_.enable_metrics        = engine["enable_metrics"].as<bool>(false);
-
-    if (engine_param_.enable_prefix_caching && HasLinearAttention(model_param_)) {
-        TM_CHECK(0) << "Prefix caching is unsupported when linear attention is present";
-    }
-
-    engine_param_.num_tokens_per_iter = engine["num_tokens_per_iter"].as<int>(0);
-    engine_param_.max_prefill_iters   = engine["max_prefill_iters"].as<int>(1);
-
-    phases_ = engine["async_"].as<int>() ? 2 : 1;
-
-    engine_param_.outer_dp_size = engine["outer_dp_size"].as<int>();
-
-    engine_param_.attn_dp_size = engine["attn_dp_size"].as<int>();
-    engine_param_.attn_tp_size = engine["attn_tp_size"].as<int>();
-    engine_param_.attn_cp_size = engine["attn_cp_size"].as<int>();
-
-    engine_param_.mlp_tp_size = engine["mlp_tp_size"].as<int>();
-
-    engine_param_.devices = engine["devices"].as<std::vector<int>>();
-
-    // multi-node information
-    engine_param_.nnodes    = engine["nnodes"].as<int>();
-    engine_param_.node_rank = engine["node_rank"].as<int>();
-
     {
         auto sp                             = engine_param_.attn_tp_size * engine_param_.attn_cp_size;
         engine_param_.max_forward_token_num = ((size_t)max_forward_token_num + sp - 1) / sp * sp;
@@ -475,23 +167,7 @@ TurboMind::Impl::Impl(string model_dir, string config, FFICtxFactory ffi_ctx_fac
     comm_size_ = engine_param_.attn_dp_size * engine_param_.attn_tp_size * engine_param_.attn_cp_size;
     FT_CHECK(engine_param_.mlp_tp_size == comm_size_);
 
-    communicator_type_ = engine["communicator"].as<std::string>();
-
-    moe_param_.experts_per_token = model["experts_per_token"].as<int>(0);
-    moe_param_.inter_size        = model["expert_inter_size"].as<int>(0);
-    moe_param_.shared_gate       = model["moe_shared_gate"].as<bool>();
-    moe_param_.norm_topk_prob    = model["norm_topk_prob"].as<bool>();
-    moe_param_.routed_scale      = model["routed_scale"].as<float>(1.f);
-    moe_param_.topk_group        = model["topk_group"].as<int>(1);
-    moe_param_.topk_method       = model["topk_method"].as<std::string>("greedy");
-    moe_param_.n_group           = model["moe_group_num"].as<int>(1);
-    moe_param_.scoring_func      = model["scoring_func"].as<std::string>("softmax");
-    moe_param_.router_n_groups   = model["router_n_groups"].as<int>(-1);
-    moe_param_.router_bias       = model["expert_router_bias"].as<bool>();
-    YAML::Node expert_num        = model["expert_num"];
-    for (auto it = expert_num.begin(); it != expert_num.end(); ++it) {
-        moe_param_.expert_num.push_back(it->as<int>());
-    }
+    communicator_type_ = std::move(config.communicator);
 
     HandleMissingParams();
 
@@ -499,18 +175,6 @@ TurboMind::Impl::Impl(string model_dir, string config, FFICtxFactory ffi_ctx_fac
     engines_.resize(engine_param_.devices.size());
     contexts_.resize(engine_param_.devices.size());
 
-    model_param_.weight_type        = data_type_from_string(model["weight_type"].as<std::string>());
-    model_param_.expert_weight_type = data_type_from_string(model["expert_weight_type"].as<std::string>());
-    model_param_.ffn_weight_type =
-        data_type_from_string(model["ffn_weight_type"].as<std::string>(model["weight_type"].as<std::string>()));
-
-    if (auto method = get_moe_method()) {
-        moe_param_.method = *method;
-    }
-    else {
-        moe_param_.method = MoeParam::kFused;
-    }
-
     // NOTE: This runs on Python main thread
     group_id_ = comm::CreateHostGroupId((engine_param_.nnodes == 1) ? "" : "hybrid");
     group_id_->Initialize();
@@ -575,8 +239,9 @@ void TurboMind::Impl::CreateContext(int index)
             p.attn_cp_rank = c.d_comm->rank(c.d_cp_group);
         }
 
-        p.attn_tp_rank = c.d_comm->rank(c.d_tp_group) / p.attn_cp_size;
-        p.mlp_tp_rank  = c.d_comm->rank(0);
+        p.model_tp_rank = c.d_comm->rank(c.d_tp_group);
+        p.attn_tp_rank  = p.model_tp_rank / p.attn_cp_size;
+        p.mlp_tp_rank   = c.d_comm->rank(0);
     }
 
     if (c.h_tp_group->rank() == 0) {
@@ -609,19 +274,12 @@ void TurboMind::Impl::CreateEngine(int index)
     ctx.comm.h_comm->Sync();
 
     // create model
-    LanguageModel model{data_type_,  //
-                        model_param_,
-                        param,
-                        attn_param_,
-                        moe_param_,
-                        ctx,
-                        *weights_[index],
-                        phases_};
+    LanguageModel model{param, ctx, *weights_[index]->text_model_ptr(), phases_};
 
     // create engine
-    engines_[index] = Engine{data_type_,  //
-                             param,
+    engines_[index] = Engine{param,
                              std::move(model),
+                             *weights_[index]->text_model_ptr(),
                              ctx,
                              *gateway_,
                              engine_param_.devices[index],
@@ -703,7 +361,7 @@ void TurboMind::Impl::WarmUp(int index)
             const auto                         max_bs = *std::max_element(bss.begin(), bss.end());
             Buffer_<int>                       input_ids(max_bs, kCPU);
             std::mt19937                       g{};
-            std::uniform_int_distribution<int> d{0, (int)model_param_.vocab_size - 1};
+            std::uniform_int_distribution<int> d{0, (int)weights_[index]->text_model_ptr()->vocab_size - 1};
             for (auto& x : input_ids) {
                 x = d(g);
             }
@@ -777,19 +435,31 @@ void TurboMind::Impl::WarmUp(int index)
 
 TurboMind::~TurboMind() = default;
 
-TurboMind::TurboMind(string model_dir, string config, FFICtxFactory ffi_ctx_factory):
-    impl_{std::make_unique<Impl>(model_dir, config, ffi_ctx_factory)}
+TurboMind::TurboMind(string model_dir, EngineConfig config, FFICtxFactory ffi_ctx_factory):
+    impl_{std::make_unique<Impl>(model_dir, std::move(config), ffi_ctx_factory)}
 {
 }
 
-void TurboMind::CreateWeights(int index)
+void TurboMind::CreateContext(int index)
 {
-    return impl_->CreateWeights(index);
+    return impl_->CreateContext(index);
 }
 
-TensorMap TurboMind::GetWeights(int index)
+core::Module* TurboMind::CreateRoot(int index)
 {
-    return impl_->GetWeights(index);
+    return impl_->CreateRoot(index);
+}
+
+core::Module* TurboMind::root(int index)
+{
+    return impl_->weights_[index].get();
+}
+
+std::pair<core::Stream, core::Allocator> TurboMind::weight_context(int index)
+{
+    auto& root = impl_->weights_.at(index);
+    TM_CHECK(root != nullptr);
+    return {root->stream(), root->allocator()};
 }
 
 void TurboMind::ProcessWeights(int index)
@@ -827,4 +497,19 @@ bool TurboMind::is_dummy_node() const noexcept
     return impl_->n_queues_ == 0;
 }
 
+int TurboMind::GetAttnTpRank(int index)
+{
+    return impl_->engine_params_.at(index).attn_tp_rank;
+}
+
+int TurboMind::GetMlpTpRank(int index)
+{
+    return impl_->engine_params_.at(index).mlp_tp_rank;
+}
+
+int TurboMind::GetModelTpRank(int index)
+{
+    return impl_->engine_params_.at(index).model_tp_rank;
+}
+
 }  // namespace turbomind
diff --git a/src/turbomind/turbomind.h b/src/turbomind/turbomind.h
index ede7c7c2e3..4d19d12641 100644
--- a/src/turbomind/turbomind.h
+++ b/src/turbomind/turbomind.h
@@ -5,8 +5,11 @@
 #include <functional>
 #include <memory>
 #include <string>
+#include <utility>
 
 #include "src/turbomind/core/core.h"
+#include "src/turbomind/core/module.h"
+#include "src/turbomind/engine/engine_config.h"
 #include "src/turbomind/engine/model_request.h"
 #include "src/turbomind/utils/metrics.h"
 
@@ -18,11 +21,16 @@ class TurboMind {
 
     ~TurboMind();
 
-    TurboMind(std::string model_dir, std::string config, FFICtxFactory ffi_ctx_factory);
+    TurboMind(std::string model_dir, EngineConfig config, FFICtxFactory ffi_ctx_factory);
 
-    void CreateWeights(int index);
+    void          CreateContext(int index);
+    core::Module* CreateRoot(int index);
 
-    TensorMap GetWeights(int index);
+    /// Returns the root `Module` for GPU `index`'s weight tree.
+    core::Module* root(int index);
+
+    /// Returns the Stream and Allocator for GPU `index`'s weight tree.
+    std::pair<core::Stream, core::Allocator> weight_context(int index);
 
     void ProcessWeights(int index);
 
@@ -38,6 +46,15 @@ class TurboMind {
 
     std::unique_ptr<ModelRequest> CreateRequest();
 
+    /// Attention TP rank for GPU *index*.
+    int GetAttnTpRank(int index);
+
+    /// MLP TP rank for GPU *index*.
+    int GetMlpTpRank(int index);
+
+    /// Model-level TP rank (rank within d_tp_group) for GPU *index*.
+    int GetModelTpRank(int index);
+
 private:
     struct Impl;
     std::unique_ptr<Impl> impl_;
diff --git a/src/turbomind/utils/memory_utils.cu b/src/turbomind/utils/memory_utils.cu
index a31bfd631d..0eee78e544 100644
--- a/src/turbomind/utils/memory_utils.cu
+++ b/src/turbomind/utils/memory_utils.cu
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+#include "src/turbomind/core/context.h"
+#include "src/turbomind/core/data_format.h"
+#include "src/turbomind/core/data_type.h"
 #include "src/turbomind/macro.h"
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/memory_utils.h"
@@ -59,4 +62,69 @@ template void invokeInPlaceTranspose102(uint16_t*    data,
                                         bool         copy,
                                         cudaStream_t stream);
 
+// -----------------------------------------------------------------------
+// Element-wise dtype cast kernel (fp32 <-> fp16 <-> bf16)
+// -----------------------------------------------------------------------
+
+template<typename To, typename Ti>
+__global__ void dtype_cast_kernel(To* dst, const Ti* src, size_t n)
+{
+    for (size_t i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += blockDim.x * gridDim.x) {
+        dst[i] = static_cast<To>(src[i]);
+    }
+}
+
+void invokeDtypeCast(
+    void* dst, const void* src, size_t count, DataType dst_dtype, DataType src_dtype, cudaStream_t stream)
+{
+    const int block = 512;
+    const int grid  = std::min((count + block - 1) / block, (size_t)8192);
+
+    using half_t = turbomind::half_t;
+    using bf16_t = turbomind::bfloat16_t;
+
+    // fp32 -> fp16
+    if (src_dtype == turbomind::kFloat32 && dst_dtype == turbomind::kFloat16) {
+        dtype_cast_kernel<<<grid, block, 0, stream>>>((half_t*)dst, (const float*)src, count);
+    }
+    // fp32 -> bf16
+    else if (src_dtype == turbomind::kFloat32 && dst_dtype == turbomind::kBfloat16) {
+        dtype_cast_kernel<<<grid, block, 0, stream>>>((bf16_t*)dst, (const float*)src, count);
+    }
+    // fp16 -> fp32
+    else if (src_dtype == turbomind::kFloat16 && dst_dtype == turbomind::kFloat32) {
+        dtype_cast_kernel<<<grid, block, 0, stream>>>((float*)dst, (const half_t*)src, count);
+    }
+    // bf16 -> fp32
+    else if (src_dtype == turbomind::kBfloat16 && dst_dtype == turbomind::kFloat32) {
+        dtype_cast_kernel<<<grid, block, 0, stream>>>((float*)dst, (const bf16_t*)src, count);
+    }
+    // fp16 -> bf16
+    else if (src_dtype == turbomind::kFloat16 && dst_dtype == turbomind::kBfloat16) {
+        dtype_cast_kernel<<<grid, block, 0, stream>>>((bf16_t*)dst, (const half_t*)src, count);
+    }
+    // bf16 -> fp16
+    else if (src_dtype == turbomind::kBfloat16 && dst_dtype == turbomind::kFloat16) {
+        dtype_cast_kernel<<<grid, block, 0, stream>>>((half_t*)dst, (const bf16_t*)src, count);
+    }
+}
+
+// -----------------------------------------------------------------------
+// EnsureFloatDtype — cast tensor to target dtype if both are trivial float
+// -----------------------------------------------------------------------
+
+void EnsureFloatDtype(core::Tensor& tensor, DataType target_dtype)
+{
+    if (!tensor || tensor.dtype() == target_dtype) {
+        return;
+    }
+    if (!IsTrivialFloatType(tensor.dtype()) || !IsTrivialFloatType(target_dtype)) {
+        return;
+    }
+    auto         stream = core::Context::stream().handle();
+    core::Tensor casted{tensor.shape(), target_dtype, tensor.device()};
+    invokeDtypeCast(casted.raw_data(), tensor.raw_data(), tensor.size(), target_dtype, tensor.dtype(), stream);
+    tensor = std::move(casted);
+}
+
 }  // namespace turbomind
diff --git a/src/turbomind/utils/memory_utils.h b/src/turbomind/utils/memory_utils.h
index a61408281f..e49bfc6cf4 100644
--- a/src/turbomind/utils/memory_utils.h
+++ b/src/turbomind/utils/memory_utils.h
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include "src/turbomind/core/data_type.h"
+#include "src/turbomind/core/tensor.h"
 #include <cuda_runtime.h>
 
 namespace turbomind {
@@ -24,4 +26,13 @@ template<typename T>
 void invokeInPlaceTranspose102(
     T* data, T* workspace, const int dim0, const int dim1, const int dim2, bool copy = true, cudaStream_t stream = 0);
 
+/// Element-wise dtype cast kernel.  Supports fp32 <-> fp16 <-> bf16.
+void invokeDtypeCast(
+    void* dst, const void* src, size_t count, DataType dst_dtype, DataType src_dtype, cudaStream_t stream = 0);
+
+/// If *tensor* is a trivial float type that differs from *target_dtype*, cast
+/// it in-place (allocates a temporary, casts, move-assigns).  Uses
+/// Context::stream() internally — no stream parameter needed.
+void EnsureFloatDtype(core::Tensor& tensor, DataType target_dtype);
+
 }  // namespace turbomind
diff --git a/tests/test_lmdeploy/test_converter.py b/tests/test_lmdeploy/test_converter.py
new file mode 100644
index 0000000000..07b04af4c6
--- /dev/null
+++ b/tests/test_lmdeploy/test_converter.py
@@ -0,0 +1,49 @@
+import logging
+
+import pytest
+
+from lmdeploy.turbomind.converter import _deep_merge
+
+
+@pytest.fixture(autouse=True)
+def _caplog_lmdeploy(caplog):
+    caplog.set_level(logging.WARNING, logger='lmdeploy')
+    logger = logging.getLogger('lmdeploy')
+    logger.propagate = True
+    yield
+    logger.propagate = False
+
+
+class TestDeepMerge:
+
+    def test_flat_override(self):
+        base = {'a': 1, 'b': 2}
+        _deep_merge(base, {'b': 99})
+        assert base == {'a': 1, 'b': 99}
+
+    def test_nested_override(self):
+        base = {'rope_scaling': {'rope_type': 'default', 'factor': 1.0}}
+        _deep_merge(base, {'rope_scaling': {'factor': 4.0}})
+        assert base == {'rope_scaling': {'rope_type': 'default', 'factor': 4.0}}
+
+    def test_new_key_warns(self, caplog):
+        base = {'a': 1}
+        _deep_merge(base, {'nonexistent_key': 'val'})
+        assert base['nonexistent_key'] == 'val'
+        assert 'nonexistent_key' in caplog.text
+
+    def test_nested_new_key_warns(self, caplog):
+        base = {'rope_scaling': {'factor': 1.0}}
+        _deep_merge(base, {'rope_scaling': {'brand_new': 'yes'}})
+        assert base['rope_scaling']['brand_new'] == 'yes'
+        assert 'brand_new' in caplog.text
+
+    def test_empty_override_is_noop(self):
+        base = {'a': 1}
+        _deep_merge(base, {})
+        assert base == {'a': 1}
+
+    def test_scalar_overrides_dict(self):
+        base = {'a': {'nested': 1}}
+        _deep_merge(base, {'a': 'flat'})
+        assert base == {'a': 'flat'}
diff --git a/tests/test_lmdeploy/test_turbomind/test_compressed_tensors.py b/tests/test_lmdeploy/test_turbomind/test_compressed_tensors.py
index 7b8b75d4b1..8c565aab89 100644
--- a/tests/test_lmdeploy/test_turbomind/test_compressed_tensors.py
+++ b/tests/test_lmdeploy/test_turbomind/test_compressed_tensors.py
@@ -1,22 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-import pytest
 import torch
-
-from lmdeploy.turbomind.deploy import converter
 from lmdeploy.turbomind.deploy.parameter import QuantWeightOnly, pack_u4_row
 from lmdeploy.turbomind.deploy.source_model.qwen import Qwen3_5ReaderMixin
 
 
-class _FakeModelConfig:
-
-    def __init__(self, dtype=torch.float16):
-        self.dtype = dtype
-
-    def to_dict(self):
-        return {'architectures': ['Qwen3_5ForConditionalGeneration']}
-
-
 class _DummyQwen35Reader(Qwen3_5ReaderMixin):
 
     def __init__(self, params):
@@ -41,35 +29,6 @@ def _reference_compressed_tensors_dequant(weight_packed: torch.Tensor, weight_sc
             weight_scale.to(torch.float16).unsqueeze(-1)).reshape(weight.shape[0], -1)
 
 
-def test_compressed_tensors_support_matrix(monkeypatch):
-    fake_cfg = _FakeModelConfig()
-    monkeypatch.setattr(converter, 'get_model_arch', lambda _: ('Qwen3_5ForConditionalGeneration', fake_cfg))
-    monkeypatch.setattr(converter, '_get_and_verify_max_len', lambda *args, **kwargs: 4096)
-    monkeypatch.setattr(converter, 'is_bf16_supported', lambda: False)
-
-    _, default_cfg = converter.get_output_model_registered_name_and_config('dummy',
-                                                                           model_format='compressed-tensors',
-                                                                           dtype='auto',
-                                                                           group_size=0)
-    assert default_cfg.model_config.group_size == 128
-
-    _, gs32_cfg = converter.get_output_model_registered_name_and_config('dummy',
-                                                                        model_format='compressed-tensors',
-                                                                        dtype='auto',
-                                                                        group_size=32)
-    assert gs32_cfg.model_config.group_size == 32
-    assert gs32_cfg.model_config.model_format == 'awq'
-
-    with pytest.raises(ValueError, match='Unsupported group_size=64'):
-        converter.get_output_model_registered_name_and_config('dummy',
-                                                              model_format='compressed-tensors',
-                                                              dtype='auto',
-                                                              group_size=64)
-
-    with pytest.raises(ValueError, match='model_format=\"awq\"'):
-        converter.get_output_model_registered_name_and_config('dummy', model_format='awq', dtype='auto', group_size=32)
-
-
 def test_quant_weight_only_synthesizes_compressed_tensor_zero_points_from_scales():
     scales = (
         torch.rand(4, 8, dtype=torch.float32),
diff --git a/tests/test_lmdeploy/test_turbomind/test_converter.py b/tests/test_lmdeploy/test_turbomind/test_converter.py
index e91e0fefaa..689276934b 100644
--- a/tests/test_lmdeploy/test_turbomind/test_converter.py
+++ b/tests/test_lmdeploy/test_turbomind/test_converter.py
@@ -1,31 +1,8 @@
 # yapf: disable
-from lmdeploy import TurbomindEngineConfig
-from lmdeploy.turbomind import update_parallel_config
-from lmdeploy.turbomind.deploy.converter import (
-    get_input_model_registered_name,
-    get_output_model_registered_name_and_config,
-)
-from lmdeploy.turbomind.deploy.source_model.base import INPUT_MODELS
 
 # yapf: enable
 
 
-def test_torch_dtype_fallback():
-    """torch_dtype is deprecated in transformers v5+; dtype should be
-    preferred.
-
-    This test ensures get_output_model_registered_name_and_config still works
-    for models whose config exposes either `dtype` or `torch_dtype`.
-    """
-    _, config = get_output_model_registered_name_and_config(
-        'internlm/internlm2-chat-7b',
-        model_format='hf',
-        dtype='auto',
-        group_size=0,
-    )
-    assert config.weight_type in ('float16', 'bfloat16')
-
-
 def test_ffn_reader_kind_none():
     """FFN readers must handle kind=None (returns filter list, not tensors).
 
@@ -67,97 +44,3 @@ def test_ffn_reader_kind_none():
     assert len(result2) > 0
     assert all(isinstance(k, str) for k in result2)
     assert all(re.search(r'feed_forward', k) for k in result2)
-
-
-def test_registered_models():
-    for model, model_format, group_size, weight_type, register_name in [
-        ('internlm/internlm2-7b', 'hf', 0, 'bfloat16', 'tm'), ('baichuan-inc/Baichuan-7B', 'hf', 0, 'float16', 'tm'),
-        ('baichuan-inc/Baichuan2-7B-Chat', 'hf', 0, 'bfloat16', 'tm'),
-        ('baichuan-inc/Baichuan-13B-Chat', 'hf', 0, 'bfloat16', 'tm'),
-        ('baichuan-inc/Baichuan2-13B-Chat', 'hf', 0, 'bfloat16', 'tm'),
-        ('internlm/internlm-chat-7b', 'hf', 0, 'float16', 'tm'),
-        ('internlm/internlm2-chat-7b', 'hf', 0, 'bfloat16', 'tm'),
-        ('internlm/internlm-xcomposer2-4khd-7b', 'hf', 0, 'bfloat16', 'tm'),
-        ('internlm/internlm-xcomposer2-vl-7b', 'hf', 0, 'bfloat16', 'tm'),
-        ('internlm/internlm-xcomposer2-7b', 'hf', 0, 'bfloat16', 'tm'),
-        ('lmsys/vicuna-7b-v1.5', 'hf', 0, 'float16', 'tm'), ('01-ai/Yi-1.5-9B', 'hf', 0, 'bfloat16', 'tm'),
-        ('deepseek-ai/deepseek-coder-6.7b-instruct', 'hf', 0, 'bfloat16', 'tm'),
-        ('deepseek-ai/deepseek-llm-7b-chat', 'hf', 0, 'bfloat16', 'tm'),
-        ('Qwen/Qwen-7B-Chat', 'hf', 0, 'bfloat16', 'tm'), ('Qwen/Qwen1.5-7B-Chat', 'hf', 0, 'bfloat16', 'tm'),
-        ('Qwen/Qwen2-7B-Instruct', 'hf', 0, 'bfloat16', 'tm'), ('Qwen/Qwen-VL-Chat', 'hf', 0, 'bfloat16', 'tm'),
-        ('liuhaotian/llava-v1.6-34b', 'hf', 0, 'bfloat16', 'tm'),
-        ('liuhaotian/llava-v1.6-mistral-7b', 'hf', 0, 'bfloat16', 'tm'),
-        ('liuhaotian/llava-v1.6-vicuna-13b', 'hf', 0, 'bfloat16', 'tm'),
-        ('OpenGVLab/InternVL-Chat-V1-5', 'hf', 0, 'bfloat16', 'tm'),
-        ('deepseek-ai/deepseek-vl-7b-chat', 'hf', 0, 'float16', 'tm'),
-        ('Qwen/Qwen1.5-4B-Chat-AWQ', 'awq', 128, 'int4', 'tm'),
-        ('solidrust/Meta-Llama-3-8B-Instruct-hf-AWQ', 'awq', 128, 'int4', 'tm'),
-        ('internlm/internlm2-chat-20b-4bits', 'awq', 128, 'int4', 'tm'),
-        ('internlm/internlm-xcomposer2-vl-7b-4bit', 'awq', 128, 'int4', 'tm')
-    ]:
-        input_name = get_input_model_registered_name(model, model_format=model_format)
-        assert input_name in list(INPUT_MODELS.module_dict.keys())
-
-        output_name, config = get_output_model_registered_name_and_config(model,
-                                                                          model_format=model_format,
-                                                                          dtype='auto',
-                                                                          group_size=0)
-        assert output_name == register_name
-        assert config.model_config.group_size == group_size
-        assert config.session_len > 0
-        assert config.model_config.model_arch is not None
-
-
-def test_update_from_engine_config():
-    import copy
-    _, _config = get_output_model_registered_name_and_config('internlm/internlm2-chat-7b',
-                                                             model_format='hf',
-                                                             dtype='auto',
-                                                             group_size=0)
-    config = copy.deepcopy(_config)
-    config.update_from_engine_config(None)
-    assert (config == _config)
-
-    config = copy.deepcopy(_config)
-    engine_config = TurbomindEngineConfig()
-    update_parallel_config(engine_config)
-    config.update_from_engine_config(engine_config)
-    assert config.model_config.attn_tp_size == 1
-    assert config.session_len == 32768
-
-    config = copy.deepcopy(_config)
-    engine_config = TurbomindEngineConfig(model_format='hf',
-                                          tp=2,
-                                          device_num=2,
-                                          session_len=4000,
-                                          max_batch_size=100,
-                                          cache_max_entry_count=0.5,
-                                          quant_policy=8,
-                                          rope_scaling_factor=3.0,
-                                          use_logn_attn=True,
-                                          max_prefill_iters=64,
-                                          num_tokens_per_iter=256)
-    update_parallel_config(engine_config)
-    config.update_from_engine_config(engine_config)
-
-    assert (config.model_config.attn_tp_size == engine_config.attn_tp_size)
-    assert (config.session_len == engine_config.session_len)
-    assert (config.attention_config.rope_param.type == 'dynamic')
-    assert (config.attention_config.rope_param.factor == engine_config.rope_scaling_factor)
-    assert (config.attention_config.use_logn_attn == engine_config.use_logn_attn)
-
-
-def test_dtype():
-    testsets = [('auto', 'bfloat16'), ('float16', 'float16'), ('bfloat16', 'bfloat16')]
-    for specified_dtype, expected_dtype in testsets:
-        _, _config = get_output_model_registered_name_and_config('internlm/internlm2-chat-7b',
-                                                                 model_format='hf',
-                                                                 dtype=specified_dtype,
-                                                                 group_size=0)
-        assert _config.weight_type == expected_dtype
-    for specified_dtype in ['auto', 'float16', 'bfloat16']:
-        _, _config = get_output_model_registered_name_and_config('internlm/internlm2_5-20b-chat-4bit-awq',
-                                                                 model_format='awq',
-                                                                 dtype=specified_dtype,
-                                                                 group_size=128)
-        assert _config.weight_type == 'int4'
diff --git a/tests/test_lmdeploy/test_turbomind/test_internvl3_5.py b/tests/test_lmdeploy/test_turbomind/test_internvl3_5.py
new file mode 100644
index 0000000000..200832cb80
--- /dev/null
+++ b/tests/test_lmdeploy/test_turbomind/test_internvl3_5.py
@@ -0,0 +1,184 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest.mock import Mock
+
+import _turbomind as _tm
+import pytest
+from transformers import PretrainedConfig
+
+from lmdeploy.turbomind.models.qwen3 import Qwen3TextModel
+
+
+def _make_qwen3_stub():
+    model = Qwen3TextModel.__new__(Qwen3TextModel)
+    model.cfg = PretrainedConfig(hidden_size=4, vocab_size=8, tie_word_embeddings=False)
+    model._contexts = []
+    model._root_handles = []
+    model._model_tp_ranks = []
+    model._layer_prefix = 'model.layers'
+    model._tie_embeddings = False
+    model._get = Mock(side_effect=lambda key: f'tensor:{key}')
+    model._linear = Mock(side_effect=lambda key: f'linear:{key}')
+    model.norm = Mock(side_effect=lambda weight: f'norm:{weight}')
+    model.layers = Mock(side_effect=lambda pfx: f'layers:{pfx}')
+    return model
+
+
+class _FakeRoot:
+
+    last = None
+
+    def __init__(self, *args, **kwargs):
+        self.add_token_embeds = Mock()
+        self.add_lm_head = Mock()
+        self.norm = None
+        self.layers = None
+        self.build = Mock()
+        _FakeRoot.last = self
+
+
+def test_qwen3_model_uses_optional_checkpoint_prefix(monkeypatch):
+    import lmdeploy.turbomind.models.qwen3 as qwen3_mod
+
+    monkeypatch.setattr(qwen3_mod, 'TextModelBuilder', _FakeRoot)
+    model = _make_qwen3_stub()
+
+    model.model(pfx='language_model.')
+
+    model._get.assert_any_call('language_model.model.embed_tokens.weight')
+    model._get.assert_any_call('language_model.model.norm.weight')
+    model._linear.assert_called_once_with('language_model.lm_head')
+    model.layers.assert_called_once_with('language_model.model.layers')
+
+    root = _FakeRoot.last
+    root.add_token_embeds.assert_called_once_with(
+        'tensor:language_model.model.embed_tokens.weight')
+    root.add_lm_head.assert_called_once_with('linear:language_model.lm_head')
+    root.build.assert_called_once_with()
+
+
+def test_qwen3_model_default_prefix_preserves_plain_keys(monkeypatch):
+    import lmdeploy.turbomind.models.qwen3 as qwen3_mod
+
+    monkeypatch.setattr(qwen3_mod, 'TextModelBuilder', _FakeRoot)
+    model = _make_qwen3_stub()
+
+    model.model()
+
+    model._get.assert_any_call('model.embed_tokens.weight')
+    model._get.assert_any_call('model.norm.weight')
+    model._linear.assert_called_once_with('lm_head')
+    model.layers.assert_called_once_with('model.layers')
+
+
+def _internvl_cfg(inner_arch='Qwen3ForCausalLM'):
+    return PretrainedConfig(
+        architectures=['InternVLChatModel'],
+        llm_config=PretrainedConfig(
+            architectures=[inner_arch],
+            num_hidden_layers=1,
+            vocab_size=8,
+            rms_norm_eps=1e-6,
+            tie_word_embeddings=False,
+            model_type='qwen3',
+            num_attention_heads=2,
+            hidden_size=4,
+            head_dim=2,
+            num_key_value_heads=2,
+            max_position_embeddings=16,
+            intermediate_size=8,
+            attention_bias=False,
+        ),
+    )
+
+
+def test_internvl35_model_creates_qwen3_text_model():
+    from lmdeploy.turbomind.models.internvl3_5 import InternVL3_5Model
+
+    model = InternVL3_5Model(
+        _internvl_cfg(),
+        resolver=Mock(data_type=_tm.DataType.TYPE_FP16))
+
+    assert isinstance(model.text_model, Qwen3TextModel)
+    assert model.vision_model is None
+
+
+def test_internvl35_model_delegates_runtime_params_and_export(monkeypatch):
+    from lmdeploy.turbomind.models.internvl3_5 import InternVL3_5Model
+
+    fake_text_model = Mock()
+    fake_text_cls = Mock(return_value=fake_text_model)
+    monkeypatch.setattr(
+        'lmdeploy.turbomind.models.internvl3_5.Qwen3TextModel',
+        fake_text_cls)
+
+    resolver = Mock()
+    model = InternVL3_5Model(_internvl_cfg(), resolver=resolver)
+
+    assert fake_text_cls.call_args.args[0].architectures == ['Qwen3ForCausalLM']
+    assert fake_text_cls.call_args.kwargs == {'resolver': resolver}
+
+    attn_tp = Mock()
+    mlp_tp = Mock()
+    model_tp = Mock()
+    model.bind_runtime(
+        ctx='ctx',
+        root_handles=['root'],
+        attn_tp=attn_tp,
+        mlp_tp=mlp_tp,
+        model_tp=model_tp,
+    )
+    fake_text_model.bind_runtime.assert_called_once_with(
+        ctx='ctx',
+        root_handles=['root'],
+        attn_tp=attn_tp,
+        mlp_tp=mlp_tp,
+        model_tp=model_tp,
+    )
+
+    params = {'language_model.lm_head.weight': object()}
+    model.set_params(params)
+    fake_text_model.set_params.assert_called_once_with(params)
+
+    model.model()
+    fake_text_model.model.assert_called_once_with(pfx='language_model.')
+
+    fake_text_model.cfg.vocab_size = 32000
+    assert model._vocab_size == 32000
+
+
+def test_internvl35_model_requires_llm_config():
+    from lmdeploy.turbomind.models.internvl3_5 import InternVL3_5Model
+
+    cfg = {'architectures': ['InternVLChatModel']}
+
+    with pytest.raises(ValueError, match='llm_config'):
+        InternVL3_5Model(cfg, resolver=Mock())
+
+
+def test_internvl35_model_requires_inner_architecture():
+    from lmdeploy.turbomind.models.internvl3_5 import InternVL3_5Model
+
+    cfg = {'architectures': ['InternVLChatModel'], 'llm_config': {}}
+
+    with pytest.raises(ValueError, match='llm_config.architectures'):
+        InternVL3_5Model(cfg, resolver=Mock())
+
+
+def test_internvl35_model_rejects_unsupported_inner_architecture():
+    from lmdeploy.turbomind.models.internvl3_5 import InternVL3_5Model
+
+    with pytest.raises(ValueError, match='GptOssForCausalLM'):
+        InternVL3_5Model(_internvl_cfg('GptOssForCausalLM'), resolver=Mock())
+
+
+def test_supported_archs_maps_internvl_chat_model():
+    from lmdeploy.turbomind.supported_models import SUPPORTED_ARCHS
+
+    assert SUPPORTED_ARCHS['InternVLChatModel'] == 'internvl3_5'
+
+
+def test_internvl35_model_is_registered():
+    from lmdeploy.turbomind.models import InternVL3_5Model  # noqa: F401
+    from lmdeploy.turbomind.models.base import INPUT_MODELS
+
+    assert INPUT_MODELS.get('internvl3_5') is InternVL3_5Model
diff --git a/tests/test_lmdeploy/test_turbomind/test_loader.py b/tests/test_lmdeploy/test_turbomind/test_loader.py
new file mode 100644
index 0000000000..b9d8144a45
--- /dev/null
+++ b/tests/test_lmdeploy/test_turbomind/test_loader.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from queue import Queue
+from unittest.mock import Mock
+
+from lmdeploy.turbomind.loader import BaseLoader, StateDictLoader, create_loader
+from lmdeploy.turbomind.model_loader import ModelLoader
+
+
+class _FakeModelComm:
+
+    def attn_tp_rank(self, gpu):
+        return gpu
+
+    def mlp_tp_rank(self, gpu):
+        return gpu
+
+    def model_tp_rank(self, gpu):
+        return gpu
+
+    def context(self, gpu):
+        return f'ctx:{gpu}'
+
+    def root(self, gpu):
+        return f'root:{gpu}'
+
+
+def test_base_loader_defaults_to_no_layer_pattern():
+    class _TestLoader(BaseLoader):
+        def items(self):
+            raise NotImplementedError
+
+        def all_items(self):
+            raise NotImplementedError
+
+    loader = _TestLoader('model-path')
+
+    assert loader.model_path == 'model-path'
+    assert loader.pattern is None
+    assert loader.mappings == []
+
+
+def test_state_dict_loader_can_be_created_without_pattern_or_mappings():
+    queue = Queue()
+
+    loader = create_loader(queue)
+
+    assert isinstance(loader, StateDictLoader)
+    assert loader.pattern is None
+
+
+def test_model_loader_export_uses_all_params_loader_without_model_metadata(monkeypatch):
+    import lmdeploy.turbomind.model_loader as model_loader_mod
+
+    loader = Mock()
+    loader.all_items.return_value = {'weight': object()}
+    create_loader = Mock(return_value=loader)
+    monkeypatch.setattr(model_loader_mod, 'create_loader', create_loader)
+
+    model = Mock()
+    model._loader_mappings = []
+    model_loader = ModelLoader(model, _FakeModelComm(), 1, 'model-path',
+                               data_type=Mock(), engine_config=Mock(attn_tp_size=1, attn_cp_size=1, mlp_tp_size=1))
+
+    model_loader.export()
+
+    create_loader.assert_called_once_with('model-path', None, [])
+    model.set_params.assert_called_once_with(loader.all_items.return_value)
+    model.model.assert_called_once_with()
+
+
+def test_model_loader_export_iter_uses_all_params_loader_without_model_metadata(monkeypatch):
+    import lmdeploy.turbomind.model_loader as model_loader_mod
+
+    loader = Mock()
+    loader.all_items.return_value = {'weight': object()}
+    create_loader = Mock(return_value=loader)
+    monkeypatch.setattr(model_loader_mod, 'create_loader', create_loader)
+
+    model = Mock()
+    model._loader_mappings = []
+    model_loader = ModelLoader(model, _FakeModelComm(), 1, 'model-path',
+                               data_type=Mock(), engine_config=Mock(attn_tp_size=1, attn_cp_size=1, mlp_tp_size=1))
+
+    assert list(model_loader.export_iter()) == [-1]
+
+    create_loader.assert_called_once_with('model-path', None, [])
+    model.set_params.assert_called_once_with(loader.all_items.return_value)
+    model.model.assert_called_once_with()
diff --git a/tests/test_lmdeploy/test_turbomind/test_local_autoconfig.py b/tests/test_lmdeploy/test_turbomind/test_local_autoconfig.py
new file mode 100644
index 0000000000..72c5020f24
--- /dev/null
+++ b/tests/test_lmdeploy/test_turbomind/test_local_autoconfig.py
@@ -0,0 +1,353 @@
+from types import SimpleNamespace
+
+import _turbomind as _tm
+from transformers import PretrainedConfig
+
+
+class DummyConfig(PretrainedConfig):
+    model_type = 'dummy'
+
+
+def test_load_model_config_returns_text_config_object(monkeypatch):
+    from lmdeploy.turbomind.models import utils
+
+    text_cfg = DummyConfig(
+        hidden_size=16,
+        num_attention_heads=2,
+        num_hidden_layers=1,
+        vocab_size=32,
+        rms_norm_eps=1e-6,
+    )
+    outer_cfg = DummyConfig(text_config=text_cfg)
+
+    monkeypatch.setattr(utils, 'get_model_arch', lambda model_path: ('DummyForCausalLM', outer_cfg))
+
+    assert utils.load_model_config('/fake/model') is text_cfg
+
+
+def test_load_model_config_returns_outer_object_without_text_config(monkeypatch):
+    from lmdeploy.turbomind.models import utils
+
+    cfg = DummyConfig(
+        hidden_size=16,
+        num_attention_heads=2,
+        num_hidden_layers=1,
+        vocab_size=32,
+        rms_norm_eps=1e-6,
+    )
+
+    monkeypatch.setattr(utils, 'get_model_arch', lambda model_path: ('DummyForCausalLM', cfg))
+
+    assert utils.load_model_config('/fake/model') is cfg
+
+
+def test_apply_hf_overrides_updates_config_object():
+    from lmdeploy.turbomind.converter import _apply_hf_overrides
+
+    cfg = DummyConfig(hidden_size=16, rope_scaling={'type': 'linear', 'factor': 2.0})
+
+    _apply_hf_overrides(cfg, {
+        'hidden_size': 32,
+        'rope_scaling': {'factor': 4.0},
+        'new_field': 'kept',
+    })
+
+    assert cfg.hidden_size == 32
+    assert cfg.rope_scaling == {'type': 'linear', 'factor': 4.0}
+    assert cfg.new_field == 'kept'
+
+
+def test_apply_hf_overrides_updates_nested_config_object():
+    from lmdeploy.turbomind.converter import _apply_hf_overrides
+
+    cfg = DummyConfig(llm_config=DummyConfig(hidden_size=16, rope_scaling={'type': 'linear', 'factor': 2.0}))
+
+    _apply_hf_overrides(cfg, {
+        'llm_config': {
+            'hidden_size': 32,
+            'rope_scaling': {'factor': 4.0},
+        },
+    })
+
+    assert cfg.llm_config.hidden_size == 32
+    assert cfg.llm_config.rope_scaling == {'type': 'linear', 'factor': 4.0}
+
+
+def test_parse_rope_param_reads_config_object_fields():
+    from lmdeploy.turbomind.models.utils import parse_rope_param
+
+    cfg = DummyConfig(
+        rope_theta=500000.0,
+        max_position_embeddings=4096,
+        rope_scaling={
+            'rope_type': 'llama3',
+            'factor': 8.0,
+            'low_freq_factor': 1.0,
+            'high_freq_factor': 4.0,
+            'original_max_position_embeddings': 8192,
+        },
+    )
+
+    rope, max_pos = parse_rope_param(cfg, head_dim=128)
+
+    assert max_pos == 4096
+    assert rope.type == 'llama3'
+    assert rope.base == 500000.0
+    assert rope.dim == 128
+    assert rope.factor == 8.0
+    assert rope.low_freq_factor == 1.0
+    assert rope.high_freq_factor == 4.0
+    assert rope.original_max_position_embeddings == 8192
+
+
+def test_make_attention_config_reads_only_common_attention_fields():
+    from lmdeploy.turbomind.models.utils import make_attention_config
+
+    cfg = DummyConfig(
+        hidden_size=16,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        head_dim=8,
+        rope_theta=10000.0,
+        max_position_embeddings=128,
+    )
+    engine_cfg = SimpleNamespace(attn_tp_size=2)
+
+    attn_cfg = make_attention_config(
+        cfg,
+        engine_cfg,
+        data_type=_tm.DataType.TYPE_FP16,
+    )
+
+    assert attn_cfg.hidden_dim == 16
+    assert attn_cfg.head_num == 4
+    assert attn_cfg.kv_head_num == 2
+    assert attn_cfg.head_dim == 8
+    assert attn_cfg.tp_size == 2
+    assert attn_cfg.data_type == _tm.DataType.TYPE_FP16
+    assert attn_cfg.rope.dim == 8
+
+
+def test_make_attention_config_applies_rope_scaling_factor_override():
+    from lmdeploy.turbomind.models.utils import make_attention_config
+
+    cfg = DummyConfig(
+        hidden_size=16,
+        num_attention_heads=4,
+        num_key_value_heads=4,
+        head_dim=4,
+        rope_theta=10000.0,
+        max_position_embeddings=128,
+    )
+    engine_cfg = SimpleNamespace(attn_tp_size=2, rope_scaling_factor=2.0)
+
+    attn_cfg = make_attention_config(
+        cfg,
+        engine_cfg,
+        data_type=_tm.DataType.TYPE_FP16,
+    )
+
+    assert attn_cfg.rope.factor == 2.0
+    assert attn_cfg.rope.max_position_embeddings == 128
+
+
+def test_model_weight_and_ffn_helpers_read_module_fields():
+    from lmdeploy.turbomind.builders import _act_type_id
+    from lmdeploy.turbomind.models.utils import make_ffn_config, make_model_weight_config
+
+    cfg = DummyConfig(hidden_size=16, intermediate_size=64)
+    engine_cfg = SimpleNamespace(attn_tp_size=2, attn_cp_size=1, mlp_tp_size=4)
+
+    root_cfg = make_model_weight_config(
+        cfg,
+        engine_cfg,
+        data_type=_tm.DataType.TYPE_FP16,
+    )
+    ffn_cfg = make_ffn_config(
+        cfg,
+        engine_cfg,
+        data_type=_tm.DataType.TYPE_FP16,
+        act_type=_act_type_id('silu'),
+    )
+
+    assert root_cfg.hidden_units == 16
+    assert root_cfg.tp_size == 2
+    assert root_cfg.data_type == _tm.DataType.TYPE_FP16
+    assert ffn_cfg.hidden_dim == 16
+    assert ffn_cfg.inter_size == 64
+    assert ffn_cfg.tp_size == 4
+    assert ffn_cfg.data_type == _tm.DataType.TYPE_FP16
+    assert ffn_cfg.act_type == _act_type_id('silu')
+
+
+def test_make_moe_config_returns_populated_config():
+    from lmdeploy.turbomind.models.utils import make_moe_config
+
+    cfg = DummyConfig(hidden_size=16)
+    engine_cfg = SimpleNamespace(mlp_tp_size=4)
+
+    moe_cfg = make_moe_config(
+        cfg, engine_cfg,
+        data_type=_tm.DataType.TYPE_FP16,
+        experts_per_token=4,
+    )
+
+    assert moe_cfg.method == 1
+    assert moe_cfg.experts_per_token == 4
+    assert moe_cfg.norm_topk_prob is True
+    assert moe_cfg.shared_gate is False
+    assert moe_cfg.routed_scale == 1.0
+    assert moe_cfg.router_bias is False
+    assert moe_cfg.topk_group == 1
+    assert moe_cfg.topk_method == 'greedy'
+    assert moe_cfg.n_group == 1
+    assert moe_cfg.scoring_func == 'softmax'
+    assert moe_cfg.router_n_groups == 0
+    assert moe_cfg.hidden_dim == 16
+    assert moe_cfg.mlp_bias is False
+    assert moe_cfg.data_type == _tm.DataType.TYPE_FP16
+    assert moe_cfg.tp_size == 4
+    assert moe_cfg.act_type == 0  # silu
+    assert moe_cfg.fuse_silu is True
+
+
+def test_make_moe_config_overrides_defaults():
+    from lmdeploy.turbomind.models.utils import make_moe_config
+
+    cfg = DummyConfig(hidden_size=32)
+    engine_cfg = SimpleNamespace(mlp_tp_size=2)
+
+    moe_cfg = make_moe_config(
+        cfg, engine_cfg,
+        data_type=_tm.DataType.TYPE_BF16,
+        experts_per_token=8,
+        act_type=1,
+        norm_topk_prob=False,
+        shared_gate=True,
+        router_bias=True,
+        mlp_bias=True,
+        topk_method='noaux_tc',
+        scoring_func='sigmoid',
+        routed_scale=2.0,
+        topk_group=2,
+        n_group=2,
+        router_n_groups=4,
+    )
+
+    assert moe_cfg.experts_per_token == 8
+    assert moe_cfg.act_type == 1
+    assert moe_cfg.norm_topk_prob is False
+    assert moe_cfg.shared_gate is True
+    assert moe_cfg.router_bias is True
+    assert moe_cfg.mlp_bias is True
+    assert moe_cfg.topk_method == 'noaux_tc'
+    assert moe_cfg.scoring_func == 'sigmoid'
+    assert moe_cfg.routed_scale == 2.0
+    assert moe_cfg.topk_group == 2
+    assert moe_cfg.n_group == 2
+    assert moe_cfg.router_n_groups == 4
+    assert moe_cfg.hidden_dim == 32
+    assert moe_cfg.data_type == _tm.DataType.TYPE_BF16
+    assert moe_cfg.tp_size == 2
+
+def _engine_cfg():
+    return SimpleNamespace(
+        rope_scaling_factor=0,
+        attn_tp_size=2,
+        attn_cp_size=1,
+        mlp_tp_size=4,
+    )
+
+
+def _resolver():
+    return SimpleNamespace(data_type=_tm.DataType.TYPE_FP16)
+
+
+def test_llama_constructor_preserves_common_config_fields():
+    from lmdeploy.turbomind.models.llama import LlamaModel
+
+    cfg = DummyConfig(
+        num_hidden_layers=2,
+        vocab_size=128,
+        rms_norm_eps=1e-6,
+        tie_word_embeddings=False,
+        model_type='llama',
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        hidden_size=32,
+        head_dim=8,
+        max_position_embeddings=1024,
+        intermediate_size=64,
+        attention_bias=False,
+    )
+
+    model = LlamaModel(cfg, _engine_cfg(), resolver=_resolver())
+
+    assert model.cfg is cfg
+    assert model.cfg.num_hidden_layers == 2
+    assert model.cfg.vocab_size == 128
+    assert model.cfg.rms_norm_eps == 1e-6
+    assert model._attn_cfg.hidden_dim == 32
+    assert model._attn_cfg.head_num == 4
+    assert model._attn_cfg.kv_head_num == 2
+    assert not hasattr(model, '_head_dim')
+    assert not hasattr(model, '_head_num')
+    assert not hasattr(model, '_kv_head_num')
+    assert not hasattr(model, '_rope')
+    assert model._attn_cfg.has_bias is False
+    assert model._ffn_cfg.inter_size == 64
+    assert model._ffn_cfg.tp_size == 4
+
+
+def test_qwen2_constructor_keeps_qkv_bias_local():
+    from lmdeploy.turbomind.models.qwen2 import Qwen2Model
+
+    cfg = DummyConfig(
+        num_hidden_layers=2,
+        vocab_size=128,
+        rms_norm_eps=1e-6,
+        tie_word_embeddings=False,
+        model_type='qwen2',
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        hidden_size=32,
+        head_dim=8,
+        max_position_embeddings=1024,
+        intermediate_size=64,
+        qkv_bias=True,
+    )
+
+    model = Qwen2Model(cfg, _engine_cfg(), resolver=_resolver())
+
+    assert model._attn_cfg.has_bias is True
+    assert model._ffn_cfg.inter_size == 64
+
+
+def test_gpt_oss_constructor_keeps_sliding_window_local():
+    from lmdeploy.turbomind.models.gpt_oss import GptOssModel
+
+    cfg = DummyConfig(
+        num_hidden_layers=2,
+        vocab_size=128,
+        rms_norm_eps=1e-6,
+        tie_word_embeddings=False,
+        model_type='gpt-oss',
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        hidden_size=32,
+        head_dim=8,
+        max_position_embeddings=1024,
+        intermediate_size=64,
+        attention_bias=True,
+        num_local_experts=4,
+        experts_per_token=2,
+        layer_types=['sliding_attention', 'full_attention'],
+        sliding_window=256,
+    )
+
+    model = GptOssModel(cfg, _engine_cfg(), resolver=_resolver())
+
+    assert model._attn_cfg.attn_sink is True
+    assert model._attn_cfg.has_bias == 1
+    assert model._window_sizes == [256, 0]
+    assert model._expert_nums == [4, 4]
diff --git a/tests/test_lmdeploy/test_turbomind/test_transform_tensors.py b/tests/test_lmdeploy/test_turbomind/test_transform_tensors.py
new file mode 100644
index 0000000000..541bb1c4da
--- /dev/null
+++ b/tests/test_lmdeploy/test_turbomind/test_transform_tensors.py
@@ -0,0 +1,362 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Tests for the @transform_output_dim decorator."""
+
+from __future__ import annotations
+
+import importlib
+import importlib.util
+import os
+import sys
+import types
+
+import torch
+
+# ---------------------------------------------------------------------------
+# Bootstrap: make _turbomind available as a lightweight stub so that
+# ``lmdeploy.turbomind.linear`` and ``_base`` can be imported without
+# the real C extension.
+# ---------------------------------------------------------------------------
+
+_repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..'))
+
+
+def _setup_fake_tm():
+    """Register a minimal ``_turbomind`` stub in ``sys.modules``."""
+    if '_turbomind' in sys.modules:
+        return
+
+    tm = types.ModuleType('_turbomind')
+
+    class DataType:
+        TYPE_FP32 = 0
+        TYPE_FP16 = 1
+        TYPE_BF16 = 2
+        TYPE_INVALID = 3
+        TYPE_INT32 = 4
+        TYPE_INT64 = 5
+        TYPE_INT8 = 6
+        TYPE_UINT8 = 7
+        TYPE_UINT4 = 8
+        TYPE_FP8_E4M3 = 9
+        TYPE_FP4_E2M1 = 10
+
+    tm.DataType = DataType
+
+    # Stub functions / classes referenced throughout turbomind/
+    tm.create_module = lambda cfg: None
+    tm.LinearConfig = type('LinearConfig', (), {})()
+    tm.ResolveLinearWeightFormat = lambda *a, **kw: None
+
+    sys.modules['_turbomind'] = tm
+
+
+_setup_fake_tm()
+
+# ---------------------------------------------------------------------------
+# Import modules under test by loading their files directly so we avoid
+# triggering the ``lmdeploy.turbomind`` package __init__ (which drags in
+# the real TurboMind runtime).
+# ---------------------------------------------------------------------------
+
+# Ensure ``lmdeploy`` top-level is importable.
+import lmdeploy  # noqa: F401
+
+# Register the sub-package stubs so that relative imports resolve.
+_turbomind_pkg = sys.modules.get('lmdeploy.turbomind')
+if _turbomind_pkg is None:
+    _turbomind_pkg = types.ModuleType('lmdeploy.turbomind')
+    _turbomind_pkg.__path__ = [os.path.join(_repo_root, 'lmdeploy', 'turbomind')]
+    _turbomind_pkg.__package__ = 'lmdeploy.turbomind'
+    sys.modules['lmdeploy.turbomind'] = _turbomind_pkg
+
+# (No longer need 'lmdeploy.turbomind.deploy' stub -- deploy/ was promoted.)
+
+
+def _load_module_from_file(mod_name: str, file_path: str):
+    """Load a Python module from *file_path* and register it as *mod_name*."""
+    spec = importlib.util.spec_from_file_location(mod_name, file_path)
+    mod = importlib.util.module_from_spec(spec)
+    sys.modules[mod_name] = mod
+    spec.loader.exec_module(mod)
+    return mod
+
+
+# Load linear.py first — weight_format.py imports from .linear at module level.
+_linear_path = os.path.join(_repo_root, 'lmdeploy', 'turbomind', 'linear.py')
+_linear_mod = _load_module_from_file('lmdeploy.turbomind.linear', _linear_path)
+Linear = _linear_mod.Linear
+
+# Load weight_format (needed by _base for TrivialFormat)
+_wf_path = os.path.join(_repo_root, 'lmdeploy', 'turbomind', 'weight_format.py')
+_load_module_from_file('lmdeploy.turbomind.weight_format', _wf_path)
+
+# Load builder/_base.py
+_base_path = os.path.join(_repo_root, 'lmdeploy', 'turbomind', 'builders', '_base.py')
+_base_mod = _load_module_from_file('lmdeploy.turbomind.builders._base', _base_path)
+transform_output_dim = _linear_mod.transform_output_dim
+transform_input_dim = _linear_mod.transform_input_dim
+
+# Register builder sub-package
+_builder_pkg = sys.modules.get('lmdeploy.turbomind.builders')
+if _builder_pkg is None:
+    _builder_pkg = types.ModuleType('lmdeploy.turbomind.builders')
+    _builder_pkg.__path__ = [os.path.join(_repo_root, 'lmdeploy', 'turbomind', 'builders')]
+    _builder_pkg.__package__ = 'lmdeploy.turbomind.builders'
+    sys.modules['lmdeploy.turbomind.builders'] = _builder_pkg
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_linear(out_dim: int, in_dim: int | None = None,
+                 has_bias: bool = False) -> Linear:
+    """Create a trivial Linear for testing.
+
+    If *in_dim* is given the weight is 2-D (in_dim, out_dim); otherwise it is 1-D (out_dim,) -- simulating a bias-only
+    tensor.
+    """
+    tensors: dict[str, torch.Tensor] = {}
+    if in_dim is not None:
+        tensors['weight'] = torch.randn(in_dim, out_dim)
+    else:
+        tensors['weight'] = torch.randn(out_dim)
+    if has_bias:
+        tensors['bias'] = torch.randn(out_dim)
+    return Linear(tensors=tensors,
+                  weight_format='placeholder')
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+class TestTransformTensors:
+
+    # -- 1-in / 1-out -------------------------------------------------------
+
+    def test_1in_1out_2d_weight_only(self):
+        """1-in/1-out with a 2-D weight tensor."""
+
+        @transform_output_dim
+        def double(x: torch.Tensor) -> torch.Tensor:
+            return x * 2
+
+        lin = _make_linear(out_dim=8, in_dim=4)
+        result = double(lin)
+        assert isinstance(result, Linear)
+        assert set(result.tensors) == {'weight'}
+        assert result.tensors['weight'].shape == (4, 8)
+        assert torch.allclose(result.tensors['weight'],
+                              lin.tensors['weight'] * 2)
+
+    def test_1in_1out_1d_bias_only(self):
+        """1-in/1-out with a 1-D tensor (bias-only shape)."""
+
+        @transform_output_dim
+        def add_one(x: torch.Tensor) -> torch.Tensor:
+            return x + 1.0
+
+        lin = _make_linear(out_dim=6)  # 1-D weight
+        result = add_one(lin)
+        assert isinstance(result, Linear)
+        assert result.tensors['weight'].shape == (6,)
+        assert torch.allclose(result.tensors['weight'],
+                              lin.tensors['weight'] + 1.0)
+
+    def test_1in_1out_mixed_dims(self):
+        """1-in/1-out with 2-D weight + 1-D bias."""
+
+        @transform_output_dim
+        def negate(x: torch.Tensor) -> torch.Tensor:
+            return -x
+
+        lin = _make_linear(out_dim=5, in_dim=3, has_bias=True)
+        result = negate(lin)
+        assert isinstance(result, Linear)
+        assert set(result.tensors) == {'weight', 'bias'}
+        # weight stays 2-D
+        assert result.tensors['weight'].shape == (3, 5)
+        assert torch.allclose(result.tensors['weight'],
+                              -lin.tensors['weight'])
+        # bias stays 1-D
+        assert result.tensors['bias'].shape == (5,)
+        assert torch.allclose(result.tensors['bias'],
+                              -lin.tensors['bias'])
+
+    # -- 1-in / 2-out (split) -----------------------------------------------
+
+    def test_1in_2out_split(self):
+        """1-in/2-out: split one Linear into two along last dim."""
+
+        @transform_output_dim
+        def split_in_half(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+            mid = x.shape[-1] // 2
+            return x[..., :mid], x[..., mid:]
+
+        lin = _make_linear(out_dim=8, in_dim=4, has_bias=True)
+        a, b = split_in_half(lin)
+        assert isinstance(a, Linear)
+        assert isinstance(b, Linear)
+        assert a.tensors['weight'].shape == (4, 4)
+        assert b.tensors['weight'].shape == (4, 4)
+        assert a.tensors['bias'].shape == (4,)
+        assert b.tensors['bias'].shape == (4,)
+
+    def test_1in_2out_1d_only(self):
+        """1-in/2-out with 1-D tensors."""
+
+        @transform_output_dim
+        def split_1d(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+            mid = x.shape[-1] // 2
+            return x[..., :mid], x[..., mid:]
+
+        lin = _make_linear(out_dim=6)  # 1-D
+        a, b = split_1d(lin)
+        assert a.tensors['weight'].shape == (3,)
+        assert b.tensors['weight'].shape == (3,)
+
+    # -- multi-in / 1-out (concat) ------------------------------------------
+
+    def test_multi_in_1out_concat(self):
+        """Multi-in/1-out: concatenate three Linears along last dim."""
+
+        @transform_output_dim
+        def concat3(a: torch.Tensor, b: torch.Tensor,
+                    c: torch.Tensor) -> torch.Tensor:
+            return torch.cat([a, b, c], dim=-1)
+
+        la = _make_linear(out_dim=4, in_dim=3, has_bias=True)
+        lb = _make_linear(out_dim=4, in_dim=3, has_bias=True)
+        lc = _make_linear(out_dim=4, in_dim=3, has_bias=True)
+        result = concat3(la, lb, lc)
+        assert isinstance(result, Linear)
+        assert result.tensors['weight'].shape == (3, 12)
+        assert result.tensors['bias'].shape == (12,)
+
+    # -- optional tensor arg -------------------------------------------------
+
+    def test_optional_tensor_none(self):
+        """Optional tensor arg passed as None -> inner fn receives None."""
+
+        @transform_output_dim
+        def maybe_add(x: torch.Tensor,
+                      y: torch.Tensor | None) -> torch.Tensor:
+            if y is None:
+                return x
+            return x + y
+
+        lin = _make_linear(out_dim=4, in_dim=3)
+        result = maybe_add(lin, None)
+        assert isinstance(result, Linear)
+        assert torch.allclose(result.tensors['weight'],
+                              lin.tensors['weight'])
+
+    def test_optional_tensor_provided(self):
+        """Optional tensor arg provided -> inner fn receives the tensor."""
+
+        @transform_output_dim
+        def maybe_add(x: torch.Tensor,
+                      y: torch.Tensor | None) -> torch.Tensor:
+            return x + y
+
+        la = _make_linear(out_dim=4, in_dim=3)
+        lb = _make_linear(out_dim=4, in_dim=3)
+        result = maybe_add(la, lb)
+        assert isinstance(result, Linear)
+        assert torch.allclose(result.tensors['weight'],
+                              la.tensors['weight'] + lb.tensors['weight'])
+
+    # -- format propagation --------------------------------------------------
+
+    def test_format_propagation(self):
+        """Output inherits weight_format from first input."""
+
+        @transform_output_dim
+        def identity(x: torch.Tensor) -> torch.Tensor:
+            return x
+
+        lin = _make_linear(out_dim=4, in_dim=3)
+        object.__setattr__(lin, 'weight_format', 'fake_fmt')
+        result = identity(lin)
+        assert result.weight_format == 'fake_fmt'
+
+    # -- kwargs passthrough ---------------------------------------------------
+
+    def test_kwargs_passthrough(self):
+        """Non-tensor kwargs are forwarded unchanged."""
+
+        @transform_output_dim
+        def scale(x: torch.Tensor, factor: float) -> torch.Tensor:
+            return x * factor
+
+        lin = _make_linear(out_dim=4, in_dim=3)
+        result = scale(lin, factor=3.0)
+        assert isinstance(result, Linear)
+        assert torch.allclose(result.tensors['weight'],
+                              lin.tensors['weight'] * 3.0)
+
+
+class TestTransformInputDim:
+
+    def test_2d_transformed(self):
+        """2-D tensors are passed through the inner function."""
+
+        @transform_input_dim
+        def pad_first_dim(tensor: torch.Tensor,
+                          *, target: int) -> torch.Tensor:
+            return torch.nn.functional.pad(
+                tensor, [0, 0, 0, target - tensor.size(0)])
+
+        lin = _make_linear(out_dim=4, in_dim=2)
+        result = pad_first_dim(lin, target=6)
+        assert isinstance(result, Linear)
+        assert result.tensors['weight'].shape == (6, 4)
+
+    def test_1d_passthrough(self):
+        """1-D tensors (bias) pass through unchanged."""
+
+        @transform_input_dim
+        def pad_first_dim(tensor: torch.Tensor,
+                          *, target: int) -> torch.Tensor:
+            return torch.nn.functional.pad(
+                tensor, [0, 0, 0, target - tensor.size(0)])
+
+        lin = _make_linear(out_dim=4)  # 1-D weight
+        result = pad_first_dim(lin, target=6)
+        assert isinstance(result, Linear)
+        assert result.tensors['weight'].shape == (4,)  # unchanged
+
+    def test_mixed_dims_2d_transformed_1d_passthrough(self):
+        """2-D weight is transformed; 1-D bias passes through."""
+
+        @transform_input_dim
+        def double_input_dim(tensor: torch.Tensor) -> torch.Tensor:
+            return tensor.repeat(2, 1)
+
+        lin = _make_linear(out_dim=4, in_dim=3, has_bias=True)
+        result = double_input_dim(lin)
+        assert isinstance(result, Linear)
+        assert set(result.tensors) == {'weight', 'bias'}
+        assert result.tensors['weight'].shape == (6, 4)  # doubled
+        assert result.tensors['bias'].shape == (4,)  # unchanged
+
+    def test_1in_2out_distributes_1d(self):
+        """Multi-output: 1-D tensors duplicated into all output buckets."""
+
+        @transform_input_dim
+        def split_input(tensor: torch.Tensor
+                        ) -> tuple[torch.Tensor, torch.Tensor]:
+            mid = tensor.size(0) // 2
+            return tensor[:mid], tensor[mid:]
+
+        lin = _make_linear(out_dim=4, in_dim=6, has_bias=True)
+        a, b = split_input(lin)
+        assert isinstance(a, Linear)
+        assert isinstance(b, Linear)
+        assert a.tensors['weight'].shape == (3, 4)
+        assert b.tensors['weight'].shape == (3, 4)
+        assert a.tensors['bias'].shape == (4,)  # duplicated
+        assert b.tensors['bias'].shape == (4,)  # duplicated
diff --git a/tests/test_lmdeploy/test_turbomind/test_weight_format_resolver.py b/tests/test_lmdeploy/test_turbomind/test_weight_format_resolver.py
new file mode 100644
index 0000000000..2810205ad4
--- /dev/null
+++ b/tests/test_lmdeploy/test_turbomind/test_weight_format_resolver.py
@@ -0,0 +1,253 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Tests for WeightFormatResolver dispatch logic.
+
+Uses a lightweight fake WeightFormat subclass that stubs out
+``make_data_format`` so the resolver can be exercised without the real
+``_turbomind`` extension.
+"""
+from __future__ import annotations
+
+import importlib
+import importlib.util
+import os
+import sys
+import types
+
+import pytest
+import torch
+
+# ---------------------------------------------------------------------------
+# _turbomind stub (same pattern as test_transform_tensors.py)
+# ---------------------------------------------------------------------------
+
+_repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..'))
+
+
+def _setup_fake_tm():
+    """Ensure ``_turbomind`` in sys.modules has every attribute the resolver
+    and weight_format class bodies touch.
+
+    Idempotent: augments whatever is
+    already there so running after test_transform_tensors.py (which sets up
+    a minimal stub) still leaves a usable module.
+    """
+    tm = sys.modules.get('_turbomind')
+    if tm is None:
+        tm = types.ModuleType('_turbomind')
+        sys.modules['_turbomind'] = tm
+
+    dt = getattr(tm, 'DataType', None)
+    if dt is None:
+        class DataType:
+            pass
+        dt = DataType
+        tm.DataType = dt
+
+    # Class-body references in weight_format.py and builder/_base.py
+    # (_STR_TO_DTYPE, _TORCH_TO_CPP) need these specific names present at
+    # module load time.
+    for name, val in (('TYPE_FP32', 0), ('TYPE_FP16', 1), ('TYPE_BF16', 2),
+                      ('TYPE_INVALID', 3), ('TYPE_INT32', 4),
+                      ('TYPE_INT64', 5), ('TYPE_INT8', 6),
+                      ('TYPE_UINT8', 7), ('TYPE_UINT4', 10),
+                      ('TYPE_FP8_E4M3', 11), ('TYPE_FP4_E2M1', 12)):
+        if not hasattr(dt, name):
+            setattr(dt, name, val)
+
+    if not hasattr(tm, 'ResolveLinearWeightFormat'):
+        tm.ResolveLinearWeightFormat = lambda d, w, bi, bo: ('DataFormat', d, w, bi, bo)
+
+
+_setup_fake_tm()
+
+# Register package stubs.
+import lmdeploy  # noqa: F401
+
+for _pkg in ('lmdeploy.turbomind',):
+    if _pkg not in sys.modules:
+        mod = types.ModuleType(_pkg)
+        mod.__path__ = [os.path.join(_repo_root, *_pkg.split('.'))]
+        mod.__package__ = _pkg
+        sys.modules[_pkg] = mod
+
+
+def _load(mod_name, file_rel_path):
+    path = os.path.join(_repo_root, *file_rel_path.split('/'))
+    spec = importlib.util.spec_from_file_location(mod_name, path)
+    mod = importlib.util.module_from_spec(spec)
+    sys.modules[mod_name] = mod
+    spec.loader.exec_module(mod)
+    return mod
+
+
+_linear_mod = _load('lmdeploy.turbomind.linear',
+                    'lmdeploy/turbomind/linear.py')
+_wf_mod = _load('lmdeploy.turbomind.weight_format',
+                'lmdeploy/turbomind/weight_format.py')
+
+Linear = _linear_mod.Linear
+WeightFormat = _wf_mod.WeightFormat
+WeightFormatResolver = _wf_mod.WeightFormatResolver
+
+
+# ---------------------------------------------------------------------------
+# Fake format used by the tests
+# ---------------------------------------------------------------------------
+
+
+class _FakeQuant(WeightFormat):
+    """Accepts when a ``.qfoo`` tensor is present.
+
+    ``normalize`` is identity.
+    """
+    name = 'fakeq'
+    suffix_map = {'.qfoo': 'weight', '.scales': 'scales', '.bias': 'bias'}
+    weight_dtype = 0  # TYPE_FP32 from our stub
+    has_zero_point = False
+
+    def __init__(self, *, block_in=None, block_out=None):
+        super().__init__(block_in=block_in, block_out=block_out)
+
+    def accepts(self, available):
+        return '.qfoo' in available
+
+    def normalize(self, x, kind):
+        return x
+
+
+class _FakeQuantWithZeros(_FakeQuant):
+    name = 'fakeqz'
+    suffix_map = {'.qfoo': 'weight', '.scales': 'scales',
+                  '.qzeros': 'zeros', '.bias': 'bias'}
+    has_zero_point = True
+
+    def synthesize_zeros(self, scales):
+        return torch.zeros_like(scales)
+
+
+class _FakeTrivial(WeightFormat):
+    name = 'faketr'
+    suffix_map = {'.weight': 'weight', '.bias': 'bias'}
+    weight_dtype = None
+    has_zero_point = False
+
+    def accepts(self, available):
+        return available.keys() <= {'.weight', '.bias'} and '.weight' in available
+
+    def normalize(self, x, kind):
+        return x
+
+    def dequant(self, tensors, data_type):
+        return tensors
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+class TestResolveQuantized:
+
+    def _make_resolver(self):
+        return WeightFormatResolver(
+            data_type=0,
+            formats=[_FakeQuant(), _FakeTrivial()])
+
+    def test_quant_prefix_picks_quant_format(self):
+        params = {
+            'layer.qfoo':   torch.randn(4, 4),
+            'layer.scales': torch.randn(1, 4),
+        }
+        lin = self._make_resolver().resolve(params, 'layer')
+        assert isinstance(lin.weight_format, _FakeQuant)
+        assert set(lin.tensors) == {'weight', 'scales'}
+
+    def test_trivial_prefix_falls_through(self):
+        params = {'layer.weight': torch.randn(4, 4)}
+        lin = self._make_resolver().resolve(params, 'layer')
+        assert isinstance(lin.weight_format, _FakeTrivial)
+
+
+class TestResolveFailureModes:
+
+    def _make_resolver(self):
+        return WeightFormatResolver(
+            data_type=0,
+            formats=[_FakeQuant(), _FakeTrivial()])
+
+    def test_missing_prefix_default_raises_key_error(self):
+        with pytest.raises(KeyError, match='no checkpoint tensors found'):
+            self._make_resolver().resolve({}, 'missing.prefix')
+
+    def test_missing_prefix_optional_returns_none(self):
+        assert self._make_resolver().resolve(
+            {}, 'missing.prefix', optional=True) is None
+
+    def test_tensors_present_no_match_raises_value_error(self):
+        class _PickyTrivial(_FakeTrivial):
+            def accepts(self, available):
+                return False
+
+        resolver = WeightFormatResolver(
+            data_type=0,
+            formats=[_FakeQuant(), _PickyTrivial()])
+        params = {'layer.weight': torch.randn(4, 4)}
+        with pytest.raises(ValueError, match='no weight format accepts'):
+            resolver.resolve(params, 'layer')
+
+
+class TestIndexedProbe:
+
+    def test_index_slices_available_tensors(self):
+        resolver = WeightFormatResolver(
+            data_type=0, formats=[_FakeTrivial()])
+        params = {'experts.weight': torch.arange(24).reshape(3, 4, 2).float()}
+        lin = resolver.resolve(params, 'experts', index=1)
+        assert lin.tensors['weight'].shape == (4, 2)
+        torch.testing.assert_close(
+            lin.tensors['weight'],
+            torch.arange(8, 16).reshape(4, 2).float())
+
+
+class TestZerosSynthesis:
+
+    def test_synthesize_zeros_called_when_missing(self):
+        params = {
+            'layer.qfoo':   torch.randn(4, 4),
+            'layer.scales': torch.ones(1, 4),
+        }
+        resolver = WeightFormatResolver(
+            data_type=0, formats=[_FakeQuantWithZeros()])
+        lin = resolver.resolve(params, 'layer')
+        assert 'zeros' in lin.tensors
+        torch.testing.assert_close(
+            lin.tensors['zeros'], torch.zeros(1, 4))
+
+    def test_synthesize_zeros_skipped_when_present(self):
+        scales   = torch.ones(1, 4)
+        supplied = torch.full_like(scales, 5.0)
+        params = {
+            'layer.qfoo':   torch.randn(4, 4),
+            'layer.scales': scales,
+            'layer.qzeros': supplied,
+        }
+        resolver = WeightFormatResolver(
+            data_type=0, formats=[_FakeQuantWithZeros()])
+        lin = resolver.resolve(params, 'layer')
+        torch.testing.assert_close(lin.tensors['zeros'], supplied)
+
+
+class TestEquality:
+
+    def test_same_class_same_blocks_equal(self):
+        a = _FakeQuant(block_in=128)
+        b = _FakeQuant(block_in=128)
+        assert a == b
+        assert hash(a) == hash(b)
+        assert {a, b} == {a}
+
+    def test_different_blocks_unequal(self):
+        assert _FakeQuant(block_in=128) != _FakeQuant(block_in=64)
+
+    def test_different_classes_unequal(self):
+        assert _FakeQuant() != _FakeTrivial()