InternLM · CUHKSZzxy · Mar 19, 2026 · Mar 24, 2026 · Mar 26, 2026 · Apr 2, 2026
diff --git a/docs/en/index.rst b/docs/en/index.rst
@@ -71,6 +71,7 @@ Documentation
 
    multi_modal/vl_pipeline.md
    multi_modal/api_server_vl.md
+   multi_modal/multimodal_inputs.md
    multi_modal/index.rst
 
 .. _quantization:

diff --git a/docs/en/multi_modal/index.rst b/docs/en/multi_modal/index.rst
@@ -1,12 +1,6 @@
 Vision-Language Models
 =================================
 
-.. toctree::
-   :maxdepth: 2
-   :caption: Guides
-
-   multimodal_inputs.md
-
 .. toctree::
    :maxdepth: 2
    :caption: Examples

diff --git a/docs/en/multi_modal/multimodal_inputs.md b/docs/en/multi_modal/multimodal_inputs.md
@@ -9,6 +9,7 @@ LMDeploy uses the OpenAI message format for all modalities. Each content item in
 | Text        | `text`            | —                     |
 | Image       | `image_url`       | `image_url.url`       |
 | Video       | `video_url`       | `video_url.url`       |
+| Audio       | `audio_url`       | `audio_url.url`       |
 | Time Series | `time_series_url` | `time_series_url.url` |
 
 All examples below target the lmdeploy OpenAI-compatible API server. Start it with:
@@ -133,7 +134,7 @@ ______________________________________________________________________
 
 ## Single Video
 
-> **Note:** Native video input is currently supported for **Qwen3-VL**, **Qwen3.5**, and **InternS1-Pro** models only.
+> **Note:** Native video input is currently supported for **Qwen3-VL**, **Qwen3.5**, **Qwen3-Omni**, and **InternS1-Pro** models only.
 
 <details>
 <summary>Complete example</summary>
@@ -176,7 +177,7 @@ ______________________________________________________________________
 
 ## Multiple Videos
 
-> **Note:** Native video input is currently supported for **Qwen3-VL**, **Qwen3.5**, and **InternS1-Pro** models only.
+> **Note:** Native video input is currently supported for **Qwen3-VL**, **Qwen3.5**, **Qwen3-Omni**, and **InternS1-Pro** models only.
 
 <details>
 <summary>Complete example</summary>
@@ -222,7 +223,7 @@ ______________________________________________________________________
 
 ## Mixed Image and Video
 
-> **Note:** Native video input is currently supported for **Qwen3-VL**, **Qwen3.5**, and **InternS1-Pro** models only.
+> **Note:** Native video input is currently supported for **Qwen3-VL**, **Qwen3.5**, **Qwen3-Omni**, and **InternS1-Pro** models only.
 
 <details>
 <summary>Complete example</summary>
@@ -266,6 +267,85 @@ print(response.choices[0].message.content)
 
 ______________________________________________________________________
 
+## Single Audio
+
+> **Note:** Audio input is currently supported for **Qwen3-Omni** models only.
+
+<details>
+<summary>Complete example</summary>
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'audio_url',
+                'audio_url': {
+                    'url': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cough.wav',
+                },
+            },
+            {
+                'type': 'text',
+                'text': 'Describe this audio.',
+            },
+        ],
+    }],
+    temperature=0.8,
+    top_p=0.8,
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+______________________________________________________________________
+
+## Multiple Audios
+
+> **Note:** Audio input is currently supported for **Qwen3-Omni** models only.
+
+<details>
+<summary>Complete example</summary>
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+audio_url_1 = 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cough.wav'
+audio_url_2 = 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cough.wav'
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {'type': 'audio_url', 'audio_url': {'url': audio_url_1}},
+            {'type': 'audio_url', 'audio_url': {'url': audio_url_2}},
+            {
+                'type': 'text',
+                'text': 'Compare these two audios. What are the similarities and differences?',
+            },
+        ],
+    }],
+    temperature=0.8,
+    top_p=0.8,
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+______________________________________________________________________
+
 ## Time Series
 
 > **Note:** Time series input is currently supported for the **InternS1-Pro** model only.

diff --git a/docs/en/multi_modal/vl_pipeline.md b/docs/en/multi_modal/vl_pipeline.md
@@ -10,7 +10,7 @@ Moreover, we will provide practical inference examples tailored to scenarios wit
 
 Using the pipeline interface to infer other VLM models is similar, with the main difference being the configuration and installation dependencies of the models. You can read [here](https://lmdeploy.readthedocs.io/en/latest/multi_modal/index.html) for environment installation and configuration methods for different models.
 
-> **See also:** [Multi-Modal Inputs](multimodal_inputs.md) — message format reference for all modalities (image, video, time series) with OpenAI-style examples.
+> **See also:** [Multi-Modal Inputs](multimodal_inputs.md) — message format reference for all modalities (image, video, audio, time series) with OpenAI-style examples.
 
 ## A 'Hello, world' example
 

diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst
@@ -71,6 +71,7 @@ LMDeploy 工具箱提供以下核心功能：
 
    multi_modal/vl_pipeline.md
    multi_modal/api_server_vl.md
+   multi_modal/multimodal_inputs.md
    multi_modal/index.rst
 
 

diff --git a/docs/zh_cn/multi_modal/index.rst b/docs/zh_cn/multi_modal/index.rst
@@ -1,12 +1,6 @@
 视觉语言模型
 =================================
 
-.. toctree::
-   :maxdepth: 2
-   :caption: 指南
-
-   multimodal_inputs.md
-
 .. toctree::
    :maxdepth: 2
    :caption: 示例

diff --git a/docs/zh_cn/multi_modal/multimodal_inputs.md b/docs/zh_cn/multi_modal/multimodal_inputs.md
@@ -9,6 +9,7 @@ LMDeploy 使用 OpenAI 消息格式处理所有模态。消息中的每个内容
 | 文本     | `text`            | —                     |
 | 图像     | `image_url`       | `image_url.url`       |
 | 视频     | `video_url`       | `video_url.url`       |
+| 音频     | `audio_url`       | `audio_url.url`       |
 | 时序数据 | `time_series_url` | `time_series_url.url` |
 
 以下示例均面向 lmdeploy 兼容 OpenAI 的 API 服务。启动服务：
@@ -133,7 +134,7 @@ ______________________________________________________________________
 
 ## 单个视频
 
-> **注意：** 原生视频输入目前仅支持 **Qwen3-VL**、**Qwen3.5** 和 **InternS1-Pro** 模型。
+> **注意：** 原生视频输入目前仅支持 **Qwen3-VL**、**Qwen3.5**、**Qwen3-Omni** 和 **InternS1-Pro** 模型。
 
 <details>
 <summary>完整示例</summary>
@@ -176,7 +177,7 @@ ______________________________________________________________________
 
 ## 多个视频
 
-> **注意：** 原生视频输入目前仅支持 **Qwen3-VL**、**Qwen3.5** 和 **InternS1-Pro** 模型。
+> **注意：** 原生视频输入目前仅支持 **Qwen3-VL**、**Qwen3.5**、**Qwen3-Omni** 和 **InternS1-Pro** 模型。
 
 <details>
 <summary>完整示例</summary>
@@ -222,7 +223,7 @@ ______________________________________________________________________
 
 ## 图像与视频混合
 
-> **注意：** 原生视频输入目前仅支持 **Qwen3-VL**、**Qwen3.5** 和 **InternS1-Pro** 模型。
+> **注意：** 原生视频输入目前仅支持 **Qwen3-VL**、**Qwen3.5**、**Qwen3-Omni** 和 **InternS1-Pro** 模型。
 
 <details>
 <summary>完整示例</summary>
@@ -266,6 +267,85 @@ print(response.choices[0].message.content)
 
 ______________________________________________________________________
 
+## 单个音频
+
+> **注意：** 音频输入目前仅支持 **Qwen3-Omni** 模型。
+
+<details>
+<summary>完整示例</summary>
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {
+                'type': 'audio_url',
+                'audio_url': {
+                    'url': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cough.wav',
+                },
+            },
+            {
+                'type': 'text',
+                'text': '描述这段音频。',
+            },
+        ],
+    }],
+    temperature=0.8,
+    top_p=0.8,
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+______________________________________________________________________
+
+## 多个音频
+
+> **注意：** 音频输入目前仅支持 **Qwen3-Omni** 模型。
+
+<details>
+<summary>完整示例</summary>
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
+model_name = client.models.list().data[0].id
+
+audio_url_1 = 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cough.wav'
+audio_url_2 = 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cough.wav'
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role': 'user',
+        'content': [
+            {'type': 'audio_url', 'audio_url': {'url': audio_url_1}},
+            {'type': 'audio_url', 'audio_url': {'url': audio_url_2}},
+            {
+                'type': 'text',
+                'text': '比较这两段音频，有哪些相似点和不同点？',
+            },
+        ],
+    }],
+    temperature=0.8,
+    top_p=0.8,
+)
+print(response.choices[0].message.content)
+```
+
+</details>
+
+______________________________________________________________________
+
 ## 时序数据
 
 > **注意：** 时序数据输入目前仅支持 **InternS1-Pro** 模型。

diff --git a/docs/zh_cn/multi_modal/vl_pipeline.md b/docs/zh_cn/multi_modal/vl_pipeline.md
@@ -10,7 +10,7 @@ LMDeploy 把视觉-语言模型（VLM）复杂的推理过程，抽象为简单
 
 使用 pipeline 接口推理其他 VLM 模型，大同小异，主要区别在于模型依赖的配置和安装。你可以阅读[此处](https://lmdeploy.readthedocs.io/zh-cn/latest/multi_modal/)，查看不同模型的环境安装和配置方式
 
-> **另请参阅：** [多模态输入](multimodal_inputs.md) — 涵盖所有模态（图像、视频、时序数据）的消息格式参考，包含 OpenAI 风格示例。
+> **另请参阅：** [多模态输入](multimodal_inputs.md) — 涵盖所有模态（图像、视频、音频、时序数据）的消息格式参考，包含 OpenAI 风格示例。
 
 ## "Hello, world" 示例
 

diff --git a/lmdeploy/archs.py b/lmdeploy/archs.py
@@ -111,9 +111,9 @@ def check_vl_llm(backend: str, config: dict) -> bool:
         'InternVLChatModel', 'MiniCPMV', 'LlavaForConditionalGeneration', 'LlavaNextForConditionalGeneration',
         'Phi3VForCausalLM', 'Qwen2VLForConditionalGeneration', 'Qwen2_5_VLForConditionalGeneration',
         'Qwen3VLForConditionalGeneration', 'Qwen3VLMoeForConditionalGeneration', 'Qwen3_5ForConditionalGeneration',
-        'Qwen3_5MoeForConditionalGeneration', 'MllamaForConditionalGeneration', 'MolmoForCausalLM',
-        'Gemma3ForConditionalGeneration', 'Llama4ForConditionalGeneration', 'InternVLForConditionalGeneration',
-        'InternS1ForConditionalGeneration', 'InternS1ProForConditionalGeneration',
+        'Qwen3_5MoeForConditionalGeneration', 'Qwen3OmniMoeForConditionalGeneration', 'MllamaForConditionalGeneration',
+        'MolmoForCausalLM', 'Gemma3ForConditionalGeneration', 'Llama4ForConditionalGeneration',
+        'InternVLForConditionalGeneration', 'InternS1ForConditionalGeneration', 'InternS1ProForConditionalGeneration',
         'InternS1_1_ForConditionalGeneration', 'Glm4vForConditionalGeneration'
     ])
     if arch == 'QWenLMHeadModel' and 'visual' in config:

diff --git a/lmdeploy/model.py b/lmdeploy/model.py
@@ -687,8 +687,19 @@ class HFChatTemplate(BaseChatTemplate):
     def __init__(self, model_path: str = '', trust_remote_code: bool = False, **kwargs):
         self.model_path = model_path
         try:
-            from transformers import AutoTokenizer
+            from transformers import AutoProcessor, AutoTokenizer
             self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=trust_remote_code)
+
+            # Some tokenizers do not have chat_template, in this case try to get chat_template from processor
+            # If this still does not work, fallback to BaseChatTemplate.
+            if getattr(self.tokenizer, 'chat_template', None) is None:
+                try:
+                    processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=trust_remote_code)
+                    self.tokenizer.chat_template = getattr(processor, 'chat_template', None)
+                except Exception as e:
+                    logger.warning(f'Failed to load processor from {model_path} for chat template. '
+                                   f'Fallback to tokenizer only. Error: {e}')
+
             # Verify if the model can perform apply_chat_template with different roles.
             self.user_start, self.user_end, _, _ = self._user_instruction()
             self.assistant_start, self.assistant_end, _, _ = self._assistant_instruction()

diff --git a/lmdeploy/pytorch/configurations/qwen3_omni.py b/lmdeploy/pytorch/configurations/qwen3_omni.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import AutoModelConfigBuilder
+from .default import DefaultModelConfigBuilder
+
+
+class Qwen3OmniModelConfigBuilder(AutoModelConfigBuilder):
+
+    @classmethod
+    def condition(cls, hf_config):
+        """config."""
+        return hf_config.model_type == 'qwen3_omni_moe'
+
+    @classmethod
+    def build(cls, hf_config, model_path: str = None, **kwargs):
+        """build."""
+        cfg = DefaultModelConfigBuilder.build(hf_config.thinker_config.text_config, model_path, **kwargs)
+        cfg.hf_config = hf_config
+        cfg.use_mrope = True
+        return cfg
diff --git a/lmdeploy/pytorch/models/module_map.py b/lmdeploy/pytorch/models/module_map.py
@@ -190,6 +190,13 @@
     'Qwen3_5MTPModel': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.qwen3_5_mtp.Qwen3_5MTPModel',
 })
 
+# qwen3 omni moe thinker
+# only support thinker module, so map to Qwen3OmniMoeThinkerForConditionalGeneration
+MODULE_MAP.update({
+    'Qwen3OmniMoeForConditionalGeneration':
+    f'{LMDEPLOY_PYTORCH_MODEL_PATH}.qwen3_omni_moe_thinker.Qwen3OmniMoeThinkerForConditionalGeneration',
+})
+
 # starcoder2
 MODULE_MAP.update({
     'Starcoder2ForCausalLM': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.starcoder2.Starcoder2ForCausalLM',