InternLM · windreamer · May 7, 2026 · May 7, 2026 · May 7, 2026
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
@@ -417,15 +417,17 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque
 
     random_seed = request.seed if request.seed is not None else None
     max_new_tokens = (request.max_completion_tokens if request.max_completion_tokens else request.max_tokens)
-    response_format = None
-    if request.response_format and request.response_format.type != 'text':
-        response_format = request.response_format.model_dump()
 
     parser_cls = VariableInterface.response_parser_cls
     response_parser = parser_cls(request=request, tokenizer=tokenizer)
-    # request might be adjusted by tool parser
+    # request might be adjusted by the response parser (e.g. GPT-OSS clears
+    # response_format and injects the schema into messages instead)
     request = response_parser.request
 
+    response_format = None
+    if request.response_format and request.response_format.type != 'text':
+        response_format = request.response_format.model_dump()
+
     gen_config = GenerationConfig(
         max_new_tokens=max_new_tokens,
         do_sample=True,

diff --git a/lmdeploy/serve/parsers/_openai_harmony.py b/lmdeploy/serve/parsers/_openai_harmony.py
@@ -3,6 +3,7 @@
 available."""
 from __future__ import annotations
 
+import json
 import re
 from typing import TYPE_CHECKING
 
@@ -16,6 +17,7 @@
     FunctionCall,
     ToolCall,
 )
+from lmdeploy.utils import get_logger
 
 from .response_parser import ResponseParser, ResponseParserManager
 
@@ -24,6 +26,8 @@
 
     from lmdeploy.serve.openai.protocol import ChatCompletionRequest
 
+logger = get_logger('lmdeploy')
+
 _harmony_encoding = None
 
 
@@ -55,6 +59,7 @@ def __init__(self, request: ChatCompletionRequest, tokenizer: PreTrainedTokenize
         else:
             # Unit tests may inject a lightweight sentinel request object.
             self.request = request
+        self._convert_response_format_to_harmony()
         self.model_tokenizer = tokenizer
         self.parser = StreamableParser(get_encoding(), role=Role.ASSISTANT)
         self._seen_any = False
@@ -64,6 +69,80 @@ def __init__(self, request: ChatCompletionRequest, tokenizer: PreTrainedTokenize
         self._active_tool_name: str | None = None
         self.tool_parser = object()  # API server checks `is not None` for tool support.
 
+    def _convert_response_format_to_harmony(self):
+        """Convert response_format to Harmony-native mode for GPT-OSS.
+
+        GPT-OSS uses Harmony mode for structured output, which conflicts with
+        the engine's built-in JSON/response-format mode. This method injects
+        the response_format schema into the system prompt as a
+        ``# Response Formats`` section and clears ``response_format`` on the
+        request so that only the Harmony-native instructions are used.
+        """
+        fmt = getattr(self.request, 'response_format', None)
+        if fmt is None or getattr(fmt, 'type', 'text') == 'text':
+            return
+
+        try:
+            format_json = json.dumps(fmt.model_dump())
+            format_body = f'# Response Formats\n{format_json}'
+            messages = self.request.messages
+
+            if isinstance(messages, str):
+                messages = messages + '\n\n' + format_body
+                self._clear_response_format(messages=messages)
+                return
+
+            if not isinstance(messages, list):
+                logger.warning('Cannot inject response_format schema into '
+                               'non-list messages for GPT-OSS; clearing response_format only.')
+                self._clear_response_format()
+                return
+
+            new_messages = list(messages)
+            system_idx = next(
+                (i for i, msg in enumerate(new_messages) if isinstance(msg, dict) and msg.get('role') == 'system'),
+                None,
+            )
+
+            if system_idx is not None:
+                content = new_messages[system_idx].get('content')
+                if isinstance(content, list):
+                    # Multimodal content blocks — append a text block.
+                    new_messages[system_idx] = {
+                        **new_messages[system_idx],
+                        'content': content + [{'type': 'text', 'text': format_body}],
+                    }
+                elif isinstance(content, str):
+                    new_messages[system_idx] = {
+                        **new_messages[system_idx],
+                        'content': (content + '\n\n' + format_body) if content else format_body,
+                    }
+                else:
+                    # content is None or unexpected type — insert a separate
+                    # system message so the schema is still available.
+                    new_messages.insert(0, {'role': 'system', 'content': format_body})
+            else:
+                new_messages.insert(0, {'role': 'system', 'content': format_body})
+
+            self._clear_response_format(messages=new_messages)
+        except Exception:
+            logger.exception('Failed to convert response_format to Harmony-native mode for GPT-OSS')
+            # Still clear response_format to avoid the Harmony/JSON mode conflict
+            self._clear_response_format()
+
+    def _clear_response_format(self, messages=None):
+        """Clear response_format on the request, handling both Pydantic and
+        plain objects."""
+        if hasattr(self.request, 'model_copy'):
+            update = {'response_format': None}
+            if messages is not None:
+                update['messages'] = messages
+            self.request = self.request.model_copy(update=update)
+        else:
+            self.request.response_format = None
+            if messages is not None:
+                self.request.messages = messages
+
     def stream_chunk(self, delta_text: str, delta_token_ids: list[int], **kwargs) -> tuple[DeltaMessage | None, bool]:
         if (
             not delta_text

diff --git a/tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py b/tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py
@@ -329,3 +329,234 @@ def test_parse_complete_appends_tool_call_still_open_at_eof(self, monkeypatch):
     )
     def test_extract_tool_name(self, recipient, expected):
         assert gpt_oss_mod.GptOssResponseParser._extract_tool_name(recipient) == expected
+
+
+class TestGptOssResponseFormatHarmonyConversion:
+    """Tests for
+    :meth:`GptOssResponseParser._convert_response_format_to_harmony`."""
+
+    @pytest.fixture(autouse=True)
+    def _patch_streamable_parser(self, monkeypatch):
+        monkeypatch.setattr(
+            openai_harmony_mod,
+            'StreamableParser',
+            lambda *args, **kwargs: _FakeStreamableParser({}),
+        )
+
+    def test_response_format_cleared_after_conversion(self):
+        """response_format must be None after the parser processes it."""
+        from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat
+
+        request = ChatCompletionRequest(
+            model='openai/gpt-oss-20b',
+            messages=[{'role': 'user', 'content': 'hi'}],
+            response_format=ResponseFormat(
+                type='json_schema',
+                json_schema=JsonSchema(
+                    name='test',
+                    schema={'type': 'object', 'properties': {'x': {'type': 'integer'}}},
+                ),
+            ),
+        )
+        parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())
+        assert parser.request.response_format is None
+
+    def test_schema_appended_to_existing_system_message(self):
+        """When a system message already exists the schema is appended to
+        it."""
+        import json as _json
+
+        from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat
+
+        schema_dict = {'type': 'object', 'properties': {'x': {'type': 'integer'}}}
+        request = ChatCompletionRequest(
+            model='openai/gpt-oss-20b',
+            messages=[
+                {'role': 'system', 'content': 'You are helpful.'},
+                {'role': 'user', 'content': 'hi'},
+            ],
+            response_format=ResponseFormat(
+                type='json_schema',
+                json_schema=JsonSchema(name='test', schema=schema_dict),
+            ),
+        )
+        parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())
+
+        msgs = parser.request.messages
+        assert msgs[0]['role'] == 'system'
+        assert parser.request.response_format is None
+        # The schema body must appear in the system message
+        assert '# Response Formats' in msgs[0]['content']
+        assert _json.dumps(schema_dict) in msgs[0]['content']
+        # The original content is preserved before the appended section
+        assert msgs[0]['content'].startswith('You are helpful.')
+        # No leading blank lines in the appended section
+        assert '\n\n# Response Formats' in msgs[0]['content']
+
+    def test_schema_inserted_as_new_system_message_when_none_exists(self):
+        """When no system message exists a new one is inserted at position
+        0."""
+        import json as _json
+
+        from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat
+
+        schema_dict = {'type': 'object', 'properties': {'name': {'type': 'string'}}}
+        request = ChatCompletionRequest(
+            model='openai/gpt-oss-20b',
+            messages=[{'role': 'user', 'content': 'hi'}],
+            response_format=ResponseFormat(
+                type='json_schema',
+                json_schema=JsonSchema(name='test', schema=schema_dict),
+            ),
+        )
+        parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())
+
+        msgs = parser.request.messages
+        assert msgs[0]['role'] == 'system'
+        assert parser.request.response_format is None
+        # New system message content must NOT start with blank lines
+        assert not msgs[0]['content'].startswith('\n')
+        assert msgs[0]['content'].startswith('# Response Formats')
+        assert _json.dumps(schema_dict) in msgs[0]['content']
+        # The user message is still present after the inserted system message
+        assert msgs[1]['role'] == 'user'
+
+    def test_text_response_format_is_not_converted(self):
+        """A text-type response_format should be left untouched."""
+        from lmdeploy.serve.openai.protocol import ResponseFormat
+
+        request = ChatCompletionRequest(
+            model='openai/gpt-oss-20b',
+            messages=[{'role': 'user', 'content': 'hi'}],
+            response_format=ResponseFormat(type='text'),
+        )
+        parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())
+        assert parser.request.response_format is not None
+        assert parser.request.response_format.type == 'text'
+
+    def test_no_response_format_leaves_request_unchanged(self):
+        """When response_format is None the request is not modified."""
+        request = ChatCompletionRequest(
+            model='openai/gpt-oss-20b',
+            messages=[{'role': 'user', 'content': 'hi'}],
+        )
+        parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())
+        assert parser.request.response_format is None
+        assert len(parser.request.messages) == 1
+
+    def test_str_messages_gets_schema_appended(self):
+        """When messages is a string, the schema section is appended to it."""
+        import json as _json
+
+        from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat
+
+        schema_dict = {'type': 'object', 'properties': {'x': {'type': 'integer'}}}
+        request = ChatCompletionRequest(
+            model='openai/gpt-oss-20b',
+            messages='Tell me a joke',
+            response_format=ResponseFormat(
+                type='json_schema',
+                json_schema=JsonSchema(name='test', schema=schema_dict),
+            ),
+        )
+        parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())
+
+        assert parser.request.response_format is None
+        assert isinstance(parser.request.messages, str)
+        assert parser.request.messages.startswith('Tell me a joke')
+        assert '# Response Formats' in parser.request.messages
+        assert _json.dumps(schema_dict) in parser.request.messages
+
+    def test_non_pydantic_request_messages_updated(self):
+        """Non-Pydantic sentinel requests also get messages updated."""
+        import json as _json
+
+        from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat
+
+        schema_dict = {'type': 'object', 'properties': {'y': {'type': 'number'}}}
+        fmt = ResponseFormat(
+            type='json_schema',
+            json_schema=JsonSchema(name='test', schema=schema_dict),
+        )
+
+        # Sentinel must NOT have tools/tool_choice attrs so that __init__
+        # skips the Pydantic-dependent tool-rendering branch.
+        class _Sentinel:
+            messages = [{'role': 'user', 'content': 'hi'}]
+            response_format = fmt
+
+        sentinel = _Sentinel()
+        parser = gpt_oss_mod.GptOssResponseParser(request=sentinel, tokenizer=object())
+
+        assert parser.request.response_format is None
+        msgs = parser.request.messages
+        assert isinstance(msgs, list)
+        assert msgs[0]['role'] == 'system'
+        assert '# Response Formats' in msgs[0]['content']
+        assert _json.dumps(schema_dict) in msgs[0]['content']
+
+    def test_list_content_system_message_gets_text_block_appended(self):
+        """When system message content is a list (multimodal), append a text
+        block."""
+        import json as _json
+
+        from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat
+
+        schema_dict = {'type': 'object', 'properties': {'z': {'type': 'boolean'}}}
+        request = ChatCompletionRequest(
+            model='openai/gpt-oss-20b',
+            messages=[
+                {'role': 'system', 'content': [
+                    {'type': 'text', 'text': 'You are helpful.'},
+                    {'type': 'image_url', 'image_url': {'url': 'http://example.com/img.png'}},
+                ]},
+                {'role': 'user', 'content': 'hi'},
+            ],
+            response_format=ResponseFormat(
+                type='json_schema',
+                json_schema=JsonSchema(name='test', schema=schema_dict),
+            ),
+        )
+        parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())
+
+        assert parser.request.response_format is None
+        sys_msg = parser.request.messages[0]
+        assert sys_msg['role'] == 'system'
+        content = sys_msg['content']
+        assert isinstance(content, list)
+        assert len(content) == 3
+        # Original two blocks preserved
+        assert content[0]['type'] == 'text'
+        assert content[0]['text'] == 'You are helpful.'
+        assert content[1]['type'] == 'image_url'
+        # Schema appended as a text block
+        assert content[2]['type'] == 'text'
+        assert '# Response Formats' in content[2]['text']
+        assert _json.dumps(schema_dict) in content[2]['text']
+
+    def test_none_content_system_message_inserts_separate_system(self):
+        """When system message content is None, insert a new system message."""
+        import json as _json
+
+        from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat
+
+        schema_dict = {'type': 'object', 'properties': {'w': {'type': 'string'}}}
+        request = ChatCompletionRequest(
+            model='openai/gpt-oss-20b',
+            messages=[
+                {'role': 'system', 'content': None},
+                {'role': 'user', 'content': 'hi'},
+            ],
+            response_format=ResponseFormat(
+                type='json_schema',
+                json_schema=JsonSchema(name='test', schema=schema_dict),
+            ),
+        )
+        parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())
+
+        assert parser.request.response_format is None
+        msgs = parser.request.messages
+        # A new system message with the schema is inserted at position 0
+        assert msgs[0]['role'] == 'system'
+        assert '# Response Formats' in msgs[0]['content']
+        assert _json.dumps(schema_dict) in msgs[0]['content']