diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py index e10f20f44e..45229a272d 100644 --- a/lmdeploy/serve/openai/api_server.py +++ b/lmdeploy/serve/openai/api_server.py @@ -417,15 +417,17 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque random_seed = request.seed if request.seed is not None else None max_new_tokens = (request.max_completion_tokens if request.max_completion_tokens else request.max_tokens) - response_format = None - if request.response_format and request.response_format.type != 'text': - response_format = request.response_format.model_dump() parser_cls = VariableInterface.response_parser_cls response_parser = parser_cls(request=request, tokenizer=tokenizer) - # request might be adjusted by tool parser + # request might be adjusted by the response parser (e.g. GPT-OSS clears + # response_format and injects the schema into messages instead) request = response_parser.request + response_format = None + if request.response_format and request.response_format.type != 'text': + response_format = request.response_format.model_dump() + gen_config = GenerationConfig( max_new_tokens=max_new_tokens, do_sample=True, diff --git a/lmdeploy/serve/parsers/_openai_harmony.py b/lmdeploy/serve/parsers/_openai_harmony.py index b1bb492709..6957d9f3f7 100644 --- a/lmdeploy/serve/parsers/_openai_harmony.py +++ b/lmdeploy/serve/parsers/_openai_harmony.py @@ -3,6 +3,7 @@ available.""" from __future__ import annotations +import json import re from typing import TYPE_CHECKING @@ -16,6 +17,7 @@ FunctionCall, ToolCall, ) +from lmdeploy.utils import get_logger from .response_parser import ResponseParser, ResponseParserManager @@ -24,6 +26,8 @@ from lmdeploy.serve.openai.protocol import ChatCompletionRequest +logger = get_logger('lmdeploy') + _harmony_encoding = None @@ -55,6 +59,7 @@ def __init__(self, request: ChatCompletionRequest, tokenizer: PreTrainedTokenize else: # Unit tests may inject a lightweight sentinel request object. self.request = request + self._convert_response_format_to_harmony() self.model_tokenizer = tokenizer self.parser = StreamableParser(get_encoding(), role=Role.ASSISTANT) self._seen_any = False @@ -64,6 +69,80 @@ def __init__(self, request: ChatCompletionRequest, tokenizer: PreTrainedTokenize self._active_tool_name: str | None = None self.tool_parser = object() # API server checks `is not None` for tool support. + def _convert_response_format_to_harmony(self): + """Convert response_format to Harmony-native mode for GPT-OSS. + + GPT-OSS uses Harmony mode for structured output, which conflicts with + the engine's built-in JSON/response-format mode. This method injects + the response_format schema into the system prompt as a + ``# Response Formats`` section and clears ``response_format`` on the + request so that only the Harmony-native instructions are used. + """ + fmt = getattr(self.request, 'response_format', None) + if fmt is None or getattr(fmt, 'type', 'text') == 'text': + return + + try: + format_json = json.dumps(fmt.model_dump()) + format_body = f'# Response Formats\n{format_json}' + messages = self.request.messages + + if isinstance(messages, str): + messages = messages + '\n\n' + format_body + self._clear_response_format(messages=messages) + return + + if not isinstance(messages, list): + logger.warning('Cannot inject response_format schema into ' + 'non-list messages for GPT-OSS; clearing response_format only.') + self._clear_response_format() + return + + new_messages = list(messages) + system_idx = next( + (i for i, msg in enumerate(new_messages) if isinstance(msg, dict) and msg.get('role') == 'system'), + None, + ) + + if system_idx is not None: + content = new_messages[system_idx].get('content') + if isinstance(content, list): + # Multimodal content blocks — append a text block. + new_messages[system_idx] = { + **new_messages[system_idx], + 'content': content + [{'type': 'text', 'text': format_body}], + } + elif isinstance(content, str): + new_messages[system_idx] = { + **new_messages[system_idx], + 'content': (content + '\n\n' + format_body) if content else format_body, + } + else: + # content is None or unexpected type — insert a separate + # system message so the schema is still available. + new_messages.insert(0, {'role': 'system', 'content': format_body}) + else: + new_messages.insert(0, {'role': 'system', 'content': format_body}) + + self._clear_response_format(messages=new_messages) + except Exception: + logger.exception('Failed to convert response_format to Harmony-native mode for GPT-OSS') + # Still clear response_format to avoid the Harmony/JSON mode conflict + self._clear_response_format() + + def _clear_response_format(self, messages=None): + """Clear response_format on the request, handling both Pydantic and + plain objects.""" + if hasattr(self.request, 'model_copy'): + update = {'response_format': None} + if messages is not None: + update['messages'] = messages + self.request = self.request.model_copy(update=update) + else: + self.request.response_format = None + if messages is not None: + self.request.messages = messages + def stream_chunk(self, delta_text: str, delta_token_ids: list[int], **kwargs) -> tuple[DeltaMessage | None, bool]: if ( not delta_text diff --git a/tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py b/tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py index a47b632b28..05bea87b7b 100644 --- a/tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py +++ b/tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py @@ -329,3 +329,234 @@ def test_parse_complete_appends_tool_call_still_open_at_eof(self, monkeypatch): ) def test_extract_tool_name(self, recipient, expected): assert gpt_oss_mod.GptOssResponseParser._extract_tool_name(recipient) == expected + + +class TestGptOssResponseFormatHarmonyConversion: + """Tests for + :meth:`GptOssResponseParser._convert_response_format_to_harmony`.""" + + @pytest.fixture(autouse=True) + def _patch_streamable_parser(self, monkeypatch): + monkeypatch.setattr( + openai_harmony_mod, + 'StreamableParser', + lambda *args, **kwargs: _FakeStreamableParser({}), + ) + + def test_response_format_cleared_after_conversion(self): + """response_format must be None after the parser processes it.""" + from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat + + request = ChatCompletionRequest( + model='openai/gpt-oss-20b', + messages=[{'role': 'user', 'content': 'hi'}], + response_format=ResponseFormat( + type='json_schema', + json_schema=JsonSchema( + name='test', + schema={'type': 'object', 'properties': {'x': {'type': 'integer'}}}, + ), + ), + ) + parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object()) + assert parser.request.response_format is None + + def test_schema_appended_to_existing_system_message(self): + """When a system message already exists the schema is appended to + it.""" + import json as _json + + from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat + + schema_dict = {'type': 'object', 'properties': {'x': {'type': 'integer'}}} + request = ChatCompletionRequest( + model='openai/gpt-oss-20b', + messages=[ + {'role': 'system', 'content': 'You are helpful.'}, + {'role': 'user', 'content': 'hi'}, + ], + response_format=ResponseFormat( + type='json_schema', + json_schema=JsonSchema(name='test', schema=schema_dict), + ), + ) + parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object()) + + msgs = parser.request.messages + assert msgs[0]['role'] == 'system' + assert parser.request.response_format is None + # The schema body must appear in the system message + assert '# Response Formats' in msgs[0]['content'] + assert _json.dumps(schema_dict) in msgs[0]['content'] + # The original content is preserved before the appended section + assert msgs[0]['content'].startswith('You are helpful.') + # No leading blank lines in the appended section + assert '\n\n# Response Formats' in msgs[0]['content'] + + def test_schema_inserted_as_new_system_message_when_none_exists(self): + """When no system message exists a new one is inserted at position + 0.""" + import json as _json + + from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat + + schema_dict = {'type': 'object', 'properties': {'name': {'type': 'string'}}} + request = ChatCompletionRequest( + model='openai/gpt-oss-20b', + messages=[{'role': 'user', 'content': 'hi'}], + response_format=ResponseFormat( + type='json_schema', + json_schema=JsonSchema(name='test', schema=schema_dict), + ), + ) + parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object()) + + msgs = parser.request.messages + assert msgs[0]['role'] == 'system' + assert parser.request.response_format is None + # New system message content must NOT start with blank lines + assert not msgs[0]['content'].startswith('\n') + assert msgs[0]['content'].startswith('# Response Formats') + assert _json.dumps(schema_dict) in msgs[0]['content'] + # The user message is still present after the inserted system message + assert msgs[1]['role'] == 'user' + + def test_text_response_format_is_not_converted(self): + """A text-type response_format should be left untouched.""" + from lmdeploy.serve.openai.protocol import ResponseFormat + + request = ChatCompletionRequest( + model='openai/gpt-oss-20b', + messages=[{'role': 'user', 'content': 'hi'}], + response_format=ResponseFormat(type='text'), + ) + parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object()) + assert parser.request.response_format is not None + assert parser.request.response_format.type == 'text' + + def test_no_response_format_leaves_request_unchanged(self): + """When response_format is None the request is not modified.""" + request = ChatCompletionRequest( + model='openai/gpt-oss-20b', + messages=[{'role': 'user', 'content': 'hi'}], + ) + parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object()) + assert parser.request.response_format is None + assert len(parser.request.messages) == 1 + + def test_str_messages_gets_schema_appended(self): + """When messages is a string, the schema section is appended to it.""" + import json as _json + + from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat + + schema_dict = {'type': 'object', 'properties': {'x': {'type': 'integer'}}} + request = ChatCompletionRequest( + model='openai/gpt-oss-20b', + messages='Tell me a joke', + response_format=ResponseFormat( + type='json_schema', + json_schema=JsonSchema(name='test', schema=schema_dict), + ), + ) + parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object()) + + assert parser.request.response_format is None + assert isinstance(parser.request.messages, str) + assert parser.request.messages.startswith('Tell me a joke') + assert '# Response Formats' in parser.request.messages + assert _json.dumps(schema_dict) in parser.request.messages + + def test_non_pydantic_request_messages_updated(self): + """Non-Pydantic sentinel requests also get messages updated.""" + import json as _json + + from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat + + schema_dict = {'type': 'object', 'properties': {'y': {'type': 'number'}}} + fmt = ResponseFormat( + type='json_schema', + json_schema=JsonSchema(name='test', schema=schema_dict), + ) + + # Sentinel must NOT have tools/tool_choice attrs so that __init__ + # skips the Pydantic-dependent tool-rendering branch. + class _Sentinel: + messages = [{'role': 'user', 'content': 'hi'}] + response_format = fmt + + sentinel = _Sentinel() + parser = gpt_oss_mod.GptOssResponseParser(request=sentinel, tokenizer=object()) + + assert parser.request.response_format is None + msgs = parser.request.messages + assert isinstance(msgs, list) + assert msgs[0]['role'] == 'system' + assert '# Response Formats' in msgs[0]['content'] + assert _json.dumps(schema_dict) in msgs[0]['content'] + + def test_list_content_system_message_gets_text_block_appended(self): + """When system message content is a list (multimodal), append a text + block.""" + import json as _json + + from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat + + schema_dict = {'type': 'object', 'properties': {'z': {'type': 'boolean'}}} + request = ChatCompletionRequest( + model='openai/gpt-oss-20b', + messages=[ + {'role': 'system', 'content': [ + {'type': 'text', 'text': 'You are helpful.'}, + {'type': 'image_url', 'image_url': {'url': 'http://example.com/img.png'}}, + ]}, + {'role': 'user', 'content': 'hi'}, + ], + response_format=ResponseFormat( + type='json_schema', + json_schema=JsonSchema(name='test', schema=schema_dict), + ), + ) + parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object()) + + assert parser.request.response_format is None + sys_msg = parser.request.messages[0] + assert sys_msg['role'] == 'system' + content = sys_msg['content'] + assert isinstance(content, list) + assert len(content) == 3 + # Original two blocks preserved + assert content[0]['type'] == 'text' + assert content[0]['text'] == 'You are helpful.' + assert content[1]['type'] == 'image_url' + # Schema appended as a text block + assert content[2]['type'] == 'text' + assert '# Response Formats' in content[2]['text'] + assert _json.dumps(schema_dict) in content[2]['text'] + + def test_none_content_system_message_inserts_separate_system(self): + """When system message content is None, insert a new system message.""" + import json as _json + + from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat + + schema_dict = {'type': 'object', 'properties': {'w': {'type': 'string'}}} + request = ChatCompletionRequest( + model='openai/gpt-oss-20b', + messages=[ + {'role': 'system', 'content': None}, + {'role': 'user', 'content': 'hi'}, + ], + response_format=ResponseFormat( + type='json_schema', + json_schema=JsonSchema(name='test', schema=schema_dict), + ), + ) + parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object()) + + assert parser.request.response_format is None + msgs = parser.request.messages + # A new system message with the schema is inserted at position 0 + assert msgs[0]['role'] == 'system' + assert '# Response Formats' in msgs[0]['content'] + assert _json.dumps(schema_dict) in msgs[0]['content']