Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions lmdeploy/serve/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,15 +417,17 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque

random_seed = request.seed if request.seed is not None else None
max_new_tokens = (request.max_completion_tokens if request.max_completion_tokens else request.max_tokens)
response_format = None
if request.response_format and request.response_format.type != 'text':
response_format = request.response_format.model_dump()

parser_cls = VariableInterface.response_parser_cls
response_parser = parser_cls(request=request, tokenizer=tokenizer)
# request might be adjusted by tool parser
# request might be adjusted by the response parser (e.g. GPT-OSS clears
# response_format and injects the schema into messages instead)
request = response_parser.request

response_format = None
if request.response_format and request.response_format.type != 'text':
response_format = request.response_format.model_dump()

gen_config = GenerationConfig(
max_new_tokens=max_new_tokens,
do_sample=True,
Expand Down
79 changes: 79 additions & 0 deletions lmdeploy/serve/parsers/_openai_harmony.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
available."""
from __future__ import annotations

import json
import re
from typing import TYPE_CHECKING

Expand All @@ -16,6 +17,7 @@
FunctionCall,
ToolCall,
)
from lmdeploy.utils import get_logger

from .response_parser import ResponseParser, ResponseParserManager

Expand All @@ -24,6 +26,8 @@

from lmdeploy.serve.openai.protocol import ChatCompletionRequest

logger = get_logger('lmdeploy')

_harmony_encoding = None


Expand Down Expand Up @@ -55,6 +59,7 @@ def __init__(self, request: ChatCompletionRequest, tokenizer: PreTrainedTokenize
else:
# Unit tests may inject a lightweight sentinel request object.
self.request = request
self._convert_response_format_to_harmony()
self.model_tokenizer = tokenizer
self.parser = StreamableParser(get_encoding(), role=Role.ASSISTANT)
self._seen_any = False
Expand All @@ -64,6 +69,80 @@ def __init__(self, request: ChatCompletionRequest, tokenizer: PreTrainedTokenize
self._active_tool_name: str | None = None
self.tool_parser = object() # API server checks `is not None` for tool support.

def _convert_response_format_to_harmony(self):
"""Convert response_format to Harmony-native mode for GPT-OSS.

GPT-OSS uses Harmony mode for structured output, which conflicts with
the engine's built-in JSON/response-format mode. This method injects
the response_format schema into the system prompt as a
``# Response Formats`` section and clears ``response_format`` on the
request so that only the Harmony-native instructions are used.
"""
fmt = getattr(self.request, 'response_format', None)
if fmt is None or getattr(fmt, 'type', 'text') == 'text':
return

try:
format_json = json.dumps(fmt.model_dump())
format_body = f'# Response Formats\n{format_json}'
messages = self.request.messages

if isinstance(messages, str):
messages = messages + '\n\n' + format_body
self._clear_response_format(messages=messages)
return

if not isinstance(messages, list):
logger.warning('Cannot inject response_format schema into '
'non-list messages for GPT-OSS; clearing response_format only.')
Comment thread
windreamer marked this conversation as resolved.
self._clear_response_format()
return

new_messages = list(messages)
system_idx = next(
(i for i, msg in enumerate(new_messages) if isinstance(msg, dict) and msg.get('role') == 'system'),
None,
)

if system_idx is not None:
content = new_messages[system_idx].get('content')
if isinstance(content, list):
# Multimodal content blocks — append a text block.
new_messages[system_idx] = {
**new_messages[system_idx],
'content': content + [{'type': 'text', 'text': format_body}],
}
elif isinstance(content, str):
new_messages[system_idx] = {
**new_messages[system_idx],
'content': (content + '\n\n' + format_body) if content else format_body,
}
else:
# content is None or unexpected type — insert a separate
# system message so the schema is still available.
new_messages.insert(0, {'role': 'system', 'content': format_body})
else:
new_messages.insert(0, {'role': 'system', 'content': format_body})

self._clear_response_format(messages=new_messages)
except Exception:
logger.exception('Failed to convert response_format to Harmony-native mode for GPT-OSS')
# Still clear response_format to avoid the Harmony/JSON mode conflict
self._clear_response_format()

def _clear_response_format(self, messages=None):
"""Clear response_format on the request, handling both Pydantic and
plain objects."""
if hasattr(self.request, 'model_copy'):
update = {'response_format': None}
if messages is not None:
update['messages'] = messages
self.request = self.request.model_copy(update=update)
else:
self.request.response_format = None
Comment thread
windreamer marked this conversation as resolved.
if messages is not None:
self.request.messages = messages

Comment thread
windreamer marked this conversation as resolved.
Comment thread
windreamer marked this conversation as resolved.
def stream_chunk(self, delta_text: str, delta_token_ids: list[int], **kwargs) -> tuple[DeltaMessage | None, bool]:
if (
not delta_text
Expand Down
231 changes: 231 additions & 0 deletions tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,3 +329,234 @@ def test_parse_complete_appends_tool_call_still_open_at_eof(self, monkeypatch):
)
def test_extract_tool_name(self, recipient, expected):
assert gpt_oss_mod.GptOssResponseParser._extract_tool_name(recipient) == expected


class TestGptOssResponseFormatHarmonyConversion:
"""Tests for
:meth:`GptOssResponseParser._convert_response_format_to_harmony`."""

@pytest.fixture(autouse=True)
def _patch_streamable_parser(self, monkeypatch):
monkeypatch.setattr(
openai_harmony_mod,
'StreamableParser',
lambda *args, **kwargs: _FakeStreamableParser({}),
)

def test_response_format_cleared_after_conversion(self):
"""response_format must be None after the parser processes it."""
from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat

request = ChatCompletionRequest(
model='openai/gpt-oss-20b',
messages=[{'role': 'user', 'content': 'hi'}],
response_format=ResponseFormat(
type='json_schema',
json_schema=JsonSchema(
name='test',
schema={'type': 'object', 'properties': {'x': {'type': 'integer'}}},
),
),
)
parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())
assert parser.request.response_format is None

def test_schema_appended_to_existing_system_message(self):
"""When a system message already exists the schema is appended to
it."""
import json as _json

from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat

schema_dict = {'type': 'object', 'properties': {'x': {'type': 'integer'}}}
request = ChatCompletionRequest(
model='openai/gpt-oss-20b',
messages=[
{'role': 'system', 'content': 'You are helpful.'},
{'role': 'user', 'content': 'hi'},
],
response_format=ResponseFormat(
type='json_schema',
json_schema=JsonSchema(name='test', schema=schema_dict),
),
)
parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())

msgs = parser.request.messages
assert msgs[0]['role'] == 'system'
assert parser.request.response_format is None
# The schema body must appear in the system message
assert '# Response Formats' in msgs[0]['content']
assert _json.dumps(schema_dict) in msgs[0]['content']
# The original content is preserved before the appended section
assert msgs[0]['content'].startswith('You are helpful.')
# No leading blank lines in the appended section
assert '\n\n# Response Formats' in msgs[0]['content']

def test_schema_inserted_as_new_system_message_when_none_exists(self):
"""When no system message exists a new one is inserted at position
0."""
import json as _json

from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat

schema_dict = {'type': 'object', 'properties': {'name': {'type': 'string'}}}
request = ChatCompletionRequest(
model='openai/gpt-oss-20b',
messages=[{'role': 'user', 'content': 'hi'}],
response_format=ResponseFormat(
type='json_schema',
json_schema=JsonSchema(name='test', schema=schema_dict),
),
)
parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())

msgs = parser.request.messages
assert msgs[0]['role'] == 'system'
assert parser.request.response_format is None
# New system message content must NOT start with blank lines
assert not msgs[0]['content'].startswith('\n')
assert msgs[0]['content'].startswith('# Response Formats')
assert _json.dumps(schema_dict) in msgs[0]['content']
# The user message is still present after the inserted system message
assert msgs[1]['role'] == 'user'

def test_text_response_format_is_not_converted(self):
"""A text-type response_format should be left untouched."""
from lmdeploy.serve.openai.protocol import ResponseFormat

request = ChatCompletionRequest(
model='openai/gpt-oss-20b',
messages=[{'role': 'user', 'content': 'hi'}],
response_format=ResponseFormat(type='text'),
)
parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())
assert parser.request.response_format is not None
assert parser.request.response_format.type == 'text'

def test_no_response_format_leaves_request_unchanged(self):
"""When response_format is None the request is not modified."""
request = ChatCompletionRequest(
model='openai/gpt-oss-20b',
messages=[{'role': 'user', 'content': 'hi'}],
)
parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())
assert parser.request.response_format is None
assert len(parser.request.messages) == 1

def test_str_messages_gets_schema_appended(self):
"""When messages is a string, the schema section is appended to it."""
import json as _json

from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat

schema_dict = {'type': 'object', 'properties': {'x': {'type': 'integer'}}}
request = ChatCompletionRequest(
model='openai/gpt-oss-20b',
messages='Tell me a joke',
response_format=ResponseFormat(
type='json_schema',
json_schema=JsonSchema(name='test', schema=schema_dict),
),
)
parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())

assert parser.request.response_format is None
assert isinstance(parser.request.messages, str)
assert parser.request.messages.startswith('Tell me a joke')
assert '# Response Formats' in parser.request.messages
assert _json.dumps(schema_dict) in parser.request.messages

def test_non_pydantic_request_messages_updated(self):
"""Non-Pydantic sentinel requests also get messages updated."""
import json as _json

from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat

schema_dict = {'type': 'object', 'properties': {'y': {'type': 'number'}}}
fmt = ResponseFormat(
type='json_schema',
json_schema=JsonSchema(name='test', schema=schema_dict),
)

# Sentinel must NOT have tools/tool_choice attrs so that __init__
# skips the Pydantic-dependent tool-rendering branch.
class _Sentinel:
messages = [{'role': 'user', 'content': 'hi'}]
response_format = fmt

sentinel = _Sentinel()
parser = gpt_oss_mod.GptOssResponseParser(request=sentinel, tokenizer=object())

assert parser.request.response_format is None
msgs = parser.request.messages
assert isinstance(msgs, list)
assert msgs[0]['role'] == 'system'
assert '# Response Formats' in msgs[0]['content']
assert _json.dumps(schema_dict) in msgs[0]['content']

def test_list_content_system_message_gets_text_block_appended(self):
"""When system message content is a list (multimodal), append a text
block."""
import json as _json

from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat

schema_dict = {'type': 'object', 'properties': {'z': {'type': 'boolean'}}}
request = ChatCompletionRequest(
model='openai/gpt-oss-20b',
messages=[
{'role': 'system', 'content': [
{'type': 'text', 'text': 'You are helpful.'},
{'type': 'image_url', 'image_url': {'url': 'http://example.com/img.png'}},
]},
{'role': 'user', 'content': 'hi'},
],
response_format=ResponseFormat(
type='json_schema',
json_schema=JsonSchema(name='test', schema=schema_dict),
),
)
parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())

assert parser.request.response_format is None
sys_msg = parser.request.messages[0]
assert sys_msg['role'] == 'system'
content = sys_msg['content']
assert isinstance(content, list)
assert len(content) == 3
# Original two blocks preserved
assert content[0]['type'] == 'text'
assert content[0]['text'] == 'You are helpful.'
assert content[1]['type'] == 'image_url'
# Schema appended as a text block
assert content[2]['type'] == 'text'
assert '# Response Formats' in content[2]['text']
assert _json.dumps(schema_dict) in content[2]['text']

def test_none_content_system_message_inserts_separate_system(self):
"""When system message content is None, insert a new system message."""
import json as _json

from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat

schema_dict = {'type': 'object', 'properties': {'w': {'type': 'string'}}}
request = ChatCompletionRequest(
model='openai/gpt-oss-20b',
messages=[
{'role': 'system', 'content': None},
{'role': 'user', 'content': 'hi'},
],
response_format=ResponseFormat(
type='json_schema',
json_schema=JsonSchema(name='test', schema=schema_dict),
),
)
parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())

assert parser.request.response_format is None
msgs = parser.request.messages
# A new system message with the schema is inserted at position 0
assert msgs[0]['role'] == 'system'
assert '# Response Formats' in msgs[0]['content']
assert _json.dumps(schema_dict) in msgs[0]['content']
Loading