Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions lmdeploy/serve/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,15 +417,17 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque

random_seed = request.seed if request.seed is not None else None
max_new_tokens = (request.max_completion_tokens if request.max_completion_tokens else request.max_tokens)
response_format = None
if request.response_format and request.response_format.type != 'text':
response_format = request.response_format.model_dump()

parser_cls = VariableInterface.response_parser_cls
response_parser = parser_cls(request=request, tokenizer=tokenizer)
# request might be adjusted by tool parser
# request might be adjusted by the response parser (e.g. GPT-OSS clears
# response_format and injects the schema into messages instead)
request = response_parser.request

response_format = None
if request.response_format and request.response_format.type != 'text':
response_format = request.response_format.model_dump()

gen_config = GenerationConfig(
max_new_tokens=max_new_tokens,
do_sample=True,
Expand Down
50 changes: 50 additions & 0 deletions lmdeploy/serve/parsers/_openai_harmony.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
available."""
from __future__ import annotations

import json
import re
from typing import TYPE_CHECKING

Expand All @@ -16,6 +17,7 @@
FunctionCall,
ToolCall,
)
from lmdeploy.utils import get_logger

from .response_parser import ResponseParser, ResponseParserManager

Expand All @@ -24,6 +26,8 @@

from lmdeploy.serve.openai.protocol import ChatCompletionRequest

logger = get_logger('lmdeploy')

_harmony_encoding = None


Expand Down Expand Up @@ -55,6 +59,7 @@ def __init__(self, request: ChatCompletionRequest, tokenizer: PreTrainedTokenize
else:
# Unit tests may inject a lightweight sentinel request object.
self.request = request
self._convert_response_format_to_harmony()
self.model_tokenizer = tokenizer
self.parser = StreamableParser(get_encoding(), role=Role.ASSISTANT)
self._seen_any = False
Expand All @@ -64,6 +69,51 @@ def __init__(self, request: ChatCompletionRequest, tokenizer: PreTrainedTokenize
self._active_tool_name: str | None = None
self.tool_parser = object() # API server checks `is not None` for tool support.

def _convert_response_format_to_harmony(self):
"""Convert response_format to Harmony-native mode for GPT-OSS.

GPT-OSS uses Harmony mode for structured output, which conflicts with
the engine's built-in JSON/response-format mode. This method injects
the response_format schema into the system prompt as a
``# Response Formats`` section and clears ``response_format`` on the
request so that only the Harmony-native instructions are used.
"""
fmt = getattr(self.request, 'response_format', None)
if fmt is None or getattr(fmt, 'type', 'text') == 'text':
return

try:
format_json = json.dumps(fmt.model_dump())
format_section = f'\n\n# Response Formats\n{format_json}'
messages = self.request.messages

if not isinstance(messages, list):
logger.warning('Cannot inject response_format schema into '
'non-list messages for GPT-OSS; clearing response_format only.')
Comment thread
windreamer marked this conversation as resolved.
self.request = self.request.model_copy(update={'response_format': None})
return

new_messages = list(messages)
system_idx = next(
(i for i, msg in enumerate(new_messages) if isinstance(msg, dict) and msg.get('role') == 'system'),
None,
)

if system_idx is not None:
content = new_messages[system_idx].get('content') or ''
new_messages[system_idx] = {**new_messages[system_idx], 'content': content + format_section}
else:
new_messages.insert(0, {'role': 'system', 'content': format_section})

Comment thread
windreamer marked this conversation as resolved.
self.request = self.request.model_copy(update={
'response_format': None,
'messages': new_messages,
})
Comment thread
windreamer marked this conversation as resolved.
Outdated
except Exception as e:
logger.error(f'Failed to convert response_format to Harmony-native mode for GPT-OSS: {e}')
Comment thread
windreamer marked this conversation as resolved.
Outdated
Comment thread
windreamer marked this conversation as resolved.
Outdated
# Still clear response_format to avoid the Harmony/JSON mode conflict
self.request = self.request.model_copy(update={'response_format': None})

Comment thread
windreamer marked this conversation as resolved.
Comment thread
windreamer marked this conversation as resolved.
def stream_chunk(self, delta_text: str, delta_token_ids: list[int], **kwargs) -> tuple[DeltaMessage | None, bool]:
if (
not delta_text
Expand Down
Loading