feat(openai): Add gen_ai.client.time_to_first_token metric for streaming

Nik-Reddy · Nik-Reddy · commit 40ac7e333765 · 2026-04-14T11:04:41.000-07:00
Fixes #3932
diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/instruments.py b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/instruments.py
@@ -1,5 +1,9 @@
 from opentelemetry.metrics import Histogram, Meter
 from opentelemetry.semconv._incubating.metrics import gen_ai_metrics
+from opentelemetry.util.genai.instruments import (
+    GEN_AI_CLIENT_TIME_TO_FIRST_TOKEN,
+    GEN_AI_CLIENT_TIME_TO_FIRST_TOKEN_BUCKETS,
+)
 
 _GEN_AI_CLIENT_OPERATION_DURATION_BUCKETS = [
     0.01,
@@ -50,3 +54,9 @@ def __init__(self, meter: Meter):
             unit="{token}",
             explicit_bucket_boundaries_advisory=_GEN_AI_CLIENT_TOKEN_USAGE_BUCKETS,
         )
+        self.ttft_histogram: Histogram = meter.create_histogram(
+            name=GEN_AI_CLIENT_TIME_TO_FIRST_TOKEN,
+            description="Time to generate first token for successful responses",
+            unit="s",
+            explicit_bucket_boundaries_advisory=GEN_AI_CLIENT_TIME_TO_FIRST_TOKEN_BUCKETS,
+        )
diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/patch.py b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/patch.py
@@ -90,7 +90,10 @@ def traced_method(wrapped, instance, args, kwargs):
                     parsed_result = result
                 if is_streaming(kwargs):
                     return LegacyChatStreamWrapper(
-                        parsed_result, span, logger, capture_content
+                        parsed_result, span, logger, capture_content,
+                        instruments=instruments,
+                        start_time=start,
+                        request_attributes=span_attributes,
                     )
 
                 if span.is_recording():
@@ -195,7 +198,10 @@ async def traced_method(wrapped, instance, args, kwargs):
                     parsed_result = result
                 if is_streaming(kwargs):
                     return LegacyChatStreamWrapper(
-                        parsed_result, span, logger, capture_content
+                        parsed_result, span, logger, capture_content,
+                        instruments=instruments,
+                        start_time=start,
+                        request_attributes=span_attributes,
                     )
 
                 if span.is_recording():
@@ -631,6 +637,8 @@ def __init__(
         self.choice_buffers = []
         self._started = False
         self.capture_content = capture_content
+        self._first_token_received = False
+        self._first_token_time: Optional[float] = None
         self._setup()
 
     def _setup(self):
@@ -752,8 +760,25 @@ def process_chunk(self, chunk):
         self.set_response_model(chunk)
         self.set_response_service_tier(chunk)
         self.build_streaming_response(chunk)
+        self._detect_first_token(chunk)
         self.set_usage(chunk)
 
+    def _detect_first_token(self, chunk):
+        if self._first_token_received:
+            return
+        if getattr(chunk, "choices", None) is None:
+            return
+        for choice in chunk.choices:
+            if not choice.delta:
+                continue
+            if (
+                choice.delta.content is not None
+                or choice.delta.tool_calls is not None
+            ):
+                self._first_token_received = True
+                self._first_token_time = default_timer()
+                return
+
     def __getattr__(self, name):
         return getattr(self.stream, name)
 
@@ -777,10 +802,16 @@ def __init__(
         span: Span,
         logger: Logger,
         capture_content: bool,
+        instruments: Optional[Instruments] = None,
+        start_time: Optional[float] = None,
+        request_attributes: Optional[dict] = None,
     ):
         super().__init__(stream, capture_content=capture_content)
         self.span = span
         self.logger = logger
+        self._instruments = instruments
+        self._start_time = start_time
+        self._request_attributes = request_attributes or {}
 
     def cleanup(self, error: Optional[BaseException] = None):
         if not self._started:
@@ -863,9 +894,43 @@ def cleanup(self, error: Optional[BaseException] = None):
         if error:
             handle_span_exception(self.span, error)
         else:
+            self._record_ttft()
             self.span.end()
         self._started = False
 
+    def _record_ttft(self):
+        if (
+            self._instruments is None
+            or self._start_time is None
+            or self._first_token_time is None
+        ):
+            return
+        ttft = max(self._first_token_time - self._start_time, 0.0)
+        common_attributes = {
+            GenAIAttributes.GEN_AI_OPERATION_NAME: GenAIAttributes.GenAiOperationNameValues.CHAT.value,
+            GenAIAttributes.GEN_AI_SYSTEM: GenAIAttributes.GenAiSystemValues.OPENAI.value,
+        }
+        if GenAIAttributes.GEN_AI_REQUEST_MODEL in self._request_attributes:
+            common_attributes[GenAIAttributes.GEN_AI_REQUEST_MODEL] = (
+                self._request_attributes[GenAIAttributes.GEN_AI_REQUEST_MODEL]
+            )
+        if self.response_model:
+            common_attributes[GenAIAttributes.GEN_AI_RESPONSE_MODEL] = (
+                self.response_model
+            )
+        if ServerAttributes.SERVER_ADDRESS in self._request_attributes:
+            common_attributes[ServerAttributes.SERVER_ADDRESS] = (
+                self._request_attributes[ServerAttributes.SERVER_ADDRESS]
+            )
+        if ServerAttributes.SERVER_PORT in self._request_attributes:
+            common_attributes[ServerAttributes.SERVER_PORT] = (
+                self._request_attributes[ServerAttributes.SERVER_PORT]
+            )
+        self._instruments.ttft_histogram.record(
+            ttft,
+            attributes=common_attributes,
+        )
+
 
 class ChatStreamWrapper(BaseStreamWrapper):
     handler: TelemetryHandler
@@ -941,6 +1006,15 @@ def cleanup(self, error: Optional[BaseException] = None):
                 },
             )
 
+        if (
+            self._first_token_time is not None
+            and self.invocation.monotonic_start_s is not None
+        ):
+            self.invocation.time_to_first_token_s = max(
+                self._first_token_time - self.invocation.monotonic_start_s,
+                0.0,
+            )
+
         self._set_output_messages()
 
         if error:
diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_ttft_metrics.py b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_ttft_metrics.py
@@ -0,0 +1,134 @@
+import pytest
+from tests.test_utils import DEFAULT_MODEL, USER_ONLY_PROMPT
+
+from opentelemetry.semconv._incubating.attributes import (
+    gen_ai_attributes as GenAIAttributes,
+)
+from opentelemetry.semconv._incubating.attributes import (
+    server_attributes as ServerAttributes,
+)
+from opentelemetry.util.genai.instruments import (
+    GEN_AI_CLIENT_TIME_TO_FIRST_TOKEN,
+    GEN_AI_CLIENT_TIME_TO_FIRST_TOKEN_BUCKETS,
+    get_metric_data_points,
+)
+from opentelemetry.util.genai.utils import is_experimental_mode
+
+
+def test_streaming_chat_records_ttft_metric(
+    metric_reader, openai_client, instrument_with_content, vcr
+):
+    """TTFT metric is recorded for streaming chat completions."""
+    with vcr.use_cassette("test_chat_completion_streaming.yaml"):
+        response = openai_client.chat.completions.create(
+            model=DEFAULT_MODEL,
+            messages=USER_ONLY_PROMPT,
+            stream=True,
+            stream_options={"include_usage": True},
+        )
+        for _ in response:
+            pass
+
+    data_points = get_metric_data_points(metric_reader, GEN_AI_CLIENT_TIME_TO_FIRST_TOKEN)
+    assert len(data_points) == 1, (
+        "expected exactly one TTFT data point for streaming"
+    )
+
+    data_point = data_points[0]
+    assert data_point.sum >= 0
+    assert data_point.count == 1
+    assert data_point.explicit_bounds == tuple(GEN_AI_CLIENT_TIME_TO_FIRST_TOKEN_BUCKETS)
+
+    latest_experimental_enabled = is_experimental_mode()
+    assert GenAIAttributes.GEN_AI_OPERATION_NAME in data_point.attributes
+    assert (
+        data_point.attributes[GenAIAttributes.GEN_AI_OPERATION_NAME]
+        == GenAIAttributes.GenAiOperationNameValues.CHAT.value
+    )
+    assert GenAIAttributes.GEN_AI_REQUEST_MODEL in data_point.attributes
+    assert (
+        data_point.attributes[GenAIAttributes.GEN_AI_REQUEST_MODEL]
+        == "gpt-4o-mini"
+    )
+    assert ServerAttributes.SERVER_ADDRESS in data_point.attributes
+
+
+@pytest.mark.asyncio()
+async def test_async_streaming_chat_records_ttft_metric(
+    metric_reader, async_openai_client, instrument_with_content, vcr
+):
+    """TTFT metric is recorded for async streaming chat completions."""
+    with vcr.use_cassette("test_async_chat_completion_streaming.yaml"):
+        response = await async_openai_client.chat.completions.create(
+            model=DEFAULT_MODEL,
+            messages=USER_ONLY_PROMPT,
+            stream=True,
+            stream_options={"include_usage": True},
+        )
+        async for _ in response:
+            pass
+
+    data_points = get_metric_data_points(metric_reader, GEN_AI_CLIENT_TIME_TO_FIRST_TOKEN)
+    assert len(data_points) == 1, (
+        "expected exactly one TTFT data point for async streaming"
+    )
+
+    data_point = data_points[0]
+    assert data_point.sum >= 0
+    assert data_point.count == 1
+    assert data_point.explicit_bounds == tuple(GEN_AI_CLIENT_TIME_TO_FIRST_TOKEN_BUCKETS)
+
+
+def test_non_streaming_chat_does_not_record_ttft_metric(
+    metric_reader, openai_client, instrument_with_content, vcr
+):
+    """TTFT metric should NOT be recorded for non-streaming requests."""
+    with vcr.use_cassette("test_chat_completion_metrics.yaml"):
+        openai_client.chat.completions.create(
+            messages=USER_ONLY_PROMPT, model=DEFAULT_MODEL, stream=False
+        )
+
+    data_points = get_metric_data_points(metric_reader, GEN_AI_CLIENT_TIME_TO_FIRST_TOKEN)
+    assert len(data_points) == 0, (
+        "gen_ai.client.time_to_first_token metric should not be recorded for non-streaming"
+    )
+
+
+def test_streaming_tool_calls_records_ttft_metric(
+    metric_reader, openai_client, instrument_with_content, vcr
+):
+    """TTFT metric is recorded for streaming responses with tool calls."""
+    with vcr.use_cassette(
+        "test_chat_completion_multiple_tools_streaming_with_content.yaml"
+    ):
+        response = openai_client.chat.completions.create(
+            model=DEFAULT_MODEL,
+            messages=[{"role": "user", "content": "What's the weather?"}],
+            stream=True,
+            stream_options={"include_usage": True},
+            tools=[
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "get_weather",
+                        "parameters": {
+                            "type": "object",
+                            "properties": {
+                                "location": {"type": "string"},
+                            },
+                        },
+                    },
+                }
+            ],
+        )
+        for _ in response:
+            pass
+
+    data_points = get_metric_data_points(metric_reader, GEN_AI_CLIENT_TIME_TO_FIRST_TOKEN)
+    assert len(data_points) == 1, (
+        "expected exactly one TTFT data point for streaming tool calls"
+    )
+
+    data_point = data_points[0]
+    assert data_point.sum >= 0
+    assert data_point.count == 1
diff --git a/util/opentelemetry-util-genai/src/opentelemetry/util/genai/instruments.py b/util/opentelemetry-util-genai/src/opentelemetry/util/genai/instruments.py
@@ -35,6 +35,27 @@
     67108864,
 ]
 
+GEN_AI_CLIENT_TIME_TO_FIRST_TOKEN = "gen_ai.client.time_to_first_token"
+
+GEN_AI_CLIENT_TIME_TO_FIRST_TOKEN_BUCKETS = [
+    0.001,
+    0.005,
+    0.01,
+    0.02,
+    0.04,
+    0.06,
+    0.08,
+    0.1,
+    0.25,
+    0.5,
+    0.75,
+    1.0,
+    2.5,
+    5.0,
+    7.5,
+    10.0,
+]
+
 
 def create_duration_histogram(meter: Meter) -> Histogram:
     return meter.create_histogram(
@@ -52,3 +73,25 @@ def create_token_histogram(meter: Meter) -> Histogram:
         unit="{token}",
         explicit_bucket_boundaries_advisory=_GEN_AI_CLIENT_TOKEN_USAGE_BUCKETS,
     )
+
+
+def create_ttft_histogram(meter: Meter) -> Histogram:
+    return meter.create_histogram(
+        name=GEN_AI_CLIENT_TIME_TO_FIRST_TOKEN,
+        description="Time to generate first token for successful responses",
+        unit="s",
+        explicit_bucket_boundaries_advisory=GEN_AI_CLIENT_TIME_TO_FIRST_TOKEN_BUCKETS,
+    )
+
+
+def get_metric_data_points(metric_reader, metric_name):
+    """Extract all data points for a given metric name from a metric reader."""
+    results = []
+    metrics = metric_reader.get_metrics_data().resource_metrics
+    if not metrics:
+        return results
+    for scope_metrics in metrics[0].scope_metrics:
+        for m in scope_metrics.metrics:
+            if m.name == metric_name:
+                results.extend(m.data.data_points)
+    return results
diff --git a/util/opentelemetry-util-genai/src/opentelemetry/util/genai/metrics.py b/util/opentelemetry-util-genai/src/opentelemetry/util/genai/metrics.py
@@ -17,6 +17,7 @@
 from opentelemetry.util.genai.instruments import (
     create_duration_histogram,
     create_token_histogram,
+    create_ttft_histogram,
 )
 from opentelemetry.util.genai.types import LLMInvocation
 from opentelemetry.util.types import AttributeValue
@@ -28,6 +29,7 @@ class InvocationMetricsRecorder:
     def __init__(self, meter: Meter):
         self._duration_histogram: Histogram = create_duration_histogram(meter)
         self._token_histogram: Histogram = create_token_histogram(meter)
+        self._ttft_histogram: Histogram = create_ttft_histogram(meter)
 
     def record(
         self,
@@ -105,5 +107,12 @@ def record(
                 context=span_context,
             )
 
+        if invocation.time_to_first_token_s is not None and not error_type:
+            self._ttft_histogram.record(
+                invocation.time_to_first_token_s,
+                attributes=attributes,
+                context=span_context,
+            )
+
 
 __all__ = ["InvocationMetricsRecorder"]
diff --git a/util/opentelemetry-util-genai/src/opentelemetry/util/genai/types.py b/util/opentelemetry-util-genai/src/opentelemetry/util/genai/types.py
@@ -335,7 +335,12 @@ class LLMInvocation(GenAIInvocation):
     seed: int | None = None
     server_address: str | None = None
     server_port: int | None = None
-
+    time_to_first_token_s: float | None = None
+    """
+    Time to first token in seconds. This is the time from the start of
+    the request (monotonic_start_s) to when the first output token is
+    received. Only populated for streaming responses.
+    """
 
 @dataclass
 class EmbeddingInvocation(GenAIInvocation):