open-telemetry · keith-decker · Apr 15, 2026 · Apr 15, 2026 · Apr 16, 2026 · Apr 16, 2026
@@ -21,6 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 of repeatedly failing on every upload ([https://github.com/open-telemetry/opentelemetry-python-contrib/pull/4390](#4390)).
 - Refactor public API: add factory methods (`start_inference`, `start_embedding`, `start_tool`, `start_workflow`) and invocation-owned lifecycle (`invocation.stop()` / `invocation.fail(exc)`); rename `LLMInvocation` → `InferenceInvocation` and `ToolCall` → `ToolInvocation`. Existing usages remain fully functional via deprecated aliases.
   ([#4391](https://github.com/open-telemetry/opentelemetry-python-contrib/pull/4391))
+- Add metrics to ToolInvocations ([#4443](https://github.com/open-telemetry/opentelemetry-python-contrib/pull/4443))
 
 
 ## Version 0.3b0 (2026-02-20)

@@ -45,7 +45,7 @@ class ToolInvocation(GenAIInvocation):
     - error.type: Error type if operation failed (Conditionally Required)
     """
 
-    def __init__(
+    def __init__(  # pylint: disable=too-many-locals
         self,
         tracer: Tracer,
         metrics_recorder: InvocationMetricsRecorder,
@@ -79,6 +79,13 @@ def __init__(
         self.tool_result = tool_result
         self._start()
 
+    def _get_metric_attributes(self) -> dict[str, Any]:
+        attrs: dict[str, Any] = {
+            GenAI.GEN_AI_OPERATION_NAME: self._operation_name,
+        }
+        attrs.update(self.metric_attributes)
+        return attrs
+
     def _apply_finish(self, error: Error | None = None) -> None:
         if error is not None:
             self._apply_error_attributes(error)
@@ -94,3 +101,4 @@ def _apply_finish(self, error: Error | None = None) -> None:
         }
         attributes.update(self.attributes)
         self.span.set_attributes(attributes)
+        self._metrics_recorder.record(self)
@@ -181,3 +181,69 @@ def _assert_metric_scope_schema_urls(
                 self.assertEqual(
                     scope_metric.scope.schema_url, expected_schema_url
                 )
+
+
+class TelemetryHandlerToolMetricsTest(TestBase):
+    def _harvest_metrics(self) -> Dict[str, List[Any]]:
+        metrics = self.get_sorted_metrics()
+        metrics_by_name: Dict[str, List[Any]] = {}
+        for metric in metrics or []:
+            points = metric.data.data_points or []
+            metrics_by_name.setdefault(metric.name, []).extend(points)
+        return metrics_by_name
+
+    def test_stop_tool_records_duration(self) -> None:
+        handler = TelemetryHandler(
+            tracer_provider=self.tracer_provider,
+            meter_provider=self.meter_provider,
+        )
+        with patch("timeit.default_timer", return_value=1000.0):
+            invocation = handler.start_tool("get_weather")
+        invocation.metric_attributes = {"custom.key": "custom_value"}
+
+        with patch("timeit.default_timer", return_value=1002.5):
+            invocation.stop()
+
+        metrics = self._harvest_metrics()
+        self.assertIn("gen_ai.client.operation.duration", metrics)
+        duration_points = metrics["gen_ai.client.operation.duration"]
+        self.assertEqual(len(duration_points), 1)
+        duration_point = duration_points[0]
+
+        self.assertEqual(
+            duration_point.attributes[GenAI.GEN_AI_OPERATION_NAME],
+            "execute_tool",
+        )
+        self.assertEqual(
+            duration_point.attributes["custom.key"], "custom_value"
+        )
+        self.assertAlmostEqual(duration_point.sum, 2.5, places=3)
+        self.assertNotIn("gen_ai.client.token.usage", metrics)
+
+    def test_fail_tool_records_duration_with_error(self) -> None:
+        handler = TelemetryHandler(
+            tracer_provider=self.tracer_provider,
+            meter_provider=self.meter_provider,
+        )
+        with patch("timeit.default_timer", return_value=500.0):
+            invocation = handler.start_tool("failing_tool")
+
+        error = Error(message="Tool execution failed", type=RuntimeError)
+        with patch("timeit.default_timer", return_value=501.5):
+            invocation.fail(error)
+
+        metrics = self._harvest_metrics()
+        self.assertIn("gen_ai.client.operation.duration", metrics)
+        duration_points = metrics["gen_ai.client.operation.duration"]
+        self.assertEqual(len(duration_points), 1)
+        duration_point = duration_points[0]
+
+        self.assertEqual(
+            duration_point.attributes["error.type"], "RuntimeError"
+        )
+        self.assertEqual(
+            duration_point.attributes[GenAI.GEN_AI_OPERATION_NAME],
+            "execute_tool",
+        )
+        self.assertAlmostEqual(duration_point.sum, 1.5, places=3)
+        self.assertNotIn("gen_ai.client.token.usage", metrics)