feat(llmobs): emitting reasoning tokens metric for openai integration (#15478)

XG-xin · web-flow · commit ddb38704924e · 2025-12-09T14:17:36.000-05:00
## Description
Emit reasoning tokens metric, and remove it from metadata.
MLOB-4264

&lt;!-- Provide an overview of the change and motivation for the change --&gt;

## Testing

&lt;!-- Describe your testing strategy or note what tests are included --&gt;

## Risks

&lt;!-- Note any risks associated with this change, or "None" if no risks
--&gt;

## Additional Notes

&lt;!-- Any other information that would be helpful for reviewers --&gt;
diff --git a/ddtrace/llmobs/_constants.py b/ddtrace/llmobs/_constants.py
@@ -43,6 +43,7 @@
 CACHE_WRITE_INPUT_TOKENS_METRIC_KEY = "cache_write_input_tokens"
 CACHE_READ_INPUT_TOKENS_METRIC_KEY = "cache_read_input_tokens"
 BILLABLE_CHARACTER_COUNT_METRIC_KEY = "billable_character_count"
+REASONING_OUTPUT_TOKENS_METRIC_KEY = "reasoning_output_tokens"
 
 EVAL_ENDPOINT = "/api/intake/llm-obs/v2/eval-metric"
 SPAN_ENDPOINT = "/api/v2/llmobs"
diff --git a/ddtrace/llmobs/_integrations/openai.py b/ddtrace/llmobs/_integrations/openai.py
@@ -19,6 +19,7 @@
 from ddtrace.llmobs._constants import OUTPUT_TOKENS_METRIC_KEY
 from ddtrace.llmobs._constants import OUTPUT_VALUE
 from ddtrace.llmobs._constants import PROXY_REQUEST
+from ddtrace.llmobs._constants import REASONING_OUTPUT_TOKENS_METRIC_KEY
 from ddtrace.llmobs._constants import SPAN_KIND
 from ddtrace.llmobs._constants import TOTAL_TOKENS_METRIC_KEY
 from ddtrace.llmobs._integrations.base import BaseLLMIntegration
@@ -221,6 +222,13 @@ def _extract_llmobs_metrics_tags(
             cached_tokens = _get_attr(prompt_tokens_details, "cached_tokens", None)
             if cached_tokens is not None:
                 metrics[CACHE_READ_INPUT_TOKENS_METRIC_KEY] = cached_tokens
+            # Chat completion returns `completion_tokens_details` while responses api returns `output_tokens_details`
+            reasoning_output_tokens_details = _get_attr(token_usage, "completion_tokens_details", {}) or _get_attr(
+                token_usage, "output_tokens_details", {}
+            )
+            reasoning_output_tokens = _get_attr(reasoning_output_tokens_details, "reasoning_tokens", None)
+            if reasoning_output_tokens is not None:
+                metrics[REASONING_OUTPUT_TOKENS_METRIC_KEY] = reasoning_output_tokens
             return metrics
         elif kwargs.get("stream") and resp is not None:
             prompt_tokens = _compute_prompt_tokens(kwargs.get("prompt", None), kwargs.get("messages", None))
diff --git a/ddtrace/llmobs/_integrations/utils.py b/ddtrace/llmobs/_integrations/utils.py
@@ -768,11 +768,6 @@ def openai_get_metadata_from_response(
         if value is not None:
             metadata[field] = load_data_value(value)
 
-    usage = getattr(response, "usage", None)
-    output_tokens_details = getattr(usage, "output_tokens_details", None)
-    reasoning_tokens = getattr(output_tokens_details, "reasoning_tokens", 0)
-    metadata["reasoning_tokens"] = reasoning_tokens
-
     return metadata
 
 
@@ -1181,6 +1176,13 @@ def llmobs_metrics(self) -> Optional[Dict[str, Any]]:
                 metrics["output_tokens"] = usage.output_tokens
             if hasattr(usage, "total_tokens"):
                 metrics["total_tokens"] = usage.total_tokens
+            # Chat completion returns `completion_tokens_details` while responses api returns `output_tokens_details`
+            reasoning_output_tokens_details = _get_attr(usage, "completion_tokens_details", {}) or _get_attr(
+                usage, "output_tokens_details", {}
+            )
+            reasoning_output_tokens = _get_attr(reasoning_output_tokens_details, "reasoning_tokens", None)
+            if reasoning_output_tokens is not None:
+                metrics["reasoning_output_tokens"] = reasoning_output_tokens
 
         return metrics if metrics else None
 
@@ -1202,9 +1204,6 @@ def llmobs_metadata(self) -> Optional[Dict[str, Any]]:
             if hasattr(self.response, "text") and self.response.text:
                 metadata["text"] = load_data_value(self.response.text)
 
-            if hasattr(self.response, "usage") and hasattr(self.response.usage, "output_tokens_details"):
-                metadata["reasoning_tokens"] = self.response.usage.output_tokens_details.reasoning_tokens
-
         if self.span_type == "custom" and hasattr(self._raw_oai_span.span_data, "data"):
             custom_data = getattr(self._raw_oai_span.span_data, "data", None)
             if custom_data:
diff --git a/releasenotes/notes/openai-reasoning-token-c9300fb856562b7b.yaml b/releasenotes/notes/openai-reasoning-token-c9300fb856562b7b.yaml
@@ -0,0 +1,4 @@
+---
+features:
+  - |
+    LLM Observability: Reasoning token counts are now captured from OpenAI and OpenAI Agents responses.
diff --git a/tests/contrib/openai/cassettes/v1/response_reasoning_tokens.yaml b/tests/contrib/openai/cassettes/v1/response_reasoning_tokens.yaml
@@ -0,0 +1,115 @@
+interactions:
+- request:
+    body: '{"input":[{"content":"You are a helpful assistant.","role":"system"},{"content":"Who
+      won the world series in 2020?","role":"user"},{"content":"The Los Angeles Dodgers
+      won the World Series in 2020.","role":"assistant"},{"content":"Where was it
+      played?","role":"user"}],"max_output_tokens":500,"model":"gpt-5-mini","user":"ddtrace-test"}'
+    headers:
+      accept:
+      - application/json
+      accept-encoding:
+      - gzip, deflate
+      connection:
+      - keep-alive
+      content-length:
+      - '335'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      user-agent:
+      - OpenAI/Python 2.3.0
+      x-stainless-arch:
+      - arm64
+      x-stainless-async:
+      - 'false'
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - MacOS
+      x-stainless-package-version:
+      - 2.3.0
+      x-stainless-read-timeout:
+      - '600'
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.12.10
+    method: POST
+    uri: https://api.openai.com/v1/responses
+  response:
+    body:
+      string: !!binary |
+        H4sIAAAAAAAAAwAAAP//dFTBjts4DL3PVwg6tcCkkJ3EsedW7GKLBRboYYvdQ1EYtERntCNLgkRN
+        Jyjy7wvLiWO3Mzebj6Qe+Uj+uGOMa8UfGA8YfSsQVQ2laOoSDyBLUVRNqfp9uce+ruuiUYBNqapt
+        01UHhG134PdjCtf9h5KuaZyNONllQCBULYxYcah2VdHs94eMRQJKcYyRbvAGCdUU1IF8OgaX7Mir
+        BxNxMmtjtD3yB/bjjjHGuIcThjFe4TMa5zHwO8bO2RlDcCNmkzHZoO31lVYhgTZxjUYKSZJ2dmUf
+        4KV1iXyiltwTZnAvxIyRc6aVYNbZBqfQjMSOnjb7zaCt3pSi3G9EvRHXjuWs/IF9zcVMJd3EiG9L
+        UZWykKMUXS8O0O12qtiJcretc+KchE4eJzEgOjs2bYZiGgYIp/Hhb9l2vn+NwBCPbzNododKZAYK
+        5LbZV1W/3WJZil8ZDBgjHHHx/huqZ1A6S2hvXVkSW6W9aoIvNEdnB7DWEVx1/PptBRp39MF1ryA5
+        0QPjXx6RlaIU7F8XjGJ/Y9AY2XeIzBs4oWJoSQc0JwbEPhnXIftL98j+0GgU05Z9DOOMkrP37Au+
+        QGTvgFlMFMBsoiZkEALYIw5oiXUoIUVkrmf0iOy3z//8+fumaJgHq3DQ8v0HPpM8X75m3jw4k3sB
+        MepIYGlyHh2zE/cQwBg06ymlkKZ98gGftUuxva5sm6Wfp9gHN3hqJchHbJ/w9CYWcBRNO7v0uI3e
+        vK/Y9y7QNBVKp+Gi3GIkx+h5hSP0SKdWqzF5r3G1zhHDs5bYkr6egB6SmYaBR3IBV7eDcPAYgFK2
+        Fx/ExZpVv9DrXRjg9r+Ytuy3XBb+jKFzUdNpUc1MfGr3o9Ny0ieR4zNwGz5OzreLkRSz0S85hmQl
+        XLrLlY7QmeuhTHm15gK0Xd+p3f2v9sXxm8vMKqpboFiV+vP5K6rda8hriecZWESX9So7OQJzg8ui
+        ntuY4kVaRQEkbgjjRd8BCRQQjG+d787/AwAA//8DAIgarZTFBgAA
+    headers:
+      CF-RAY:
+      - 9a7533cccb62a16e-BOS
+      Connection:
+      - keep-alive
+      Content-Encoding:
+      - gzip
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 01 Dec 2025 20:06:02 GMT
+      Server:
+      - cloudflare
+      Set-Cookie:
+      - __cf_bm=2kGXnalD2OsHSa2nTDfcC.x2Ika0f2_xb7jheZf7DU4-1764619562-1.0.1.1-Au.Yl85NNbXOu16kuncX5kBMQzP4RpRl0PV6BVnlPTErwQVRCI0DupekNakHl7Axhdd.qH_fm2I0r0zPzdZKWnuDTN0VPhzRIJ4F0baZOQ0;
+        path=/; expires=Mon, 01-Dec-25 20:36:02 GMT; domain=.api.openai.com; HttpOnly;
+        Secure; SameSite=None
+      - _cfuvid=HDI66hWV9Becq_pRyYlPYhGoaajJRBY_ADJKn2yxQxA-1764619562018-0.0.1.1-604800000;
+        path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
+      Strict-Transport-Security:
+      - max-age=31536000; includeSubDomains; preload
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - nosniff
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - datadog-staging
+      openai-processing-ms:
+      - '4080'
+      openai-project:
+      - proj_gt6TQZPRbZfoY2J9AQlEJMpd
+      openai-version:
+      - '2020-10-01'
+      x-envoy-upstream-service-time:
+      - '4083'
+      x-ratelimit-limit-requests:
+      - '30000'
+      x-ratelimit-limit-tokens:
+      - '180000000'
+      x-ratelimit-remaining-requests:
+      - '29999'
+      x-ratelimit-remaining-tokens:
+      - '180000000'
+      x-ratelimit-reset-requests:
+      - 2ms
+      x-ratelimit-reset-tokens:
+      - 0s
+      x-request-id:
+      - req_47f608c3045b489ea39bcb3e4ad37cb8
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/contrib/openai/test_openai_llmobs.py b/tests/contrib/openai/test_openai_llmobs.py
diff --git a/tests/contrib/openai_agents/test_openai_agents_llmobs.py b/tests/contrib/openai_agents/test_openai_agents_llmobs.py

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +---
 +features:
 +  - |
 +    LLM Observability: Reasoning token counts are now captured from OpenAI and OpenAI Agents responses.