Skip to content

Commit 9e5a5a0

Browse files
authored
chore(llmobs): add base datasets for experiments (#13852)
Adds the basic support for creating and pulling dataset objects from the new Datadog LLMObs Experiments product. Testing is done using the new cassette logic added in 1.27.4 to the test agent which enables us to easily record requests made to the Datadog API. Instructions for running the tests are included at the top of the tests file. Dependent on DataDog/dd-apm-test-agent#223 being merged + released. [](https://datadoghq.atlassian.net/browse/MLOB-3260)
1 parent 2fe07c3 commit 9e5a5a0

18 files changed

+544
-4
lines changed

.gitlab/services.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
LOG_LEVEL: ERROR
1919
SNAPSHOT_DIR: ${CI_PROJECT_DIR}/tests/snapshots
2020
SNAPSHOT_CI: 1
21+
VCR_CASSETTES_DIRECTORY: ${CI_PROJECT_DIR}/tests/llmobs/llmobs_cassettes
2122
PORT: 9126
2223
DD_POOL_TRACE_CHECK_FAILURES: true
2324
DD_DISABLE_ERROR_RESPONSES: true

ddtrace/llmobs/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66
LLMObs.enable()
77
"""
88

9+
from ._experiment import Dataset
10+
from ._experiment import DatasetRecord
911
from ._llmobs import LLMObs
1012
from ._llmobs import LLMObsSpan
1113

1214

13-
__all__ = ["LLMObs", "LLMObsSpan"]
15+
__all__ = ["LLMObs", "LLMObsSpan", "Dataset", "DatasetRecord"]

ddtrace/llmobs/_constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,10 @@
4747
EVP_SUBDOMAIN_HEADER_NAME = "X-Datadog-EVP-Subdomain"
4848
SPAN_SUBDOMAIN_NAME = "llmobs-intake"
4949
EVAL_SUBDOMAIN_NAME = "api"
50+
EXP_SUBDOMAIN_NAME = "api"
5051
AGENTLESS_SPAN_BASE_URL = "https://{}".format(SPAN_SUBDOMAIN_NAME)
5152
AGENTLESS_EVAL_BASE_URL = "https://{}".format(EVAL_SUBDOMAIN_NAME)
53+
AGENTLESS_EXP_BASE_URL = "https://{}".format(EXP_SUBDOMAIN_NAME)
5254

5355
EVP_PAYLOAD_SIZE_LIMIT = 5 << 20 # 5MB (actual limit is 5.1MB)
5456
EVP_EVENT_SIZE_LIMIT = (1 << 20) - 1024 # 999KB (actual limit is 1MB)

ddtrace/llmobs/_experiment.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
from typing import Any
2+
from typing import Dict
3+
from typing import List
4+
from typing import Optional
5+
from typing import TypedDict
6+
from typing import Union
7+
8+
from typing_extensions import NotRequired
9+
10+
11+
JSONType = Union[str, int, float, bool, None, List["JSONType"], Dict[str, "JSONType"]]
12+
NonNoneJSONType = Union[str, int, float, bool, List[JSONType], Dict[str, JSONType]]
13+
14+
15+
class DatasetRecord(TypedDict):
16+
input: NonNoneJSONType
17+
expected_output: JSONType
18+
metadata: Dict[str, Any]
19+
record_id: NotRequired[Optional[str]]
20+
21+
22+
class Dataset:
23+
name: str
24+
_id: str
25+
_data: List[DatasetRecord]
26+
27+
def __init__(self, name: str, dataset_id: str, data: List[DatasetRecord]) -> None:
28+
self.name = name
29+
self._id = dataset_id
30+
self._data = data

ddtrace/llmobs/_llmobs.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@
7373
from ddtrace.llmobs._constants import TAGS
7474
from ddtrace.llmobs._context import LLMObsContextProvider
7575
from ddtrace.llmobs._evaluators.runner import EvaluatorRunner
76+
from ddtrace.llmobs._experiment import Dataset
7677
from ddtrace.llmobs._utils import AnnotationContext
7778
from ddtrace.llmobs._utils import LinkTracker
7879
from ddtrace.llmobs._utils import ToolCallTracker
@@ -85,6 +86,7 @@
8586
from ddtrace.llmobs._utils import validate_prompt
8687
from ddtrace.llmobs._writer import LLMObsEvalMetricWriter
8788
from ddtrace.llmobs._writer import LLMObsEvaluationMetricEvent
89+
from ddtrace.llmobs._writer import LLMObsExperimentsClient
8890
from ddtrace.llmobs._writer import LLMObsSpanEvent
8991
from ddtrace.llmobs._writer import LLMObsSpanWriter
9092
from ddtrace.llmobs._writer import should_use_agentless
@@ -155,6 +157,7 @@ def get_tag(self, key: str) -> Optional[str]:
155157
class LLMObs(Service):
156158
_instance = None # type: LLMObs
157159
enabled = False
160+
_app_key: str = os.environ.get("DD_APP_KEY", "")
158161

159162
def __init__(
160163
self,
@@ -180,6 +183,12 @@ def __init__(
180183
interval=float(os.getenv("_DD_LLMOBS_EVALUATOR_INTERVAL", 1.0)),
181184
llmobs_service=self,
182185
)
186+
self._dne_client = LLMObsExperimentsClient(
187+
interval=float(os.getenv("_DD_LLMOBS_WRITER_INTERVAL", 1.0)),
188+
timeout=float(os.getenv("_DD_LLMOBS_WRITER_TIMEOUT", 5.0)),
189+
_app_key=self._app_key,
190+
is_agentless=True, # agent proxy doesn't seem to work for experiments
191+
)
183192

184193
forksafe.register(self._child_after_fork)
185194

@@ -447,6 +456,7 @@ def enable(
447456
instrumented_proxy_urls: Optional[Set[str]] = None,
448457
site: Optional[str] = None,
449458
api_key: Optional[str] = None,
459+
app_key: Optional[str] = None,
450460
env: Optional[str] = None,
451461
service: Optional[str] = None,
452462
span_processor: Optional[Callable[[LLMObsSpan], LLMObsSpan]] = None,
@@ -462,6 +472,7 @@ def enable(
462472
:param set[str] instrumented_proxy_urls: A set of instrumented proxy URLs to help detect when to emit LLM spans.
463473
:param str site: Your datadog site.
464474
:param str api_key: Your datadog api key.
475+
:param str app_key: Your datadog application key.
465476
:param str env: Your environment name.
466477
:param str service: Your service name.
467478
:param Callable[[LLMObsSpan], LLMObsSpan] span_processor: A function that takes an LLMObsSpan and returns an
@@ -477,6 +488,7 @@ def enable(
477488
# grab required values for LLMObs
478489
config._dd_site = site or config._dd_site
479490
config._dd_api_key = api_key or config._dd_api_key
491+
cls._app_key = app_key or cls._app_key
480492
config.env = env or config.env
481493
config.service = service or config.service
482494
config._llmobs_ml_app = ml_app or config._llmobs_ml_app
@@ -549,6 +561,18 @@ def enable(
549561
config._llmobs_ml_app,
550562
)
551563

564+
@classmethod
565+
def pull_dataset(cls, name: str) -> Dataset:
566+
return cls._instance._dne_client.dataset_pull(name)
567+
568+
@classmethod
569+
def create_dataset(cls, name: str, description: str) -> Dataset:
570+
return cls._instance._dne_client.dataset_create(name, description)
571+
572+
@classmethod
573+
def _delete_dataset(cls, dataset_id: str) -> None:
574+
return cls._instance._dne_client.dataset_delete(dataset_id)
575+
552576
@classmethod
553577
def register_processor(cls, processor: Optional[Callable[[LLMObsSpan], LLMObsSpan]] = None) -> None:
554578
"""Register a processor to be called on each LLMObs span.

ddtrace/llmobs/_writer.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
import atexit
2+
import json
23
from typing import Any
34
from typing import Dict
45
from typing import List
56
from typing import Optional
67
from typing import Union
8+
from urllib.parse import quote
79

810

911
# TypedDict was added to typing in python 3.8
@@ -24,6 +26,7 @@
2426
from ddtrace.internal.utils.retry import fibonacci_backoff_with_jitter
2527
from ddtrace.llmobs import _telemetry as telemetry
2628
from ddtrace.llmobs._constants import AGENTLESS_EVAL_BASE_URL
29+
from ddtrace.llmobs._constants import AGENTLESS_EXP_BASE_URL
2730
from ddtrace.llmobs._constants import AGENTLESS_SPAN_BASE_URL
2831
from ddtrace.llmobs._constants import DROPPED_IO_COLLECTION_ERROR
2932
from ddtrace.llmobs._constants import DROPPED_VALUE_TEXT
@@ -33,8 +36,12 @@
3336
from ddtrace.llmobs._constants import EVP_PAYLOAD_SIZE_LIMIT
3437
from ddtrace.llmobs._constants import EVP_PROXY_AGENT_BASE_PATH
3538
from ddtrace.llmobs._constants import EVP_SUBDOMAIN_HEADER_NAME
39+
from ddtrace.llmobs._constants import EXP_SUBDOMAIN_NAME
3640
from ddtrace.llmobs._constants import SPAN_ENDPOINT
3741
from ddtrace.llmobs._constants import SPAN_SUBDOMAIN_NAME
42+
from ddtrace.llmobs._experiment import Dataset
43+
from ddtrace.llmobs._experiment import DatasetRecord
44+
from ddtrace.llmobs._experiment import JSONType
3845
from ddtrace.llmobs._utils import safe_json
3946
from ddtrace.settings._agent import config as agent_config
4047

@@ -110,6 +117,7 @@ def __init__(
110117
is_agentless: bool,
111118
_site: str = "",
112119
_api_key: str = "",
120+
_app_key: str = "",
113121
_override_url: str = "",
114122
) -> None:
115123
super(BaseLLMObsWriter, self).__init__(interval=interval)
@@ -119,6 +127,7 @@ def __init__(
119127
self._timeout: float = timeout
120128
self._api_key: str = _api_key or config._dd_api_key
121129
self._site: str = _site or config._dd_site
130+
self._app_key: str = _app_key
122131
self._override_url: str = _override_url
123132

124133
self._agentless: bool = is_agentless
@@ -264,6 +273,97 @@ def _data(self, events: List[LLMObsEvaluationMetricEvent]) -> Dict[str, Any]:
264273
return {"data": {"type": "evaluation_metric", "attributes": {"metrics": events}}}
265274

266275

276+
class LLMObsExperimentsClient(BaseLLMObsWriter):
277+
EVP_SUBDOMAIN_HEADER_VALUE = EXP_SUBDOMAIN_NAME
278+
AGENTLESS_BASE_URL = AGENTLESS_EXP_BASE_URL
279+
ENDPOINT = ""
280+
281+
def request(self, method: str, path: str, body: JSONType = None) -> Response:
282+
headers = {
283+
"Content-Type": "application/json",
284+
"DD-API-KEY": self._api_key,
285+
"DD-APPLICATION-KEY": self._app_key,
286+
}
287+
if not self._agentless:
288+
headers[EVP_SUBDOMAIN_HEADER_NAME] = self.EVP_SUBDOMAIN_HEADER_VALUE
289+
290+
encoded_body = json.dumps(body).encode("utf-8") if body else b""
291+
conn = get_connection(self._intake)
292+
try:
293+
url = self._intake + self._endpoint + path
294+
logger.debug("requesting %s", url)
295+
conn.request(method, url, encoded_body, headers)
296+
resp = conn.getresponse()
297+
return Response.from_http_response(resp)
298+
finally:
299+
conn.close()
300+
301+
def dataset_delete(self, dataset_id: str) -> None:
302+
path = "/api/unstable/llm-obs/v1/datasets/delete"
303+
resp = self.request(
304+
"POST",
305+
path,
306+
body={
307+
"data": {
308+
"type": "datasets",
309+
"attributes": {
310+
"type": "soft",
311+
"dataset_ids": [dataset_id],
312+
},
313+
},
314+
},
315+
)
316+
if resp.status != 200:
317+
raise ValueError(f"Failed to delete dataset {id}: {resp.get_json()}")
318+
return None
319+
320+
def dataset_create(self, name: str, description: str) -> Dataset:
321+
path = "/api/unstable/llm-obs/v1/datasets"
322+
body: JSONType = {
323+
"data": {
324+
"type": "datasets",
325+
"attributes": {"name": name, "description": description},
326+
}
327+
}
328+
resp = self.request("POST", path, body)
329+
if resp.status != 200:
330+
raise ValueError(f"Failed to create dataset {name}: {resp.status} {resp.get_json()}")
331+
response_data = resp.get_json()
332+
dataset_id = response_data["data"]["id"]
333+
return Dataset(name, dataset_id, [])
334+
335+
def dataset_pull(self, name: str) -> Dataset:
336+
path = f"/api/unstable/llm-obs/v1/datasets?filter[name]={quote(name)}"
337+
resp = self.request("GET", path)
338+
if resp.status != 200:
339+
raise ValueError(f"Failed to pull dataset {name}: {resp.status} {resp.get_json()}")
340+
341+
response_data = resp.get_json()
342+
data = response_data["data"]
343+
if not data:
344+
raise ValueError(f"Dataset '{name}' not found")
345+
346+
dataset_id = data[0]["id"]
347+
url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records"
348+
resp = self.request("GET", url)
349+
if resp.status == 404:
350+
raise ValueError(f"Dataset '{name}' not found")
351+
records_data = resp.get_json()
352+
353+
class_records: List[DatasetRecord] = []
354+
for record in records_data.get("data", []):
355+
attrs = record.get("attributes", {})
356+
class_records.append(
357+
{
358+
"record_id": record["id"],
359+
"input": attrs["input"],
360+
"expected_output": attrs["expected_output"],
361+
"metadata": attrs.get("metadata", {}),
362+
}
363+
)
364+
return Dataset(name, dataset_id, class_records)
365+
366+
267367
class LLMObsSpanWriter(BaseLLMObsWriter):
268368
"""Writes span events to the LLMObs Span Endpoint."""
269369

docker-compose.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,9 +126,11 @@ services:
126126
- "127.0.0.1:9126:8126"
127127
volumes:
128128
- ./tests/snapshots:/snapshots
129+
- ./tests/llmobs/llmobs_cassettes:/cassettes
129130
environment:
130131
- LOG_LEVEL=WARNING
131132
- SNAPSHOT_DIR=/snapshots
133+
- VCR_CASSETTES_DIRECTORY=/cassettes
132134
- SNAPSHOT_CI=0
133135
- DD_POOL_TRACE_CHECK_FAILURES=true
134136
- DD_DISABLE_ERROR_RESPONSES=true

tests/llmobs/_utils.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,14 @@
2424
cassette_library_dir=os.path.join(os.path.dirname(__file__), "llmobs_cassettes/"),
2525
record_mode="once",
2626
match_on=["path"],
27-
filter_headers=["authorization", "OpenAI-Organization", "api-key", "x-api-key", ("DD-API-KEY", "XXXXXX")],
27+
filter_headers=[
28+
"authorization",
29+
"OpenAI-Organization",
30+
"api-key",
31+
"x-api-key",
32+
("DD-API-KEY", "XXXXXX"),
33+
("DD-APPLICATION-KEY", "XXXXXX"),
34+
],
2835
# Ignore requests to the agent
2936
ignore_localhost=True,
3037
)

tests/llmobs/conftest.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,11 @@ def ddtrace_global_config():
9999

100100

101101
def default_global_config():
102-
return {"_dd_api_key": "<not-a-real-api_key>", "_llmobs_ml_app": "unnamed-ml-app", "service": "tests.llmobs"}
102+
return {
103+
"_dd_api_key": os.environ.get("DD_API_KEY", "<not-a-real-api_key>"),
104+
"_llmobs_ml_app": "unnamed-ml-app",
105+
"service": "tests.llmobs",
106+
}
103107

104108

105109
@pytest.fixture
@@ -172,7 +176,7 @@ def tracer():
172176
@pytest.fixture
173177
def llmobs_env():
174178
return {
175-
"DD_API_KEY": "<default-not-a-real-key>",
179+
"DD_API_KEY": os.environ.get("DD_API_KEY", "<default-not-a-real-key>"),
176180
"DD_LLMOBS_ML_APP": "unnamed-ml-app",
177181
}
178182

@@ -269,6 +273,7 @@ def llmobs(
269273
llmobs_service.enable(_tracer=tracer, **llmobs_enable_opts)
270274
llmobs_service._instance._llmobs_span_writer = llmobs_span_writer
271275
llmobs_service._instance._llmobs_span_writer.start()
276+
llmobs_service._instance._dne_client._intake = "http://localhost:9126/vcr/datadog"
272277
yield llmobs_service
273278
tracer.shutdown()
274279
llmobs_service.disable()
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
interactions:
2+
- request:
3+
body: '{"data": {"type": "datasets", "attributes": {"type": "soft", "dataset_ids":
4+
["e3608c37-7422-4a91-8897-9dc6097e86b4"]}}}'
5+
headers:
6+
Accept:
7+
- '*/*'
8+
? !!python/object/new:multidict._multidict.istr
9+
- Accept-Encoding
10+
: - identity
11+
Connection:
12+
- keep-alive
13+
Content-Length:
14+
- '119'
15+
? !!python/object/new:multidict._multidict.istr
16+
- Content-Type
17+
: - application/json
18+
User-Agent:
19+
- python-requests/2.32.3
20+
method: POST
21+
uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/delete
22+
response:
23+
body:
24+
string: '{"data":[{"id":"e3608c37-7422-4a91-8897-9dc6097e86b4","type":"datasets","attributes":{"author":{"id":"0dd9d379-c1a3-11ed-b4e0-566658a732f8"},"created_at":"2025-07-07T19:14:51.167054Z","current_version":0,"deleted_at":"2025-07-07T19:14:53.628842Z","description":"A
25+
test dataset","name":"test-dataset","updated_at":"2025-07-07T19:14:51.167054Z"}}]}'
26+
headers:
27+
content-length:
28+
- '346'
29+
content-security-policy:
30+
- frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
31+
content-type:
32+
- application/vnd.api+json
33+
date:
34+
- Mon, 07 Jul 2025 19:14:53 GMT
35+
strict-transport-security:
36+
- max-age=31536000; includeSubDomains; preload
37+
vary:
38+
- Accept-Encoding
39+
x-content-type-options:
40+
- nosniff
41+
x-frame-options:
42+
- SAMEORIGIN
43+
status:
44+
code: 200
45+
message: OK
46+
version: 1

0 commit comments

Comments
 (0)