Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ddtrace/llmobs/_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -1072,6 +1072,8 @@ def _generate_metric_from_evaluation(
metric_type = "boolean"
elif isinstance(eval_value, (int, float)):
metric_type = "score"
elif isinstance(eval_value, dict):
metric_type = "json"
else:
metric_type = "categorical"
eval_value = str(eval_value).lower()
Expand Down
7 changes: 5 additions & 2 deletions ddtrace/llmobs/_llmobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2125,9 +2125,9 @@ def submit_evaluation(
raise ValueError("label value must not contain a '.'.")

metric_type = metric_type.lower()
if metric_type not in ("categorical", "score", "boolean"):
if metric_type not in ("categorical", "score", "boolean", "json"):
error = "invalid_metric_type"
raise ValueError("metric_type must be one of 'categorical', 'score', or 'boolean'.")
raise ValueError("metric_type must be one of 'categorical', 'score', 'boolean', or 'json'.")

if metric_type == "categorical" and not isinstance(value, str):
error = "invalid_metric_value"
Expand All @@ -2138,6 +2138,9 @@ def submit_evaluation(
if metric_type == "boolean" and not isinstance(value, bool):
error = "invalid_metric_value"
raise TypeError("value must be a boolean for a boolean metric.")
if metric_type == "json" and not isinstance(value, dict):
error = "invalid_metric_value"
raise TypeError("value must be a dict for a json metric.")

if tags is not None and not isinstance(tags, dict):
raise LLMObsSubmitEvaluationError("tags must be a dictionary of string key-value pairs.")
Expand Down
2 changes: 1 addition & 1 deletion ddtrace/llmobs/_telemetry.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def record_llmobs_user_processor_called(error: bool) -> None:


def record_llmobs_submit_evaluation(join_on: Dict[str, Any], metric_type: str, error: Optional[str]):
_metric_type = metric_type if metric_type in ("categorical", "score", "boolean") else "other"
_metric_type = metric_type if metric_type in ("categorical", "score", "boolean", "json") else "other"
custom_joining_key = str(int(join_on.get("tag") is not None))
tags = _base_tags(error)
tags.extend([("metric_type", _metric_type), ("custom_joining_key", custom_joining_key)])
Expand Down
1 change: 1 addition & 0 deletions ddtrace/llmobs/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ class LLMObsExperimentEvalMetricEvent(TypedDict, total=False):
categorical_value: str
score_value: float
boolean_value: bool
json_value: Dict[str, JSONType]
error: Optional[Dict[str, str]]
tags: List[str]
experiment_id: str
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
features:
- |
LLM Observability: Adds support for ``json`` metric type in evaluation metrics.
Users can now submit ``dict`` values as evaluation metrics using ``LLMObs.submit_evaluation()``
with ``metric_type="json"``. Additionally, experiment evaluators that return ``dict`` values
are automatically detected as ``json`` metric type.
11 changes: 9 additions & 2 deletions tests/llmobs/test_llmobs_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -1763,11 +1763,11 @@ def test_submit_evaluation_label_value_with_a_period_raises_error(llmobs, mock_l


def test_submit_evaluation_incorrect_metric_type_raises_error(llmobs, mock_llmobs_logs):
with pytest.raises(ValueError, match="metric_type must be one of 'categorical', 'score', or 'boolean'."):
with pytest.raises(ValueError, match="metric_type must be one of 'categorical', 'score', 'boolean', or 'json'."):
llmobs.submit_evaluation(
span={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="wrong", value="high"
)
with pytest.raises(ValueError, match="metric_type must be one of 'categorical', 'score', or 'boolean'."):
with pytest.raises(ValueError, match="metric_type must be one of 'categorical', 'score', 'boolean', or 'json'."):
llmobs.submit_evaluation(
span={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="", value="high"
)
Expand Down Expand Up @@ -2163,3 +2163,10 @@ def test_submit_evaluation_incorrect_categorical_value_type_raises_error(llmobs,
llmobs.submit_evaluation(
span={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", value=123
)


def test_submit_evaluation_incorrect_json_value_type_raises_error(llmobs, mock_llmobs_logs):
with pytest.raises(TypeError, match="value must be a dict for a json metric."):
llmobs.submit_evaluation(
span={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="json", value="high"
)
Loading