diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py index 44d3691e8bf..c53bfef98ca 100644 --- a/ddtrace/llmobs/_experiment.py +++ b/ddtrace/llmobs/_experiment.py @@ -1072,6 +1072,8 @@ def _generate_metric_from_evaluation( metric_type = "boolean" elif isinstance(eval_value, (int, float)): metric_type = "score" + elif isinstance(eval_value, dict): + metric_type = "json" else: metric_type = "categorical" eval_value = str(eval_value).lower() diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index 87039a5ca3b..b7aacf4c950 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -2125,9 +2125,9 @@ def submit_evaluation( raise ValueError("label value must not contain a '.'.") metric_type = metric_type.lower() - if metric_type not in ("categorical", "score", "boolean"): + if metric_type not in ("categorical", "score", "boolean", "json"): error = "invalid_metric_type" - raise ValueError("metric_type must be one of 'categorical', 'score', or 'boolean'.") + raise ValueError("metric_type must be one of 'categorical', 'score', 'boolean', or 'json'.") if metric_type == "categorical" and not isinstance(value, str): error = "invalid_metric_value" @@ -2138,6 +2138,9 @@ def submit_evaluation( if metric_type == "boolean" and not isinstance(value, bool): error = "invalid_metric_value" raise TypeError("value must be a boolean for a boolean metric.") + if metric_type == "json" and not isinstance(value, dict): + error = "invalid_metric_value" + raise TypeError("value must be a dict for a json metric.") if tags is not None and not isinstance(tags, dict): raise LLMObsSubmitEvaluationError("tags must be a dictionary of string key-value pairs.") diff --git a/ddtrace/llmobs/_telemetry.py b/ddtrace/llmobs/_telemetry.py index 463fe545eb0..ee0b733d5b4 100644 --- a/ddtrace/llmobs/_telemetry.py +++ b/ddtrace/llmobs/_telemetry.py @@ -197,7 +197,7 @@ def record_llmobs_user_processor_called(error: bool) -> None: def record_llmobs_submit_evaluation(join_on: Dict[str, Any], metric_type: str, error: Optional[str]): - _metric_type = metric_type if metric_type in ("categorical", "score", "boolean") else "other" + _metric_type = metric_type if metric_type in ("categorical", "score", "boolean", "json") else "other" custom_joining_key = str(int(join_on.get("tag") is not None)) tags = _base_tags(error) tags.extend([("metric_type", _metric_type), ("custom_joining_key", custom_joining_key)]) diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py index 3ac1631571e..12a8f586237 100644 --- a/ddtrace/llmobs/_writer.py +++ b/ddtrace/llmobs/_writer.py @@ -101,6 +101,7 @@ class LLMObsExperimentEvalMetricEvent(TypedDict, total=False): categorical_value: str score_value: float boolean_value: bool + json_value: Dict[str, JSONType] error: Optional[Dict[str, str]] tags: List[str] experiment_id: str diff --git a/releasenotes/notes/llmobs-json-metric-type-2e58cd6c746f9947.yaml b/releasenotes/notes/llmobs-json-metric-type-2e58cd6c746f9947.yaml new file mode 100644 index 00000000000..cc20d134aa3 --- /dev/null +++ b/releasenotes/notes/llmobs-json-metric-type-2e58cd6c746f9947.yaml @@ -0,0 +1,7 @@ +--- +features: + - | + LLM Observability: Adds support for ``json`` metric type in evaluation metrics. + Users can now submit ``dict`` values as evaluation metrics using ``LLMObs.submit_evaluation()`` + with ``metric_type="json"``. Additionally, experiment evaluators that return ``dict`` values + are automatically detected as ``json`` metric type. diff --git a/tests/llmobs/test_llmobs_service.py b/tests/llmobs/test_llmobs_service.py index 94a1ca0758a..3d2658c2290 100644 --- a/tests/llmobs/test_llmobs_service.py +++ b/tests/llmobs/test_llmobs_service.py @@ -1763,11 +1763,11 @@ def test_submit_evaluation_label_value_with_a_period_raises_error(llmobs, mock_l def test_submit_evaluation_incorrect_metric_type_raises_error(llmobs, mock_llmobs_logs): - with pytest.raises(ValueError, match="metric_type must be one of 'categorical', 'score', or 'boolean'."): + with pytest.raises(ValueError, match="metric_type must be one of 'categorical', 'score', 'boolean', or 'json'."): llmobs.submit_evaluation( span={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="wrong", value="high" ) - with pytest.raises(ValueError, match="metric_type must be one of 'categorical', 'score', or 'boolean'."): + with pytest.raises(ValueError, match="metric_type must be one of 'categorical', 'score', 'boolean', or 'json'."): llmobs.submit_evaluation( span={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="", value="high" ) @@ -2163,3 +2163,10 @@ def test_submit_evaluation_incorrect_categorical_value_type_raises_error(llmobs, llmobs.submit_evaluation( span={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", value=123 ) + + +def test_submit_evaluation_incorrect_json_value_type_raises_error(llmobs, mock_llmobs_logs): + with pytest.raises(TypeError, match="value must be a dict for a json metric."): + llmobs.submit_evaluation( + span={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="json", value="high" + )