Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# Release History

## 1.17.1 (Unreleased)

### Features Added

- Enabled `ToolCallAccuracyEvaluator`, `_ToolInputAccuracyEvaluator`, and `_ToolCallSuccessEvaluator` to run on conversations that include built-in restricted tools (`bing_grounding`, `bing_custom_search`, `azure_ai_search`, `azure_fabric`, `sharepoint_grounding`). These three evaluators grade the agent's tool selection, input arguments, and call status — none of which require the (redacted) tool output body — so the previous unconditional rejection of conversations containing restricted tools is now lifted. Achieved by setting `check_for_unsupported_tools=False` on each evaluator's input validator. `GroundednessEvaluator` and `ToolOutputUtilizationEvaluator` continue to reject restricted tools because they consume the tool output body.
- Exported `_ToolInputAccuracyEvaluator` from the top-level `azure.ai.evaluation` namespace so consumers no longer need to reach into the private `_evaluators._tool_input_accuracy` submodule. The other tool evaluators were already exposed there; this brings the four siblings in line.
- `_ToolCallSuccessEvaluator` now deterministically returns `fail` (score `0`, `_passed=False`) without invoking the LLM when any `tool_call` or `tool_result` in the response carries a known-failure `status` (`failed`, `error`, `incomplete`, `cancelled`/`canceled`). This matches the evaluator's binary contract ("FALSE: at least one tool call failed") and prevents the prompty rubric -- which doesn't see the `status` field -- from mis-grading conversations whose only failure signal is the runtime-reported execution status. Behavior is unchanged for responses where no `status` is populated.

## 1.17.0 (2026-06-03)

### Breaking Changes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from ._evaluators._document_retrieval import DocumentRetrievalEvaluator
from ._evaluators._tool_output_utilization import _ToolOutputUtilizationEvaluator
from ._evaluators._tool_call_success import _ToolCallSuccessEvaluator
from ._evaluators._tool_input_accuracy import _ToolInputAccuracyEvaluator
from ._model_configurations import (
AzureAIProject,
AzureOpenAIModelConfiguration,
Expand Down Expand Up @@ -135,6 +136,7 @@ def lazy_import():
"ToolCallAccuracyEvaluator",
"_ToolOutputUtilizationEvaluator",
"_ToolCallSuccessEvaluator",
"_ToolInputAccuracyEvaluator",
"AzureOpenAIGrader",
"AzureOpenAILabelGrader",
"AzureOpenAIStringCheckGrader",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE,
# Initialize input validator
self._validator = ToolCallsValidator(
error_target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
check_for_unsupported_tools=True,
check_for_unsupported_tools=False,
)

super().__init__(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def __init__(self, model_config, *, credential=None, **kwargs):
self._validator = ToolDefinitionsValidator(
error_target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR,
requires_query=False,
check_for_unsupported_tools=True,
check_for_unsupported_tools=False,
)

super().__init__(
Expand Down Expand Up @@ -179,6 +179,37 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t
target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR,
)

# Short-circuit: if the agent runtime already reported a failed tool
# execution via a known-failure ``status`` (e.g. "failed", "error",
# "incomplete"), deterministically return ``fail`` without calling the
# LLM. The evaluator's scoring contract is binary -- "FALSE: at least
# one tool call failed" -- and the prompty rubric doesn't see the
# ``status`` field, so it would otherwise grade only the (typically
# empty) result body and frequently mis-score the conversation as a
# pass. ``status`` is only populated by upstream converters that
# preserve it; absent ``status``, behavior is unchanged.
if isinstance(eval_input.get("response"), list):
failed_statuses = _collect_failed_tool_statuses(eval_input["response"])
if failed_statuses:
reason = (
"Detected failed tool execution(s) with status "
+ ", ".join(sorted(set(failed_statuses)))
+ ". Marked as fail without LLM grading."
)
return {
self._result_key: 0.0,
f"{self._result_key}_score": 0.0,
f"{self._result_key}_passed": False,
f"{self._result_key}_result": "fail",
f"{self._result_key}_reason": reason,
f"{self._result_key}_status": "completed",
f"{self._result_key}_threshold": self._threshold,
f"{self._result_key}_properties": {
"short_circuit": "tool_status",
"failed_statuses": sorted(set(failed_statuses)),
},
}

if isinstance(eval_input.get("response"), list):
eval_input["response"] = _preprocess_messages(eval_input["response"])
eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger)
Expand Down Expand Up @@ -271,6 +302,43 @@ def _filter_to_used_tools(tool_definitions, msgs_list, logger=None):
return tool_definitions


_FAILED_TOOL_STATUSES = frozenset({"failed", "error", "incomplete", "cancelled", "canceled"})


def _collect_failed_tool_statuses(agent_response_msgs):
"""Return the list of failure statuses seen on any ``tool_call`` or
``tool_result`` content block in ``agent_response_msgs``.

Inputs are intentionally tolerated -- malformed messages / non-dict
content blocks are skipped rather than raised on, so this helper is safe
to call on freshly-deserialized agent traces.

:param agent_response_msgs: The agent's response message list (already
validated to be a list by the caller).
:type agent_response_msgs: list
:return: A list (with duplicates preserved) of lowercased failure status
strings. Empty list means no failure signal was found.
:rtype: list[str]
"""
found = []
if not isinstance(agent_response_msgs, list):
return found
for msg in agent_response_msgs:
if not isinstance(msg, dict):
continue
content = msg.get("content")
if not isinstance(content, list):
continue
for block in content:
if not isinstance(block, dict):
continue
if block.get("type") in ("tool_call", "tool_result"):
status = block.get("status")
if isinstance(status, str) and status.lower() in _FAILED_TOOL_STATUSES:
found.append(status.lower())
return found


def _get_tool_calls_results(agent_response_msgs):
"""Extract formatted agent tool calls and results from response."""
agent_response_text = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def __init__(
self._validator = ToolDefinitionsValidator(
error_target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
optional_tool_definitions=False,
check_for_unsupported_tools=True,
check_for_unsupported_tools=False,
)

super().__init__(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
# ---------------------------------------------------------
# represents upcoming version

VERSION = "1.17.0"
VERSION = "1.17.1"
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
from unittest.mock import MagicMock

import pytest

from azure.ai.evaluation import _ToolCallSuccessEvaluator
from azure.ai.evaluation._evaluators._tool_call_success._tool_call_success import (
_collect_failed_tool_statuses,
)


# Default prompty mock that always grades as PASS. Tests that exercise the
# deterministic short-circuit path rely on this mock NOT being called.
async def _flow_pass(timeout, **kwargs): # pylint: disable=unused-argument
return {
"llm_output": {
"reason": "All tool calls completed successfully.",
"score": 1,
"properties": {},
}
}


def _assistant_tool_call(tool_call_id="call_1", name="get_weather", arguments=None, status=None):
block = {
"type": "tool_call",
"tool_call_id": tool_call_id,
"name": name,
"arguments": arguments or {"location": "NYC"},
}
if status is not None:
block["status"] = status
return {"role": "assistant", "content": [block]}


def _tool_result(tool_call_id="call_1", result="72F sunny", status=None):
block = {"type": "tool_result", "tool_result": result}
if status is not None:
block["status"] = status
return {"role": "tool", "tool_call_id": tool_call_id, "content": [block]}


# ---------------------------------------------------------------------------
# _collect_failed_tool_statuses
# ---------------------------------------------------------------------------


class TestCollectFailedToolStatuses:
def test_no_status_returns_empty(self):
msgs = [_assistant_tool_call(), _tool_result()]
assert _collect_failed_tool_statuses(msgs) == []

def test_completed_status_returns_empty(self):
msgs = [
_assistant_tool_call(status="completed"),
_tool_result(status="completed"),
]
assert _collect_failed_tool_statuses(msgs) == []

@pytest.mark.parametrize("status", ["failed", "error", "incomplete", "cancelled", "canceled"])
def test_known_failure_status_on_tool_call_is_collected(self, status):
msgs = [_assistant_tool_call(status=status)]
assert _collect_failed_tool_statuses(msgs) == [status]

@pytest.mark.parametrize("status", ["FAILED", "Error", "Incomplete"])
def test_failure_status_is_case_insensitive(self, status):
msgs = [_assistant_tool_call(status=status)]
assert _collect_failed_tool_statuses(msgs) == [status.lower()]

def test_failure_status_on_tool_result_is_collected(self):
msgs = [_assistant_tool_call(), _tool_result(status="failed")]
assert _collect_failed_tool_statuses(msgs) == ["failed"]

def test_unknown_status_string_is_ignored(self):
msgs = [_assistant_tool_call(status="something_weird")]
assert _collect_failed_tool_statuses(msgs) == []

def test_non_string_status_is_ignored(self):
msgs = [_assistant_tool_call(status=500)]
assert _collect_failed_tool_statuses(msgs) == []

def test_malformed_inputs_are_tolerated(self):
# Non-list input
assert _collect_failed_tool_statuses(None) == []
assert _collect_failed_tool_statuses("not a list") == []
# List with non-dict items + dicts with non-list content
msgs = [
"string entry",
42,
{"role": "assistant"}, # no content
{"role": "assistant", "content": "not a list"},
{"role": "assistant", "content": [None, 1, "x", _assistant_tool_call(status="failed")["content"][0]]},
]
assert _collect_failed_tool_statuses(msgs) == ["failed"]


# ---------------------------------------------------------------------------
# _do_eval short-circuit
# ---------------------------------------------------------------------------


@pytest.mark.usefixtures("mock_model_config")
@pytest.mark.unittest
class TestToolCallSuccessShortCircuit:
def test_short_circuits_on_failed_tool_call_status(self, mock_model_config):
evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config)
evaluator._flow = MagicMock(side_effect=_flow_pass)

response = [_assistant_tool_call(status="failed"), _tool_result()]
result = evaluator(response=response)

evaluator._flow.assert_not_called()
assert result["tool_call_success"] == 0.0
assert result["tool_call_success_score"] == 0.0
assert result["tool_call_success_passed"] is False
assert result["tool_call_success_result"] == "fail"
assert result["tool_call_success_status"] == "completed"
assert "failed" in result["tool_call_success_reason"]
props = result["tool_call_success_properties"]
assert props["short_circuit"] == "tool_status"
assert props["failed_statuses"] == ["failed"]

def test_short_circuits_on_failed_tool_result_status(self, mock_model_config):
evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config)
evaluator._flow = MagicMock(side_effect=_flow_pass)

response = [_assistant_tool_call(), _tool_result(status="error")]
result = evaluator(response=response)

evaluator._flow.assert_not_called()
assert result["tool_call_success_result"] == "fail"
assert result["tool_call_success_properties"]["failed_statuses"] == ["error"]

def test_dedupes_and_sorts_failed_statuses_in_reason(self, mock_model_config):
evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config)
evaluator._flow = MagicMock(side_effect=_flow_pass)

response = [
_assistant_tool_call(tool_call_id="a", status="failed"),
_tool_result(tool_call_id="a", status="failed"),
_assistant_tool_call(tool_call_id="b", status="error"),
]
result = evaluator(response=response)

evaluator._flow.assert_not_called()
# Reason joins deduped, sorted statuses
assert "error, failed" in result["tool_call_success_reason"]
assert result["tool_call_success_properties"]["failed_statuses"] == ["error", "failed"]

def test_no_short_circuit_when_all_statuses_completed(self, mock_model_config):
evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config)
evaluator._flow = MagicMock(side_effect=_flow_pass)

response = [
_assistant_tool_call(status="completed"),
_tool_result(status="completed"),
]
result = evaluator(response=response)

evaluator._flow.assert_called_once() # Goes to LLM
assert result["tool_call_success_passed"] is True

def test_no_short_circuit_when_status_absent(self, mock_model_config):
"""Back-compat: traces produced by converters that do not preserve
``status`` continue to be graded by the LLM as before."""
evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config)
evaluator._flow = MagicMock(side_effect=_flow_pass)

response = [_assistant_tool_call(), _tool_result()]
result = evaluator(response=response)

evaluator._flow.assert_called_once()
assert result["tool_call_success_passed"] is True
Loading