Azure · mmkawale · Jun 5, 2026 · Jun 5, 2026 · Jun 8, 2026 · Jun 8, 2026
@@ -1,5 +1,13 @@
 # Release History
 
+## 1.17.1 (Unreleased)
+
+### Features Added
+
+- Enabled `ToolCallAccuracyEvaluator`, `_ToolInputAccuracyEvaluator`, and `_ToolCallSuccessEvaluator` to run on conversations that include built-in restricted tools (`bing_grounding`, `bing_custom_search`, `azure_ai_search`, `azure_fabric`, `sharepoint_grounding`). These three evaluators grade the agent's tool selection, input arguments, and call status — none of which require the (redacted) tool output body — so the previous unconditional rejection of conversations containing restricted tools is now lifted. Achieved by setting `check_for_unsupported_tools=False` on each evaluator's input validator. `GroundednessEvaluator` and `ToolOutputUtilizationEvaluator` continue to reject restricted tools because they consume the tool output body.
+- Exported `_ToolInputAccuracyEvaluator` from the top-level `azure.ai.evaluation` namespace so consumers no longer need to reach into the private `_evaluators._tool_input_accuracy` submodule. The other tool evaluators were already exposed there; this brings the four siblings in line.
+- `_ToolCallSuccessEvaluator` now deterministically returns `fail` (score `0`, `_passed=False`) without invoking the LLM when any `tool_call` or `tool_result` in the response carries a known-failure `status` (`failed`, `error`, `incomplete`, `cancelled`/`canceled`). This matches the evaluator's binary contract ("FALSE: at least one tool call failed") and prevents the prompty rubric -- which doesn't see the `status` field -- from mis-grading conversations whose only failure signal is the runtime-reported execution status. Behavior is unchanged for responses where no `status` is populated.
+
 ## 1.17.0 (2026-06-03)
 
 ### Breaking Changes

@@ -34,6 +34,7 @@
 from ._evaluators._document_retrieval import DocumentRetrievalEvaluator
 from ._evaluators._tool_output_utilization import _ToolOutputUtilizationEvaluator
 from ._evaluators._tool_call_success import _ToolCallSuccessEvaluator
+from ._evaluators._tool_input_accuracy import _ToolInputAccuracyEvaluator
 from ._model_configurations import (
     AzureAIProject,
     AzureOpenAIModelConfiguration,
@@ -135,6 +136,7 @@ def lazy_import():
     "ToolCallAccuracyEvaluator",
     "_ToolOutputUtilizationEvaluator",
     "_ToolCallSuccessEvaluator",
+    "_ToolInputAccuracyEvaluator",
     "AzureOpenAIGrader",
     "AzureOpenAILabelGrader",
     "AzureOpenAIStringCheckGrader",

@@ -103,7 +103,7 @@ def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE,
         # Initialize input validator
         self._validator = ToolCallsValidator(
             error_target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
-            check_for_unsupported_tools=True,
+            check_for_unsupported_tools=False,
         )
 
         super().__init__(

@@ -78,7 +78,7 @@ def __init__(self, model_config, *, credential=None, **kwargs):
         self._validator = ToolDefinitionsValidator(
             error_target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR,
             requires_query=False,
-            check_for_unsupported_tools=True,
+            check_for_unsupported_tools=False,
         )
 
         super().__init__(
@@ -179,6 +179,37 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:  # t
                 target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR,
             )
 
+        # Short-circuit: if the agent runtime already reported a failed tool
+        # execution via a known-failure ``status`` (e.g. "failed", "error",
+        # "incomplete"), deterministically return ``fail`` without calling the
+        # LLM. The evaluator's scoring contract is binary -- "FALSE: at least
+        # one tool call failed" -- and the prompty rubric doesn't see the
+        # ``status`` field, so it would otherwise grade only the (typically
+        # empty) result body and frequently mis-score the conversation as a
+        # pass. ``status`` is only populated by upstream converters that
+        # preserve it; absent ``status``, behavior is unchanged.
+        if isinstance(eval_input.get("response"), list):
+            failed_statuses = _collect_failed_tool_statuses(eval_input["response"])
+            if failed_statuses:
+                reason = (
+                    "Detected failed tool execution(s) with status "
+                    + ", ".join(sorted(set(failed_statuses)))
+                    + ". Marked as fail without LLM grading."
+                )
+                return {
+                    self._result_key: 0.0,
+                    f"{self._result_key}_score": 0.0,
+                    f"{self._result_key}_passed": False,
+                    f"{self._result_key}_result": "fail",
+                    f"{self._result_key}_reason": reason,
+                    f"{self._result_key}_status": "completed",
+                    f"{self._result_key}_threshold": self._threshold,
+                    f"{self._result_key}_properties": {
+                        "short_circuit": "tool_status",
+                        "failed_statuses": sorted(set(failed_statuses)),
+                    },
+                }
+
         if isinstance(eval_input.get("response"), list):
             eval_input["response"] = _preprocess_messages(eval_input["response"])
             eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger)
@@ -271,6 +302,43 @@ def _filter_to_used_tools(tool_definitions, msgs_list, logger=None):
         return tool_definitions
 
 
+_FAILED_TOOL_STATUSES = frozenset({"failed", "error", "incomplete", "cancelled", "canceled"})
+
+
+def _collect_failed_tool_statuses(agent_response_msgs):
+    """Return the list of failure statuses seen on any ``tool_call`` or
+    ``tool_result`` content block in ``agent_response_msgs``.
+
+    Inputs are intentionally tolerated -- malformed messages / non-dict
+    content blocks are skipped rather than raised on, so this helper is safe
+    to call on freshly-deserialized agent traces.
+
+    :param agent_response_msgs: The agent's response message list (already
+        validated to be a list by the caller).
+    :type agent_response_msgs: list
+    :return: A list (with duplicates preserved) of lowercased failure status
+        strings. Empty list means no failure signal was found.
+    :rtype: list[str]
+    """
+    found = []
+    if not isinstance(agent_response_msgs, list):
+        return found
+    for msg in agent_response_msgs:
+        if not isinstance(msg, dict):
+            continue
+        content = msg.get("content")
+        if not isinstance(content, list):
+            continue
+        for block in content:
+            if not isinstance(block, dict):
+                continue
+            if block.get("type") in ("tool_call", "tool_result"):
+                status = block.get("status")
+                if isinstance(status, str) and status.lower() in _FAILED_TOOL_STATUSES:
+                    found.append(status.lower())
+    return found
+
+
 def _get_tool_calls_results(agent_response_msgs):
     """Extract formatted agent tool calls and results from response."""
     agent_response_text = []

@@ -92,7 +92,7 @@ def __init__(
         self._validator = ToolDefinitionsValidator(
             error_target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
             optional_tool_definitions=False,
-            check_for_unsupported_tools=True,
+            check_for_unsupported_tools=False,
         )
 
         super().__init__(

@@ -3,4 +3,4 @@
 # ---------------------------------------------------------
 # represents upcoming version
 
-VERSION = "1.17.0"
+VERSION = "1.17.1"
@@ -0,0 +1,172 @@
+from unittest.mock import MagicMock
+
+import pytest
+
+from azure.ai.evaluation import _ToolCallSuccessEvaluator
+from azure.ai.evaluation._evaluators._tool_call_success._tool_call_success import (
+    _collect_failed_tool_statuses,
+)
+
+
+# Default prompty mock that always grades as PASS. Tests that exercise the
+# deterministic short-circuit path rely on this mock NOT being called.
+async def _flow_pass(timeout, **kwargs):  # pylint: disable=unused-argument
+    return {
+        "llm_output": {
+            "reason": "All tool calls completed successfully.",
+            "score": 1,
+            "properties": {},
+        }
+    }
+
+
+def _assistant_tool_call(tool_call_id="call_1", name="get_weather", arguments=None, status=None):
+    block = {
+        "type": "tool_call",
+        "tool_call_id": tool_call_id,
+        "name": name,
+        "arguments": arguments or {"location": "NYC"},
+    }
+    if status is not None:
+        block["status"] = status
+    return {"role": "assistant", "content": [block]}
+
+
+def _tool_result(tool_call_id="call_1", result="72F sunny", status=None):
+    block = {"type": "tool_result", "tool_result": result}
+    if status is not None:
+        block["status"] = status
+    return {"role": "tool", "tool_call_id": tool_call_id, "content": [block]}
+
+
+# ---------------------------------------------------------------------------
+# _collect_failed_tool_statuses
+# ---------------------------------------------------------------------------
+
+
+class TestCollectFailedToolStatuses:
+    def test_no_status_returns_empty(self):
+        msgs = [_assistant_tool_call(), _tool_result()]
+        assert _collect_failed_tool_statuses(msgs) == []
+
+    def test_completed_status_returns_empty(self):
+        msgs = [
+            _assistant_tool_call(status="completed"),
+            _tool_result(status="completed"),
+        ]
+        assert _collect_failed_tool_statuses(msgs) == []
+
+    @pytest.mark.parametrize("status", ["failed", "error", "incomplete", "cancelled", "canceled"])
+    def test_known_failure_status_on_tool_call_is_collected(self, status):
+        msgs = [_assistant_tool_call(status=status)]
+        assert _collect_failed_tool_statuses(msgs) == [status]
+
+    @pytest.mark.parametrize("status", ["FAILED", "Error", "Incomplete"])
+    def test_failure_status_is_case_insensitive(self, status):
+        msgs = [_assistant_tool_call(status=status)]
+        assert _collect_failed_tool_statuses(msgs) == [status.lower()]
+
+    def test_failure_status_on_tool_result_is_collected(self):
+        msgs = [_assistant_tool_call(), _tool_result(status="failed")]
+        assert _collect_failed_tool_statuses(msgs) == ["failed"]
+
+    def test_unknown_status_string_is_ignored(self):
+        msgs = [_assistant_tool_call(status="something_weird")]
+        assert _collect_failed_tool_statuses(msgs) == []
+
+    def test_non_string_status_is_ignored(self):
+        msgs = [_assistant_tool_call(status=500)]
+        assert _collect_failed_tool_statuses(msgs) == []
+
+    def test_malformed_inputs_are_tolerated(self):
+        # Non-list input
+        assert _collect_failed_tool_statuses(None) == []
+        assert _collect_failed_tool_statuses("not a list") == []
+        # List with non-dict items + dicts with non-list content
+        msgs = [
+            "string entry",
+            42,
+            {"role": "assistant"},  # no content
+            {"role": "assistant", "content": "not a list"},
+            {"role": "assistant", "content": [None, 1, "x", _assistant_tool_call(status="failed")["content"][0]]},
+        ]
+        assert _collect_failed_tool_statuses(msgs) == ["failed"]
+
+
+# ---------------------------------------------------------------------------
+# _do_eval short-circuit
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.usefixtures("mock_model_config")
+@pytest.mark.unittest
+class TestToolCallSuccessShortCircuit:
+    def test_short_circuits_on_failed_tool_call_status(self, mock_model_config):
+        evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config)
+        evaluator._flow = MagicMock(side_effect=_flow_pass)
+
+        response = [_assistant_tool_call(status="failed"), _tool_result()]
+        result = evaluator(response=response)
+
+        evaluator._flow.assert_not_called()
+        assert result["tool_call_success"] == 0.0
+        assert result["tool_call_success_score"] == 0.0
+        assert result["tool_call_success_passed"] is False
+        assert result["tool_call_success_result"] == "fail"
+        assert result["tool_call_success_status"] == "completed"
+        assert "failed" in result["tool_call_success_reason"]
+        props = result["tool_call_success_properties"]
+        assert props["short_circuit"] == "tool_status"
+        assert props["failed_statuses"] == ["failed"]
+
+    def test_short_circuits_on_failed_tool_result_status(self, mock_model_config):
+        evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config)
+        evaluator._flow = MagicMock(side_effect=_flow_pass)
+
+        response = [_assistant_tool_call(), _tool_result(status="error")]
+        result = evaluator(response=response)
+
+        evaluator._flow.assert_not_called()
+        assert result["tool_call_success_result"] == "fail"
+        assert result["tool_call_success_properties"]["failed_statuses"] == ["error"]
+
+    def test_dedupes_and_sorts_failed_statuses_in_reason(self, mock_model_config):
+        evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config)
+        evaluator._flow = MagicMock(side_effect=_flow_pass)
+
+        response = [
+            _assistant_tool_call(tool_call_id="a", status="failed"),
+            _tool_result(tool_call_id="a", status="failed"),
+            _assistant_tool_call(tool_call_id="b", status="error"),
+        ]
+        result = evaluator(response=response)
+
+        evaluator._flow.assert_not_called()
+        # Reason joins deduped, sorted statuses
+        assert "error, failed" in result["tool_call_success_reason"]
+        assert result["tool_call_success_properties"]["failed_statuses"] == ["error", "failed"]
+
+    def test_no_short_circuit_when_all_statuses_completed(self, mock_model_config):
+        evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config)
+        evaluator._flow = MagicMock(side_effect=_flow_pass)
+
+        response = [
+            _assistant_tool_call(status="completed"),
+            _tool_result(status="completed"),
+        ]
+        result = evaluator(response=response)
+
+        evaluator._flow.assert_called_once()  # Goes to LLM
+        assert result["tool_call_success_passed"] is True
+
+    def test_no_short_circuit_when_status_absent(self, mock_model_config):
+        """Back-compat: traces produced by converters that do not preserve
+        ``status`` continue to be graded by the LLM as before."""
+        evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config)
+        evaluator._flow = MagicMock(side_effect=_flow_pass)
+
+        response = [_assistant_tool_call(), _tool_result()]
+        result = evaluator(response=response)
+
+        evaluator._flow.assert_called_once()
+        assert result["tool_call_success_passed"] is True