From 7ac46e230ddbfeb47a84f8f0e7831215001a856c Mon Sep 17 00:00:00 2001 From: philipph-askui Date: Wed, 24 Jun 2026 09:57:09 +0200 Subject: [PATCH 1/3] feat: adds a new zoom tool to facilitate interaction with small elements (after the example of anthropic enhanced actions: https://platform.claude.com/docs/en/agents-and-tools/tool-use/computer-use-tool#available-actions) --- src/askui/tools/agent_os.py | 7 +- src/askui/tools/askui/askui_controller.py | 5 +- src/askui/tools/computer_agent_os_facade.py | 17 +++- src/askui/tools/playwright/agent_os.py | 5 +- src/askui/tools/playwright/agent_os_facade.py | 4 +- .../store/computer/experimental/__init__.py | 2 + .../tools/store/computer/experimental/zoom.py | 99 +++++++++++++++++++ tests/unit/tools/test_zoom_tool.py | 78 +++++++++++++++ 8 files changed, 212 insertions(+), 5 deletions(-) create mode 100644 src/askui/tools/store/computer/experimental/zoom.py create mode 100644 tests/unit/tools/test_zoom_tool.py diff --git a/src/askui/tools/agent_os.py b/src/askui/tools/agent_os.py index 96ecc831..af9cc96d 100644 --- a/src/askui/tools/agent_os.py +++ b/src/askui/tools/agent_os.py @@ -263,13 +263,18 @@ def disconnect(self) -> None: """ @abstractmethod - def screenshot(self, report: bool = True) -> Image.Image: + def screenshot(self, report: bool = True, unscaled: bool = False) -> Image.Image: """ Captures a screenshot of the current display. Args: report (bool, optional): Whether to include the screenshot in reporting. Defaults to `True`. + unscaled (bool, optional): Whether to return the screenshot at its + full, real-screen resolution instead of the resolution shown to + the model. Only has an effect on scaling implementations (e.g. + `ComputerAgentOsFacade`); implementations that already return the + native resolution ignore it. Defaults to `False`. Returns: Image.Image: A PIL Image object containing the screenshot. diff --git a/src/askui/tools/askui/askui_controller.py b/src/askui/tools/askui/askui_controller.py index 26aeb5d0..2abd8083 100644 --- a/src/askui/tools/askui/askui_controller.py +++ b/src/askui/tools/askui/askui_controller.py @@ -349,13 +349,16 @@ def _stop_execution(self) -> None: @telemetry.record_call() @override - def screenshot(self, report: bool = True) -> Image.Image: + def screenshot(self, report: bool = True, unscaled: bool = False) -> Image.Image: """ Take a screenshot of the current screen. Args: report (bool, optional): Whether to include the screenshot in reporting. Defaults to `True`. + unscaled (bool, optional): Accepted for interface compatibility. This + client always returns the native screen resolution, so it has no + effect. Defaults to `False`. Returns: Image.Image: A PIL Image object containing the screenshot. diff --git a/src/askui/tools/computer_agent_os_facade.py b/src/askui/tools/computer_agent_os_facade.py index 676a6454..3a148462 100644 --- a/src/askui/tools/computer_agent_os_facade.py +++ b/src/askui/tools/computer_agent_os_facade.py @@ -66,10 +66,25 @@ def disconnect(self) -> None: self._agent_os.disconnect() self._scaler.real_screen_resolution = None - def screenshot(self, report: bool = True) -> Image.Image: + def screenshot(self, report: bool = True, unscaled: bool = False) -> Image.Image: screenshot = self._agent_os.screenshot(report=report) + if unscaled: + self._scaler.real_screen_resolution = screenshot.size + return screenshot return self._scaler.scale_screenshot(screenshot) + def scale_point_to_real_screen(self, x: float, y: float) -> tuple[int, int]: + """Map a point from the model coordinate space to real screen pixels. + + Args: + x (float): The horizontal coordinate in the model coordinate space. + y (float): The vertical coordinate in the model coordinate space. + + Returns: + tuple[int, int]: The corresponding `(x, y)` in real screen pixels. + """ + return self._scaler.scale_coordinates(x, y) + def _take_silent_screenshot(self) -> Image.Image: return self.screenshot(report=False) diff --git a/src/askui/tools/playwright/agent_os.py b/src/askui/tools/playwright/agent_os.py index 6381be37..5f46e837 100644 --- a/src/askui/tools/playwright/agent_os.py +++ b/src/askui/tools/playwright/agent_os.py @@ -197,12 +197,15 @@ def disconnect(self) -> None: ) @override - def screenshot(self, report: bool = True) -> Image.Image: + def screenshot(self, report: bool = True, unscaled: bool = False) -> Image.Image: """Capture a screenshot of the current page. Args: report (bool, optional): Whether to include the screenshot in reporting. Defaults to `True`. + unscaled (bool, optional): Accepted for interface compatibility. This + agent OS always returns the native page resolution, so it has no + effect. Defaults to `False`. Returns: Image.Image: A PIL Image object containing the screenshot. diff --git a/src/askui/tools/playwright/agent_os_facade.py b/src/askui/tools/playwright/agent_os_facade.py index c6969fe4..1f68d61b 100644 --- a/src/askui/tools/playwright/agent_os_facade.py +++ b/src/askui/tools/playwright/agent_os_facade.py @@ -55,8 +55,10 @@ def disconnect(self) -> None: self._agent_os.disconnect() self._scaler.real_screen_resolution = None - def screenshot(self, report: bool = True) -> Image.Image: + def screenshot(self, report: bool = True, unscaled: bool = False) -> Image.Image: screenshot = self._agent_os.screenshot(report=report) + if unscaled: + return screenshot return self._scaler.scale_screenshot(screenshot) def mouse_move(self, x: float, y: float, duration: int = 500) -> None: diff --git a/src/askui/tools/store/computer/experimental/__init__.py b/src/askui/tools/store/computer/experimental/__init__.py index 43414e4b..c364ef26 100644 --- a/src/askui/tools/store/computer/experimental/__init__.py +++ b/src/askui/tools/store/computer/experimental/__init__.py @@ -8,6 +8,7 @@ ComputerSetProcessInFocusTool, ComputerSetWindowInFocusTool, ) +from .zoom import ComputerZoomTool __all__ = [ "ComputerGetFileNamesTool", @@ -18,4 +19,5 @@ "ComputerAddWindowAsVirtualDisplayTool", "ComputerSetWindowInFocusTool", "ComputerSetProcessInFocusTool", + "ComputerZoomTool", ] diff --git a/src/askui/tools/store/computer/experimental/zoom.py b/src/askui/tools/store/computer/experimental/zoom.py new file mode 100644 index 00000000..20baaf8e --- /dev/null +++ b/src/askui/tools/store/computer/experimental/zoom.py @@ -0,0 +1,99 @@ +from typing import cast + +from PIL import Image + +from askui.models.shared import ComputerBaseTool, ToolTags +from askui.tools.computer_agent_os_facade import ComputerAgentOsFacade + + +class ComputerZoomTool(ComputerBaseTool): + """ + Views a region of the screen at full resolution to inspect small details. + + Screenshots are downscaled before they reach the model, so small UI elements + (icons, tab titles, status-bar text, line numbers, tiny buttons) can become + illegible. This tool crops the requested region from the full-resolution + screenshot and returns it magnified. The returned image is only a magnified + view; coordinates for subsequent actions still use the original screen + coordinate space. + + Example: + ```python + from askui import ComputerAgent + from askui.tools.store.computer.experimental import ComputerZoomTool + + with ComputerAgent(act_tools=[ComputerZoomTool()]) as agent: + agent.act("Enable the tiny checkbox next to 'Advanced options'") + + with ComputerAgent() as agent: + agent.act( + "Enable the tiny checkbox next to 'Advanced options'", + tools=[ComputerZoomTool()], + ) + ``` + """ + + def __init__(self, agent_os: ComputerAgentOsFacade | None = None) -> None: + super().__init__( + name="zoom", + description=( + "View a specific region of the screen at full resolution. Use " + "this to read small text or to locate small UI elements (icons, " + "tab titles, status-bar text, line numbers, tiny buttons) that " + "are not legible in a normal screenshot. Provide the region as " + "[x1, y1, x2, y2], the top-left and bottom-right corners in the " + "same coordinates you use for clicking. The returned image is " + "only a magnified view; coordinates for subsequent actions still " + "use the original screen coordinate space." + ), + input_schema={ + "type": "object", + "properties": { + "region": { + "type": "array", + "description": ( + "The region to zoom into as [x1, y1, x2, y2]: the " + "top-left and bottom-right corners in screen " + "coordinates." + ), + "items": {"type": "integer"}, + "minItems": 4, + "maxItems": 4, + }, + }, + "required": ["region"], + }, + agent_os=agent_os, + required_tags=[ToolTags.SCALED_AGENT_OS.value], + ) + self.is_cacheable = True + + def __call__(self, region: list[int]) -> tuple[str, Image.Image]: + if len(region) != 4: # noqa: PLR2004 + error_msg = ( + f"region must contain exactly 4 values [x1, y1, x2, y2], " + f"got {len(region)}" + ) + raise ValueError(error_msg) + + agent_os = cast("ComputerAgentOsFacade", self.agent_os) + screenshot = agent_os.screenshot(unscaled=True) + + x1, y1, x2, y2 = region + left, top = agent_os.scale_point_to_real_screen(x1, y1) + right, bottom = agent_os.scale_point_to_real_screen(x2, y2) + + left, right = sorted((left, right)) + top, bottom = sorted((top, bottom)) + left = max(0, min(left, screenshot.width)) + right = max(left + 1, min(right, screenshot.width)) + top = max(0, min(top, screenshot.height)) + bottom = max(top + 1, min(bottom, screenshot.height)) + + crop = screenshot.crop((left, top, right, bottom)) + message = ( + f"Zoomed into region [{x1}, {y1}, {x2}, {y2}] shown at full " + "resolution. Coordinates for further actions remain in the original " + "screen coordinate space." + ) + return message, crop diff --git a/tests/unit/tools/test_zoom_tool.py b/tests/unit/tools/test_zoom_tool.py new file mode 100644 index 00000000..cc6db9bd --- /dev/null +++ b/tests/unit/tools/test_zoom_tool.py @@ -0,0 +1,78 @@ +"""Tests for `ComputerZoomTool`. + +The zoom tool returns a magnified, full-resolution crop of a region the model +specifies in its (downscaled) coordinate space. The region is mapped back to +real screen pixels before cropping, so a small box in model space becomes a +larger, more legible crop. +""" + +from unittest.mock import MagicMock + +import pytest +from PIL import Image +from pytest_mock import MockerFixture + +from askui.models.shared.coordinate_space import PixelCoordinateSpace +from askui.models.shared.image_scaler import ContainedImageScaler +from askui.tools.computer_agent_os_facade import ComputerAgentOsFacade +from askui.tools.store.computer.experimental import ComputerZoomTool + + +def _make_facade(real_size: tuple[int, int]) -> ComputerAgentOsFacade: + """Create a facade wrapping a mocked agent OS with a known screen size.""" + mock_os = MagicMock() + mock_os.tags = [] + mock_os.screenshot.return_value = Image.new("RGB", real_size) + return ComputerAgentOsFacade( + mock_os, + coordinate_space=PixelCoordinateSpace(), + image_scaler=ContainedImageScaler(), + ) + + +class TestComputerZoomTool: + """A 2048x1536 screen maps 1:2 onto the 1024x768 model space (no padding).""" + + def test_crops_region_mapped_to_real_resolution(self) -> None: + facade = _make_facade((2048, 1536)) + tool = ComputerZoomTool(agent_os=facade) + + message, crop = tool(region=[100, 100, 200, 200]) + + # 100px box in model space -> 200px box at full resolution + assert crop.size == (200, 200) + assert "[100, 100, 200, 200]" in message + + def test_requests_unscaled_screenshot(self, mocker: MockerFixture) -> None: + facade = _make_facade((2048, 1536)) + spy = mocker.spy(facade, "screenshot") + tool = ComputerZoomTool(agent_os=facade) + + tool(region=[0, 0, 100, 100]) + + assert any(call.kwargs.get("unscaled") is True for call in spy.call_args_list) + + def test_normalizes_unordered_corners(self) -> None: + facade = _make_facade((2048, 1536)) + tool = ComputerZoomTool(agent_os=facade) + + _, crop = tool(region=[200, 200, 100, 100]) + + assert crop.size == (200, 200) + + def test_rejects_region_with_wrong_length(self) -> None: + facade = _make_facade((2048, 1536)) + tool = ComputerZoomTool(agent_os=facade) + + with pytest.raises(ValueError, match="exactly 4 values"): + tool(region=[100, 100, 200]) + + def test_returns_text_and_image(self) -> None: + facade = _make_facade((2048, 1536)) + tool = ComputerZoomTool(agent_os=facade) + + result = tool(region=[10, 20, 110, 120]) + + message, crop = result + assert isinstance(message, str) + assert isinstance(crop, Image.Image) From 6d90f93cd4f6b4cc8f72a3cc3d966fe40fa7f484 Mon Sep 17 00:00:00 2001 From: philipph-askui Date: Wed, 24 Jun 2026 11:03:19 +0200 Subject: [PATCH 2/3] feat: add auto-scaling to max size for cropped screenshots --- src/askui/tools/computer_agent_os_facade.py | 16 ++++++++++++++++ .../tools/store/computer/experimental/zoom.py | 1 + tests/unit/tools/test_zoom_tool.py | 14 +++++++++++++- 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/src/askui/tools/computer_agent_os_facade.py b/src/askui/tools/computer_agent_os_facade.py index 3a148462..c1090c6c 100644 --- a/src/askui/tools/computer_agent_os_facade.py +++ b/src/askui/tools/computer_agent_os_facade.py @@ -50,6 +50,7 @@ def __init__( image_scaler: ImageScaler, ) -> None: self._agent_os = agent_os + self._image_scaler = image_scaler self._scaler = CoordinateScaler( coordinate_space=coordinate_space, image_scaler=image_scaler, @@ -73,6 +74,21 @@ def screenshot(self, report: bool = True, unscaled: bool = False) -> Image.Image return screenshot return self._scaler.scale_screenshot(screenshot) + def scale_image_for_model(self, image: Image.Image) -> Image.Image: + """Apply the same scaling screenshots receive, without recording state. + + Unlike `screenshot`, this does not update the coordinate scaler's + recorded resolutions, so it is safe to call on arbitrary images (e.g. a + cropped region) without corrupting coordinate mapping. + + Args: + image (Image.Image): The image to scale for model consumption. + + Returns: + Image.Image: The scaled image. + """ + return self._image_scaler(image) + def scale_point_to_real_screen(self, x: float, y: float) -> tuple[int, int]: """Map a point from the model coordinate space to real screen pixels. diff --git a/src/askui/tools/store/computer/experimental/zoom.py b/src/askui/tools/store/computer/experimental/zoom.py index 20baaf8e..5ab0de02 100644 --- a/src/askui/tools/store/computer/experimental/zoom.py +++ b/src/askui/tools/store/computer/experimental/zoom.py @@ -91,6 +91,7 @@ def __call__(self, region: list[int]) -> tuple[str, Image.Image]: bottom = max(top + 1, min(bottom, screenshot.height)) crop = screenshot.crop((left, top, right, bottom)) + crop = agent_os.scale_image_for_model(crop) message = ( f"Zoomed into region [{x1}, {y1}, {x2}, {y2}] shown at full " "resolution. Coordinates for further actions remain in the original " diff --git a/tests/unit/tools/test_zoom_tool.py b/tests/unit/tools/test_zoom_tool.py index cc6db9bd..4920489a 100644 --- a/tests/unit/tools/test_zoom_tool.py +++ b/tests/unit/tools/test_zoom_tool.py @@ -39,10 +39,22 @@ def test_crops_region_mapped_to_real_resolution(self) -> None: message, crop = tool(region=[100, 100, 200, 200]) - # 100px box in model space -> 200px box at full resolution + # 100px box in model space -> 200px box at full resolution. + # The model-image scaler only downscales, so a crop within bounds + # passes through unchanged. assert crop.size == (200, 200) assert "[100, 100, 200, 200]" in message + def test_oversized_crop_is_scaled_to_model_bounds(self) -> None: + facade = _make_facade((2048, 1536)) + tool = ComputerZoomTool(agent_os=facade) + + # Full model space -> full 2048x1536 real region, larger than the + # 1024x768 scaler bounds, so it is downscaled like a screenshot. + _, crop = tool(region=[0, 0, 1024, 768]) + + assert crop.size == (1024, 768) + def test_requests_unscaled_screenshot(self, mocker: MockerFixture) -> None: facade = _make_facade((2048, 1536)) spy = mocker.spy(facade, "screenshot") From b5927a0466353f12f4b6fdc2ff06c1569303f04d Mon Sep 17 00:00:00 2001 From: philipph-askui Date: Wed, 24 Jun 2026 12:10:12 +0200 Subject: [PATCH 3/3] fix: report zoomed image and screen coordinates from zoom tool --- src/askui/tools/askui/askui_controller.py | 3 +- src/askui/tools/computer_agent_os_facade.py | 11 ++- .../tools/store/computer/experimental/zoom.py | 72 ++++++++++++++----- tests/unit/tools/test_zoom_tool.py | 66 ++++++++++++++++- 4 files changed, 129 insertions(+), 23 deletions(-) diff --git a/src/askui/tools/askui/askui_controller.py b/src/askui/tools/askui/askui_controller.py index 2abd8083..4e2f8c4f 100644 --- a/src/askui/tools/askui/askui_controller.py +++ b/src/askui/tools/askui/askui_controller.py @@ -378,7 +378,8 @@ def screenshot(self, report: bool = True, unscaled: bool = False) -> Image.Image screenResponse.bitmap.data, ).split() image = Image.merge("RGB", (b, g, r)) - self._reporter.add_message("AgentOS", "screenshot()", image) + if report: + self._reporter.add_message("AgentOS", "screenshot()", image) return image @telemetry.record_call() diff --git a/src/askui/tools/computer_agent_os_facade.py b/src/askui/tools/computer_agent_os_facade.py index c1090c6c..57c7efa4 100644 --- a/src/askui/tools/computer_agent_os_facade.py +++ b/src/askui/tools/computer_agent_os_facade.py @@ -89,17 +89,24 @@ def scale_image_for_model(self, image: Image.Image) -> Image.Image: """ return self._image_scaler(image) - def scale_point_to_real_screen(self, x: float, y: float) -> tuple[int, int]: + def scale_point_to_real_screen( + self, x: float, y: float, check_coordinates_in_bounds: bool = True + ) -> tuple[int, int]: """Map a point from the model coordinate space to real screen pixels. Args: x (float): The horizontal coordinate in the model coordinate space. y (float): The vertical coordinate in the model coordinate space. + check_coordinates_in_bounds (bool, optional): Whether to raise if the + mapped coordinate falls outside the screen. Set to `False` when the + caller clamps the result itself. Defaults to `True`. Returns: tuple[int, int]: The corresponding `(x, y)` in real screen pixels. """ - return self._scaler.scale_coordinates(x, y) + return self._scaler.scale_coordinates( + x, y, check_coordinates_in_bounds=check_coordinates_in_bounds + ) def _take_silent_screenshot(self) -> Image.Image: return self.screenshot(report=False) diff --git a/src/askui/tools/store/computer/experimental/zoom.py b/src/askui/tools/store/computer/experimental/zoom.py index 5ab0de02..cae772ab 100644 --- a/src/askui/tools/store/computer/experimental/zoom.py +++ b/src/askui/tools/store/computer/experimental/zoom.py @@ -3,6 +3,7 @@ from PIL import Image from askui.models.shared import ComputerBaseTool, ToolTags +from askui.reporting import NULL_REPORTER, Reporter from askui.tools.computer_agent_os_facade import ComputerAgentOsFacade @@ -17,6 +18,13 @@ class ComputerZoomTool(ComputerBaseTool): view; coordinates for subsequent actions still use the original screen coordinate space. + Args: + agent_os (`ComputerAgentOsFacade`, optional): The agent OS facade. Injected + automatically when the tool is registered with an agent. + reporter (`Reporter`, optional): Reporter used to show the cropped image + (the exact image handed to the model) in the report. Defaults to a + null reporter that discards messages. + Example: ```python from askui import ComputerAgent @@ -33,18 +41,32 @@ class ComputerZoomTool(ComputerBaseTool): ``` """ - def __init__(self, agent_os: ComputerAgentOsFacade | None = None) -> None: + def __init__( + self, + agent_os: ComputerAgentOsFacade | None = None, + reporter: Reporter = NULL_REPORTER, + ) -> None: super().__init__( name="zoom", description=( - "View a specific region of the screen at full resolution. Use " - "this to read small text or to locate small UI elements (icons, " - "tab titles, status-bar text, line numbers, tiny buttons) that " - "are not legible in a normal screenshot. Provide the region as " - "[x1, y1, x2, y2], the top-left and bottom-right corners in the " - "same coordinates you use for clicking. The returned image is " - "only a magnified view; coordinates for subsequent actions still " - "use the original screen coordinate space." + "View a specific region of the screen at full resolution. This " + "is a last resort for reading content that is genuinely too small " + "to make out in the normal screenshot (e.g. tiny text, icons, " + "status-bar text, line numbers) when that detail is required to " + "decide your next action.\n" + "Use it sparingly. Before zooming, rely on the normal screenshot " + "you already have. Do NOT use this tool when:\n" + "- the relevant text or element is already legible in the normal " + "screenshot;\n" + "- you only need to locate or click an element (the normal " + "screenshot coordinates are sufficient for that);\n" + "- you have already zoomed into this region — do not re-zoom the " + "same area.\n" + "Provide the region as [x1, y1, x2, y2], the top-left and " + "bottom-right corners in the same coordinates you use for " + "clicking. The returned image is only a magnified view; " + "coordinates for subsequent actions still use the original screen " + "coordinate space." ), input_schema={ "type": "object", @@ -56,7 +78,7 @@ def __init__(self, agent_os: ComputerAgentOsFacade | None = None) -> None: "top-left and bottom-right corners in screen " "coordinates." ), - "items": {"type": "integer"}, + "items": {"type": "number"}, "minItems": 4, "maxItems": 4, }, @@ -67,8 +89,9 @@ def __init__(self, agent_os: ComputerAgentOsFacade | None = None) -> None: required_tags=[ToolTags.SCALED_AGENT_OS.value], ) self.is_cacheable = True + self._reporter = reporter - def __call__(self, region: list[int]) -> tuple[str, Image.Image]: + def __call__(self, region: list[float]) -> tuple[str, Image.Image]: if len(region) != 4: # noqa: PLR2004 error_msg = ( f"region must contain exactly 4 values [x1, y1, x2, y2], " @@ -77,24 +100,39 @@ def __call__(self, region: list[int]) -> tuple[str, Image.Image]: raise ValueError(error_msg) agent_os = cast("ComputerAgentOsFacade", self.agent_os) - screenshot = agent_os.screenshot(unscaled=True) + # Suppress reporting of the uncropped screenshot; we report the crop below. + screenshot = agent_os.screenshot(unscaled=True, report=False) + # Map the model-space corners to real screen pixels. Skip the mapper's + # bounds check; we clamp to the screenshot below so a slightly oversized + # region from the model crops to the edge instead of erroring. x1, y1, x2, y2 = region - left, top = agent_os.scale_point_to_real_screen(x1, y1) - right, bottom = agent_os.scale_point_to_real_screen(x2, y2) + left, top = agent_os.scale_point_to_real_screen( + x1, y1, check_coordinates_in_bounds=False + ) + right, bottom = agent_os.scale_point_to_real_screen( + x2, y2, check_coordinates_in_bounds=False + ) left, right = sorted((left, right)) top, bottom = sorted((top, bottom)) - left = max(0, min(left, screenshot.width)) + left = max(0, min(left, screenshot.width - 1)) right = max(left + 1, min(right, screenshot.width)) - top = max(0, min(top, screenshot.height)) + top = max(0, min(top, screenshot.height - 1)) bottom = max(top + 1, min(bottom, screenshot.height)) crop = screenshot.crop((left, top, right, bottom)) crop = agent_os.scale_image_for_model(crop) + # Report the region in real screen pixels (where the crop was actually + # taken), not the raw coordinates the model passed. + self._reporter.add_message( + "AgentOS", f"zoom([{left}, {top}, {right}, {bottom}])", crop + ) message = ( f"Zoomed into region [{x1}, {y1}, {x2}, {y2}] shown at full " "resolution. Coordinates for further actions remain in the original " - "screen coordinate space." + "screen coordinate space. Now proceed with the next action (e.g. " + "move/click) using those coordinates; do not zoom again unless a " + "different region is still too small to read." ) return message, crop diff --git a/tests/unit/tools/test_zoom_tool.py b/tests/unit/tools/test_zoom_tool.py index 4920489a..5dd88972 100644 --- a/tests/unit/tools/test_zoom_tool.py +++ b/tests/unit/tools/test_zoom_tool.py @@ -6,26 +6,34 @@ larger, more legible crop. """ +from typing import cast from unittest.mock import MagicMock import pytest from PIL import Image from pytest_mock import MockerFixture -from askui.models.shared.coordinate_space import PixelCoordinateSpace +from askui.models.shared.coordinate_space import ( + NormalizedCoordinateSpace, + PixelCoordinateSpace, + VlmCoordinateSpace, +) from askui.models.shared.image_scaler import ContainedImageScaler from askui.tools.computer_agent_os_facade import ComputerAgentOsFacade from askui.tools.store.computer.experimental import ComputerZoomTool -def _make_facade(real_size: tuple[int, int]) -> ComputerAgentOsFacade: +def _make_facade( + real_size: tuple[int, int], + coordinate_space: VlmCoordinateSpace | None = None, +) -> ComputerAgentOsFacade: """Create a facade wrapping a mocked agent OS with a known screen size.""" mock_os = MagicMock() mock_os.tags = [] mock_os.screenshot.return_value = Image.new("RGB", real_size) return ComputerAgentOsFacade( mock_os, - coordinate_space=PixelCoordinateSpace(), + coordinate_space=coordinate_space or PixelCoordinateSpace(), image_scaler=ContainedImageScaler(), ) @@ -79,6 +87,58 @@ def test_rejects_region_with_wrong_length(self) -> None: with pytest.raises(ValueError, match="exactly 4 values"): tool(region=[100, 100, 200]) + def test_reports_the_cropped_image_not_the_full_screenshot(self) -> None: + reporter = MagicMock() + facade = _make_facade((2048, 1536)) + tool = ComputerZoomTool(agent_os=facade, reporter=reporter) + + _, crop = tool(region=[100, 100, 200, 200]) + + # The underlying screenshot is never fetched with reporting enabled, so + # the uncropped image is not shown in the report. + screenshot_mock = cast("MagicMock", facade._agent_os).screenshot + assert screenshot_mock.call_count >= 1 + assert all( + call.kwargs.get("report") is False + for call in screenshot_mock.call_args_list + ) + # Exactly the cropped image handed to the model is reported. + reporter.add_message.assert_called_once() + reported_image = reporter.add_message.call_args.args[2] + assert reported_image is crop + + def test_reports_scaled_back_coordinates_not_model_coordinates(self) -> None: + reporter = MagicMock() + facade = _make_facade((2048, 1536)) + tool = ComputerZoomTool(agent_os=facade, reporter=reporter) + + tool(region=[100, 100, 200, 200]) + + # Model coords [100, 100, 200, 200] map 1:2 onto the real screen. + reported_message = reporter.add_message.call_args.args[1] + assert "200, 200, 400, 400" in reported_message + assert "100, 100, 200, 200" not in reported_message + + def test_out_of_bounds_region_is_clamped_not_rejected(self) -> None: + facade = _make_facade((2048, 1536)) + tool = ComputerZoomTool(agent_os=facade) + + # Region extends well past the model bounds; it must clamp to the screen + # edge and crop the whole screen instead of raising. + _, crop = tool(region=[0, 0, 5000, 5000]) + + assert crop.size == (1024, 768) + + def test_accepts_normalized_float_region(self) -> None: + facade = _make_facade((1920, 1080), NormalizedCoordinateSpace()) + tool = ComputerZoomTool(agent_os=facade) + + # Kimi-style 0.0-1.0 coordinates: 0.4..0.6 spans 20% of each axis. + _, crop = tool(region=[0.4, 0.4, 0.6, 0.6]) + + # 0.2 * 1920 = 384 wide, 0.2 * 1080 = 216 tall. + assert crop.size == (384, 216) + def test_returns_text_and_image(self) -> None: facade = _make_facade((2048, 1536)) tool = ComputerZoomTool(agent_os=facade)