Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion src/askui/tools/agent_os.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,13 +263,18 @@ def disconnect(self) -> None:
"""

@abstractmethod
def screenshot(self, report: bool = True) -> Image.Image:
def screenshot(self, report: bool = True, unscaled: bool = False) -> Image.Image:
"""
Captures a screenshot of the current display.

Args:
report (bool, optional): Whether to include the screenshot in
reporting. Defaults to `True`.
unscaled (bool, optional): Whether to return the screenshot at its
full, real-screen resolution instead of the resolution shown to
the model. Only has an effect on scaling implementations (e.g.
`ComputerAgentOsFacade`); implementations that already return the
native resolution ignore it. Defaults to `False`.

Returns:
Image.Image: A PIL Image object containing the screenshot.
Expand Down
8 changes: 6 additions & 2 deletions src/askui/tools/askui/askui_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,13 +349,16 @@ def _stop_execution(self) -> None:

@telemetry.record_call()
@override
def screenshot(self, report: bool = True) -> Image.Image:
def screenshot(self, report: bool = True, unscaled: bool = False) -> Image.Image:
"""
Take a screenshot of the current screen.

Args:
report (bool, optional): Whether to include the screenshot in reporting.
Defaults to `True`.
unscaled (bool, optional): Accepted for interface compatibility. This
client always returns the native screen resolution, so it has no
effect. Defaults to `False`.

Returns:
Image.Image: A PIL Image object containing the screenshot.
Expand All @@ -375,7 +378,8 @@ def screenshot(self, report: bool = True) -> Image.Image:
screenResponse.bitmap.data,
).split()
image = Image.merge("RGB", (b, g, r))
self._reporter.add_message("AgentOS", "screenshot()", image)
if report:
self._reporter.add_message("AgentOS", "screenshot()", image)
return image

@telemetry.record_call()
Expand Down
40 changes: 39 additions & 1 deletion src/askui/tools/computer_agent_os_facade.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def __init__(
image_scaler: ImageScaler,
) -> None:
self._agent_os = agent_os
self._image_scaler = image_scaler
self._scaler = CoordinateScaler(
coordinate_space=coordinate_space,
image_scaler=image_scaler,
Expand All @@ -66,10 +67,47 @@ def disconnect(self) -> None:
self._agent_os.disconnect()
self._scaler.real_screen_resolution = None

def screenshot(self, report: bool = True) -> Image.Image:
def screenshot(self, report: bool = True, unscaled: bool = False) -> Image.Image:
screenshot = self._agent_os.screenshot(report=report)
if unscaled:
self._scaler.real_screen_resolution = screenshot.size
return screenshot
return self._scaler.scale_screenshot(screenshot)

def scale_image_for_model(self, image: Image.Image) -> Image.Image:
"""Apply the same scaling screenshots receive, without recording state.

Unlike `screenshot`, this does not update the coordinate scaler's
recorded resolutions, so it is safe to call on arbitrary images (e.g. a
cropped region) without corrupting coordinate mapping.

Args:
image (Image.Image): The image to scale for model consumption.

Returns:
Image.Image: The scaled image.
"""
return self._image_scaler(image)

def scale_point_to_real_screen(
self, x: float, y: float, check_coordinates_in_bounds: bool = True
) -> tuple[int, int]:
"""Map a point from the model coordinate space to real screen pixels.

Args:
x (float): The horizontal coordinate in the model coordinate space.
y (float): The vertical coordinate in the model coordinate space.
check_coordinates_in_bounds (bool, optional): Whether to raise if the
mapped coordinate falls outside the screen. Set to `False` when the
caller clamps the result itself. Defaults to `True`.

Returns:
tuple[int, int]: The corresponding `(x, y)` in real screen pixels.
"""
return self._scaler.scale_coordinates(
x, y, check_coordinates_in_bounds=check_coordinates_in_bounds
)

def _take_silent_screenshot(self) -> Image.Image:
return self.screenshot(report=False)

Expand Down
5 changes: 4 additions & 1 deletion src/askui/tools/playwright/agent_os.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,12 +197,15 @@ def disconnect(self) -> None:
)

@override
def screenshot(self, report: bool = True) -> Image.Image:
def screenshot(self, report: bool = True, unscaled: bool = False) -> Image.Image:
"""Capture a screenshot of the current page.

Args:
report (bool, optional): Whether to include the screenshot in
reporting. Defaults to `True`.
unscaled (bool, optional): Accepted for interface compatibility. This
agent OS always returns the native page resolution, so it has no
effect. Defaults to `False`.

Returns:
Image.Image: A PIL Image object containing the screenshot.
Expand Down
4 changes: 3 additions & 1 deletion src/askui/tools/playwright/agent_os_facade.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,10 @@ def disconnect(self) -> None:
self._agent_os.disconnect()
self._scaler.real_screen_resolution = None

def screenshot(self, report: bool = True) -> Image.Image:
def screenshot(self, report: bool = True, unscaled: bool = False) -> Image.Image:
screenshot = self._agent_os.screenshot(report=report)
if unscaled:
return screenshot
return self._scaler.scale_screenshot(screenshot)

def mouse_move(self, x: float, y: float, duration: int = 500) -> None:
Expand Down
2 changes: 2 additions & 0 deletions src/askui/tools/store/computer/experimental/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
ComputerSetProcessInFocusTool,
ComputerSetWindowInFocusTool,
)
from .zoom import ComputerZoomTool

__all__ = [
"ComputerGetFileNamesTool",
Expand All @@ -18,4 +19,5 @@
"ComputerAddWindowAsVirtualDisplayTool",
"ComputerSetWindowInFocusTool",
"ComputerSetProcessInFocusTool",
"ComputerZoomTool",
]
138 changes: 138 additions & 0 deletions src/askui/tools/store/computer/experimental/zoom.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
from typing import cast

from PIL import Image

from askui.models.shared import ComputerBaseTool, ToolTags
from askui.reporting import NULL_REPORTER, Reporter
from askui.tools.computer_agent_os_facade import ComputerAgentOsFacade


class ComputerZoomTool(ComputerBaseTool):
"""
Views a region of the screen at full resolution to inspect small details.

Screenshots are downscaled before they reach the model, so small UI elements
(icons, tab titles, status-bar text, line numbers, tiny buttons) can become
illegible. This tool crops the requested region from the full-resolution
screenshot and returns it magnified. The returned image is only a magnified
view; coordinates for subsequent actions still use the original screen
coordinate space.

Args:
agent_os (`ComputerAgentOsFacade`, optional): The agent OS facade. Injected
automatically when the tool is registered with an agent.
reporter (`Reporter`, optional): Reporter used to show the cropped image
(the exact image handed to the model) in the report. Defaults to a
null reporter that discards messages.

Example:
```python
from askui import ComputerAgent
from askui.tools.store.computer.experimental import ComputerZoomTool

with ComputerAgent(act_tools=[ComputerZoomTool()]) as agent:
agent.act("Enable the tiny checkbox next to 'Advanced options'")

with ComputerAgent() as agent:
agent.act(
"Enable the tiny checkbox next to 'Advanced options'",
tools=[ComputerZoomTool()],
)
```
"""

def __init__(
self,
agent_os: ComputerAgentOsFacade | None = None,
reporter: Reporter = NULL_REPORTER,
) -> None:
super().__init__(
name="zoom",
description=(
"View a specific region of the screen at full resolution. This "
"is a last resort for reading content that is genuinely too small "
"to make out in the normal screenshot (e.g. tiny text, icons, "
"status-bar text, line numbers) when that detail is required to "
"decide your next action.\n"
"Use it sparingly. Before zooming, rely on the normal screenshot "
"you already have. Do NOT use this tool when:\n"
"- the relevant text or element is already legible in the normal "
"screenshot;\n"
"- you only need to locate or click an element (the normal "
"screenshot coordinates are sufficient for that);\n"
"- you have already zoomed into this region — do not re-zoom the "
"same area.\n"
"Provide the region as [x1, y1, x2, y2], the top-left and "
"bottom-right corners in the same coordinates you use for "
"clicking. The returned image is only a magnified view; "
"coordinates for subsequent actions still use the original screen "
"coordinate space."
),
input_schema={
"type": "object",
"properties": {
"region": {
"type": "array",
"description": (
"The region to zoom into as [x1, y1, x2, y2]: the "
"top-left and bottom-right corners in screen "
"coordinates."
),
"items": {"type": "number"},
"minItems": 4,
"maxItems": 4,
},
},
"required": ["region"],
},
agent_os=agent_os,
required_tags=[ToolTags.SCALED_AGENT_OS.value],
)
self.is_cacheable = True
self._reporter = reporter

def __call__(self, region: list[float]) -> tuple[str, Image.Image]:
if len(region) != 4: # noqa: PLR2004
error_msg = (
f"region must contain exactly 4 values [x1, y1, x2, y2], "
f"got {len(region)}"
)
raise ValueError(error_msg)

agent_os = cast("ComputerAgentOsFacade", self.agent_os)
# Suppress reporting of the uncropped screenshot; we report the crop below.
screenshot = agent_os.screenshot(unscaled=True, report=False)

# Map the model-space corners to real screen pixels. Skip the mapper's
# bounds check; we clamp to the screenshot below so a slightly oversized
# region from the model crops to the edge instead of erroring.
x1, y1, x2, y2 = region
left, top = agent_os.scale_point_to_real_screen(
x1, y1, check_coordinates_in_bounds=False
)
right, bottom = agent_os.scale_point_to_real_screen(
x2, y2, check_coordinates_in_bounds=False
)

left, right = sorted((left, right))
top, bottom = sorted((top, bottom))
left = max(0, min(left, screenshot.width - 1))
right = max(left + 1, min(right, screenshot.width))
top = max(0, min(top, screenshot.height - 1))
bottom = max(top + 1, min(bottom, screenshot.height))

crop = screenshot.crop((left, top, right, bottom))
crop = agent_os.scale_image_for_model(crop)
# Report the region in real screen pixels (where the crop was actually
# taken), not the raw coordinates the model passed.
self._reporter.add_message(
"AgentOS", f"zoom([{left}, {top}, {right}, {bottom}])", crop
)
message = (
f"Zoomed into region [{x1}, {y1}, {x2}, {y2}] shown at full "
"resolution. Coordinates for further actions remain in the original "
"screen coordinate space. Now proceed with the next action (e.g. "
"move/click) using those coordinates; do not zoom again unless a "
"different region is still too small to read."
)
return message, crop
Loading
Loading