Skip to content

Commit 8c2d469

Browse files
Add tests
1 parent a22eaed commit 8c2d469

1 file changed

Lines changed: 285 additions & 0 deletions

File tree

tests/benchmarks/test_osworld.py

Lines changed: 285 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,285 @@
1+
2+
import tempfile
3+
from pathlib import Path
4+
from unittest.mock import patch
5+
6+
import pytest
7+
8+
from agentlab.benchmarks.osworld import (
9+
OSWorldActionSet,
10+
OsworldEnvArgs,
11+
OsworldGym,
12+
)
13+
14+
15+
16+
def mock_task_config() -> dict:
17+
"""Mock task configuration for testing."""
18+
return {
19+
"id": "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
20+
"snapshot": "chrome",
21+
"instruction": "Can you make Bing the main search thingy when I look stuff up on the internet?",
22+
"source": "https://support.google.com/chrome/answer/95426",
23+
"config": [
24+
{
25+
"type": "launch",
26+
"parameters": {"command": ["google-chrome", "--remote-debugging-port=1337"]},
27+
}
28+
],
29+
"trajectory": "trajectories/",
30+
"related_apps": ["chrome"],
31+
"evaluator": {
32+
"func": "match_in_list",
33+
"result": {"type": "default_search_engine"},
34+
"expected": {"type": "rule", "rules": {"expected": ["Microsoft Bing", "Bing"]}},
35+
},
36+
"proxy": False,
37+
}
38+
39+
40+
class TestOSWorldActionSet:
41+
"""Test cases for OSWorld action set functionality."""
42+
43+
def test_action_set_creation(self):
44+
"""Test basic action set creation."""
45+
action_set = OSWorldActionSet(action_space="computer_13")
46+
assert action_set.action_space == "computer_13"
47+
48+
def test_to_tool_description_openai(self):
49+
"""Test tool description conversion for OpenAI format."""
50+
action_set = OSWorldActionSet(action_space="computer_13")
51+
tools = action_set.to_tool_description(api="openai")
52+
53+
assert isinstance(tools, list)
54+
assert len(tools) > 0
55+
56+
# Check that tools have the expected structure
57+
tool = tools[0]
58+
assert "type" in tool
59+
assert "name" in tool
60+
assert "description" in tool
61+
assert "parameters" in tool
62+
assert tool["type"] == "function"
63+
64+
def test_to_tool_description_anthropic(self):
65+
"""Test tool description conversion for Anthropic format."""
66+
action_set = OSWorldActionSet(action_space="computer_13")
67+
tools = action_set.to_tool_description(api="anthropic")
68+
69+
assert isinstance(tools, list)
70+
assert len(tools) > 0
71+
72+
# Check that tools have the Anthropic format
73+
tool = tools[0]
74+
assert "name" in tool
75+
assert "description" in tool
76+
assert "input_schema" in tool
77+
# Anthropic format doesn't have "type" field
78+
79+
def test_unsupported_action_space(self):
80+
"""Test that unsupported action spaces raise ValueError."""
81+
action_set = OSWorldActionSet(action_space="pyautogui")
82+
with pytest.raises(
83+
ValueError, match="Only 'computer_13' action space is currently supported"
84+
):
85+
action_set.to_tool_description()
86+
87+
88+
class TestOsworldEnvArgs:
89+
"""Test cases for OSWorld environment arguments."""
90+
91+
def test_env_args_creation(self):
92+
"""Test basic environment args creation."""
93+
task = mock_task_config()
94+
env_args = OsworldEnvArgs(task=task, task_name="test_task", max_steps=10)
95+
96+
assert env_args.task == task
97+
assert env_args.task_name == "test_task"
98+
assert env_args.max_steps == 10
99+
assert env_args.action_space == "computer_13" # default
100+
assert env_args.provider_name == "docker" # default
101+
102+
def test_env_args_custom_config(self):
103+
"""Test environment args with custom configuration."""
104+
task = mock_task_config()
105+
env_args = OsworldEnvArgs(
106+
task=task,
107+
task_name="custom_task",
108+
action_space="computer_13",
109+
provider_name="vmware",
110+
headless=True,
111+
screen_size=(1280, 720),
112+
max_steps=25,
113+
)
114+
115+
assert env_args.action_space == "computer_13"
116+
assert env_args.provider_name == "vmware"
117+
assert env_args.headless is True
118+
assert env_args.screen_size == (1280, 720)
119+
assert env_args.max_steps == 25
120+
121+
@patch("agentlab.benchmarks.osworld.OsworldGym")
122+
def test_make_env(self, mock_gym_class):
123+
"""Test environment creation from args."""
124+
task = mock_task_config()
125+
env_args = OsworldEnvArgs(task=task, task_name="test_task")
126+
127+
with tempfile.TemporaryDirectory() as tmp_dir:
128+
exp_dir = Path(tmp_dir)
129+
env_args.make_env(exp_dir)
130+
131+
# Verify that OsworldGym was called with correct arguments
132+
mock_gym_class.assert_called_once()
133+
call_args = mock_gym_class.call_args[1]
134+
assert call_args["task"] == task
135+
assert call_args["exp_dir"] == exp_dir
136+
137+
138+
class TestOsworldGym:
139+
"""Test cases for OSWorld gym functionality."""
140+
141+
def test_gym_action_parsing(self):
142+
"""Test gym action parsing functionality."""
143+
144+
from agentlab.benchmarks.osworld import OsworldGym
145+
146+
# Test various action strings including edge cases
147+
test_cases = [
148+
# Basic actions
149+
("wait()", ("wait", [], {})),
150+
("done()", ("done", [], {})),
151+
("move_to(x=100, y=200)", ("move_to", [], {"x": 100, "y": 200})),
152+
('typing(text="hello world")', ("typing", [], {"text": "hello world"})),
153+
("hotkey(keys=['ctrl', 'c'])", ("hotkey", [], {"keys": ["ctrl", "c"]})),
154+
# Edge cases with strings
155+
('typing(text="")', ("typing", [], {"text": ""})), # Empty string
156+
('typing(text="line1\\nline2")', ("typing", [], {"text": "line1\nline2"})), # Newlines
157+
('typing(text="tab\\there")', ("typing", [], {"text": "tab\there"})), # Tabs
158+
(
159+
'typing(text="quote\\"test")',
160+
("typing", [], {"text": 'quote"test'}),
161+
), # Escaped quotes
162+
(
163+
'typing(text="single\'quote")',
164+
("typing", [], {"text": "single'quote"}),
165+
), # Single quotes
166+
('typing(text="unicode: café")', ("typing", [], {"text": "unicode: café"})), # Unicode
167+
# Edge cases with coordinates
168+
("move_to(x=0, y=0)", ("move_to", [], {"x": 0, "y": 0})), # Zero coordinates
169+
(
170+
"move_to(x=-10, y=-20)",
171+
("move_to", [], {"x": -10, "y": -20}),
172+
), # Negative coordinates
173+
(
174+
"move_to(x=9999, y=9999)",
175+
("move_to", [], {"x": 9999, "y": 9999}),
176+
), # Large coordinates
177+
# Edge cases with lists
178+
("hotkey(keys=[])", ("hotkey", [], {"keys": []})), # Empty list
179+
("hotkey(keys=['ctrl'])", ("hotkey", [], {"keys": ["ctrl"]})), # Single key
180+
(
181+
"hotkey(keys=['ctrl', 'shift', 'alt', 'a'])",
182+
("hotkey", [], {"keys": ["ctrl", "shift", "alt", "a"]}),
183+
), # Multiple keys
184+
# Edge cases with boolean values
185+
("scroll(direction='up', clicks=3)", ("scroll", [], {"direction": "up", "clicks": 3})),
186+
(
187+
"click(x=100, y=200, button='left')",
188+
("click", [], {"x": 100, "y": 200, "button": "left"}),
189+
),
190+
# Edge cases with mixed parameter types
191+
(
192+
"complex_action(text='test', x=50, enabled=True, items=['a', 'b'])",
193+
(
194+
"complex_action",
195+
[],
196+
{"text": "test", "x": 50, "enabled": True, "items": ["a", "b"]},
197+
),
198+
),
199+
# Edge cases with whitespace
200+
(" wait() ", ("wait", [], {})), # Leading/trailing spaces
201+
(
202+
"move_to( x=100 , y=200 )",
203+
("move_to", [], {"x": 100, "y": 200}),
204+
), # Spaces around params
205+
# Edge cases with special characters in strings
206+
(
207+
'typing(text="@#$%^&*()+={}[]|\\:;\'<>?,./")',
208+
("typing", [], {"text": "@#$%^&*()+={}[]|\\:;'<>?,./"}),
209+
),
210+
]
211+
212+
for action_str, expected in test_cases:
213+
result = OsworldGym.parse_agentlab_action_str_to_func_args(action_str)
214+
assert result == expected, f"Failed parsing: {action_str}"
215+
216+
@patch("agentlab.benchmarks.osworld.DesktopEnv")
217+
def test_gym_creation(self, mock_desktop_env):
218+
"""Test OSWorld gym creation."""
219+
task = mock_task_config()
220+
221+
with tempfile.TemporaryDirectory() as tmp_dir:
222+
exp_dir = Path(tmp_dir)
223+
gym = OsworldGym(
224+
task=task,
225+
provider_name="docker",
226+
region=None,
227+
path_to_vm=None,
228+
snapshot_name="init_state",
229+
action_space="computer_13",
230+
cache_dir="cache",
231+
screen_size=(1920, 1080),
232+
headless=True,
233+
require_a11y_tree=True,
234+
require_terminal=False,
235+
os_type="Ubuntu",
236+
enable_proxy=False,
237+
max_steps=50,
238+
exp_dir=exp_dir,
239+
)
240+
241+
assert gym.task == task
242+
assert gym._step_count == 0
243+
assert gym.max_steps == 50
244+
assert gym.exp_dir == exp_dir
245+
246+
def test_convert_agentlab_action_to_computer_13(self):
247+
"""Test action conversion from AgentLab to Computer 13 format."""
248+
task = mock_task_config()
249+
250+
with tempfile.TemporaryDirectory() as tmp_dir:
251+
exp_dir = Path(tmp_dir)
252+
253+
with patch("agentlab.benchmarks.osworld.DesktopEnv"):
254+
gym = OsworldGym(
255+
task=task,
256+
provider_name="docker",
257+
region=None,
258+
path_to_vm=None,
259+
snapshot_name="init_state",
260+
action_space="computer_13",
261+
cache_dir="cache",
262+
screen_size=(1920, 1080),
263+
headless=True,
264+
require_a11y_tree=True,
265+
require_terminal=False,
266+
os_type="Ubuntu",
267+
enable_proxy=False,
268+
max_steps=50,
269+
exp_dir=exp_dir,
270+
)
271+
272+
# Test simple action
273+
result = gym.convert_agentlab_action_to_computer_13("wait()")
274+
assert result == "WAIT"
275+
276+
# Test action with parameters
277+
result = gym.convert_agentlab_action_to_computer_13("move_to(x=100, y=200)")
278+
expected = {"action_type": "MOVE_TO", "parameters": {"x": 100, "y": 200}}
279+
assert result == expected
280+
281+
# Test typing action
282+
result = gym.convert_agentlab_action_to_computer_13('typing(text="hello")')
283+
expected = {"action_type": "TYPING", "parameters": {"text": "hello"}}
284+
assert result == expected
285+

0 commit comments

Comments
 (0)