|
| 1 | + |
| 2 | +import tempfile |
| 3 | +from pathlib import Path |
| 4 | +from unittest.mock import patch |
| 5 | + |
| 6 | +import pytest |
| 7 | + |
| 8 | +from agentlab.benchmarks.osworld import ( |
| 9 | + OSWorldActionSet, |
| 10 | + OsworldEnvArgs, |
| 11 | + OsworldGym, |
| 12 | +) |
| 13 | + |
| 14 | + |
| 15 | + |
| 16 | +def mock_task_config() -> dict: |
| 17 | + """Mock task configuration for testing.""" |
| 18 | + return { |
| 19 | + "id": "bb5e4c0d-f964-439c-97b6-bdb9747de3f4", |
| 20 | + "snapshot": "chrome", |
| 21 | + "instruction": "Can you make Bing the main search thingy when I look stuff up on the internet?", |
| 22 | + "source": "https://support.google.com/chrome/answer/95426", |
| 23 | + "config": [ |
| 24 | + { |
| 25 | + "type": "launch", |
| 26 | + "parameters": {"command": ["google-chrome", "--remote-debugging-port=1337"]}, |
| 27 | + } |
| 28 | + ], |
| 29 | + "trajectory": "trajectories/", |
| 30 | + "related_apps": ["chrome"], |
| 31 | + "evaluator": { |
| 32 | + "func": "match_in_list", |
| 33 | + "result": {"type": "default_search_engine"}, |
| 34 | + "expected": {"type": "rule", "rules": {"expected": ["Microsoft Bing", "Bing"]}}, |
| 35 | + }, |
| 36 | + "proxy": False, |
| 37 | + } |
| 38 | + |
| 39 | + |
| 40 | +class TestOSWorldActionSet: |
| 41 | + """Test cases for OSWorld action set functionality.""" |
| 42 | + |
| 43 | + def test_action_set_creation(self): |
| 44 | + """Test basic action set creation.""" |
| 45 | + action_set = OSWorldActionSet(action_space="computer_13") |
| 46 | + assert action_set.action_space == "computer_13" |
| 47 | + |
| 48 | + def test_to_tool_description_openai(self): |
| 49 | + """Test tool description conversion for OpenAI format.""" |
| 50 | + action_set = OSWorldActionSet(action_space="computer_13") |
| 51 | + tools = action_set.to_tool_description(api="openai") |
| 52 | + |
| 53 | + assert isinstance(tools, list) |
| 54 | + assert len(tools) > 0 |
| 55 | + |
| 56 | + # Check that tools have the expected structure |
| 57 | + tool = tools[0] |
| 58 | + assert "type" in tool |
| 59 | + assert "name" in tool |
| 60 | + assert "description" in tool |
| 61 | + assert "parameters" in tool |
| 62 | + assert tool["type"] == "function" |
| 63 | + |
| 64 | + def test_to_tool_description_anthropic(self): |
| 65 | + """Test tool description conversion for Anthropic format.""" |
| 66 | + action_set = OSWorldActionSet(action_space="computer_13") |
| 67 | + tools = action_set.to_tool_description(api="anthropic") |
| 68 | + |
| 69 | + assert isinstance(tools, list) |
| 70 | + assert len(tools) > 0 |
| 71 | + |
| 72 | + # Check that tools have the Anthropic format |
| 73 | + tool = tools[0] |
| 74 | + assert "name" in tool |
| 75 | + assert "description" in tool |
| 76 | + assert "input_schema" in tool |
| 77 | + # Anthropic format doesn't have "type" field |
| 78 | + |
| 79 | + def test_unsupported_action_space(self): |
| 80 | + """Test that unsupported action spaces raise ValueError.""" |
| 81 | + action_set = OSWorldActionSet(action_space="pyautogui") |
| 82 | + with pytest.raises( |
| 83 | + ValueError, match="Only 'computer_13' action space is currently supported" |
| 84 | + ): |
| 85 | + action_set.to_tool_description() |
| 86 | + |
| 87 | + |
| 88 | +class TestOsworldEnvArgs: |
| 89 | + """Test cases for OSWorld environment arguments.""" |
| 90 | + |
| 91 | + def test_env_args_creation(self): |
| 92 | + """Test basic environment args creation.""" |
| 93 | + task = mock_task_config() |
| 94 | + env_args = OsworldEnvArgs(task=task, task_name="test_task", max_steps=10) |
| 95 | + |
| 96 | + assert env_args.task == task |
| 97 | + assert env_args.task_name == "test_task" |
| 98 | + assert env_args.max_steps == 10 |
| 99 | + assert env_args.action_space == "computer_13" # default |
| 100 | + assert env_args.provider_name == "docker" # default |
| 101 | + |
| 102 | + def test_env_args_custom_config(self): |
| 103 | + """Test environment args with custom configuration.""" |
| 104 | + task = mock_task_config() |
| 105 | + env_args = OsworldEnvArgs( |
| 106 | + task=task, |
| 107 | + task_name="custom_task", |
| 108 | + action_space="computer_13", |
| 109 | + provider_name="vmware", |
| 110 | + headless=True, |
| 111 | + screen_size=(1280, 720), |
| 112 | + max_steps=25, |
| 113 | + ) |
| 114 | + |
| 115 | + assert env_args.action_space == "computer_13" |
| 116 | + assert env_args.provider_name == "vmware" |
| 117 | + assert env_args.headless is True |
| 118 | + assert env_args.screen_size == (1280, 720) |
| 119 | + assert env_args.max_steps == 25 |
| 120 | + |
| 121 | + @patch("agentlab.benchmarks.osworld.OsworldGym") |
| 122 | + def test_make_env(self, mock_gym_class): |
| 123 | + """Test environment creation from args.""" |
| 124 | + task = mock_task_config() |
| 125 | + env_args = OsworldEnvArgs(task=task, task_name="test_task") |
| 126 | + |
| 127 | + with tempfile.TemporaryDirectory() as tmp_dir: |
| 128 | + exp_dir = Path(tmp_dir) |
| 129 | + env_args.make_env(exp_dir) |
| 130 | + |
| 131 | + # Verify that OsworldGym was called with correct arguments |
| 132 | + mock_gym_class.assert_called_once() |
| 133 | + call_args = mock_gym_class.call_args[1] |
| 134 | + assert call_args["task"] == task |
| 135 | + assert call_args["exp_dir"] == exp_dir |
| 136 | + |
| 137 | + |
| 138 | +class TestOsworldGym: |
| 139 | + """Test cases for OSWorld gym functionality.""" |
| 140 | + |
| 141 | + def test_gym_action_parsing(self): |
| 142 | + """Test gym action parsing functionality.""" |
| 143 | + |
| 144 | + from agentlab.benchmarks.osworld import OsworldGym |
| 145 | + |
| 146 | + # Test various action strings including edge cases |
| 147 | + test_cases = [ |
| 148 | + # Basic actions |
| 149 | + ("wait()", ("wait", [], {})), |
| 150 | + ("done()", ("done", [], {})), |
| 151 | + ("move_to(x=100, y=200)", ("move_to", [], {"x": 100, "y": 200})), |
| 152 | + ('typing(text="hello world")', ("typing", [], {"text": "hello world"})), |
| 153 | + ("hotkey(keys=['ctrl', 'c'])", ("hotkey", [], {"keys": ["ctrl", "c"]})), |
| 154 | + # Edge cases with strings |
| 155 | + ('typing(text="")', ("typing", [], {"text": ""})), # Empty string |
| 156 | + ('typing(text="line1\\nline2")', ("typing", [], {"text": "line1\nline2"})), # Newlines |
| 157 | + ('typing(text="tab\\there")', ("typing", [], {"text": "tab\there"})), # Tabs |
| 158 | + ( |
| 159 | + 'typing(text="quote\\"test")', |
| 160 | + ("typing", [], {"text": 'quote"test'}), |
| 161 | + ), # Escaped quotes |
| 162 | + ( |
| 163 | + 'typing(text="single\'quote")', |
| 164 | + ("typing", [], {"text": "single'quote"}), |
| 165 | + ), # Single quotes |
| 166 | + ('typing(text="unicode: café")', ("typing", [], {"text": "unicode: café"})), # Unicode |
| 167 | + # Edge cases with coordinates |
| 168 | + ("move_to(x=0, y=0)", ("move_to", [], {"x": 0, "y": 0})), # Zero coordinates |
| 169 | + ( |
| 170 | + "move_to(x=-10, y=-20)", |
| 171 | + ("move_to", [], {"x": -10, "y": -20}), |
| 172 | + ), # Negative coordinates |
| 173 | + ( |
| 174 | + "move_to(x=9999, y=9999)", |
| 175 | + ("move_to", [], {"x": 9999, "y": 9999}), |
| 176 | + ), # Large coordinates |
| 177 | + # Edge cases with lists |
| 178 | + ("hotkey(keys=[])", ("hotkey", [], {"keys": []})), # Empty list |
| 179 | + ("hotkey(keys=['ctrl'])", ("hotkey", [], {"keys": ["ctrl"]})), # Single key |
| 180 | + ( |
| 181 | + "hotkey(keys=['ctrl', 'shift', 'alt', 'a'])", |
| 182 | + ("hotkey", [], {"keys": ["ctrl", "shift", "alt", "a"]}), |
| 183 | + ), # Multiple keys |
| 184 | + # Edge cases with boolean values |
| 185 | + ("scroll(direction='up', clicks=3)", ("scroll", [], {"direction": "up", "clicks": 3})), |
| 186 | + ( |
| 187 | + "click(x=100, y=200, button='left')", |
| 188 | + ("click", [], {"x": 100, "y": 200, "button": "left"}), |
| 189 | + ), |
| 190 | + # Edge cases with mixed parameter types |
| 191 | + ( |
| 192 | + "complex_action(text='test', x=50, enabled=True, items=['a', 'b'])", |
| 193 | + ( |
| 194 | + "complex_action", |
| 195 | + [], |
| 196 | + {"text": "test", "x": 50, "enabled": True, "items": ["a", "b"]}, |
| 197 | + ), |
| 198 | + ), |
| 199 | + # Edge cases with whitespace |
| 200 | + (" wait() ", ("wait", [], {})), # Leading/trailing spaces |
| 201 | + ( |
| 202 | + "move_to( x=100 , y=200 )", |
| 203 | + ("move_to", [], {"x": 100, "y": 200}), |
| 204 | + ), # Spaces around params |
| 205 | + # Edge cases with special characters in strings |
| 206 | + ( |
| 207 | + 'typing(text="@#$%^&*()+={}[]|\\:;\'<>?,./")', |
| 208 | + ("typing", [], {"text": "@#$%^&*()+={}[]|\\:;'<>?,./"}), |
| 209 | + ), |
| 210 | + ] |
| 211 | + |
| 212 | + for action_str, expected in test_cases: |
| 213 | + result = OsworldGym.parse_agentlab_action_str_to_func_args(action_str) |
| 214 | + assert result == expected, f"Failed parsing: {action_str}" |
| 215 | + |
| 216 | + @patch("agentlab.benchmarks.osworld.DesktopEnv") |
| 217 | + def test_gym_creation(self, mock_desktop_env): |
| 218 | + """Test OSWorld gym creation.""" |
| 219 | + task = mock_task_config() |
| 220 | + |
| 221 | + with tempfile.TemporaryDirectory() as tmp_dir: |
| 222 | + exp_dir = Path(tmp_dir) |
| 223 | + gym = OsworldGym( |
| 224 | + task=task, |
| 225 | + provider_name="docker", |
| 226 | + region=None, |
| 227 | + path_to_vm=None, |
| 228 | + snapshot_name="init_state", |
| 229 | + action_space="computer_13", |
| 230 | + cache_dir="cache", |
| 231 | + screen_size=(1920, 1080), |
| 232 | + headless=True, |
| 233 | + require_a11y_tree=True, |
| 234 | + require_terminal=False, |
| 235 | + os_type="Ubuntu", |
| 236 | + enable_proxy=False, |
| 237 | + max_steps=50, |
| 238 | + exp_dir=exp_dir, |
| 239 | + ) |
| 240 | + |
| 241 | + assert gym.task == task |
| 242 | + assert gym._step_count == 0 |
| 243 | + assert gym.max_steps == 50 |
| 244 | + assert gym.exp_dir == exp_dir |
| 245 | + |
| 246 | + def test_convert_agentlab_action_to_computer_13(self): |
| 247 | + """Test action conversion from AgentLab to Computer 13 format.""" |
| 248 | + task = mock_task_config() |
| 249 | + |
| 250 | + with tempfile.TemporaryDirectory() as tmp_dir: |
| 251 | + exp_dir = Path(tmp_dir) |
| 252 | + |
| 253 | + with patch("agentlab.benchmarks.osworld.DesktopEnv"): |
| 254 | + gym = OsworldGym( |
| 255 | + task=task, |
| 256 | + provider_name="docker", |
| 257 | + region=None, |
| 258 | + path_to_vm=None, |
| 259 | + snapshot_name="init_state", |
| 260 | + action_space="computer_13", |
| 261 | + cache_dir="cache", |
| 262 | + screen_size=(1920, 1080), |
| 263 | + headless=True, |
| 264 | + require_a11y_tree=True, |
| 265 | + require_terminal=False, |
| 266 | + os_type="Ubuntu", |
| 267 | + enable_proxy=False, |
| 268 | + max_steps=50, |
| 269 | + exp_dir=exp_dir, |
| 270 | + ) |
| 271 | + |
| 272 | + # Test simple action |
| 273 | + result = gym.convert_agentlab_action_to_computer_13("wait()") |
| 274 | + assert result == "WAIT" |
| 275 | + |
| 276 | + # Test action with parameters |
| 277 | + result = gym.convert_agentlab_action_to_computer_13("move_to(x=100, y=200)") |
| 278 | + expected = {"action_type": "MOVE_TO", "parameters": {"x": 100, "y": 200}} |
| 279 | + assert result == expected |
| 280 | + |
| 281 | + # Test typing action |
| 282 | + result = gym.convert_agentlab_action_to_computer_13('typing(text="hello")') |
| 283 | + expected = {"action_type": "TYPING", "parameters": {"text": "hello"}} |
| 284 | + assert result == expected |
| 285 | + |
0 commit comments