11import asyncio
22import json
33import os
4- from pathlib import Path
5- import subprocess
64import shutil
5+ import subprocess
76import time
87from collections import deque
8+ from pathlib import Path
99
1010from aiohttp import web
1111
1818
1919
2020def install (ctx ):
21-
2221 # Check for agent-browser binary
2322 if not shutil .which ("agent-browser" ):
2423 ctx .log ("agent-browser not found. See https://agent-browser.dev/installation to use the browser extension." )
@@ -29,6 +28,10 @@ def ensure_dir(path):
2928 os .makedirs (path , exist_ok = True )
3029 return path
3130
31+ def get_browser_dir (req = None ):
32+ user = ctx .get_username (req ) if req else None
33+ return ensure_dir (os .path .join (ctx .get_user_path (user = user ), "browser" ))
34+
3235 def get_script_dir (req ):
3336 return ensure_dir (os .path .join (ctx .get_user_path (user = ctx .get_username (req )), "browser" , "scripts" ))
3437
@@ -54,7 +57,7 @@ def _add_debug_log(cmd_str, result, duration):
5457 }
5558 )
5659
57- async def run_browser_cmd_async (* args , timeout = 30 , env = None ):
60+ async def run_browser_cmd_async (* args , timeout = 30 , env = None , record = True ):
5861 """Run agent-browser command asynchronously."""
5962 cmd = ["agent-browser" ] + list (args )
6063 cmd_str = " " .join (cmd )
@@ -79,28 +82,33 @@ async def run_browser_cmd_async(*args, timeout=30, env=None):
7982 ret = {"success" : False , "error" : "Command timed out" }
8083 except Exception as e :
8184 ret = {"success" : False , "error" : str (e )}
82- _add_debug_log (cmd_str , ret , time .monotonic () - t0 )
85+ if record :
86+ _add_debug_log (cmd_str , ret , time .monotonic () - t0 )
8387 return ret
84-
88+
8589 # =========================================================================
8690 # Status & Screenshot Endpoints
8791 # =========================================================================
8892
93+ async def get_status_object ():
94+ result = await run_browser_cmd_async ("eval" , "({title:document.title,url:location.href})" , record = False )
95+ running = result ["success" ]
96+ if not running :
97+ return None
98+
99+ status = json .loads (result ["stdout" ]) if result ["success" ] and result ["stdout" ].strip () else {}
100+ url = status .get ("url" , "" )
101+ if not url or url == "about:blank" :
102+ return None
103+ status ["running" ] = True
104+ return status
105+
89106 async def get_status (req ):
90107 """Get current browser status including URL and title."""
91- url_result = await run_browser_cmd_async ("get" , "url" )
92- title_result = await run_browser_cmd_async ("get" , "title" )
93-
94- if not url_result ["success" ] and "no active" in url_result .get ("stderr" , "" ).lower ():
108+ result = await get_status_object ()
109+ if not result :
95110 return web .json_response ({"running" : False , "url" : None , "title" : None })
96-
97- return web .json_response (
98- {
99- "running" : url_result ["success" ],
100- "url" : url_result ["stdout" ].strip () if url_result ["success" ] else None ,
101- "title" : title_result ["stdout" ].strip () if title_result ["success" ] else None ,
102- }
103- )
111+ return web .json_response (result )
104112
105113 ctx .add_get ("/browser/status" , get_status )
106114
@@ -123,47 +131,67 @@ async def clear_debug_log(request):
123131
124132 async def get_screenshot (req ):
125133 """Capture and return current screenshot."""
126- screenshot_path = os .path .join (ctx .get_user_path (user = ctx .get_username (req )), "browser" , "screenshot.png" )
134+ browser_dir = os .path .join (ctx .get_user_path (user = ctx .get_username (req )), "browser" )
135+ screenshot_path = os .path .join (browser_dir , "screenshot.png" )
136+ snapshot_path = os .path .join (browser_dir , "snapshot.json" )
127137
128- result = await run_browser_cmd_async ("screenshot" , screenshot_path )
129- if not result ["success" ]:
130- return web .json_response ({"error" : result .get ("stderr" , "Screenshot failed" )}, status = 500 )
138+ screenshot_result , snapshot_result = await asyncio .gather (
139+ run_browser_cmd_async ("screenshot" , screenshot_path , record = False ),
140+ run_browser_cmd_async ("snapshot" , "-i" , "--json" , snapshot_path , record = False ),
141+ )
131142
132- if os .path .exists (screenshot_path ):
143+ success = snapshot_result ["success" ]
144+ if success and snapshot_result .get ("stdout" , "" ).strip ():
145+ # write output to snapshot_path for next time
146+ try :
147+ snapshot = json .loads (snapshot_result ["stdout" ])
148+ status = await get_status_object ()
149+ if status :
150+ snapshot .update (status )
151+ Path (snapshot_path ).write_text (json .dumps (snapshot ))
152+ except Exception as e :
153+ ctx .err ("Failed to parse snapshot JSON\n " + snapshot_result ["stdout" ], e )
154+ success = False
155+
156+ if success and os .path .exists (screenshot_path ):
133157 return web .FileResponse (screenshot_path , headers = {"Content-Type" : "image/png" , "Cache-Control" : "no-cache" })
134- return web .json_response ({"error" : "Screenshot file not found" }, status = 500 )
158+
159+ return web .FileResponse (os .path .join (os .path .dirname (__file__ ), "ui" , "connecting.svg" ), headers = {"Content-Type" : "image/svg" , "Cache-Control" : "no-cache" })
135160
136161 ctx .add_get ("/browser/screenshot" , get_screenshot )
137162
138163 async def get_snapshot (req ):
139164 """Get interactive elements snapshot with refs."""
165+ browser_dir = os .path .join (ctx .get_user_path (user = ctx .get_username (req )), "browser" )
166+ snapshot_path = os .path .join (browser_dir , "snapshot.json" )
167+ force = req .query .get ("force" , "false" ) == "true"
140168 include_cursor = req .query .get ("cursor" , "false" ) == "true"
141169 args = ["snapshot" , "-i" , "--json" ]
142170 if include_cursor :
143171 args .append ("-C" )
144172
145- result = await run_browser_cmd_async (* args )
146- if not result ["success" ]:
147- return web .json_response ({"error" : result .get ("stderr" , "Snapshot failed" )}, status = 500 )
173+ parsed = None
174+ if force or not os .path .exists (snapshot_path ):
175+ result = await run_browser_cmd_async (* args )
176+ if not result ["success" ]:
177+ raise Exception (result .get ("stderr" , "Snapshot failed" ))
178+ # write output to snapshot_path for next time
179+ if result .get ("stdout" , "" ).strip ():
180+ try :
181+ parsed = json .loads (result ["stdout" ])
182+ parsed .update (await get_status_object ())
183+ Path (snapshot_path ).write_text (json .dumps (parsed ))
184+ except Exception as e :
185+ ctx .err ("Failed to parse snapshot JSON\n " + result ["stdout" ], e )
148186
149187 try :
150- parsed = json .loads (result ["stdout" ]) if result ["stdout" ].strip () else {}
151- # agent-browser --json returns {"success":true,"data":{"refs":{...},"snapshot":"..."}}
152- if isinstance (parsed , dict ) and "data" in parsed and "refs" in parsed ["data" ]:
153- refs = parsed ["data" ]["refs" ]
154- elements = [
155- {"ref" : f"@{ key } " , "desc" : f'{ val .get ("role" , "" )} "{ val .get ("name" , "" )} "' .strip ()}
156- for key , val in sorted (refs .items (), key = lambda x : int (x [0 ][1 :]) if x [0 ][1 :].isdigit () else 0 )
157- ]
158- elif isinstance (parsed , list ):
159- elements = parsed
160- else :
161- elements = []
188+ if not parsed :
189+ snapshot_json = Path (snapshot_path ).read_text ()
190+ parsed = json .loads (snapshot_json ) if snapshot_json .strip () else {}
162191 except json .JSONDecodeError :
163- # Return raw text if not JSON
164- elements = result ["stdout" ].strip ().split ("\n " )
192+ parsed = {}
165193
166- return web .json_response ({ "elements" : elements } )
194+ return web .json_response (parsed )
167195
168196 ctx .add_get ("/browser/snapshot" , get_snapshot )
169197
@@ -179,7 +207,10 @@ async def browser_open(req):
179207 if not url :
180208 return web .json_response ({"error" : "URL required" }, status = 400 )
181209
182- _browser_env = {"AGENT_BROWSER_PROFILE" : get_profile_dir (req ), "AGENT_BROWSER_USER_AGENT" : AGENT_BROWSER_USER_AGENT }
210+ _browser_env = {
211+ "AGENT_BROWSER_PROFILE" : get_profile_dir (req ),
212+ "AGENT_BROWSER_USER_AGENT" : AGENT_BROWSER_USER_AGENT ,
213+ }
183214 result = await run_browser_cmd_async ("open" , url , timeout = 60 , env = _browser_env )
184215 ctx .log (
185216 f"browser_open: Open result: success={ result ['success' ]} , stdout={ result .get ('stdout' , '' )[:100 ]} , stderr={ result .get ('stderr' , '' )[:100 ]} "
@@ -432,7 +463,13 @@ async def run_script(req):
432463 return web .json_response (result )
433464 except asyncio .TimeoutError :
434465 proc .kill ()
435- result = {"success" : False , "error" : "Script execution timed out" , "returncode" : - 1 , "stdout" : "" , "stderr" : "Script execution timed out" }
466+ result = {
467+ "success" : False ,
468+ "error" : "Script execution timed out" ,
469+ "returncode" : - 1 ,
470+ "stdout" : "" ,
471+ "stderr" : "Script execution timed out" ,
472+ }
436473 _add_debug_log (f"bash { name } " , result , time .monotonic () - t0 )
437474 return web .json_response ({"error" : "Script execution timed out" }, status = 500 )
438475 except Exception as e :
@@ -456,36 +493,31 @@ async def generate_script(req):
456493 if not prompt :
457494 return web .json_response ({"error" : "Prompt required" }, status = 400 )
458495
459- system_prompt = """You are an expert at browser automation using the agent-browser CLI tool.
460- Generate a bash script that accomplishes the user's task.
461-
462- Key commands:
463- - agent-browser open <url> - Navigate to URL
464- - agent-browser snapshot -i - Get interactive elements with refs (@e1, @e2, etc.)
465- - agent-browser click @e1 - Click element by ref
466- - agent-browser fill @e1 "text" - Fill input field
467- - agent-browser type @e1 "text" - Type without clearing
468- - agent-browser press Enter - Press key
469- - agent-browser wait --load networkidle - Wait for page load
470- - agent-browser wait @e1 - Wait for element
471- - agent-browser get text @e1 - Get element text
472- - agent-browser screenshot output.png - Take screenshot
473-
474- Always:
475- 1. Start with #!/bin/bash and set -euo pipefail
476- 2. Use snapshot -i after navigation to get element refs
477- 3. Wait for page loads with --load networkidle
478- 4. Add comments explaining each step
479-
480- Output ONLY the bash script, no explanations."""
496+ system_prompt = None
497+ candidate_paths = [
498+ Path (os .path .join (get_browser_dir (req ), name )),
499+ Path (os .path .join (get_browser_dir (), name )),
500+ Path (__file__ ).parent / "ui" / "generate-script.txt" ,
501+ ]
502+
503+ for path in candidate_paths :
504+ if path .exists ():
505+ system_prompt = path .read_text ()
506+ break
507+
508+ if not system_prompt :
509+ raise Exception ("generate-script.txt system prompt template not found." )
481510
482511 if existing_script .strip ():
483512 user_message = f"Here is an existing browser automation script:\n \n ```bash\n { existing_script } \n ```\n \n Modify this script to: { prompt } \n \n Output the complete updated script."
484513 else :
485514 user_message = f"Create a browser automation script that: { prompt } "
486515
516+ BROWSER_MODEL = os .getenv ("BROWSER_MODEL" , ctx .config .get ("defaults" , {}).get ("text" , {}).get ("model" ))
517+ if not BROWSER_MODEL :
518+ raise Exception ("No model specified for browser script generation. Set BROWSER_MODEL environment variable or configure a default text model in llms.json." )
487519 chat_request = {
488- "model" : ctx . config . get ( "defaults" , {}). get ( "text" , {}). get ( "model" , "MiniMax-M2.1" ) ,
520+ "model" : BROWSER_MODEL ,
489521 "messages" : [
490522 {"role" : "system" , "content" : system_prompt },
491523 {"role" : "user" , "content" : user_message },
0 commit comments