Skip to content

Commit 3dd8808

Browse files
committed
ui + server updates
1 parent 0eb1dc2 commit 3dd8808

5 files changed

Lines changed: 453 additions & 95 deletions

File tree

llms/extensions/browser/__init__.py

Lines changed: 99 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
import asyncio
22
import json
33
import os
4-
from pathlib import Path
5-
import subprocess
64
import shutil
5+
import subprocess
76
import time
87
from collections import deque
8+
from pathlib import Path
99

1010
from aiohttp import web
1111

@@ -18,7 +18,6 @@
1818

1919

2020
def install(ctx):
21-
2221
# Check for agent-browser binary
2322
if not shutil.which("agent-browser"):
2423
ctx.log("agent-browser not found. See https://agent-browser.dev/installation to use the browser extension.")
@@ -29,6 +28,10 @@ def ensure_dir(path):
2928
os.makedirs(path, exist_ok=True)
3029
return path
3130

31+
def get_browser_dir(req=None):
32+
user = ctx.get_username(req) if req else None
33+
return ensure_dir(os.path.join(ctx.get_user_path(user=user), "browser"))
34+
3235
def get_script_dir(req):
3336
return ensure_dir(os.path.join(ctx.get_user_path(user=ctx.get_username(req)), "browser", "scripts"))
3437

@@ -54,7 +57,7 @@ def _add_debug_log(cmd_str, result, duration):
5457
}
5558
)
5659

57-
async def run_browser_cmd_async(*args, timeout=30, env=None):
60+
async def run_browser_cmd_async(*args, timeout=30, env=None, record=True):
5861
"""Run agent-browser command asynchronously."""
5962
cmd = ["agent-browser"] + list(args)
6063
cmd_str = " ".join(cmd)
@@ -79,28 +82,33 @@ async def run_browser_cmd_async(*args, timeout=30, env=None):
7982
ret = {"success": False, "error": "Command timed out"}
8083
except Exception as e:
8184
ret = {"success": False, "error": str(e)}
82-
_add_debug_log(cmd_str, ret, time.monotonic() - t0)
85+
if record:
86+
_add_debug_log(cmd_str, ret, time.monotonic() - t0)
8387
return ret
84-
88+
8589
# =========================================================================
8690
# Status & Screenshot Endpoints
8791
# =========================================================================
8892

93+
async def get_status_object():
94+
result = await run_browser_cmd_async("eval", "({title:document.title,url:location.href})", record=False)
95+
running = result["success"]
96+
if not running:
97+
return None
98+
99+
status = json.loads(result["stdout"]) if result["success"] and result["stdout"].strip() else {}
100+
url = status.get("url", "")
101+
if not url or url == "about:blank":
102+
return None
103+
status["running"] = True
104+
return status
105+
89106
async def get_status(req):
90107
"""Get current browser status including URL and title."""
91-
url_result = await run_browser_cmd_async("get", "url")
92-
title_result = await run_browser_cmd_async("get", "title")
93-
94-
if not url_result["success"] and "no active" in url_result.get("stderr", "").lower():
108+
result = await get_status_object()
109+
if not result:
95110
return web.json_response({"running": False, "url": None, "title": None})
96-
97-
return web.json_response(
98-
{
99-
"running": url_result["success"],
100-
"url": url_result["stdout"].strip() if url_result["success"] else None,
101-
"title": title_result["stdout"].strip() if title_result["success"] else None,
102-
}
103-
)
111+
return web.json_response(result)
104112

105113
ctx.add_get("/browser/status", get_status)
106114

@@ -123,47 +131,67 @@ async def clear_debug_log(request):
123131

124132
async def get_screenshot(req):
125133
"""Capture and return current screenshot."""
126-
screenshot_path = os.path.join(ctx.get_user_path(user=ctx.get_username(req)), "browser", "screenshot.png")
134+
browser_dir = os.path.join(ctx.get_user_path(user=ctx.get_username(req)), "browser")
135+
screenshot_path = os.path.join(browser_dir, "screenshot.png")
136+
snapshot_path = os.path.join(browser_dir, "snapshot.json")
127137

128-
result = await run_browser_cmd_async("screenshot", screenshot_path)
129-
if not result["success"]:
130-
return web.json_response({"error": result.get("stderr", "Screenshot failed")}, status=500)
138+
screenshot_result, snapshot_result = await asyncio.gather(
139+
run_browser_cmd_async("screenshot", screenshot_path, record=False),
140+
run_browser_cmd_async("snapshot", "-i", "--json", snapshot_path, record=False),
141+
)
131142

132-
if os.path.exists(screenshot_path):
143+
success = snapshot_result["success"]
144+
if success and snapshot_result.get("stdout", "").strip():
145+
# write output to snapshot_path for next time
146+
try:
147+
snapshot = json.loads(snapshot_result["stdout"])
148+
status = await get_status_object()
149+
if status:
150+
snapshot.update(status)
151+
Path(snapshot_path).write_text(json.dumps(snapshot))
152+
except Exception as e:
153+
ctx.err("Failed to parse snapshot JSON\n" + snapshot_result["stdout"], e)
154+
success = False
155+
156+
if success and os.path.exists(screenshot_path):
133157
return web.FileResponse(screenshot_path, headers={"Content-Type": "image/png", "Cache-Control": "no-cache"})
134-
return web.json_response({"error": "Screenshot file not found"}, status=500)
158+
159+
return web.FileResponse(os.path.join(os.path.dirname(__file__), "ui", "connecting.svg"), headers={"Content-Type": "image/svg", "Cache-Control": "no-cache"})
135160

136161
ctx.add_get("/browser/screenshot", get_screenshot)
137162

138163
async def get_snapshot(req):
139164
"""Get interactive elements snapshot with refs."""
165+
browser_dir = os.path.join(ctx.get_user_path(user=ctx.get_username(req)), "browser")
166+
snapshot_path = os.path.join(browser_dir, "snapshot.json")
167+
force = req.query.get("force", "false") == "true"
140168
include_cursor = req.query.get("cursor", "false") == "true"
141169
args = ["snapshot", "-i", "--json"]
142170
if include_cursor:
143171
args.append("-C")
144172

145-
result = await run_browser_cmd_async(*args)
146-
if not result["success"]:
147-
return web.json_response({"error": result.get("stderr", "Snapshot failed")}, status=500)
173+
parsed = None
174+
if force or not os.path.exists(snapshot_path):
175+
result = await run_browser_cmd_async(*args)
176+
if not result["success"]:
177+
raise Exception(result.get("stderr", "Snapshot failed"))
178+
# write output to snapshot_path for next time
179+
if result.get("stdout", "").strip():
180+
try:
181+
parsed = json.loads(result["stdout"])
182+
parsed.update(await get_status_object())
183+
Path(snapshot_path).write_text(json.dumps(parsed))
184+
except Exception as e:
185+
ctx.err("Failed to parse snapshot JSON\n" + result["stdout"], e)
148186

149187
try:
150-
parsed = json.loads(result["stdout"]) if result["stdout"].strip() else {}
151-
# agent-browser --json returns {"success":true,"data":{"refs":{...},"snapshot":"..."}}
152-
if isinstance(parsed, dict) and "data" in parsed and "refs" in parsed["data"]:
153-
refs = parsed["data"]["refs"]
154-
elements = [
155-
{"ref": f"@{key}", "desc": f'{val.get("role", "")} "{val.get("name", "")}"'.strip()}
156-
for key, val in sorted(refs.items(), key=lambda x: int(x[0][1:]) if x[0][1:].isdigit() else 0)
157-
]
158-
elif isinstance(parsed, list):
159-
elements = parsed
160-
else:
161-
elements = []
188+
if not parsed:
189+
snapshot_json = Path(snapshot_path).read_text()
190+
parsed = json.loads(snapshot_json) if snapshot_json.strip() else {}
162191
except json.JSONDecodeError:
163-
# Return raw text if not JSON
164-
elements = result["stdout"].strip().split("\n")
192+
parsed = {}
165193

166-
return web.json_response({"elements": elements})
194+
return web.json_response(parsed)
167195

168196
ctx.add_get("/browser/snapshot", get_snapshot)
169197

@@ -179,7 +207,10 @@ async def browser_open(req):
179207
if not url:
180208
return web.json_response({"error": "URL required"}, status=400)
181209

182-
_browser_env = {"AGENT_BROWSER_PROFILE": get_profile_dir(req), "AGENT_BROWSER_USER_AGENT": AGENT_BROWSER_USER_AGENT}
210+
_browser_env = {
211+
"AGENT_BROWSER_PROFILE": get_profile_dir(req),
212+
"AGENT_BROWSER_USER_AGENT": AGENT_BROWSER_USER_AGENT,
213+
}
183214
result = await run_browser_cmd_async("open", url, timeout=60, env=_browser_env)
184215
ctx.log(
185216
f"browser_open: Open result: success={result['success']}, stdout={result.get('stdout', '')[:100]}, stderr={result.get('stderr', '')[:100]}"
@@ -432,7 +463,13 @@ async def run_script(req):
432463
return web.json_response(result)
433464
except asyncio.TimeoutError:
434465
proc.kill()
435-
result = {"success": False, "error": "Script execution timed out", "returncode": -1, "stdout": "", "stderr": "Script execution timed out"}
466+
result = {
467+
"success": False,
468+
"error": "Script execution timed out",
469+
"returncode": -1,
470+
"stdout": "",
471+
"stderr": "Script execution timed out",
472+
}
436473
_add_debug_log(f"bash {name}", result, time.monotonic() - t0)
437474
return web.json_response({"error": "Script execution timed out"}, status=500)
438475
except Exception as e:
@@ -456,36 +493,31 @@ async def generate_script(req):
456493
if not prompt:
457494
return web.json_response({"error": "Prompt required"}, status=400)
458495

459-
system_prompt = """You are an expert at browser automation using the agent-browser CLI tool.
460-
Generate a bash script that accomplishes the user's task.
461-
462-
Key commands:
463-
- agent-browser open <url> - Navigate to URL
464-
- agent-browser snapshot -i - Get interactive elements with refs (@e1, @e2, etc.)
465-
- agent-browser click @e1 - Click element by ref
466-
- agent-browser fill @e1 "text" - Fill input field
467-
- agent-browser type @e1 "text" - Type without clearing
468-
- agent-browser press Enter - Press key
469-
- agent-browser wait --load networkidle - Wait for page load
470-
- agent-browser wait @e1 - Wait for element
471-
- agent-browser get text @e1 - Get element text
472-
- agent-browser screenshot output.png - Take screenshot
473-
474-
Always:
475-
1. Start with #!/bin/bash and set -euo pipefail
476-
2. Use snapshot -i after navigation to get element refs
477-
3. Wait for page loads with --load networkidle
478-
4. Add comments explaining each step
479-
480-
Output ONLY the bash script, no explanations."""
496+
system_prompt = None
497+
candidate_paths = [
498+
Path(os.path.join(get_browser_dir(req), name)),
499+
Path(os.path.join(get_browser_dir(), name)),
500+
Path(__file__).parent / "ui" / "generate-script.txt",
501+
]
502+
503+
for path in candidate_paths:
504+
if path.exists():
505+
system_prompt = path.read_text()
506+
break
507+
508+
if not system_prompt:
509+
raise Exception("generate-script.txt system prompt template not found.")
481510

482511
if existing_script.strip():
483512
user_message = f"Here is an existing browser automation script:\n\n```bash\n{existing_script}\n```\n\nModify this script to: {prompt}\n\nOutput the complete updated script."
484513
else:
485514
user_message = f"Create a browser automation script that: {prompt}"
486515

516+
BROWSER_MODEL = os.getenv("BROWSER_MODEL", ctx.config.get("defaults", {}).get("text", {}).get("model"))
517+
if not BROWSER_MODEL:
518+
raise Exception("No model specified for browser script generation. Set BROWSER_MODEL environment variable or configure a default text model in llms.json.")
487519
chat_request = {
488-
"model": ctx.config.get("defaults", {}).get("text", {}).get("model", "MiniMax-M2.1"),
520+
"model": BROWSER_MODEL,
489521
"messages": [
490522
{"role": "system", "content": system_prompt},
491523
{"role": "user", "content": user_message},
Lines changed: 79 additions & 0 deletions
Loading

0 commit comments

Comments
 (0)