Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .importlinter
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ source_modules =
aai_cli.agent_cascade
aai_cli.auth
aai_cli.code_gen
aai_cli.control
aai_cli.init
aai_cli.onboard
aai_cli.streaming
Expand Down
93 changes: 93 additions & 0 deletions aai_cli/commands/control/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from __future__ import annotations

import typer

from aai_cli import command_registry, help_panels, options
from aai_cli.app.context import run_with_options
from aai_cli.commands.control import _exec as control_exec
from aai_cli.core import llm
from aai_cli.ui.help_text import examples_epilog

app = typer.Typer()

SPEC = command_registry.CommandModuleSpec(
panel=help_panels.TRANSCRIPTION,
order=47, # pragma: no mutate -- sparse rank; a +-1 shift is order-equivalent
commands=("control",),
)


@app.command(
rich_help_panel=help_panels.TRANSCRIPTION,
epilog=examples_epilog(
[
("Control your Mac hands-free by voice", "assembly control"),
("Preview actions without touching the UI", "assembly control --dry-run"),
("Use a more capable model for the agent", "assembly control --model claude-opus-4-7"),
("Emit the loop as newline-delimited JSON", "assembly control --json"),
]
),
)
def control(
ctx: typer.Context,
device: int | None = typer.Option(
None,
"--device",
help="Microphone device index",
rich_help_panel=help_panels.OPT_CAPTURE,
),
sample_rate: int | None = typer.Option(
None,
"--sample-rate",
help="Microphone capture rate in Hz (default: device native)",
min=1,
rich_help_panel=help_panels.OPT_CAPTURE,
),
model: str = typer.Option(
llm.DEFAULT_MODEL,
"--model",
help="LLM Gateway model that decides the actions",
rich_help_panel=help_panels.OPT_LLM,
autocompletion=llm.complete_model,
),
max_tokens: int = typer.Option(
llm.DEFAULT_MAX_TOKENS,
"--max-tokens",
help="Max tokens per agent step",
min=1,
rich_help_panel=help_panels.OPT_LLM,
),
max_steps: int = typer.Option(
10,
"--max-steps",
help="Max action steps the agent may take per spoken instruction",
min=1,
rich_help_panel=help_panels.OPT_LLM,
),
dry_run: bool = typer.Option(
False,
"--dry-run",
help="Plan and observe only: refuse every UI-changing action",
),
json_out: bool = options.json_option("Emit newline-delimited JSON events"),
) -> None:
"""Drive your Mac hands-free: speak an instruction, an agent acts on the UI

Each spoken instruction is transcribed with Streaming STT and handed to an
LLM agent that decides which UI actions to take — typing, key chords,
clicking accessibility elements, launching apps — and performs them through a
bundled native macOS helper, then speaks back a short confirmation.

macOS only: the helper needs Apple's Swift compiler and the Accessibility +
Microphone permissions granted to your terminal. Use --dry-run to watch the
agent plan without it touching anything.
"""
opts = control_exec.ControlOptions(
device=device,
sample_rate=sample_rate,
model=model,
max_tokens=max_tokens,
max_steps=max_steps,
dry_run=dry_run,
)
run_with_options(ctx, control_exec.run_control, opts, json=json_out)
95 changes: 95 additions & 0 deletions aai_cli/commands/control/_exec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
"""Run logic for `assembly control`: a gh-style options/run split.

The command module parses argv into a :class:`ControlOptions` and hands it to
:func:`run_control`. The three external legs — mic Streaming STT, the LLM
Gateway, and the native UI helper — are bundled in :class:`ControlDeps` with
real-implementation defaults, so a test drives the whole session by passing
fakes to :func:`_run_control` with no microphone, network, subprocess, or macOS.
"""

from __future__ import annotations

from collections.abc import Callable, Iterable
from dataclasses import dataclass

from aai_cli.app.context import AppState
from aai_cli.control import bridge, engine, prompt
from aai_cli.control import listen as listen_mod
from aai_cli.control.helper import UiHelper
from aai_cli.control.render import ControlRenderer
from aai_cli.core import signals


@dataclass(frozen=True)
class ControlOptions:
"""Every `assembly control` flag as plain data."""

device: int | None
sample_rate: int | None
model: str
max_tokens: int
max_steps: int
dry_run: bool


def _default_transcripts(api_key: str, opts: ControlOptions) -> Iterable[str]:
"""Real mic→utterance leg."""
return listen_mod.listen(api_key, device=opts.device, sample_rate=opts.sample_rate)


def _default_responder(api_key: str, opts: ControlOptions) -> engine.Responder:
"""Real LLM-Gateway leg."""
return bridge.build_responder(api_key, model=opts.model, max_tokens=opts.max_tokens)


def _default_helper() -> UiHelper:
"""Real native-helper leg (compiles + spawns the Swift binary on first action)."""
return UiHelper()


@dataclass(frozen=True)
class ControlDeps:
"""The three external legs, injectable so the session is exercised with fakes."""

transcripts: Callable[[str, ControlOptions], Iterable[str]] = _default_transcripts
responder: Callable[[str, ControlOptions], engine.Responder] = _default_responder
helper: Callable[[], UiHelper] = _default_helper


_DEFAULT_DEPS = ControlDeps()


def _run_control(
opts: ControlOptions,
state: AppState,
*,
json_mode: bool,
deps: ControlDeps,
) -> None:
"""Drive one hands-free control session with the given dependencies."""
# Build the native helper first: on a non-macOS host this fails fast with the
# "macOS only" message, before the user is ever asked to authenticate. Once it
# exists, everything else runs under try/finally so the child is always closed.
hands = deps.helper()
try:
api_key = state.resolve_api_key()
respond = deps.responder(api_key, opts)
transcripts = deps.transcripts(api_key, opts)
renderer = ControlRenderer(json_mode=json_mode)
with signals.terminate_as_interrupt():
engine.run_session(
transcripts,
system=prompt.system_prompt(),
respond=respond,
execute=hands.execute,
renderer=renderer,
max_steps=opts.max_steps,
allow_mutate=not opts.dry_run,
)
finally:
hands.close()


def run_control(opts: ControlOptions, state: AppState, /, *, json_mode: bool) -> None:
"""Execute one `assembly control` invocation from already-parsed flags."""
_run_control(opts, state, json_mode=json_mode, deps=_DEFAULT_DEPS)
20 changes: 20 additions & 0 deletions aai_cli/control/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
"""Voice-controlled computer use: `assembly control`.

A local agent loop that turns spoken instructions into real macOS UI actions —
the "voice-in, hands-on-the-machine" tool that a browser/web service can't be,
because it drives the actual desktop (keystrokes, clicks, app focus) through a
native Swift helper.

The slice is split so every external leg is an injectable seam and the loop
itself is pure:

- `actions` — the action vocabulary the helper understands (pure data).
- `tools` — those actions as OpenAI function-calling tool definitions.
- `prompt` — the system prompt that briefs the model on the loop.
- `engine` — the observe/act loop over a transcript stream (no I/O of its own).
- `bridge` — adapts the LLM Gateway into the engine's `Responder` seam.
- `helper` — spawns and talks JSON to the native `macos_ui_control.swift` helper.
- `listen` — adapts mic Streaming STT into a stream of finalized utterances.
"""

from __future__ import annotations
72 changes: 72 additions & 0 deletions aai_cli/control/actions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""The action protocol: the vocabulary the LLM "brain" uses to drive the macOS
"hands" helper.

An :class:`Action` is one tool call the model emitted — a name plus JSON
arguments. :func:`validate` checks the name is known and the required arguments
are present, turning a raw model tool call into a request the Swift helper
understands. Everything here is pure data, so the engine is exercised without a
model, a microphone, or macOS.
"""

from __future__ import annotations

from dataclasses import dataclass

# Action name -> the argument names it requires. The Swift helper understands
# exactly these actions; a tool call for any other name is rejected back to the
# model and never executed (see :func:`validate`).
ACTION_SPECS: dict[str, tuple[str, ...]] = {
"type_text": ("text",),
"key_combo": ("keys",),
"click": (),
"launch_app": ("name",),
"focus_app": ("name",),
"get_ui_tree": (),
"screenshot": (),
}

# Actions that only read the screen and never change UI state. `--dry-run`
# executes these for real (so the model can still "see") but refuses every
# other, UI-mutating action.
OBSERVE_ACTIONS = frozenset({"get_ui_tree", "screenshot"})


class InvalidAction(Exception):
"""A model tool call that names an unknown action or omits a required argument.

Surfaced back to the model as a failed tool result rather than crashing the
session — the model can correct itself on the next step.
"""


@dataclass(frozen=True)
class Action:
"""One validated UI action: a known name plus its JSON arguments."""

name: str
arguments: dict[str, object]

def is_observe(self) -> bool:
"""True for read-only actions (screen observation), which `--dry-run` allows."""
return self.name in OBSERVE_ACTIONS

def request(self) -> dict[str, object]:
"""The JSON object sent to the Swift helper: the action name plus its arguments."""
return {"action": self.name, **self.arguments}


def validate(name: str, arguments: dict[str, object]) -> Action:
"""Turn a model's tool call into an :class:`Action`, or raise :class:`InvalidAction`.

Rejects an unknown action name and any call missing a required argument, so the
helper is only ever handed a request it can execute.
"""
required = ACTION_SPECS.get(name)
if required is None:
raise InvalidAction(f"Unknown action {name!r}.")
missing = [arg for arg in required if arg not in arguments]
if missing:
raise InvalidAction(
f"Action {name!r} is missing required argument(s): {', '.join(missing)}."
)
return Action(name=name, arguments=arguments)
87 changes: 87 additions & 0 deletions aai_cli/control/bridge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""Adapt the LLM Gateway into the engine's :data:`~aai_cli.control.engine.Responder`.

The gateway is OpenAI-compatible, so one chat-completions call with the control
``tools`` is a single model turn. This converts the SDK response into the
engine's plain :class:`~aai_cli.control.engine.Reply` — parsing each tool call's
JSON arguments — so the loop never touches the OpenAI types. The underlying
:func:`aai_cli.core.llm.complete` is injected so the adapter is unit-tested
against a fake completer with no network.
"""

from __future__ import annotations

import json
from collections.abc import Callable
from typing import TYPE_CHECKING

from aai_cli.control import engine, tools
from aai_cli.control.engine import Reply, ToolCall
from aai_cli.core import jsonshape, llm

if TYPE_CHECKING:
from openai.types.chat import ChatCompletion

from aai_cli.control.engine import Message

# The completer seam: same shape as ``llm.complete``'s keyword call below.
type Completer = Callable[..., ChatCompletion]


def _parse_arguments(raw: str | None) -> dict[str, object]:
"""Parse a tool call's JSON ``arguments`` string into a dict.

A model occasionally emits empty or malformed arguments; treat those as no
arguments so validation (not a JSON crash) reports the real problem.
"""
if not raw:
return {}
try:
parsed = json.loads(raw)
except json.JSONDecodeError:
return {}
return jsonshape.as_mapping(parsed) or {}


def _reply_of(response: ChatCompletion) -> Reply:
"""Convert a chat-completions response into the engine's :class:`Reply`."""
message = response.choices[0].message
calls: list[ToolCall] = []
for call in message.tool_calls or []:
# The SDK union also allows a custom (non-function) tool call; we only ask
# the model for function tools, so narrow to those on the type discriminant.
if call.type != "function":
continue
calls.append(
ToolCall(
id=call.id,
name=call.function.name,
arguments=_parse_arguments(call.function.arguments),
)
)
return Reply(content=message.content or "", tool_calls=tuple(calls))


def build_responder(
api_key: str,
*,
model: str,
max_tokens: int,
complete: Completer = llm.complete,
) -> engine.Responder:
"""A :data:`Responder` that runs one gateway turn with the control tools.

The tools and ``tool_choice`` ride in ``extra`` (merged into the request
body), since the gateway accepts the OpenAI tool-calling fields.
"""

def respond(messages: list[Message]) -> Reply:
response = complete(
api_key,
model=model,
messages=messages,
max_tokens=max_tokens,
extra={"tools": tools.tool_definitions(), "tool_choice": "auto"},
)
return _reply_of(response)

return respond
Loading
Loading