AssemblyAI · alexkroman · Jun 23, 2026 · Jun 23, 2026
diff --git a/.importlinter b/.importlinter
@@ -35,6 +35,7 @@ source_modules =
     aai_cli.agent_cascade
     aai_cli.auth
     aai_cli.code_gen
+    aai_cli.control
     aai_cli.init
     aai_cli.onboard
     aai_cli.streaming

diff --git a/aai_cli/commands/control/__init__.py b/aai_cli/commands/control/__init__.py
@@ -0,0 +1,93 @@
+from __future__ import annotations
+
+import typer
+
+from aai_cli import command_registry, help_panels, options
+from aai_cli.app.context import run_with_options
+from aai_cli.commands.control import _exec as control_exec
+from aai_cli.core import llm
+from aai_cli.ui.help_text import examples_epilog
+
+app = typer.Typer()
+
+SPEC = command_registry.CommandModuleSpec(
+    panel=help_panels.TRANSCRIPTION,
+    order=47,  # pragma: no mutate -- sparse rank; a +-1 shift is order-equivalent
+    commands=("control",),
+)
+
+
+@app.command(
+    rich_help_panel=help_panels.TRANSCRIPTION,
+    epilog=examples_epilog(
+        [
+            ("Control your Mac hands-free by voice", "assembly control"),
+            ("Preview actions without touching the UI", "assembly control --dry-run"),
+            ("Use a more capable model for the agent", "assembly control --model claude-opus-4-7"),
+            ("Emit the loop as newline-delimited JSON", "assembly control --json"),
+        ]
+    ),
+)
+def control(
+    ctx: typer.Context,
+    device: int | None = typer.Option(
+        None,
+        "--device",
+        help="Microphone device index",
+        rich_help_panel=help_panels.OPT_CAPTURE,
+    ),
+    sample_rate: int | None = typer.Option(
+        None,
+        "--sample-rate",
+        help="Microphone capture rate in Hz (default: device native)",
+        min=1,
+        rich_help_panel=help_panels.OPT_CAPTURE,
+    ),
+    model: str = typer.Option(
+        llm.DEFAULT_MODEL,
+        "--model",
+        help="LLM Gateway model that decides the actions",
+        rich_help_panel=help_panels.OPT_LLM,
+        autocompletion=llm.complete_model,
+    ),
+    max_tokens: int = typer.Option(
+        llm.DEFAULT_MAX_TOKENS,
+        "--max-tokens",
+        help="Max tokens per agent step",
+        min=1,
+        rich_help_panel=help_panels.OPT_LLM,
+    ),
+    max_steps: int = typer.Option(
+        10,
+        "--max-steps",
+        help="Max action steps the agent may take per spoken instruction",
+        min=1,
+        rich_help_panel=help_panels.OPT_LLM,
+    ),
+    dry_run: bool = typer.Option(
+        False,
+        "--dry-run",
+        help="Plan and observe only: refuse every UI-changing action",
+    ),
+    json_out: bool = options.json_option("Emit newline-delimited JSON events"),
+) -> None:
+    """Drive your Mac hands-free: speak an instruction, an agent acts on the UI
+
+    Each spoken instruction is transcribed with Streaming STT and handed to an
+    LLM agent that decides which UI actions to take — typing, key chords,
+    clicking accessibility elements, launching apps — and performs them through a
+    bundled native macOS helper, then speaks back a short confirmation.
+
+    macOS only: the helper needs Apple's Swift compiler and the Accessibility +
+    Microphone permissions granted to your terminal. Use --dry-run to watch the
+    agent plan without it touching anything.
+    """
+    opts = control_exec.ControlOptions(
+        device=device,
+        sample_rate=sample_rate,
+        model=model,
+        max_tokens=max_tokens,
+        max_steps=max_steps,
+        dry_run=dry_run,
+    )
+    run_with_options(ctx, control_exec.run_control, opts, json=json_out)
diff --git a/aai_cli/commands/control/_exec.py b/aai_cli/commands/control/_exec.py
@@ -0,0 +1,95 @@
+"""Run logic for `assembly control`: a gh-style options/run split.
+
+The command module parses argv into a :class:`ControlOptions` and hands it to
+:func:`run_control`. The three external legs — mic Streaming STT, the LLM
+Gateway, and the native UI helper — are bundled in :class:`ControlDeps` with
+real-implementation defaults, so a test drives the whole session by passing
+fakes to :func:`_run_control` with no microphone, network, subprocess, or macOS.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable, Iterable
+from dataclasses import dataclass
+
+from aai_cli.app.context import AppState
+from aai_cli.control import bridge, engine, prompt
+from aai_cli.control import listen as listen_mod
+from aai_cli.control.helper import UiHelper
+from aai_cli.control.render import ControlRenderer
+from aai_cli.core import signals
+
+
+@dataclass(frozen=True)
+class ControlOptions:
+    """Every `assembly control` flag as plain data."""
+
+    device: int | None
+    sample_rate: int | None
+    model: str
+    max_tokens: int
+    max_steps: int
+    dry_run: bool
+
+
+def _default_transcripts(api_key: str, opts: ControlOptions) -> Iterable[str]:
+    """Real mic→utterance leg."""
+    return listen_mod.listen(api_key, device=opts.device, sample_rate=opts.sample_rate)
+
+
+def _default_responder(api_key: str, opts: ControlOptions) -> engine.Responder:
+    """Real LLM-Gateway leg."""
+    return bridge.build_responder(api_key, model=opts.model, max_tokens=opts.max_tokens)
+
+
+def _default_helper() -> UiHelper:
+    """Real native-helper leg (compiles + spawns the Swift binary on first action)."""
+    return UiHelper()
+
+
+@dataclass(frozen=True)
+class ControlDeps:
+    """The three external legs, injectable so the session is exercised with fakes."""
+
+    transcripts: Callable[[str, ControlOptions], Iterable[str]] = _default_transcripts
+    responder: Callable[[str, ControlOptions], engine.Responder] = _default_responder
+    helper: Callable[[], UiHelper] = _default_helper
+
+
+_DEFAULT_DEPS = ControlDeps()
+
+
+def _run_control(
+    opts: ControlOptions,
+    state: AppState,
+    *,
+    json_mode: bool,
+    deps: ControlDeps,
+) -> None:
+    """Drive one hands-free control session with the given dependencies."""
+    # Build the native helper first: on a non-macOS host this fails fast with the
+    # "macOS only" message, before the user is ever asked to authenticate. Once it
+    # exists, everything else runs under try/finally so the child is always closed.
+    hands = deps.helper()
+    try:
+        api_key = state.resolve_api_key()
+        respond = deps.responder(api_key, opts)
+        transcripts = deps.transcripts(api_key, opts)
+        renderer = ControlRenderer(json_mode=json_mode)
+        with signals.terminate_as_interrupt():
+            engine.run_session(
+                transcripts,
+                system=prompt.system_prompt(),
+                respond=respond,
+                execute=hands.execute,
+                renderer=renderer,
+                max_steps=opts.max_steps,
+                allow_mutate=not opts.dry_run,
+            )
+    finally:
+        hands.close()
+
+
+def run_control(opts: ControlOptions, state: AppState, /, *, json_mode: bool) -> None:
+    """Execute one `assembly control` invocation from already-parsed flags."""
+    _run_control(opts, state, json_mode=json_mode, deps=_DEFAULT_DEPS)
diff --git a/aai_cli/control/__init__.py b/aai_cli/control/__init__.py
@@ -0,0 +1,20 @@
+"""Voice-controlled computer use: `assembly control`.
+
+A local agent loop that turns spoken instructions into real macOS UI actions —
+the "voice-in, hands-on-the-machine" tool that a browser/web service can't be,
+because it drives the actual desktop (keystrokes, clicks, app focus) through a
+native Swift helper.
+
+The slice is split so every external leg is an injectable seam and the loop
+itself is pure:
+
+- `actions` — the action vocabulary the helper understands (pure data).
+- `tools` — those actions as OpenAI function-calling tool definitions.
+- `prompt` — the system prompt that briefs the model on the loop.
+- `engine` — the observe/act loop over a transcript stream (no I/O of its own).
+- `bridge` — adapts the LLM Gateway into the engine's `Responder` seam.
+- `helper` — spawns and talks JSON to the native `macos_ui_control.swift` helper.
+- `listen` — adapts mic Streaming STT into a stream of finalized utterances.
+"""
+
+from __future__ import annotations
diff --git a/aai_cli/control/actions.py b/aai_cli/control/actions.py
@@ -0,0 +1,72 @@
+"""The action protocol: the vocabulary the LLM "brain" uses to drive the macOS
+"hands" helper.
+
+An :class:`Action` is one tool call the model emitted — a name plus JSON
+arguments. :func:`validate` checks the name is known and the required arguments
+are present, turning a raw model tool call into a request the Swift helper
+understands. Everything here is pure data, so the engine is exercised without a
+model, a microphone, or macOS.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+# Action name -> the argument names it requires. The Swift helper understands
+# exactly these actions; a tool call for any other name is rejected back to the
+# model and never executed (see :func:`validate`).
+ACTION_SPECS: dict[str, tuple[str, ...]] = {
+    "type_text": ("text",),
+    "key_combo": ("keys",),
+    "click": (),
+    "launch_app": ("name",),
+    "focus_app": ("name",),
+    "get_ui_tree": (),
+    "screenshot": (),
+}
+
+# Actions that only read the screen and never change UI state. `--dry-run`
+# executes these for real (so the model can still "see") but refuses every
+# other, UI-mutating action.
+OBSERVE_ACTIONS = frozenset({"get_ui_tree", "screenshot"})
+
+
+class InvalidAction(Exception):
+    """A model tool call that names an unknown action or omits a required argument.
+
+    Surfaced back to the model as a failed tool result rather than crashing the
+    session — the model can correct itself on the next step.
+    """
+
+
+@dataclass(frozen=True)
+class Action:
+    """One validated UI action: a known name plus its JSON arguments."""
+
+    name: str
+    arguments: dict[str, object]
+
+    def is_observe(self) -> bool:
+        """True for read-only actions (screen observation), which `--dry-run` allows."""
+        return self.name in OBSERVE_ACTIONS
+
+    def request(self) -> dict[str, object]:
+        """The JSON object sent to the Swift helper: the action name plus its arguments."""
+        return {"action": self.name, **self.arguments}
+
+
+def validate(name: str, arguments: dict[str, object]) -> Action:
+    """Turn a model's tool call into an :class:`Action`, or raise :class:`InvalidAction`.
+
+    Rejects an unknown action name and any call missing a required argument, so the
+    helper is only ever handed a request it can execute.
+    """
+    required = ACTION_SPECS.get(name)
+    if required is None:
+        raise InvalidAction(f"Unknown action {name!r}.")
+    missing = [arg for arg in required if arg not in arguments]
+    if missing:
+        raise InvalidAction(
+            f"Action {name!r} is missing required argument(s): {', '.join(missing)}."
+        )
+    return Action(name=name, arguments=arguments)
diff --git a/aai_cli/control/bridge.py b/aai_cli/control/bridge.py
@@ -0,0 +1,87 @@
+"""Adapt the LLM Gateway into the engine's :data:`~aai_cli.control.engine.Responder`.
+
+The gateway is OpenAI-compatible, so one chat-completions call with the control
+``tools`` is a single model turn. This converts the SDK response into the
+engine's plain :class:`~aai_cli.control.engine.Reply` — parsing each tool call's
+JSON arguments — so the loop never touches the OpenAI types. The underlying
+:func:`aai_cli.core.llm.complete` is injected so the adapter is unit-tested
+against a fake completer with no network.
+"""
+
+from __future__ import annotations
+
+import json
+from collections.abc import Callable
+from typing import TYPE_CHECKING
+
+from aai_cli.control import engine, tools
+from aai_cli.control.engine import Reply, ToolCall
+from aai_cli.core import jsonshape, llm
+
+if TYPE_CHECKING:
+    from openai.types.chat import ChatCompletion
+
+    from aai_cli.control.engine import Message
+
+# The completer seam: same shape as ``llm.complete``'s keyword call below.
+type Completer = Callable[..., ChatCompletion]
+
+
+def _parse_arguments(raw: str | None) -> dict[str, object]:
+    """Parse a tool call's JSON ``arguments`` string into a dict.
+
+    A model occasionally emits empty or malformed arguments; treat those as no
+    arguments so validation (not a JSON crash) reports the real problem.
+    """
+    if not raw:
+        return {}
+    try:
+        parsed = json.loads(raw)
+    except json.JSONDecodeError:
+        return {}
+    return jsonshape.as_mapping(parsed) or {}
+
+
+def _reply_of(response: ChatCompletion) -> Reply:
+    """Convert a chat-completions response into the engine's :class:`Reply`."""
+    message = response.choices[0].message
+    calls: list[ToolCall] = []
+    for call in message.tool_calls or []:
+        # The SDK union also allows a custom (non-function) tool call; we only ask
+        # the model for function tools, so narrow to those on the type discriminant.
+        if call.type != "function":
+            continue
+        calls.append(
+            ToolCall(
+                id=call.id,
+                name=call.function.name,
+                arguments=_parse_arguments(call.function.arguments),
+            )
+        )
+    return Reply(content=message.content or "", tool_calls=tuple(calls))
+
+
+def build_responder(
+    api_key: str,
+    *,
+    model: str,
+    max_tokens: int,
+    complete: Completer = llm.complete,
+) -> engine.Responder:
+    """A :data:`Responder` that runs one gateway turn with the control tools.
+
+    The tools and ``tool_choice`` ride in ``extra`` (merged into the request
+    body), since the gateway accepts the OpenAI tool-calling fields.
+    """
+
+    def respond(messages: list[Message]) -> Reply:
+        response = complete(
+            api_key,
+            model=model,
+            messages=messages,
+            max_tokens=max_tokens,
+            extra={"tools": tools.tool_definitions(), "tool_choice": "auto"},
+        )
+        return _reply_of(response)
+
+    return respond