diff --git a/scripts/run_anvil.py b/scripts/run_anvil.py index a684679..3e60950 100644 --- a/scripts/run_anvil.py +++ b/scripts/run_anvil.py @@ -14,21 +14,25 @@ between Make and `docker run`. It discovers tong definitions across the four layers using the pure core in `tongs.py`, then runs the anvil. -Shared tongs ------------- +Tong lifecycles +--------------- When a tong is discovered, the launcher starts it before the anvil, waits for it -to report ready, makes it reachable from the anvil, runs the anvil in the -foreground, and leaves the tong running afterwards. A `shared` tong is one -long-lived container keyed by a stable name: a running one whose config-hash -label still matches is reused untouched, and a missing/stopped/stale one is -(re)started. A `port` tong's reachability is injected into the anvil as -environment; a `none` tong is started but has no anvil-facing surface. - -The launcher starts only `shared` tongs reached over the network (`port`) or with -no anvil-facing surface (`none`), carrying no secret references. A `session` -lifecycle, a secret reference, an `mcp` or `volume` interface, or a `shared` tong -that mounts the workspace is refused with a clear message rather than started -half-wired. +to report ready, makes it reachable from the anvil, then runs the anvil in the +foreground. A `shared` tong is one long-lived container keyed by a stable name: a +running one whose config-hash label still matches is reused untouched, a +missing/stopped/stale one is (re)started, and it is left running afterwards. A +`session` tong is per-session: when any exists the launcher creates a per-session +network, starts the `session` tongs on it under their canonical aliases, connects +each network-facing `shared` tong to it, and joins the anvil to it (plus the base +`NETWORK=` network). On exit -- including SIGINT -- the `session` tongs and the +per-session network are torn down (and the connected `shared` tongs disconnected) +while the long-lived `shared` tongs keep running. A `port` tong's reachability is +injected into the anvil as environment; a `none` tong has no anvil-facing surface. + +The launcher starts `shared` and `session` tongs reached over the network +(`port`) or with no anvil-facing surface (`none`), carrying no secret references. +A secret reference, an `mcp` or `volume` interface, or a `shared` tong that mounts +the workspace is refused with a clear message rather than started half-wired. First-run approval ------------------ @@ -419,6 +423,37 @@ def tcp_probe(self, network, host, port, image): "--entrypoint", "python3", image, "-c", script, host, str(port)] return self._quiet(argv) == 0 + def ensure_network(self, name): + """Create the per-session docker network unless it already exists. + + Mirrors the Makefile's inspect-or-create so a leftover network from a + crashed session (whose teardown never ran) is reused rather than failing + the launch. + """ + if self._quiet(["docker", "network", "inspect", name]) == 0: + return + self._checked(["docker", "network", "create", name]) + + def network_connect(self, network, container, alias=None): + """Attach a running container to `network`, optionally under `alias`. + + Used to connect a long-lived `shared` tong to a session network under its + canonical alias, so the session reaches it without the tong having to live + on the session network permanently. + """ + argv = ["docker", "network", "connect"] + if alias: + argv += ["--alias", alias] + self._checked(argv + [network, container]) + + def network_disconnect(self, network, container): + """Detach a container from a network (best-effort, for teardown).""" + self._quiet(["docker", "network", "disconnect", network, container]) + + def network_rm(self, network): + """Remove a network (best-effort, for teardown).""" + self._quiet(["docker", "network", "rm", network]) + def run_foreground(self, argv): """Run the anvil in the foreground and return its exit code. @@ -427,6 +462,34 @@ def run_foreground(self, argv): through the controlling terminal's process group; the anvil handles it and exits, we reap it, and the KeyboardInterrupt propagates to the caller. """ + return self._wait_foreground(argv) + + def run_foreground_multi(self, argv, extra_networks, container): + """Create the anvil, join the extra networks, then start it attached. + + `docker run` attaches only one network at creation, so an anvil that joins + both its per-session network and a pre-existing `NETWORK=` network is + created on its primary (per-session) network, connected to each extra + network, then started in the foreground. Returns the anvil's exit code. + The container is left for the caller's teardown to remove, so a created + container is not orphaned if `connect` or `start` fails before its `--rm` + could fire. + """ + self._checked(tongs.to_create_argv(argv)) + for network in extra_networks: + self._checked(["docker", "network", "connect", network, container]) + return self._wait_foreground( + ["docker", "start", "--attach", "--interactive", container] + ) + + def _wait_foreground(self, argv): + """Run a foreground command, reaping it on Ctrl-C before re-raising. + + Popen + wait (rather than exec) so the launcher regains control after the + process exits. On Ctrl-C the SIGINT reaches both this process and the child + through the controlling terminal's process group; the child handles it and + exits, we reap it, and the KeyboardInterrupt propagates to the caller. + """ try: proc = subprocess.Popen(argv) except OSError as exc: @@ -515,10 +578,10 @@ def _mounts_workspace(defn): def unsupported_tong_reasons(merged): """Reasons each discovered tong is outside what the launcher can start. - The launcher starts only `shared`, network-or-nothing tongs that hold no - secret. Refused here: + The launcher starts `shared` and `session` tongs reached over the network + (`port`) or with no anvil-facing surface (`none`), holding no secret. Refused + here: - * a `session` lifecycle -- it needs a per-session network; * a secret reference -- it needs tmpfs delivery; * an `mcp` interface -- it needs generated MCP config; * a `volume` interface -- a shared named volume has no consumer yet, so it @@ -536,11 +599,6 @@ def unsupported_tong_reasons(merged): for name in sorted(merged): defn = merged[name]["definition"] kind = (defn.get("interface") or {}).get("kind") - if defn.get("lifecycle") == "session": - reasons.append( - "tong '%s' is a 'session' tong, which this launcher does not " - "start (only 'shared' tongs are supported)" % name - ) if tongs.find_secret_refs(defn): reasons.append( "tong '%s' references a secret, which this launcher does not " @@ -603,6 +661,26 @@ def _ensure_shared_tong(docker, name, defn, *, container, network, alias, ) +def _start_session_tong(docker, name, defn, *, container, network, alias, workspace): + """Start one `session` tong container detached on the per-session network. + + A `session` tong is per-session and torn down with the anvil, so it gets a + session-suffixed container name and lives only on the per-session network + under its canonical alias. The launcher only reaches here for a secret-less + tong, so the definition's `env` is passed straight through as `-e`; any + leftover container of the same name (from a crashed prior session) is removed + first so it is replaced cleanly. + """ + argv = tongs.tong_run_argv( + name, defn, + container_name=container, network=network, alias=alias, + env=defn.get("env") or {}, label_hash=tongs.config_hash(defn), + workspace=workspace, + ) + docker.rm_force(container) + docker.run_detached(argv) + + def _injection_pre_image_args(injection): """`-e`/`-v` options the discovered tongs add to the anvil before the image. @@ -620,49 +698,126 @@ def _injection_pre_image_args(injection): def run_with_tongs(merged, anvil_cmd, opts, *, docker, sleep=time.sleep, monotonic=time.monotonic): - """Start the discovered `shared` tongs, run the anvil, and leave them running. + """Start the discovered tongs, run the anvil, and tear down session state. Only reached when at least one tong was discovered and every tong is startable (the empty case stays a direct exec; unsupported tongs are refused earlier). - Sequence: ensure each `shared` tong is up on the anvil's base network - (reusing a running one whose config hash still matches), probe each tong's - readiness, inject `port` reachability into the anvil argv, then run the anvil - in the foreground. `shared` tongs are long-lived, so nothing is torn down when - the anvil exits. + + `shared` tongs are ensured on the anvil's base network, reusing a running one + whose config hash still matches. When any `session` tong exists a per-session + network is created: the `session` tongs start on it under their canonical + aliases, each network-facing `shared` tong is connected to it for this session, + and the anvil joins it plus the base network (the `NETWORK=` escape hatch). + With no `session` tong the anvil keeps using the base network exactly as + before. Each tong's readiness is probed on the network the anvil will use, + `port` reachability is injected into the anvil argv, then the anvil runs in the + foreground. + + On exit -- including SIGINT -- the `session` tongs and the per-session network + are torn down (and the connected `shared` tongs disconnected) while the + long-lived `shared` tongs are left running. Returns the anvil's exit code. Raises `OrchestrationError` if a tong never - becomes ready -- the anvil does not run against a half-up environment. + becomes ready (the anvil does not run against a half-up environment) or a + `session` tong is discovered with no anvil `--name` to key the session by. """ base_network = tongs.anvil_option_value(anvil_cmd, "--network") + session_id = tongs.anvil_option_value(anvil_cmd, "--name") + + has_session = any( + merged[name]["definition"].get("lifecycle") == "session" for name in merged + ) + if has_session and not session_id: + # The per-session network and container names key off the anvil --name. + # The Makefile always passes it, so its absence is a launch-shape bug -- + # stop rather than build an unnamed session network. (Checked before + # plan_network, which needs the handle to derive the network name.) + raise OrchestrationError( + "session tongs require the anvil '--name' as a session handle" + ) + + plan = tongs.plan_network(merged, base_network, session_id) # Only `port`/`none` tongs reach this path (`mcp`/`volume` are refused # upstream), so the MCP emitter is unused and the injection is `port` env only. injection = tongs.plan_injection(merged, None) - ready_checks = [] - for name in sorted(merged): - defn = merged[name]["definition"] - alias = tongs.canonical_alias(name, defn) - label_hash = tongs.config_hash(defn) - container = tongs.shared_container_name(name) - _ensure_shared_tong( - docker, name, defn, - container=container, network=base_network, alias=alias, - workspace=opts.workspace, label_hash=label_hash, + created_network = None + started_sessions = [] + connected_shared = [] + try: + if plan["create"]: + docker.ensure_network(plan["create"]) + created_network = plan["create"] + + ready_checks = [] + for name in sorted(merged): + defn = merged[name]["definition"] + alias = tongs.canonical_alias(name, defn) + if defn.get("lifecycle") == "session": + container = tongs.session_container_name(session_id, name) + _start_session_tong( + docker, name, defn, + container=container, network=plan["network"], alias=alias, + workspace=opts.workspace, + ) + started_sessions.append(container) + else: + container = tongs.shared_container_name(name) + _ensure_shared_tong( + docker, name, defn, + container=container, network=base_network, alias=alias, + workspace=opts.workspace, label_hash=tongs.config_hash(defn), + ) + ready_checks.append((name, defn, alias, container)) + + # Attach each network-facing `shared` tong to the per-session network under + # its canonical alias, so the anvil reaches it there without the long-lived + # tong having to live on the session network permanently. (The session-tong + # start loop above iterates the whole merged set, not plan["session_aliases"], + # because a `none` session tong with no alias must still be started; only the + # network-facing `shared` tongs in plan["shared_connect"] are connected here.) + for name, alias in plan["shared_connect"]: + container = tongs.shared_container_name(name) + # ensure_network may have reused a network left by a hard-killed prior + # session whose teardown never ran, with this shared tong still attached; + # a stale endpoint would make connect fail. Clear it first -- best-effort, + # a no-op when the tong is not attached -- so the connect is idempotent. + docker.network_disconnect(plan["network"], container) + docker.network_connect(plan["network"], container, alias=alias) + connected_shared.append((plan["network"], container)) + + # Probe readiness on the network the anvil will use, so a `shared` tong is + # checked at the alias the anvil dials (connected to that network above). + for name, defn, alias, container in ready_checks: + if not wait_ready( + docker, container, defn, alias, plan["network"], + anvil_image=opts.anvil_image, sleep=sleep, monotonic=monotonic, + ): + raise OrchestrationError("tong '%s' did not become ready in time" % name) + + injected = tongs.inject_anvil_argv( + anvil_cmd, network=plan["network"], + pre_image_args=_injection_pre_image_args(injection), ) - ready_checks.append((name, defn, alias, base_network, container)) - - for name, defn, alias, probe_net, container in ready_checks: - if not wait_ready( - docker, container, defn, alias, probe_net, - anvil_image=opts.anvil_image, sleep=sleep, monotonic=monotonic, - ): - raise OrchestrationError("tong '%s' did not become ready in time" % name) - - injected = tongs.inject_anvil_argv( - anvil_cmd, network=base_network, - pre_image_args=_injection_pre_image_args(injection), - ) - return docker.run_foreground(injected) + if plan["extra_networks"]: + # The anvil joins more than one network, which docker run cannot do at + # creation, so create -> connect the extras -> start it attached. + return docker.run_foreground_multi( + injected, plan["extra_networks"], session_id + ) + return docker.run_foreground(injected) + finally: + # Tear down per-session state, leaving the long-lived `shared` tongs + # running. Order matters: remove the `session` tongs and the anvil, then + # disconnect the `shared` tongs, before removing the network -- docker + # refuses to delete a network while endpoints remain. + for container in started_sessions: + docker.rm_force(container) + if created_network: + docker.rm_force(session_id) + for network, container in connected_shared: + docker.network_disconnect(network, container) + docker.network_rm(created_network) def exec_anvil(anvil_cmd): @@ -721,10 +876,10 @@ def main(argv): return 1 # Refuse anything this launcher cannot start (see unsupported_tong_reasons: - # a session lifecycle, a secret reference, an MCP or volume interface, or a - # shared tong mounting the workspace) rather than starting it half-wired. - # Every remaining tong is a `port`/`none` tong, whose canonical alias is its - # unique filename, so no two can claim the same network alias. + # a secret reference, an MCP or volume interface, or a shared tong mounting + # the workspace) rather than starting it half-wired. Every remaining tong has + # a `port`/`none` interface, whose canonical alias is its unique filename, so + # no two can claim the same network alias. unsupported = unsupported_tong_reasons(merged) if unsupported: for reason in unsupported: diff --git a/scripts/test_run_anvil.py b/scripts/test_run_anvil.py index 138cf0a..95ffdd8 100644 --- a/scripts/test_run_anvil.py +++ b/scripts/test_run_anvil.py @@ -446,15 +446,16 @@ def test_no_prompt_unapproved_does_not_forward_anvil(self): def test_approved_workspace_tong_passes_gate_then_refused_as_unsupported(self): # Approval is no longer the only gate: an approved (and otherwise valid) - # workspace tong clears the approval prompt but, being a `session` tong, is - # then refused as unsupported -- proving the gate passed without the anvil - # ever running. + # workspace tong clears the approval prompt but, having an `mcp` interface + # this launcher cannot wire up yet, is then refused as unsupported -- + # proving the gate passed without the anvil ever running. with tempfile.TemporaryDirectory() as tmp: tongs_dir = os.path.join(tmp, "tongs") os.makedirs(tongs_dir) with open(os.path.join(tongs_dir, "gh.yaml"), "w") as handle: handle.write( - "lifecycle: session\nimage: x\ninterface:\n kind: none\n" + "lifecycle: shared\nimage: x\ninterface:\n" + " kind: mcp\n name: github\n port: 8080\n" "readiness:\n mode: none\n" ) defn = tongs.load_tong_file(os.path.join(tongs_dir, "gh.yaml")) @@ -471,7 +472,7 @@ def test_approved_workspace_tong_passes_gate_then_refused_as_unsupported(self): ) self.assertEqual(completed.returncode, 1) self.assertEqual(completed.stdout, "") # anvil never ran - self.assertIn("session", completed.stderr) + self.assertIn("mcp", completed.stderr) self.assertNotIn("fails closed", completed.stderr) def test_invalid_tong_returns_one_without_exec(self): @@ -485,22 +486,6 @@ def test_invalid_tong_returns_one_without_exec(self): self.assertEqual(completed.returncode, 1) self.assertEqual(completed.stdout, "") # anvil never ran - def test_session_tong_refused_without_exec(self): - # A `session` tong is beyond the shared-only launch path, so it is refused - # before any docker call -- the anvil never runs. - with tempfile.TemporaryDirectory() as tmp: - tongs_dir = os.path.join(tmp, "tongs") - os.makedirs(tongs_dir) - with open(os.path.join(tongs_dir, "ship.yaml"), "w") as handle: - handle.write( - "lifecycle: session\nimage: x\ninterface:\n kind: none\n" - "readiness:\n mode: none\n" - ) - completed = _run_launcher_raw(["--repo-tongs", tongs_dir]) - self.assertEqual(completed.returncode, 1) - self.assertEqual(completed.stdout, "") - self.assertIn("session", completed.stderr) - def test_secret_tong_refused_without_exec(self): # A shared tong that references a secret cannot be delivered here, so it # is refused before docker. @@ -602,10 +587,18 @@ def test_startable_port_tong_has_no_reasons(self): def test_startable_none_tong_has_no_reasons(self): self.assertEqual(self._reasons(SHARED_NONE), []) - def test_session_secret_mcp_volume_each_refused(self): - self.assertTrue(self._reasons( - {"lifecycle": "session", "image": "x", "interface": {"kind": "none"}, - "readiness": {"mode": "none"}})) + def test_startable_session_tong_has_no_reasons(self): + # A `session` tong reached over the network (or with no surface) is now + # startable -- it runs on a per-session network. + self.assertEqual( + self._reasons({ + "lifecycle": "session", "image": "x", + "interface": {"kind": "port", "port": 5432}, "readiness": {"mode": "none"}, + }), + [], + ) + + def test_secret_mcp_volume_each_refused(self): self.assertTrue(self._reasons( {"lifecycle": "shared", "image": "x", "env": {"T": "${secret:op:r}"}, "interface": {"kind": "none"}, "readiness": {"mode": "none"}})) @@ -618,6 +611,17 @@ def test_session_secret_mcp_volume_each_refused(self): "interface": {"kind": "volume", "volume": "v", "mountpoint": "/m"}, "readiness": {"mode": "none"}})) + def test_session_tong_with_secret_or_mcp_still_refused(self): + # Lifting the session guard does not lift the secret/mcp guards; a session + # tong that needs either is still refused (delivered in a later phase). + self.assertTrue(self._reasons( + {"lifecycle": "session", "image": "x", "env": {"T": "${secret:op:r}"}, + "interface": {"kind": "none"}, "readiness": {"mode": "none"}})) + self.assertTrue(self._reasons( + {"lifecycle": "session", "image": "x", + "interface": {"kind": "mcp", "name": "g", "port": 8080}, + "readiness": {"mode": "none"}})) + def test_shared_workspace_mount_refused_but_docker_socket_allowed(self): # A shared tong that mounts the workspace leaks it across sessions, so it # is refused; the docker-socket mount (the broker pattern) is not. @@ -636,17 +640,16 @@ def test_shared_workspace_mount_refused_but_docker_socket_allowed(self): ) def test_workspace_refusal_is_shared_scoped(self): - # The workspace-mount leak is a `shared`-reuse hazard, so a non-shared tong - # that mounts the workspace must NOT be refused for the workspace -- only a - # `shared` one. (A session+workspace watcher is legitimate; it lands when - # session tongs turn on.) A session tong here is refused for being session, - # not for the workspace mount. - session_reasons = self._reasons({ - "lifecycle": "session", "image": "x", "mounts": ["workspace:ro"], - "interface": {"kind": "none"}, "readiness": {"mode": "none"}, - }) - self.assertTrue(any("session" in r for r in session_reasons)) - self.assertFalse(any("workspace" in r for r in session_reasons)) + # The workspace-mount leak is a `shared`-reuse hazard, so a `session` tong + # that mounts the workspace is legitimate (it is torn down with the anvil) + # and must NOT be refused -- only a `shared` one is. + self.assertEqual( + self._reasons({ + "lifecycle": "session", "image": "x", "mounts": ["workspace:ro"], + "interface": {"kind": "none"}, "readiness": {"mode": "none"}, + }), + [], + ) class FakeDocker: @@ -659,7 +662,8 @@ def __init__(self, states=None, ready=True, anvil_rc=0): self._ready = ready self._anvil_rc = anvil_rc self.run_argvs = [] # detached `docker run` argvs - self.anvil_argv = None # set when the anvil runs via run_foreground + self.anvil_argv = None # set when the anvil runs + self.anvil_extra_networks = None # extra networks the anvil joined def rm_force(self, container): self.calls.append(("rm_force", container)) @@ -667,6 +671,24 @@ def rm_force(self, container): def run_detached(self, argv): self.run_argvs.append(argv) + def ensure_network(self, name): + self.calls.append(("ensure_network", name)) + + def network_connect(self, network, container, alias=None): + self.calls.append(("network_connect", network, container, alias)) + + def network_disconnect(self, network, container): + self.calls.append(("network_disconnect", network, container)) + + def network_rm(self, network): + self.calls.append(("network_rm", network)) + + def run_foreground_multi(self, argv, extra_networks, container): + self.anvil_argv = argv + self.anvil_extra_networks = list(extra_networks) + self.calls.append(("run_foreground_multi", argv, tuple(extra_networks), container)) + return self._anvil_rc + def inspect_state(self, container): return self._states.get(container) @@ -720,6 +742,97 @@ def __call__(self): "readiness": {"mode": "none"}, } +# A per-session network service (a throwaway fixture DB) reached by host+port. +SESSION_PORT = { + "lifecycle": "session", + "image": "fixture-pg", + "interface": {"kind": "port", "port": 5432}, + "readiness": {"mode": "none"}, +} + + +class _RecordingRun: + """A subprocess.run stand-in that records argvs and returns canned codes. + + `codes` maps the first three argv tokens to a return code (default 0), so a + test can make one docker subcommand "fail" while the rest succeed. + """ + + def __init__(self, codes=None): + self.argvs = [] + self._codes = codes or {} + + def __call__(self, argv, **kwargs): + self.argvs.append(list(argv)) + return subprocess.CompletedProcess(argv, self._codes.get(tuple(argv[:3]), 0)) + + +class DockerCLITests(unittest.TestCase): + """The network seam used by the session-network launch path.""" + + def test_ensure_network_creates_when_absent(self): + rec = _RecordingRun({("docker", "network", "inspect"): 1}) + run_anvil.DockerCLI(run=rec).ensure_network("sess-net") + self.assertEqual(rec.argvs[0][:4], ["docker", "network", "inspect", "sess-net"]) + self.assertIn(["docker", "network", "create", "sess-net"], rec.argvs) + + def test_ensure_network_reuses_existing(self): + rec = _RecordingRun() # inspect returns 0 => already present + run_anvil.DockerCLI(run=rec).ensure_network("sess-net") + self.assertNotIn(["docker", "network", "create", "sess-net"], rec.argvs) + + def test_ensure_network_raises_when_create_fails(self): + rec = _RecordingRun( + {("docker", "network", "inspect"): 1, ("docker", "network", "create"): 1} + ) + with self.assertRaises(run_anvil.DockerError): + run_anvil.DockerCLI(run=rec).ensure_network("sess-net") + + def test_network_connect_passes_alias(self): + rec = _RecordingRun() + run_anvil.DockerCLI(run=rec).network_connect("net", "ctr", alias="gh") + self.assertEqual( + rec.argvs[-1], ["docker", "network", "connect", "--alias", "gh", "net", "ctr"] + ) + + def test_network_connect_without_alias(self): + rec = _RecordingRun() + run_anvil.DockerCLI(run=rec).network_connect("net", "ctr") + self.assertEqual(rec.argvs[-1], ["docker", "network", "connect", "net", "ctr"]) + + def test_network_connect_raises_on_failure(self): + rec = _RecordingRun({("docker", "network", "connect"): 1}) + with self.assertRaises(run_anvil.DockerError): + run_anvil.DockerCLI(run=rec).network_connect("net", "ctr") + + def test_network_disconnect_and_rm_are_best_effort(self): + # Teardown must not raise even when the network or endpoint is already gone. + rec = _RecordingRun( + {("docker", "network", "disconnect"): 1, ("docker", "network", "rm"): 1} + ) + cli = run_anvil.DockerCLI(run=rec) + cli.network_disconnect("net", "ctr") + cli.network_rm("net") + self.assertIn(["docker", "network", "disconnect", "net", "ctr"], rec.argvs) + self.assertIn(["docker", "network", "rm", "net"], rec.argvs) + + def test_run_foreground_multi_creates_connects_then_starts(self): + rec = _RecordingRun() + cli = run_anvil.DockerCLI(run=rec) + argv = ["docker", "run", "-it", "--name", "anvil", "--network", "sess", "img"] + with mock.patch.object(run_anvil.subprocess, "Popen") as popen: + popen.return_value.wait.return_value = 7 + rc = cli.run_foreground_multi(argv, ["base-net"], "anvil") + self.assertEqual(rc, 7) + # Created on its primary (session) network... + self.assertEqual(rec.argvs[0][:2], ["docker", "create"]) + self.assertEqual(rec.argvs[0][rec.argvs[0].index("--network") + 1], "sess") + # ...connected to the extra network, then started attached. + self.assertIn(["docker", "network", "connect", "base-net", "anvil"], rec.argvs) + popen.assert_called_once_with( + ["docker", "start", "--attach", "--interactive", "anvil"] + ) + class RunWithTongsTests(unittest.TestCase): def _run(self, docker, merged, anvil=None, workspace=None): @@ -852,6 +965,120 @@ def test_no_anvil_image_degrades_tcp_to_running_check(self): self.assertEqual(rc, 0) self.assertNotIn("tcp_probe", [c[0] for c in docker.calls]) + # --- Session lifecycle + per-session networks --------------------------- + + def test_shared_only_keeps_base_network_and_plain_run(self): + # No `session` tong => no per-session network is created and the anvil runs + # on the base network through the plain (single-network) foreground path. + docker = FakeDocker() + self._run(docker, _merged("ollama", SHARED_OLLAMA, source=tongs.REPO)) + kinds = [c[0] for c in docker.calls] + self.assertNotIn("ensure_network", kinds) + self.assertNotIn("network_rm", kinds) + self.assertNotIn("run_foreground_multi", kinds) + self.assertIn("run_foreground", kinds) + self.assertEqual( + docker.anvil_argv[docker.anvil_argv.index("--network") + 1], "opencode-net" + ) + self.assertIsNone(docker.anvil_extra_networks) + + def test_session_tong_creates_network_starts_on_it_and_tears_down(self): + docker = FakeDocker() + rc = self._run(docker, _merged("pg", SESSION_PORT, source=tongs.REPO)) + self.assertEqual(rc, 0) + net = tongs.session_network_name("claude-myproject") + self.assertIn(("ensure_network", net), docker.calls) + # The session tong is started on the per-session network under its alias. + self.assertEqual(len(docker.run_argvs), 1) + started = docker.run_argvs[0] + self.assertIn("claude-myproject-tong-pg", started) + self.assertEqual(started[started.index("--network") + 1], net) + self.assertEqual(started[started.index("--network-alias") + 1], "pg") + # The anvil joined the session network (primary) and the base network + # (extra) via the create -> connect -> start path, and got the port env. + self.assertEqual(docker.anvil_argv[docker.anvil_argv.index("--network") + 1], net) + self.assertEqual(docker.anvil_extra_networks, ["opencode-net"]) + self.assertIn("SWARMFORGE_TONG_PG_HOST=pg", docker.anvil_argv) + # Teardown removes the session tong and the anvil, then the network -- the + # network rm must come after its endpoints are gone or docker refuses it. + self.assertIn(("rm_force", "claude-myproject-tong-pg"), docker.calls) + self.assertIn(("rm_force", "claude-myproject"), docker.calls) + self.assertIn(("network_rm", net), docker.calls) + self.assertLess( + docker.calls.index(("rm_force", "claude-myproject")), + docker.calls.index(("network_rm", net)), + ) + self.assertLess( + docker.calls.index(("rm_force", "claude-myproject-tong-pg")), + docker.calls.index(("network_rm", net)), + ) + + def test_shared_tong_connected_to_session_network_and_left_running(self): + # A `shared` tong alongside a `session` tong is ensured on the base network, + # then connected to the per-session network for the anvil to reach; on + # teardown it is disconnected but never removed. + docker = FakeDocker() + merged = { + "pg": {"source": tongs.REPO, "definition": SESSION_PORT}, + "ollama": {"source": tongs.REPO, "definition": SHARED_OLLAMA}, + } + self._run(docker, merged) + net = tongs.session_network_name("claude-myproject") + self.assertIn( + ("network_connect", net, "swarmforge-shared-ollama", "ollama"), docker.calls + ) + self.assertIn( + ("network_disconnect", net, "swarmforge-shared-ollama"), docker.calls + ) + # The connect is idempotent against a reused network: a best-effort + # disconnect precedes it (a no-op when the tong is not already attached). + self.assertLess( + docker.calls.index(("network_disconnect", net, "swarmforge-shared-ollama")), + docker.calls.index(("network_connect", net, "swarmforge-shared-ollama", "ollama")), + ) + # The shared tong is rm_force'd only once -- when (re)started to clear a + # leftover -- never as part of teardown, so it is left running. + self.assertEqual( + docker.calls.count(("rm_force", "swarmforge-shared-ollama")), 1 + ) + + def test_session_tong_readiness_probes_on_session_network(self): + docker = FakeDocker() + defn = { + "lifecycle": "session", "image": "pg", + "interface": {"kind": "port", "port": 5432}, "readiness": {"mode": "tcp"}, + } + self._run(docker, _merged("pg", defn, source=tongs.REPO)) + net = tongs.session_network_name("claude-myproject") + self.assertIn(("tcp_probe", net, "pg", 5432, "anvil:img"), docker.calls) + + def test_session_teardown_runs_on_keyboard_interrupt(self): + # Ctrl-C mid-session must still tear down the session tong and network so an + # interrupted run leaks neither. + docker = FakeDocker() + + def interrupt(argv, extra_networks, container): + docker.calls.append(("run_foreground_multi", argv, tuple(extra_networks), container)) + raise KeyboardInterrupt + + docker.run_foreground_multi = interrupt + net = tongs.session_network_name("claude-myproject") + with self.assertRaises(KeyboardInterrupt): + self._run(docker, _merged("pg", SESSION_PORT, source=tongs.REPO)) + self.assertIn(("rm_force", "claude-myproject-tong-pg"), docker.calls) + self.assertIn(("rm_force", "claude-myproject"), docker.calls) + self.assertIn(("network_rm", net), docker.calls) + + def test_session_tong_without_anvil_name_raises_before_any_docker_call(self): + docker = FakeDocker() + anvil = ["docker", "run", "-it", "--rm", "--network", "opencode-net", "img"] + with self.assertRaises(run_anvil.OrchestrationError): + run_anvil.run_with_tongs( + _merged("pg", SESSION_PORT, source=tongs.REPO), anvil, _opts(), + docker=docker, sleep=lambda _s: None, monotonic=_Clock(), + ) + self.assertEqual(docker.calls, []) # nothing created => nothing to tear down + if __name__ == "__main__": unittest.main(verbosity=2) diff --git a/scripts/test_tongs.py b/scripts/test_tongs.py index 7679f23..18668c4 100644 --- a/scripts/test_tongs.py +++ b/scripts/test_tongs.py @@ -993,6 +993,31 @@ def test_inject_non_docker_run_raises_when_splicing(self): with self.assertRaises(ValueError): tongs.inject_anvil_argv(["podman", "ps"], pre_image_args=["-e", "A=1"]) + def test_to_create_argv_swaps_run_for_create(self): + out = tongs.to_create_argv(ANVIL_ARGV) + self.assertEqual(out[:2], ["docker", "create"]) + # Everything else is preserved byte-for-byte. + self.assertEqual(out[2:], ANVIL_ARGV[2:]) + + def test_to_create_argv_does_not_mutate_input(self): + original = list(ANVIL_ARGV) + tongs.to_create_argv(ANVIL_ARGV) + self.assertEqual(ANVIL_ARGV, original) + + def test_to_create_argv_leaves_a_create_argv_unchanged(self): + argv = ["docker", "create", "--rm", "img"] + self.assertEqual(tongs.to_create_argv(argv), argv) + + def test_to_create_argv_does_not_rewrite_a_harness_run_arg(self): + # Only the subcommand is swapped; a later 'run' token (e.g. a harness arg) + # is left alone. + argv = ["docker", "run", "img", "run"] + self.assertEqual(tongs.to_create_argv(argv), ["docker", "create", "img", "run"]) + + def test_to_create_argv_non_docker_run_raises(self): + with self.assertRaises(ValueError): + tongs.to_create_argv(["podman", "ps"]) + class AliasCollisionTests(unittest.TestCase): def _m(self, **defs): diff --git a/scripts/tongs.py b/scripts/tongs.py index e5e3547..daee351 100644 --- a/scripts/tongs.py +++ b/scripts/tongs.py @@ -1224,6 +1224,25 @@ def _replace_network(argv, network): return out[:insert_at] + ["--network", network] + out[insert_at:] +def to_create_argv(anvil_argv): + """Rewrite a `docker run ...` argv into the equivalent `docker create ...`. + + `docker run` attaches only one network when it creates the container, so an + anvil that must join more than one network (its per-session network plus the + pre-existing `NETWORK=` network) is instead created, connected to the extra + networks, then started. Only the `run` subcommand token is swapped for + `create`; every other token (flags, image, harness args) is preserved, so the + created container is byte-for-byte what `docker run` would have made. Returns a + new argv (the input is never mutated). Raises `ValueError` if the argv is not a + docker run/create command. + """ + out = list(anvil_argv) + # _docker_run_index points just past the run/create subcommand, so the token + # before it is the subcommand to rewrite (already 'create' is left as-is). + out[_docker_run_index(out) - 1] = "create" + return out + + def inject_anvil_argv(anvil_argv, network=None, pre_image_args=(), post_image_args=()): """Rewrite the anvil's docker-run argv to reach the discovered tongs.