diff --git a/2.0/README.md b/2.0/README.md index 7d2eb1a3..b414435d 100644 --- a/2.0/README.md +++ b/2.0/README.md @@ -39,6 +39,14 @@ This variant keeps the same SIFT1M-scale service contract and recall target as offline indexing strategies are more viable. Its problem ID is `vector_db_ann_relaxed`. +## Generals.io Bot Arena + +This game-playing problem asks agents to improve a patch-based bot for a local +Generals.io-style simulator. Its problem ID is `generals_io_bot`. The judge +applies the submitted patch to a clean skeleton, runs a hidden arena against +multiple baseline bot families, and scores by mean baseline win rate with a +small faster-win tiebreak. The online generals.io service is not used. + ## BBOPlace ISPD2005 This VLSI placement problem asks agents to generate macro placement candidates diff --git a/2.0/problems/generals_io_bot/config.yaml b/2.0/problems/generals_io_bot/config.yaml new file mode 100644 index 00000000..acb60a55 --- /dev/null +++ b/2.0/problems/generals_io_bot/config.yaml @@ -0,0 +1,47 @@ +tag: games +runtime: + language: patch + timeout_seconds: 10800 + environment: "Generals.io bot patch; local generals-bots simulator arena" + apt_packages: + - bash + - ca-certificates + - git + - python3 + - python3-pip + judge_apt_packages: + - bash + - ca-certificates + - git + - python3 + - python3-pip + docker: + image: frontiercs/generals-io-bot-agent:experimental-c2b77bf + judge_image: frontiercs/generals-io-bot-judge:experimental-c2b77bf +environment: + cpus: 4 + memory_mb: 8192 + storage_mb: 8192 + build_timeout_seconds: 1800 +evaluation: + generals_bots_commit: "c2b77bf72812ec91fb2024d80d90112b961dfa7e" + arena_seed: 20260608 + games_per_matchup: 1 + async_start_method: spawn + max_eval_seconds: 240 + truncation: 180 + pool_size: 2 + speed_weight: 0.25 + grid_sizes: + - 10 + baselines: + - random_low_split + - expander + - strongest_frontier + - hunter + - fast_pathing + - flobot_fast +submission: + kind: file + path: /app/solution.patch + allow_empty: true diff --git a/2.0/problems/generals_io_bot/docker/README.md b/2.0/problems/generals_io_bot/docker/README.md new file mode 100644 index 00000000..460991b3 --- /dev/null +++ b/2.0/problems/generals_io_bot/docker/README.md @@ -0,0 +1,14 @@ +# Generals.io Bot Images + +Build the task-specific Harbor images before running a local Harbor trial: + +```bash +bash 2.0/problems/generals_io_bot/docker/build_images.sh +``` + +The images install `strakam/generals-bots` from pinned commit +`c2b77bf72812ec91fb2024d80d90112b961dfa7e` plus explicit CPU `jax/jaxlib` +dependencies used by the simulator. + +- Agent image: exposes `/app/generals_agent` as a git checkout for the agent. +- Judge image: keeps a clean copy at `/opt/generals-agent-clean` for patch application. diff --git a/2.0/problems/generals_io_bot/docker/agent/Dockerfile b/2.0/problems/generals_io_bot/docker/agent/Dockerfile new file mode 100644 index 00000000..1054a7c8 --- /dev/null +++ b/2.0/problems/generals_io_bot/docker/agent/Dockerfile @@ -0,0 +1,27 @@ +# syntax=docker/dockerfile:1.7 +FROM python:3.11-slim + +ARG GENERALS_BOTS_COMMIT=c2b77bf72812ec91fb2024d80d90112b961dfa7e + +ENV XLA_FLAGS="--xla_cpu_multi_thread_eigen=false intra_op_parallelism_threads=1" \ + OMP_NUM_THREADS=1 \ + OPENBLAS_NUM_THREADS=1 \ + MKL_NUM_THREADS=1 \ + NUMEXPR_NUM_THREADS=1 + +RUN apt-get update && \ + apt-get install -y --no-install-recommends bash ca-certificates git && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --no-cache-dir \ + "git+https://github.com/strakam/generals-bots.git@${GENERALS_BOTS_COMMIT}" \ + "jax[cpu]>=0.4.30" \ + "jaxlib>=0.4.30" + +WORKDIR /app +COPY harbor/app/generals_agent /app/generals_agent +RUN git -C /app/generals_agent init -q && \ + git -C /app/generals_agent config user.email frontier-cs@example.invalid && \ + git -C /app/generals_agent config user.name "Frontier-CS" && \ + git -C /app/generals_agent add . && \ + git -C /app/generals_agent commit -q -m base diff --git a/2.0/problems/generals_io_bot/docker/build_images.sh b/2.0/problems/generals_io_bot/docker/build_images.sh new file mode 100755 index 00000000..8eb1ebae --- /dev/null +++ b/2.0/problems/generals_io_bot/docker/build_images.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../.." && pwd)" +TASK_DIR="$ROOT/2.0/problems/generals_io_bot" +COMMIT="${GENERALS_BOTS_COMMIT:-c2b77bf72812ec91fb2024d80d90112b961dfa7e}" +SHORT="${COMMIT:0:7}" +AGENT_TAG="${AGENT_TAG:-frontiercs/generals-io-bot-agent:experimental-${SHORT}}" +JUDGE_TAG="${JUDGE_TAG:-frontiercs/generals-io-bot-judge:experimental-${SHORT}}" + +docker build \ + --build-arg GENERALS_BOTS_COMMIT="$COMMIT" \ + -f "$TASK_DIR/docker/agent/Dockerfile" \ + -t "$AGENT_TAG" \ + "$TASK_DIR" + +docker build \ + --build-arg GENERALS_BOTS_COMMIT="$COMMIT" \ + -f "$TASK_DIR/docker/judge/Dockerfile" \ + -t "$JUDGE_TAG" \ + "$TASK_DIR" diff --git a/2.0/problems/generals_io_bot/docker/judge/Dockerfile b/2.0/problems/generals_io_bot/docker/judge/Dockerfile new file mode 100644 index 00000000..547024dc --- /dev/null +++ b/2.0/problems/generals_io_bot/docker/judge/Dockerfile @@ -0,0 +1,28 @@ +# syntax=docker/dockerfile:1.7 +FROM python:3.11-slim + +ARG GENERALS_BOTS_COMMIT=c2b77bf72812ec91fb2024d80d90112b961dfa7e + +ENV XLA_FLAGS="--xla_cpu_multi_thread_eigen=false intra_op_parallelism_threads=1" \ + OMP_NUM_THREADS=1 \ + OPENBLAS_NUM_THREADS=1 \ + MKL_NUM_THREADS=1 \ + NUMEXPR_NUM_THREADS=1 + +RUN apt-get update && \ + apt-get install -y --no-install-recommends bash ca-certificates git && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --no-cache-dir \ + "git+https://github.com/strakam/generals-bots.git@${GENERALS_BOTS_COMMIT}" \ + "jax[cpu]>=0.4.30" \ + "jaxlib>=0.4.30" + +COPY harbor/app/generals_agent /opt/generals-agent-clean +RUN git -C /opt/generals-agent-clean init -q && \ + git -C /opt/generals-agent-clean config user.email frontier-cs@example.invalid && \ + git -C /opt/generals-agent-clean config user.name "Frontier-CS Judge" && \ + git -C /opt/generals-agent-clean add . && \ + git -C /opt/generals-agent-clean commit -q -m base + +WORKDIR /judge diff --git a/2.0/problems/generals_io_bot/evaluate.sh b/2.0/problems/generals_io_bot/evaluate.sh new file mode 100755 index 00000000..9ae52639 --- /dev/null +++ b/2.0/problems/generals_io_bot/evaluate.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash +set -euo pipefail +python3 "$(dirname "$0")/evaluator.py" "$1" diff --git a/2.0/problems/generals_io_bot/evaluator.py b/2.0/problems/generals_io_bot/evaluator.py new file mode 100644 index 00000000..11135760 --- /dev/null +++ b/2.0/problems/generals_io_bot/evaluator.py @@ -0,0 +1,802 @@ +"""Evaluator for the Frontier-CS 2.0 Generals.io bot arena task.""" + +from __future__ import annotations + +import importlib.util +import ast +import json +import os +import re +import shutil +import signal +import subprocess +import sys +import tempfile +from functools import partial +from pathlib import Path +from typing import Any + +import jax +import jax.numpy as jnp +import jax.random as jrandom +from generals import GeneralsEnv, get_observation +from generals.agents import Agent, ExpanderAgent, HunterAgent, RandomAgent +from generals.core.action import compute_valid_move_mask_obs + +_DIRECTIONS = jnp.array([[-1, 0], [1, 0], [0, -1], [0, 1]], dtype=jnp.int32) + +MAX_PATCH_BYTES = 500_000 +MAX_CHANGED_FILES = 20 +TASK_CONFIG_PATH = Path("/judge/task_config.json") +DEFAULT_CLEAN_SOURCE = Path("/opt/generals-agent-clean") +LOCAL_CLEAN_SOURCE = Path(__file__).parent / "harbor" / "app" / "generals_agent" + +ALLOWED_FILES = { + "bot.py", + "strategy.py", + "utils.py", +} +DENIED_TOKENS = ( + "import os", + "from os", + "import sys", + "from sys", + "subprocess", + "import socket", + "from socket", + "socket", + "import requests", + "from requests", + "requests", + "import urllib", + "from urllib", + "urllib", + "urllib3", + "httpx", + "aiohttp", + "websocket", + "socketio", + "generals.remote", + "http.client", + "ftplib", + "open(", + "io.", + "import pathlib", + "from pathlib", + "pathlib", + "Path(", + "read_text", + "read_bytes", + "write_text", + "write_bytes", + "os.environ", + "os.getenv", + "__import__", + "importlib", + "eval(", + "exec(", + "compile(", +) +DENIED_IMPORT_ROOTS = { + "builtins", + "ftplib", + "http", + "httpx", + "importlib", + "io", + "os", + "pathlib", + "requests", + "socket", + "subprocess", + "sys", + "urllib", + "urllib3", + "websocket", +} +DENIED_IMPORT_PREFIXES = { + "generals.remote", +} +DENIED_CALL_NAMES = { + "__import__", + "breakpoint", + "compile", + "delattr", + "dir", + "eval", + "exec", + "getattr", + "globals", + "help", + "input", + "locals", + "open", + "setattr", + "vars", +} +DENIED_ATTR_NAMES = { + "environ", + "getenv", + "popen", + "read_bytes", + "read_text", + "remove", + "rename", + "replace", + "rmdir", + "system", + "unlink", + "write_bytes", + "write_text", +} +DENIED_DUNDER_ATTRS = { + "__bases__", + "__class__", + "__code__", + "__dict__", + "__getattribute__", + "__globals__", + "__mro__", + "__subclasses__", +} + +DEFAULT_BASELINES = ( + "random_low_split", + "expander", + "strongest_frontier", + "hunter", + "fast_pathing", + "flobot_fast", +) + + +class _EvaluationTimeout(Exception): + pass + + +def _load_task_config() -> dict[str, Any]: + try: + payload = json.loads(TASK_CONFIG_PATH.read_text(encoding="utf-8")) + except Exception: + return {} + return payload if isinstance(payload, dict) else {} + + +TASK_CONFIG = _load_task_config() +EVALUATION_CONFIG = ( + TASK_CONFIG.get("evaluation", {}) + if isinstance(TASK_CONFIG.get("evaluation"), dict) + else {} +) + + +def _config_int(name: str, default: int) -> int: + try: + return int(EVALUATION_CONFIG.get(name, default)) + except Exception: + return default + + +def _config_float(name: str, default: float) -> float: + try: + return float(EVALUATION_CONFIG.get(name, default)) + except Exception: + return default + + +def _config_tuple_int(name: str, default: tuple[int, ...]) -> tuple[int, ...]: + raw = EVALUATION_CONFIG.get(name, default) + if isinstance(raw, list): + return tuple(int(x) for x in raw) + if isinstance(raw, tuple): + return tuple(int(x) for x in raw) + return default + + +def _config_tuple_str(name: str, default: tuple[str, ...]) -> tuple[str, ...]: + raw = EVALUATION_CONFIG.get(name, default) + if isinstance(raw, list): + return tuple(str(x) for x in raw) + if isinstance(raw, tuple): + return tuple(str(x) for x in raw) + return default + + +def _clean_source_dir() -> Path: + if DEFAULT_CLEAN_SOURCE.exists(): + return DEFAULT_CLEAN_SOURCE + return LOCAL_CLEAN_SOURCE + + +def _run(cmd: list[str], *, cwd: Path) -> subprocess.CompletedProcess[str]: + return subprocess.run( + cmd, + cwd=cwd, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + timeout=60, + check=False, + ) + + +def _changed_files(patch_path: Path) -> list[str]: + result = _run(["git", "diff", "--name-only", "--no-index", "--", "/dev/null", str(patch_path)], cwd=Path("/tmp")) + del result + text = patch_path.read_text(encoding="utf-8", errors="replace") + paths: list[str] = [] + for line in text.splitlines(): + if line.startswith("+++ b/"): + path = line.removeprefix("+++ b/").strip() + if path != "/dev/null" and path not in paths: + paths.append(path) + return paths + + +def _validate_patch(patch_path: Path) -> tuple[bool, str, list[str]]: + try: + data = patch_path.read_bytes() + except Exception as exc: + return False, f"could not read patch: {exc}", [] + if not data.strip(): + return True, "baseline skeleton", [] + if len(data) > MAX_PATCH_BYTES: + return False, f"patch too large: {len(data)} bytes > {MAX_PATCH_BYTES}", [] + if b"\x00" in data: + return False, "binary patches are not allowed", [] + + text = data.decode("utf-8", errors="replace") + if re.search(r"^GIT binary patch$", text, flags=re.MULTILINE): + return False, "binary patches are not allowed", [] + + paths = _changed_files(patch_path) + if not paths: + return False, "patch does not modify any tracked file", [] + if len(paths) > MAX_CHANGED_FILES: + return False, f"too many changed files: {len(paths)} > {MAX_CHANGED_FILES}", paths + for path in paths: + normalized = Path(path) + if normalized.is_absolute() or ".." in normalized.parts: + return False, f"unsafe patch path: {path}", paths + if path not in ALLOWED_FILES: + return False, f"patch may only modify {sorted(ALLOWED_FILES)}; got {path}", paths + + lowered = text.lower() + for token in DENIED_TOKENS: + if token.lower() in lowered: + return False, f"patch contains denied token: {token}", paths + return True, "ok", paths + + +def _validate_candidate_source(work: Path) -> tuple[bool, str]: + for filename in sorted(ALLOWED_FILES): + path = work / filename + if not path.exists(): + continue + try: + tree = ast.parse(path.read_text(encoding="utf-8"), filename=str(path)) + except SyntaxError: + return False, f"{filename} has invalid Python syntax" + for node in ast.walk(tree): + if isinstance(node, ast.Import): + for alias in node.names: + root = alias.name.split(".", 1)[0] + if root in DENIED_IMPORT_ROOTS or any( + alias.name == prefix or alias.name.startswith(prefix + ".") + for prefix in DENIED_IMPORT_PREFIXES + ): + return False, f"{filename} imports denied module: {alias.name}" + elif isinstance(node, ast.ImportFrom): + module = node.module or "" + root = module.split(".", 1)[0] + if root in DENIED_IMPORT_ROOTS or any( + module == prefix or module.startswith(prefix + ".") + for prefix in DENIED_IMPORT_PREFIXES + ): + return False, f"{filename} imports denied module: {module}" + elif isinstance(node, ast.Name): + if node.id == "__builtins__" or (node.id.startswith("__") and node.id.endswith("__")): + return False, f"{filename} uses denied dynamic name: {node.id}" + elif isinstance(node, ast.Attribute): + if node.attr in DENIED_ATTR_NAMES or node.attr in DENIED_DUNDER_ATTRS: + return False, f"{filename} uses denied attribute: {node.attr}" + elif isinstance(node, ast.Call): + func = node.func + if isinstance(func, ast.Name) and func.id in DENIED_CALL_NAMES: + return False, f"{filename} calls denied function: {func.id}" + if isinstance(func, ast.Attribute) and func.attr in DENIED_ATTR_NAMES: + return False, f"{filename} calls denied method: {func.attr}" + return True, "ok" + + +def _prepare_candidate(patch_path: Path) -> tuple[Path | None, str]: + ok, message, _ = _validate_patch(patch_path) + if not ok: + return None, message + + clean_source = _clean_source_dir() + if not clean_source.exists(): + return None, f"clean source not found: {clean_source}" + + tmp = Path(tempfile.mkdtemp(prefix="frontier-generals-")) + work = tmp / "generals_agent" + shutil.copytree(clean_source, work, ignore=shutil.ignore_patterns(".git", "__pycache__")) + _run(["git", "init", "-q"], cwd=work) + _run(["git", "config", "user.email", "frontier-cs@example.invalid"], cwd=work) + _run(["git", "config", "user.name", "Frontier-CS Judge"], cwd=work) + _run(["git", "add", "."], cwd=work) + _run(["git", "commit", "-q", "-m", "base"], cwd=work) + + if patch_path.read_text(encoding="utf-8", errors="replace").strip(): + check = _run(["git", "apply", "--check", str(patch_path)], cwd=work) + if check.returncode != 0: + return None, "patch failed to apply" + apply = _run(["git", "apply", str(patch_path)], cwd=work) + if apply.returncode != 0: + return None, "patch failed to apply" + ok, message = _validate_candidate_source(work) + if not ok: + return None, message + return work, "ok" + + +def _load_candidate_factory(work: Path): + sys.path.insert(0, str(work)) + try: + spec = importlib.util.spec_from_file_location("frontier_candidate_bot", work / "bot.py") + if spec is None or spec.loader is None: + raise RuntimeError("could not load bot.py") + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + cls = getattr(module, "FrontierAgent") + return cls + finally: + try: + sys.path.remove(str(work)) + except ValueError: + pass + + +def _shift_grid(values, fill_value, step: int, axis: int): + shifted = jnp.roll(values, step, axis) + edge = 0 if step == 1 else -1 + if axis == 0: + return shifted.at[edge, :].set(fill_value) + return shifted.at[:, edge].set(fill_value) + + +def _bfs_distance(passable, sources): + h, w = passable.shape + inf = jnp.int32(h * w + 7) + + def relax(_, dist): + neighbors = jnp.minimum( + jnp.minimum(_shift_grid(dist, inf, 1, 0), _shift_grid(dist, inf, -1, 0)), + jnp.minimum(_shift_grid(dist, inf, 1, 1), _shift_grid(dist, inf, -1, 1)), + ) + return jnp.where( + sources, + 0, + jnp.where(passable, jnp.minimum(dist, neighbors + 1), inf), + ) + + return jax.lax.fori_loop(0, h * w, relax, jnp.where(sources, 0, inf)) + + +def _best_direction_toward(field, passable): + inf = jnp.int32(field.size + 13) + values = jnp.stack( + [ + jnp.where(_shift_grid(passable, False, 1, 0), _shift_grid(field, inf, 1, 0), inf), + jnp.where(_shift_grid(passable, False, -1, 0), _shift_grid(field, inf, -1, 0), inf), + jnp.where(_shift_grid(passable, False, 1, 1), _shift_grid(field, inf, 1, 1), inf), + jnp.where(_shift_grid(passable, False, -1, 1), _shift_grid(field, inf, -1, 1), inf), + ], + axis=0, + ) + return jnp.argmin(values, axis=0).astype(jnp.int32), jnp.min(values, axis=0) + + +class _PathingBaselineAgent(Agent): + """Judge-only pathing baseline with tuned hunting and exploration pressure.""" + + def __init__( + self, + *, + id: str = "Pathing", + release_threshold: int = 6, + scout_far: bool = True, + block_neutral_cities: bool = True, + convoy_weight: float = 100.0, + ): + super().__init__(id=id) + self.release_threshold = int(release_threshold) + self.scout_far = bool(scout_far) + self.block_neutral_cities = bool(block_neutral_cities) + self.convoy_weight = float(convoy_weight) + + @partial(jax.jit, static_argnums=0) + def act(self, observation, key): + del key + army = observation.armies + mine = observation.owned_cells + h, w = army.shape + reach = jnp.int32(h * w + 7) + city_block = observation.cities & ~mine if self.block_neutral_cities else jnp.zeros_like(mine) + passable = ~(observation.mountains | observation.structures_in_fog | city_block) + movable = mine & (army > 1) + mine_army = jnp.where(mine, army, 0) + + own_general = mine & observation.generals + own_general_army = jnp.sum(jnp.where(own_general, army, 0)) + own_general_idx = jnp.argmax(own_general.reshape(-1).astype(jnp.int32)) + dist_from_general = _bfs_distance(passable, own_general) + + enemy_general = observation.opponent_cells & observation.generals + enemy_land = observation.opponent_cells & ~observation.cities + fog = observation.fog_cells & passable & (dist_from_general < reach) + open_land = passable & ~mine & (dist_from_general < reach) + + def farthest(mask): + return mask & (dist_from_general == jnp.max(jnp.where(mask, dist_from_general, -1))) + + def nearest(mask): + return mask & (dist_from_general == jnp.min(jnp.where(mask, dist_from_general, reach))) + + fog_goal = farthest(fog) if self.scout_far else nearest(fog) + open_goal = farthest(open_land) if self.scout_far else nearest(open_land) + goal = jnp.where( + jnp.any(enemy_general), + enemy_general, + jnp.where(jnp.any(enemy_land), enemy_land, jnp.where(jnp.any(fog), fog_goal, open_goal)), + ) + + dist_to_goal = _bfs_distance(passable, goal) + direction, neighbor_dist = _best_direction_toward(dist_to_goal, passable) + advances = neighbor_dist < dist_to_goal + flat_direction = direction.reshape(-1) + + enemy_general_army = jnp.sum(jnp.where(enemy_general, army, 0)) + killing_move = ( + jnp.any(enemy_general) + & movable + & (dist_to_goal == 1) + & advances + & (army - 1 > enemy_general_army) + ) + kill_idx = jnp.argmax(jnp.where(killing_move, mine_army, -1).reshape(-1)) + + feed_from_general = (own_general_army >= self.release_threshold) & advances.reshape(-1)[own_general_idx] + convoy_move = movable & ~own_general & advances + convoy_score = mine_army.astype(jnp.float32) * self.convoy_weight - dist_to_goal.astype(jnp.float32) + convoy_idx = jnp.argmax(jnp.where(convoy_move, convoy_score, -1.0).reshape(-1)) + + do_kill = jnp.any(killing_move) + do_feed = (~do_kill) & feed_from_general + do_convoy = (~do_kill) & (~do_feed) & jnp.any(convoy_move) + idx = jnp.where(do_kill, kill_idx, jnp.where(do_feed, own_general_idx, convoy_idx)) + should_pass = ~(do_kill | do_feed | do_convoy) + return jnp.array( + [should_pass, idx // w, idx % w, flat_direction[idx], do_feed], + dtype=jnp.int32, + ) + + +class _FlobotStyleAgent(Agent): + """Judge-only Flobot-inspired baseline: spread, infiltrate, then end-game push.""" + + def __init__(self, *, id: str = "FlobotStyle", early_threshold: int = 12): + super().__init__(id=id) + self.early_threshold = int(early_threshold) + + @partial(jax.jit, static_argnums=0) + def act(self, observation, key): + del key + army = observation.armies + mine = observation.owned_cells + h, w = army.shape + reach = jnp.int32(h * w + 7) + passable = ~(observation.mountains | observation.structures_in_fog) + movable = mine & (army > 1) + mine_army = jnp.where(mine, army, 0) + + own_general = mine & observation.generals + own_general_idx = jnp.argmax(own_general.reshape(-1).astype(jnp.int32)) + own_general_army = jnp.sum(jnp.where(own_general, army, 0)) + dist_from_general = _bfs_distance(passable, own_general) + + enemy_general = observation.opponent_cells & observation.generals + enemy_land = observation.opponent_cells & ~observation.cities + visible_enemy = enemy_general | enemy_land + neutral_city = observation.cities & ~mine & ~observation.opponent_cells + fog = observation.fog_cells & passable & (dist_from_general < reach) + open_land = passable & ~mine & (dist_from_general < reach) + + far_fog = fog & (dist_from_general == jnp.max(jnp.where(fog, dist_from_general, -1))) + border_target = visible_enemy | (fog & (dist_from_general == jnp.min(jnp.where(fog, dist_from_general, reach)))) + economy_target = neutral_city & (dist_from_general == jnp.min(jnp.where(neutral_city, dist_from_general, reach))) + spread_target = open_land & (dist_from_general == jnp.max(jnp.where(open_land, dist_from_general, -1))) + goal = jnp.where( + jnp.any(enemy_general), + enemy_general, + jnp.where( + jnp.any(enemy_land), + border_target, + jnp.where(jnp.any(neutral_city) & (own_general_army >= self.early_threshold), economy_target, jnp.where(jnp.any(fog), far_fog, spread_target)), + ), + ) + + dist_to_goal = _bfs_distance(passable, goal) + direction, neighbor_dist = _best_direction_toward(dist_to_goal, passable) + advances = neighbor_dist < dist_to_goal + flat_direction = direction.reshape(-1) + + enemy_general_army = jnp.sum(jnp.where(enemy_general, army, 0)) + can_end = ( + jnp.any(enemy_general) + & movable + & advances + & ((army - 1) > (enemy_general_army + dist_to_goal)) + ) + end_idx = jnp.argmax(jnp.where(can_end, mine_army - dist_to_goal, -1).reshape(-1)) + + early_launch = (own_general_army >= self.early_threshold) & advances.reshape(-1)[own_general_idx] + spread = movable & advances + border = mine & (dist_to_goal <= 2) + spread_score = mine_army.astype(jnp.float32) * jnp.where(border, 2.0, 1.0) - dist_to_goal.astype(jnp.float32) + spread_idx = jnp.argmax(jnp.where(spread, spread_score, -1.0).reshape(-1)) + + do_end = jnp.any(can_end) + do_early = (~do_end) & early_launch + do_spread = (~do_end) & (~do_early) & jnp.any(spread) + idx = jnp.where(do_end, end_idx, jnp.where(do_early, own_general_idx, spread_idx)) + should_pass = ~(do_end | do_early | do_spread) + return jnp.array( + [should_pass, idx // w, idx % w, flat_direction[idx], do_early], + dtype=jnp.int32, + ) + + +class _HiddenFrontierAgent(Agent): + def __init__(self, id: str = "Frontier"): + super().__init__(id=id) + + def act(self, observation, key): + del key + valid = compute_valid_move_mask_obs(observation) + h, w = observation.armies.shape + positions = jnp.argwhere(valid, size=h * w * 4, fill_value=-1) + num_valid = jnp.sum(jnp.all(positions >= 0, axis=-1)) + + def score_move(idx): + move = positions[idx] + ok = jnp.all(move >= 0) + r, c, d = move[0], move[1], move[2] + dr = jnp.array([-1, 1, 0, 0], dtype=jnp.int32)[d] + dc = jnp.array([0, 0, -1, 1], dtype=jnp.int32)[d] + nr = jnp.clip(r + dr, 0, h - 1) + nc = jnp.clip(c + dc, 0, w - 1) + dest_owned = observation.owned_cells[nr, nc] + dest_opponent = observation.opponent_cells[nr, nc] + dest_neutral = observation.neutral_cells[nr, nc] + dest_general = observation.generals[nr, nc] & dest_opponent + source_army = observation.armies[r, c] + dest_army = observation.armies[nr, nc] + can_capture = source_army > dest_army + 1 + value = source_army.astype(jnp.float32) + value += jnp.where(dest_neutral, 20.0, 0.0) + value += jnp.where(dest_opponent, 60.0, 0.0) + value += jnp.where(dest_general, 10000.0, 0.0) + value = jnp.where(dest_owned, value * 0.1, value) + value = jnp.where(can_capture & ok, value, -1.0) + return value + + scores = jax.vmap(score_move)(jnp.arange(h * w * 4)) + best = jnp.argmax(scores) + move = positions[best] + should_pass = (num_valid == 0) | (scores[best] < 0) + return jnp.array([should_pass, move[0], move[1], move[2], 0], dtype=jnp.int32) + + +def _make_hidden_baseline(name: str, *, player: int): + if name == "random_low_split": + return RandomAgent(id=f"RandomLowSplit-{player}", idle_prob=0.03, split_prob=0.10) + if name == "random_high_split": + return RandomAgent(id=f"RandomHighSplit-{player}", idle_prob=0.08, split_prob=0.45) + if name == "expander": + return ExpanderAgent(id=f"Expander-{player}") + if name in {"hunter", "hunter_pressure", "hunter_mirror"}: + return HunterAgent(id=f"Hunter-{player}") + if name == "fast_pathing": + return _PathingBaselineAgent(id=f"FastPathing-{player}", release_threshold=6, scout_far=True) + if name == "near_scout_pathing": + return _PathingBaselineAgent(id=f"NearScoutPathing-{player}", release_threshold=8, scout_far=False) + if name == "flobot_style": + return _FlobotStyleAgent(id=f"FlobotStyle-{player}", early_threshold=12) + if name == "flobot_fast": + return _FlobotStyleAgent(id=f"FlobotFast-{player}", early_threshold=8) + if name == "strongest_frontier": + return _HiddenFrontierAgent(id=f"Frontier-{player}") + raise ValueError(f"unknown baseline {name!r}") + + +def _play_hidden_game( + candidate_factory, + *, + baseline_name: str, + candidate_player: int, + seed: int, + grid_size: int, + truncation: int, + pool_size: int, +) -> tuple[bool, int]: + env = GeneralsEnv(grid_dims=(grid_size, grid_size), truncation=truncation, pool_size=pool_size) + key = jrandom.PRNGKey(seed) + pool, state = env.reset(key) + candidate = candidate_factory() + baseline = _make_hidden_baseline(baseline_name, player=1 - candidate_player) + + agents = [None, None] + agents[candidate_player] = candidate + agents[1 - candidate_player] = baseline + for agent in agents: + reset = getattr(agent, "reset", None) + if callable(reset): + reset() + + terminated = truncated = False + turns = 0 + while not (terminated or truncated): + obs_0 = get_observation(state, 0) + obs_1 = get_observation(state, 1) + key, k0, k1 = jrandom.split(key, 3) + actions = jnp.stack([agents[0].act(obs_0, k0), agents[1].act(obs_1, k1)]) + timestep, state = env.step(state, actions, pool) + terminated = bool(timestep.terminated) + truncated = bool(timestep.truncated) + turns += 1 + + return int(timestep.info.winner) == candidate_player, turns + + +def _evaluate_hidden_agent( + candidate_factory, + *, + games_per_matchup: int, + seed: int, + grid_sizes: tuple[int, ...], + truncation: int, + baselines: tuple[str, ...], + pool_size: int, + speed_weight: float, +) -> dict[str, Any]: + total = 0 + wins = 0 + speed_credit = 0.0 + by_baseline: dict[str, dict[str, float]] = {} + + for baseline in baselines: + b_total = 0 + b_wins = 0 + for grid_size in grid_sizes: + for game_idx in range(games_per_matchup): + for candidate_player in (0, 1): + baseline_offset = sum((idx + 1) * ord(ch) for idx, ch in enumerate(baseline)) + game_seed = seed + 100003 * game_idx + 1009 * grid_size + 17 * candidate_player + game_seed += 7919 * baseline_offset + candidate_won, turns = _play_hidden_game( + candidate_factory, + baseline_name=baseline, + candidate_player=candidate_player, + seed=game_seed, + grid_size=grid_size, + truncation=truncation, + pool_size=pool_size, + ) + total += 1 + b_total += 1 + if candidate_won: + wins += 1 + b_wins += 1 + speed_credit += max(0.0, (truncation - turns) / truncation) + by_baseline[baseline] = { + "games": float(b_total), + "wins": float(b_wins), + "win_rate": float(b_wins / b_total if b_total else 0.0), + } + + win_rate = wins / total if total else 0.0 + speed_tiebreak = speed_credit / total if total else 0.0 + mean_baseline_win_rate = ( + sum(item["win_rate"] for item in by_baseline.values()) / len(by_baseline) + if by_baseline + else 0.0 + ) + speed_weight = max(0.0, min(1.0, speed_weight)) + win_weight = 1.0 - speed_weight + score = 100.0 * (win_weight * mean_baseline_win_rate + speed_weight * speed_tiebreak) + return { + "score": score, + "win_rate": win_rate, + "mean_baseline_win_rate": mean_baseline_win_rate, + "speed_tiebreak": speed_tiebreak, + "win_weight": win_weight, + "speed_weight": speed_weight, + "games": float(total), + "wins": float(wins), + "by_baseline": by_baseline, + } + + +def evaluate(solution_path: str) -> tuple[float, float, str]: + patch_path = Path(solution_path).resolve() + work, prep_message = _prepare_candidate(patch_path) + if work is None: + message = json.dumps({"status": "invalid", "reason": prep_message}, sort_keys=True) + return 0.0, 0.0, message + + try: + candidate_factory = _load_candidate_factory(work) + timeout_seconds = max(1, _config_int("max_eval_seconds", 240)) + + def _raise_timeout(_signum, _frame): + raise _EvaluationTimeout() + + previous_handler = signal.getsignal(signal.SIGALRM) + previous_timer = signal.setitimer(signal.ITIMER_REAL, timeout_seconds) + try: + signal.signal(signal.SIGALRM, _raise_timeout) + metrics = _evaluate_hidden_agent( + candidate_factory, + games_per_matchup=max(1, _config_int("games_per_matchup", 1)), + seed=_config_int("arena_seed", 20260608), + grid_sizes=_config_tuple_int("grid_sizes", (10,)), + truncation=_config_int("truncation", 180), + baselines=_config_tuple_str("baselines", DEFAULT_BASELINES), + pool_size=max(2, _config_int("pool_size", 2)), + speed_weight=_config_float("speed_weight", 0.25), + ) + finally: + signal.setitimer(signal.ITIMER_REAL, *previous_timer) + signal.signal(signal.SIGALRM, previous_handler) + except _EvaluationTimeout: + message = json.dumps( + { + "status": "timeout", + "reason": "evaluation exceeded task time budget", + }, + sort_keys=True, + ) + return 0.0, 0.0, message + except Exception as exc: + message = json.dumps( + {"status": "error", "reason": type(exc).__name__}, + sort_keys=True, + ) + return 0.0, 0.0, message + + score = max(0.0, min(100.0, float(metrics["score"]))) + public_metrics = { + "status": "scored", + "score": score, + "win_rate": round(float(metrics["win_rate"]), 4), + "mean_baseline_win_rate": round(float(metrics["mean_baseline_win_rate"]), 4), + "speed_tiebreak": round(float(metrics["speed_tiebreak"]), 4), + "speed_weight": round(float(metrics["speed_weight"]), 4), + "games": int(metrics["games"]), + "wins": int(metrics["wins"]), + } + return score, score, json.dumps(public_metrics, sort_keys=True) + + +def main() -> int: + if len(sys.argv) != 2: + print("usage: evaluator.py /path/to/solution.patch", file=sys.stderr) + return 2 + score, score_unbounded, message = evaluate(sys.argv[1]) + print(json.dumps({"score": score, "score_unbounded": score_unbounded, "message": message})) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/2.0/problems/generals_io_bot/harbor/app/LICENSE.generals-bots b/2.0/problems/generals_io_bot/harbor/app/LICENSE.generals-bots new file mode 100644 index 00000000..27f2f9fc --- /dev/null +++ b/2.0/problems/generals_io_bot/harbor/app/LICENSE.generals-bots @@ -0,0 +1,23 @@ +MIT License + +Copyright (c) 2024, Matej Straka + +Author: Matej Straka + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/2.0/problems/generals_io_bot/harbor/app/README.md b/2.0/problems/generals_io_bot/harbor/app/README.md new file mode 100644 index 00000000..d925183c --- /dev/null +++ b/2.0/problems/generals_io_bot/harbor/app/README.md @@ -0,0 +1,22 @@ +# Generals.io Bot Arena + +Work in `/app/generals_agent`, then run: + +```bash +bash /app/make_submission.sh +bash /app/submit.sh +``` + +Submit the baseline skeleton once before running long local experiments, then +submit every meaningful improvement. Local simulations are useful only for tiny +sanity checks; the black-box judge is the scoring feedback for this task, and +submissions run asynchronously while you keep improving. + +Submission is asynchronous. Use: + +```bash +bash /app/submissions.sh +bash /app/wait_submission.sh +``` + +to inspect judge results while continuing to improve the bot. diff --git a/2.0/problems/generals_io_bot/harbor/app/generals_agent/README.md b/2.0/problems/generals_io_bot/harbor/app/generals_agent/README.md new file mode 100644 index 00000000..47854f1f --- /dev/null +++ b/2.0/problems/generals_io_bot/harbor/app/generals_agent/README.md @@ -0,0 +1,46 @@ +# Generals Agent Skeleton + +Edit `bot.py` and implement `FrontierAgent`. + +The judge-side arena is black-box. This workspace contains the bot skeleton and +the public `generals-bots` package, but no Frontier-CS evaluation harness, +baseline ensemble, seeds, or match runner. + +Useful imports: + +```python +import jax.numpy as jnp +from generals.core.action import create_action, compute_valid_move_mask_obs +from generals.agents import Agent +``` + +Action format: + +```text +[pass, row, col, direction, split] +``` + +Directions: + +```text +0 up, 1 down, 2 left, 3 right +``` + +Create a patch submission: + +```bash +bash /app/make_submission.sh +bash /app/submit.sh +``` + +Submit the baseline skeleton once before running long local experiments, then +submit every meaningful improvement. Local simulations are useful only for tiny +sanity checks; the black-box judge is the scoring feedback for this task, and +submissions run asynchronously while you keep improving. + +The judge accepts patches touching only `bot.py`, `strategy.py`, and `utils.py`. +Do not read files, launch subprocesses, open network sockets, or inspect +environment variables; the evaluator rejects these patterns. + +The simulator is JAX-based. Compact array logic usually runs faster than large +Python-heavy policies. diff --git a/2.0/problems/generals_io_bot/harbor/app/generals_agent/bot.py b/2.0/problems/generals_io_bot/harbor/app/generals_agent/bot.py new file mode 100644 index 00000000..d611e9b8 --- /dev/null +++ b/2.0/problems/generals_io_bot/harbor/app/generals_agent/bot.py @@ -0,0 +1,12 @@ +from generals.agents import ExpanderAgent + + +class FrontierAgent(ExpanderAgent): + """Baseline expanding bot. + + Improve this class or replace it with your own implementation. The judge + instantiates FrontierAgent() and calls act(observation, key) every turn. + """ + + def __init__(self, id: str = "FrontierAgent"): + super().__init__(id=id) diff --git a/2.0/problems/generals_io_bot/harbor/app/make_submission.sh b/2.0/problems/generals_io_bot/harbor/app/make_submission.sh new file mode 100755 index 00000000..b503257a --- /dev/null +++ b/2.0/problems/generals_io_bot/harbor/app/make_submission.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +set -euo pipefail + +REPO="${GENERALS_AGENT_DIR:-/app/generals_agent}" +OUT="${1:-/app/solution.patch}" + +if [[ ! -d "$REPO/.git" ]]; then + echo "Generals agent checkout not found at $REPO" >&2 + exit 2 +fi + +git -C "$REPO" diff --binary -- bot.py strategy.py utils.py > "$OUT" +bytes=$(wc -c < "$OUT" | tr -d ' ') +echo "Wrote $OUT ($bytes bytes)" diff --git a/2.0/problems/generals_io_bot/harbor/app/solution.patch b/2.0/problems/generals_io_bot/harbor/app/solution.patch new file mode 100644 index 00000000..d406071d --- /dev/null +++ b/2.0/problems/generals_io_bot/harbor/app/solution.patch @@ -0,0 +1,12 @@ +diff --git a/bot.py b/bot.py +index d611e9b..cd85a3a 100644 +--- a/bot.py ++++ b/bot.py +@@ -2,6 +2,7 @@ from generals.agents import ExpanderAgent + + + class FrontierAgent(ExpanderAgent): ++ # Reference patch keeps the public baseline behavior. + """Baseline expanding bot. + + Improve this class or replace it with your own implementation. The judge diff --git a/2.0/problems/generals_io_bot/readme b/2.0/problems/generals_io_bot/readme new file mode 100644 index 00000000..91ae0b66 --- /dev/null +++ b/2.0/problems/generals_io_bot/readme @@ -0,0 +1,113 @@ +# Generals.io Bot Arena + +## Problem + +Implement a bot for a local Generals.io-style arena. Your bot plays repeated +two-player games against fixed baseline bots in the `generals-bots` simulator. + +Each game is played on a square grid with fog of war. A player wins by capturing +the opponent's general. If no general is captured before the truncation limit, +the game is scored as a draw for win-rate purposes. + +The environment is the local `generals-bots` simulator, not the online +generals.io service. Each turn your bot receives an observation containing: + +```text +armies, generals, cities, mountains, neutral_cells, owned_cells, +opponent_cells, fog_cells, structures_in_fog, owned/opponent land and army +counts, and timestep +``` + +Fog hides cells outside the visibility radius around your territory. Mountains +are impassable. Cities and generals produce armies over time. A valid move sends +army from one owned cell to an adjacent passable cell; moving into enemy or +neutral territory captures it only when the moving army is larger than the +defending army. + +## Submission + +Submit a patch against the public `generals_agent` skeleton. In Harbor, edit the +repository under: + +```text +/app/generals_agent +``` + +Then run: + +```bash +bash /app/make_submission.sh +bash /app/submit.sh +``` + +Start by submitting the baseline skeleton once before running long local +experiments. This establishes black-box feedback early; later submissions can +replace it as you improve the bot. + +The patch must produce a Python module with: + +```python +class FrontierAgent: + def act(self, observation, key): + ... +``` + +`act` must return a `generals-bots` action array: + +```text +[pass, row, col, direction, split] +``` + +where `direction` is `0=up`, `1=down`, `2=left`, `3=right`, and `split` +selects whether to move half the army instead of all-but-one. + +Patches may modify only these files: + +```text +bot.py +strategy.py +utils.py +``` + +The judge rejects binary patches, oversized patches, path traversal, and common +file/network/process access tokens. This is a bot-policy benchmark, not an +environment inspection task. + +The agent workspace intentionally does not include a Frontier-CS match runner, +baseline ensemble, hidden seeds, or evaluator implementation. Use the black-box +submission interface for scoring feedback. + +## Scoring + +Every submission is evaluated against the same baseline families used by final +verification. These include random, expansion, hunting/pathing, and +strategy-inspired rule-based opponents, so exploiting only one weak bot is not +enough for a high score. Faster wins also matter: the score gives substantial +credit for capturing the enemy general in fewer turns. + +The default Harbor configuration is intentionally lightweight so agents can +iterate quickly: it uses one game per matchup and an internal evaluator time +budget. Increase `games_per_matchup`, `grid_sizes`, `truncation`, `pool_size`, +and `max_eval_seconds` together in `config.yaml` for a heavier run. Adjust +`speed_weight` if you want fast wins to matter more or less relative to raw win +rate. + +Practical tip: the simulator is JAX-based. Simple array programs compile and +run much faster than large Python control-flow policies, so keep `act` compact +and vectorized when possible. + +The reported score is scaled to `[0, 100]`: + +```text +score = 100 * ((1 - speed_weight) * mean_baseline_win_rate + speed_weight * mean_baseline_speed_tiebreak) +``` + +The default `speed_weight` is `0.25`. The speed credit is only earned on games +that your bot wins and is larger for earlier captures. + +## Notes + +- The online generals.io service is not used. +- The hidden evaluator and hidden seeds are not visible in the agent workspace. +- The task uses `strakam/generals-bots` at pinned commit + `c2b77bf72812ec91fb2024d80d90112b961dfa7e` under the MIT license. diff --git a/2.0/problems/generals_io_bot/reference.patch b/2.0/problems/generals_io_bot/reference.patch new file mode 100644 index 00000000..d406071d --- /dev/null +++ b/2.0/problems/generals_io_bot/reference.patch @@ -0,0 +1,12 @@ +diff --git a/bot.py b/bot.py +index d611e9b..cd85a3a 100644 +--- a/bot.py ++++ b/bot.py +@@ -2,6 +2,7 @@ from generals.agents import ExpanderAgent + + + class FrontierAgent(ExpanderAgent): ++ # Reference patch keeps the public baseline behavior. + """Baseline expanding bot. + + Improve this class or replace it with your own implementation. The judge diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/Dockerfile b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/Dockerfile index 8f3f6233..2cc631d4 100644 --- a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/Dockerfile +++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/Dockerfile @@ -30,4 +30,16 @@ COPY readme config.yaml task_config.json submission_config.json AGENT.md \ wait_submission.py wait_submission.sh cancel_submission.py cancel_submission.sh /app/ COPY harbor_app/ /app/ {visible_input_copies} +RUN if command -v git >/dev/null 2>&1; then \ + find /app -mindepth 2 -maxdepth 4 -type d -name .git -print | \ + while read -r gitdir; do \ + repo="$(dirname "$gitdir")"; \ + git -C "$repo" config user.email frontier-cs@example.invalid; \ + git -C "$repo" config user.name "Frontier-CS"; \ + git -C "$repo" add -A; \ + if ! git -C "$repo" diff --cached --quiet; then \ + git -C "$repo" commit -q --amend --no-edit || git -C "$repo" commit -q -m base; \ + fi; \ + done; \ + fi RUN chmod +x /app/submit.sh /app/submissions.sh /app/wait_submission.sh /app/cancel_submission.sh diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py index 83cb02a2..a26c9506 100644 --- a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py +++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/judge_server.py @@ -72,6 +72,28 @@ def configured_max_queue_size() -> int: return DEFAULT_MAX_QUEUE_SIZE +def configured_allow_empty_submission() -> bool: + config = load_task_config() + submission = config.get("submission", {}) + return bool(isinstance(submission, dict) and submission.get("allow_empty")) + + +def configured_async_start_method() -> str: + config = load_task_config() + evaluation = config.get("evaluation", {}) + configured = None + if isinstance(evaluation, dict): + configured = evaluation.get("async_start_method") + method = str( + os.environ.get("FRONTIER_ASYNC_EVAL_START_METHOD") + or configured + or "fork" + ) + if method not in multiprocessing.get_all_start_methods(): + return "fork" + return method + + MAX_QUEUE_SIZE = configured_max_queue_size() @@ -341,7 +363,7 @@ def validate_payload(payload: dict[str, Any], *, allow_final: bool, role_token: raise ValueError("directory submission must include archive_b64") else: code = payload.get("code") - if not isinstance(code, str) or not code.strip(): + if not isinstance(code, str) or (not configured_allow_empty_submission() and not code.strip()): raise ValueError("file submission must include non-empty string field 'code'") submission_kind = "file" return submission_uuid, submission_role, submission_kind @@ -359,7 +381,34 @@ def run_payload(payload: dict[str, Any], *, submission_role: str) -> dict[str, A if not acquired: raise TimeoutError("timed out waiting for evaluator lock") try: - return evaluate_payload_direct(payload, submission_role=submission_role) + ctx = multiprocessing.get_context(configured_async_start_method()) + with tempfile.TemporaryDirectory(prefix="frontier_cs_2_0_final_result_") as tmp: + result_path = Path(tmp) / "result.json" + process = ctx.Process( + target=_async_evaluate_child, + args=(payload, submission_role, str(result_path)), + ) + process.start() + deadline = time.time() + EVALUATION_LOCK_TIMEOUT_SECONDS + while process.is_alive(): + process.join(timeout=0.2) + if time.time() >= deadline: + terminate_process_group(process) + raise TimeoutError("final evaluation timed out") + + if not result_path.exists(): + raise RuntimeError( + f"evaluation process exited without a result (exitcode={process.exitcode})" + ) + output = json.loads(result_path.read_text(encoding="utf-8")) + if not isinstance(output, dict): + raise RuntimeError("evaluation process returned invalid output") + if not output.get("ok"): + raise RuntimeError(str(output.get("detail") or "evaluation failed")) + result = output.get("result") + if not isinstance(result, dict): + raise RuntimeError("evaluation process returned invalid result") + return result finally: EVALUATION_LOCK.release() @@ -369,12 +418,15 @@ def _async_evaluate_child( submission_role: str, result_path: str, ) -> None: + global EVALUATOR if os.name == "posix": try: os.setsid() except OSError: pass try: + if EVALUATOR is None: + EVALUATOR = load_problem_evaluator() result = evaluate_payload_direct(payload, submission_role=submission_role) output = {"ok": True, "result": result} except BaseException: @@ -390,7 +442,7 @@ def terminate_process_group(process: multiprocessing.Process) -> None: try: os.killpg(process.pid, signal.SIGTERM) except ProcessLookupError: - pass + process.terminate() except OSError: process.terminate() else: @@ -404,7 +456,7 @@ def terminate_process_group(process: multiprocessing.Process) -> None: try: os.killpg(process.pid, signal.SIGKILL) except ProcessLookupError: - pass + process.kill() except OSError: process.kill() else: @@ -431,7 +483,7 @@ def run_async_payload( if submission_is_cancelling(submission_uuid): raise SubmissionCancelled() - ctx = multiprocessing.get_context("fork") + ctx = multiprocessing.get_context(configured_async_start_method()) with tempfile.TemporaryDirectory(prefix="frontier_cs_2_0_async_result_") as tmp: result_path = Path(tmp) / "result.json" process = ctx.Process( diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/submit.py b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/submit.py index 3fca5b46..311f818d 100644 --- a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/submit.py +++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/environment/submit.py @@ -148,6 +148,7 @@ def main() -> int: default_path = str(config.get("path") or SOLUTION_PATH) solution_path = Path(sys.argv[1] if len(sys.argv) > 1 else default_path) exclude = list(config.get("exclude", []) or []) + allow_empty = bool(config.get("allow_empty", False)) sub_uuid = str(uuid.uuid4()) code_chars = 0 file_count = 0 @@ -208,7 +209,7 @@ def main() -> int: else: code = solution_path.read_text(encoding="utf-8") code_chars = len(code) - if not code.strip(): + if not allow_empty and not code.strip(): msg = f"Solution file {solution_path} is empty" print(f"[submit] ERROR: {msg}", file=sys.stderr) log_record( diff --git a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/tests/evaluate.py b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/tests/evaluate.py index 4e91d8db..eef882fe 100644 --- a/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/tests/evaluate.py +++ b/adapters/frontier-cs-2.0/src/frontier_cs_2_0/task-template/tests/evaluate.py @@ -34,50 +34,11 @@ FINAL_ROLE_TOKEN = "{verifier_token}" -def submission_reward(record: dict) -> float | None: - try: - return float(record.get("score", 0.0)) / 100.0 - except (TypeError, ValueError): - return None - - def result_score_key(record: dict) -> tuple[float, float]: score = float(record.get("score", 0.0)) return (score, float(record.get("score_unbounded", score))) -def best_submission() -> dict | None: - submissions_log = ( - VERIFIER_SUBMISSIONS_LOG - if VERIFIER_SUBMISSIONS_LOG.exists() - else JUDGE_SUBMISSIONS_LOG - ) - if not submissions_log.exists(): - return None - - best: dict | None = None - for line in submissions_log.read_text(encoding="utf-8").splitlines(): - if not line.strip(): - continue - try: - record = json.loads(line) - reward = submission_reward(record) - if reward is None: - continue - except json.JSONDecodeError: - continue - if record.get("submission_role", "agent") != "agent": - continue - if record.get("status") != "done": - continue - metrics = record.get("metrics", {}) - if isinstance(metrics, dict) and metrics.get("evaluation_scope") == "quick_feedback": - continue - if best is None or result_score_key(record) > result_score_key(best): - best = record - return best - - def write_reward(reward: float, detail: str = "", extra: dict | None = None) -> None: REWARD_TXT.parent.mkdir(parents=True, exist_ok=True) reward = max(0.0, min(1.0, float(reward))) @@ -391,7 +352,8 @@ def try_write_best_final_result(reason: str, final_key: tuple[float, float] | No return write_reward(0.0, f"{solution_path} not found") return - if solution_path.is_file() and not solution_path.read_text(encoding="utf-8").strip(): + allow_empty = bool(config.get("allow_empty", False)) + if solution_path.is_file() and not allow_empty and not solution_path.read_text(encoding="utf-8").strip(): print(f"ERROR: {solution_path} is empty") if try_write_best_final_result(f"{solution_path} is empty"): return