|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +import argparse |
| 4 | +import csv |
| 5 | +import datetime as dt |
| 6 | +import os |
| 7 | +import re |
| 8 | +import subprocess |
| 9 | +import sys |
| 10 | +import tempfile |
| 11 | +from typing import Dict, List |
| 12 | + |
| 13 | + |
| 14 | +TIME_PATTERNS = { |
| 15 | + "elapsed_raw": re.compile(r"^\s*Elapsed \(wall clock\) time \(h:mm:ss or m:ss\):\s*(.+?)\s*$"), |
| 16 | + "user_seconds": re.compile(r"^\s*User time \(seconds\):\s*([0-9]*\.?[0-9]+)\s*$"), |
| 17 | + "system_seconds": re.compile(r"^\s*System time \(seconds\):\s*([0-9]*\.?[0-9]+)\s*$"), |
| 18 | + "max_rss_kb": re.compile(r"^\s*Maximum resident set size \(kbytes\):\s*([0-9]+)\s*$"), |
| 19 | +} |
| 20 | + |
| 21 | + |
| 22 | +def parse_elapsed_to_seconds(raw: str) -> float: |
| 23 | + # Handles formats like "0:03.12", "12:34.56", "1:02:03.45". |
| 24 | + parts = raw.strip().split(":") |
| 25 | + if len(parts) == 2: |
| 26 | + minutes = int(parts[0]) |
| 27 | + seconds = float(parts[1]) |
| 28 | + return minutes * 60 + seconds |
| 29 | + if len(parts) == 3: |
| 30 | + hours = int(parts[0]) |
| 31 | + minutes = int(parts[1]) |
| 32 | + seconds = float(parts[2]) |
| 33 | + return hours * 3600 + minutes * 60 + seconds |
| 34 | + raise ValueError(f"Unrecognized elapsed time format: {raw!r}") |
| 35 | + |
| 36 | + |
| 37 | +def parse_time_v(stderr_text: str) -> Dict[str, float]: |
| 38 | + parsed: Dict[str, float] = {} |
| 39 | + elapsed_raw = None |
| 40 | + for line in stderr_text.splitlines(): |
| 41 | + for key, pattern in TIME_PATTERNS.items(): |
| 42 | + m = pattern.match(line) |
| 43 | + if not m: |
| 44 | + continue |
| 45 | + if key == "elapsed_raw": |
| 46 | + elapsed_raw = m.group(1) |
| 47 | + elif key == "max_rss_kb": |
| 48 | + parsed[key] = int(m.group(1)) |
| 49 | + else: |
| 50 | + parsed[key] = float(m.group(1)) |
| 51 | + if elapsed_raw is not None: |
| 52 | + parsed["wall_seconds"] = parse_elapsed_to_seconds(elapsed_raw) |
| 53 | + return parsed |
| 54 | + |
| 55 | + |
| 56 | +def run_one( |
| 57 | + vg_bin: str, |
| 58 | + mode: str, |
| 59 | + input_path: str, |
| 60 | + threads: int, |
| 61 | + keep_output: bool, |
| 62 | +) -> Dict[str, object]: |
| 63 | + if mode == "gfa": |
| 64 | + mode_flag = "-g" |
| 65 | + elif mode == "gfaz": |
| 66 | + mode_flag = "-z" |
| 67 | + else: |
| 68 | + raise ValueError(f"Unsupported mode: {mode}") |
| 69 | + |
| 70 | + output_path = tempfile.mkstemp(prefix=f"bench_{mode}_t{threads}_", suffix=".vg")[1] |
| 71 | + cmd = ["/usr/bin/time", "-v", vg_bin, "convert", mode_flag, input_path, "-p", "-t", str(threads)] |
| 72 | + |
| 73 | + try: |
| 74 | + with open(output_path, "wb") as out_f: |
| 75 | + proc = subprocess.run(cmd, stdout=out_f, stderr=subprocess.PIPE, text=True) |
| 76 | + metrics = parse_time_v(proc.stderr or "") |
| 77 | + result: Dict[str, object] = { |
| 78 | + "mode": mode, |
| 79 | + "threads": threads, |
| 80 | + "input_path": input_path, |
| 81 | + "exit_code": proc.returncode, |
| 82 | + "wall_seconds": metrics.get("wall_seconds"), |
| 83 | + "user_seconds": metrics.get("user_seconds"), |
| 84 | + "system_seconds": metrics.get("system_seconds"), |
| 85 | + "max_rss_kb": metrics.get("max_rss_kb"), |
| 86 | + "output_path": output_path if keep_output else "", |
| 87 | + } |
| 88 | + if proc.returncode != 0: |
| 89 | + result["error"] = (proc.stderr or "").strip().replace("\n", " | ") |
| 90 | + else: |
| 91 | + result["error"] = "" |
| 92 | + return result |
| 93 | + finally: |
| 94 | + if not keep_output and os.path.exists(output_path): |
| 95 | + os.remove(output_path) |
| 96 | + |
| 97 | + |
| 98 | +def parse_threads(raw: str) -> List[int]: |
| 99 | + values = [] |
| 100 | + for token in raw.split(","): |
| 101 | + token = token.strip() |
| 102 | + if not token: |
| 103 | + continue |
| 104 | + values.append(int(token)) |
| 105 | + if not values: |
| 106 | + raise ValueError("No thread values provided") |
| 107 | + return values |
| 108 | + |
| 109 | + |
| 110 | +def main() -> int: |
| 111 | + parser = argparse.ArgumentParser( |
| 112 | + description="Benchmark vg convert performance for GFA (-g) vs GFAZ (-z) with /usr/bin/time -v." |
| 113 | + ) |
| 114 | + parser.add_argument("--vg", default="./bin/vg", help="Path to vg binary (default: ./bin/vg)") |
| 115 | + parser.add_argument("--gfa", required=True, help="Input GFA path") |
| 116 | + parser.add_argument("--gfaz", required=True, help="Input GFAZ path") |
| 117 | + parser.add_argument( |
| 118 | + "--threads", |
| 119 | + default="1,4,8,16", |
| 120 | + help="Comma-separated thread counts to test (default: 1,4,8,16)", |
| 121 | + ) |
| 122 | + parser.add_argument("--csv", required=True, help="Output CSV path") |
| 123 | + parser.add_argument( |
| 124 | + "--keep-output", |
| 125 | + action="store_true", |
| 126 | + help="Keep generated .vg outputs (off by default)", |
| 127 | + ) |
| 128 | + parser.add_argument( |
| 129 | + "--repeats", |
| 130 | + type=int, |
| 131 | + default=1, |
| 132 | + help="Repeats per mode/thread combination (default: 1)", |
| 133 | + ) |
| 134 | + args = parser.parse_args() |
| 135 | + |
| 136 | + vg_bin = os.path.abspath(args.vg) |
| 137 | + gfa_path = os.path.abspath(args.gfa) |
| 138 | + gfaz_path = os.path.abspath(args.gfaz) |
| 139 | + csv_path = os.path.abspath(args.csv) |
| 140 | + |
| 141 | + if not os.path.exists(vg_bin): |
| 142 | + print(f"error: vg binary not found: {vg_bin}", file=sys.stderr) |
| 143 | + return 2 |
| 144 | + if not os.path.exists(gfa_path): |
| 145 | + print(f"error: gfa input not found: {gfa_path}", file=sys.stderr) |
| 146 | + return 2 |
| 147 | + if not os.path.exists(gfaz_path): |
| 148 | + print(f"error: gfaz input not found: {gfaz_path}", file=sys.stderr) |
| 149 | + return 2 |
| 150 | + if args.repeats < 1: |
| 151 | + print("error: --repeats must be >= 1", file=sys.stderr) |
| 152 | + return 2 |
| 153 | + |
| 154 | + try: |
| 155 | + threads = parse_threads(args.threads) |
| 156 | + except Exception as e: |
| 157 | + print(f"error: invalid --threads value: {e}", file=sys.stderr) |
| 158 | + return 2 |
| 159 | + |
| 160 | + os.makedirs(os.path.dirname(csv_path) or ".", exist_ok=True) |
| 161 | + |
| 162 | + rows: List[Dict[str, object]] = [] |
| 163 | + start_ts = dt.datetime.now().isoformat(timespec="seconds") |
| 164 | + print(f"[{start_ts}] benchmarking started") |
| 165 | + |
| 166 | + for repeat in range(1, args.repeats + 1): |
| 167 | + for t in threads: |
| 168 | + for mode, path in (("gfa", gfa_path), ("gfaz", gfaz_path)): |
| 169 | + print(f"running mode={mode} threads={t} repeat={repeat}") |
| 170 | + row = run_one( |
| 171 | + vg_bin=vg_bin, |
| 172 | + mode=mode, |
| 173 | + input_path=path, |
| 174 | + threads=t, |
| 175 | + keep_output=args.keep_output, |
| 176 | + ) |
| 177 | + row["repeat"] = repeat |
| 178 | + row["timestamp"] = dt.datetime.now().isoformat(timespec="seconds") |
| 179 | + rows.append(row) |
| 180 | + if row["exit_code"] != 0: |
| 181 | + print( |
| 182 | + f"warning: run failed mode={mode} threads={t} repeat={repeat} " |
| 183 | + f"exit={row['exit_code']}", |
| 184 | + file=sys.stderr, |
| 185 | + ) |
| 186 | + |
| 187 | + fieldnames = [ |
| 188 | + "timestamp", |
| 189 | + "repeat", |
| 190 | + "mode", |
| 191 | + "threads", |
| 192 | + "input_path", |
| 193 | + "exit_code", |
| 194 | + "wall_seconds", |
| 195 | + "user_seconds", |
| 196 | + "system_seconds", |
| 197 | + "max_rss_kb", |
| 198 | + "output_path", |
| 199 | + "error", |
| 200 | + ] |
| 201 | + with open(csv_path, "w", newline="", encoding="utf-8") as f: |
| 202 | + writer = csv.DictWriter(f, fieldnames=fieldnames) |
| 203 | + writer.writeheader() |
| 204 | + writer.writerows(rows) |
| 205 | + |
| 206 | + end_ts = dt.datetime.now().isoformat(timespec="seconds") |
| 207 | + print(f"[{end_ts}] benchmarking complete, wrote {len(rows)} rows to {csv_path}") |
| 208 | + return 0 |
| 209 | + |
| 210 | + |
| 211 | +if __name__ == "__main__": |
| 212 | + raise SystemExit(main()) |
0 commit comments