vgteam
diff --git a/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions b/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 14 additions & 1 deletion b/‎Makefile‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎deps/GFAz‎ b/‎deps/GFAz‎
diff --git a/‎scripts/benchmark_convert_gfa_vs_gfaz.py‎
Lines changed: 212 additions & 0 deletions b/‎scripts/benchmark_convert_gfa_vs_gfaz.py‎
Lines changed: 212 additions & 0 deletions
@@ -133,3 +133,6 @@
 [submodule "deps/mimalloc"]
 	path = deps/mimalloc
 	url = https://github.com/microsoft/mimalloc.git
+[submodule "deps/GFAz"]
+	path = deps/GFAz
+	url = https://github.com/babyplutokurt/GFAz.git
@@ -74,7 +74,7 @@ INCLUDE_FLAGS :=-I$(CWD)/$(INC_DIR) -I. -I$(CWD)/$(SRC_DIR) -I$(CWD)/$(UNITTEST_
 # These need to come before library search paths from LDFLAGS or we won't
 # prefer linking vg-installed dependencies over system ones.
 LD_LIB_DIR_FLAGS := -L$(CWD)/$(LIB_DIR)
-LD_LIB_FLAGS := -lvcflib -lwfa2 -ltabixpp -lgssw -lssw -lsublinearLS -lpthread -lncurses -lgcsa2 -lgbwtgraph -lgbwt -lkff -ldivsufsort -ldivsufsort64 -lvcfh -lraptor2 -lpinchesandcacti -l3edgeconnected -lsonlib -lfml -lstructures -lbdsg -lxg -lsdsl -lzstd -lhandlegraph -lcrypto
+LD_LIB_FLAGS := -lvcflib -lwfa2 -ltabixpp -lgssw -lssw -lsublinearLS -lpthread -lncurses -lgcsa2 -lgbwtgraph -lgbwt -lkff -ldivsufsort -ldivsufsort64 -lvcfh -lraptor2 -lpinchesandcacti -l3edgeconnected -lsonlib -lfml -lstructures -lbdsg -lxg -lsdsl -lzstd -lhandlegraph -lgfa_compression_core -lcrypto
 # We omit Boost Program Options for now; we find it in a platform-dependent way.
 # By default it has no suffix
 BOOST_SUFFIX=""
@@ -372,6 +372,7 @@ IPS4O_DIR=deps/ips4o
 BBHASH_DIR=deps/BBHash
 MIO_DIR=deps/mio
 ATOMIC_QUEUE_DIR=deps/atomic_queue
+GFAz_DIR=deps/GFAz
 
 # Dependencies that go into libvg's archive
 # These go in libvg but come from dependencies
@@ -408,6 +409,7 @@ LIB_DEPS += $(LIB_DIR)/libvgio.a
 LIB_DEPS += $(LIB_DIR)/libhandlegraph.a
 LIB_DEPS += $(LIB_DIR)/libbdsg.a
 LIB_DEPS += $(LIB_DIR)/libxg.a
+LIB_DEPS += $(LIB_DIR)/libgfa_compression_core.a
 ifneq ($(shell uname -s),Darwin)
     # On non-Mac (i.e. Linux), where ELF binaries are used, pull in libdw which
     # backward-cpp will use.
@@ -518,6 +520,7 @@ DEPS += $(INC_DIR)/raptor2/raptor2.h
 DEPS += $(INC_DIR)/BooPHF.h
 DEPS += $(INC_DIR)/mio/mmap.hpp
 DEPS += $(INC_DIR)/atomic_queue.h
+DEPS += $(INC_DIR)/GFAz/gfa_parser.hpp
 
 .PHONY: clean clean-tests get-deps deps lint test set-path objs static static-docker docs man .pre-build version
 
@@ -918,6 +921,16 @@ $(INC_DIR)/mio/mmap.hpp: $(MIO_DIR)/include/mio/*
 $(INC_DIR)/atomic_queue.h: $(ATOMIC_QUEUE_DIR)/include/*
 	+cp -r $(ATOMIC_QUEUE_DIR)/include/atomic_queue/* $(CWD)/$(INC_DIR)/
 
+$(INC_DIR)/GFAz/gfa_parser.hpp: $(LIB_DIR)/libgfa_compression_core.a
+
+$(LIB_DIR)/libgfa_compression_core.a: $(GFAz_DIR)/CMakeLists.txt $(wildcard $(GFAz_DIR)/src/*) $(wildcard $(GFAz_DIR)/src/gpu/*) $(wildcard $(GFAz_DIR)/include/*) $(wildcard $(GFAz_DIR)/include/gpu/*)
+	+rm -f $(CWD)/$(LIB_DIR)/libgfa_compression_core.a
+	+rm -Rf $(CWD)/$(INC_DIR)/GFAz
+	+cd $(GFAz_DIR) && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_C_COMPILER="$(CC)" -DCMAKE_CXX_COMPILER="$(CXX)" -DCMAKE_CXX_FLAGS="-fPIC $(CXXFLAGS) $(CPPFLAGS)" -DBUILD_PYTHON_BINDINGS=OFF -DBUILD_CLI=OFF -DBUILD_BENCHMARKS=OFF -DBUILD_TESTS=OFF .. && $(MAKE) $(FILTER) gfa_compression_core && cp libgfa_compression_core.a $(CWD)/$(LIB_DIR)/
+	+mkdir -p $(CWD)/$(INC_DIR)/GFAz
+	+cp -r $(GFAz_DIR)/include/* $(CWD)/$(INC_DIR)/GFAz/
+
+
 $(INC_DIR)/mmmultiset.hpp: $(MMMULTIMAP_DIR)/src/mmmultiset.hpp $(INC_DIR)/mmmultimap.hpp
 $(INC_DIR)/mmmultimap.hpp: $(MMMULTIMAP_DIR)/src/mmmultimap.hpp $(MMMULTIMAP_DIR)/src/mmmultiset.hpp $(INC_DIR)/mio/mmap.hpp $(INC_DIR)/atomic_queue.h
 	+cp $(MMMULTIMAP_DIR)/src/mmmultimap.hpp $(MMMULTIMAP_DIR)/src/mmmultiset.hpp $(CWD)/$(INC_DIR)/
 
@@ -0,0 +1,212 @@
+#!/usr/bin/env python3
+
+import argparse
+import csv
+import datetime as dt
+import os
+import re
+import subprocess
+import sys
+import tempfile
+from typing import Dict, List
+
+
+TIME_PATTERNS = {
+    "elapsed_raw": re.compile(r"^\s*Elapsed \(wall clock\) time \(h:mm:ss or m:ss\):\s*(.+?)\s*$"),
+    "user_seconds": re.compile(r"^\s*User time \(seconds\):\s*([0-9]*\.?[0-9]+)\s*$"),
+    "system_seconds": re.compile(r"^\s*System time \(seconds\):\s*([0-9]*\.?[0-9]+)\s*$"),
+    "max_rss_kb": re.compile(r"^\s*Maximum resident set size \(kbytes\):\s*([0-9]+)\s*$"),
+}
+
+
+def parse_elapsed_to_seconds(raw: str) -> float:
+    # Handles formats like "0:03.12", "12:34.56", "1:02:03.45".
+    parts = raw.strip().split(":")
+    if len(parts) == 2:
+        minutes = int(parts[0])
+        seconds = float(parts[1])
+        return minutes * 60 + seconds
+    if len(parts) == 3:
+        hours = int(parts[0])
+        minutes = int(parts[1])
+        seconds = float(parts[2])
+        return hours * 3600 + minutes * 60 + seconds
+    raise ValueError(f"Unrecognized elapsed time format: {raw!r}")
+
+
+def parse_time_v(stderr_text: str) -> Dict[str, float]:
+    parsed: Dict[str, float] = {}
+    elapsed_raw = None
+    for line in stderr_text.splitlines():
+        for key, pattern in TIME_PATTERNS.items():
+            m = pattern.match(line)
+            if not m:
+                continue
+            if key == "elapsed_raw":
+                elapsed_raw = m.group(1)
+            elif key == "max_rss_kb":
+                parsed[key] = int(m.group(1))
+            else:
+                parsed[key] = float(m.group(1))
+    if elapsed_raw is not None:
+        parsed["wall_seconds"] = parse_elapsed_to_seconds(elapsed_raw)
+    return parsed
+
+
+def run_one(
+    vg_bin: str,
+    mode: str,
+    input_path: str,
+    threads: int,
+    keep_output: bool,
+) -> Dict[str, object]:
+    if mode == "gfa":
+        mode_flag = "-g"
+    elif mode == "gfaz":
+        mode_flag = "-z"
+    else:
+        raise ValueError(f"Unsupported mode: {mode}")
+
+    output_path = tempfile.mkstemp(prefix=f"bench_{mode}_t{threads}_", suffix=".vg")[1]
+    cmd = ["/usr/bin/time", "-v", vg_bin, "convert", mode_flag, input_path, "-p", "-t", str(threads)]
+
+    try:
+        with open(output_path, "wb") as out_f:
+            proc = subprocess.run(cmd, stdout=out_f, stderr=subprocess.PIPE, text=True)
+        metrics = parse_time_v(proc.stderr or "")
+        result: Dict[str, object] = {
+            "mode": mode,
+            "threads": threads,
+            "input_path": input_path,
+            "exit_code": proc.returncode,
+            "wall_seconds": metrics.get("wall_seconds"),
+            "user_seconds": metrics.get("user_seconds"),
+            "system_seconds": metrics.get("system_seconds"),
+            "max_rss_kb": metrics.get("max_rss_kb"),
+            "output_path": output_path if keep_output else "",
+        }
+        if proc.returncode != 0:
+            result["error"] = (proc.stderr or "").strip().replace("\n", " | ")
+        else:
+            result["error"] = ""
+        return result
+    finally:
+        if not keep_output and os.path.exists(output_path):
+            os.remove(output_path)
+
+
+def parse_threads(raw: str) -> List[int]:
+    values = []
+    for token in raw.split(","):
+        token = token.strip()
+        if not token:
+            continue
+        values.append(int(token))
+    if not values:
+        raise ValueError("No thread values provided")
+    return values
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Benchmark vg convert performance for GFA (-g) vs GFAZ (-z) with /usr/bin/time -v."
+    )
+    parser.add_argument("--vg", default="./bin/vg", help="Path to vg binary (default: ./bin/vg)")
+    parser.add_argument("--gfa", required=True, help="Input GFA path")
+    parser.add_argument("--gfaz", required=True, help="Input GFAZ path")
+    parser.add_argument(
+        "--threads",
+        default="1,4,8,16",
+        help="Comma-separated thread counts to test (default: 1,4,8,16)",
+    )
+    parser.add_argument("--csv", required=True, help="Output CSV path")
+    parser.add_argument(
+        "--keep-output",
+        action="store_true",
+        help="Keep generated .vg outputs (off by default)",
+    )
+    parser.add_argument(
+        "--repeats",
+        type=int,
+        default=1,
+        help="Repeats per mode/thread combination (default: 1)",
+    )
+    args = parser.parse_args()
+
+    vg_bin = os.path.abspath(args.vg)
+    gfa_path = os.path.abspath(args.gfa)
+    gfaz_path = os.path.abspath(args.gfaz)
+    csv_path = os.path.abspath(args.csv)
+
+    if not os.path.exists(vg_bin):
+        print(f"error: vg binary not found: {vg_bin}", file=sys.stderr)
+        return 2
+    if not os.path.exists(gfa_path):
+        print(f"error: gfa input not found: {gfa_path}", file=sys.stderr)
+        return 2
+    if not os.path.exists(gfaz_path):
+        print(f"error: gfaz input not found: {gfaz_path}", file=sys.stderr)
+        return 2
+    if args.repeats < 1:
+        print("error: --repeats must be >= 1", file=sys.stderr)
+        return 2
+
+    try:
+        threads = parse_threads(args.threads)
+    except Exception as e:
+        print(f"error: invalid --threads value: {e}", file=sys.stderr)
+        return 2
+
+    os.makedirs(os.path.dirname(csv_path) or ".", exist_ok=True)
+
+    rows: List[Dict[str, object]] = []
+    start_ts = dt.datetime.now().isoformat(timespec="seconds")
+    print(f"[{start_ts}] benchmarking started")
+
+    for repeat in range(1, args.repeats + 1):
+        for t in threads:
+            for mode, path in (("gfa", gfa_path), ("gfaz", gfaz_path)):
+                print(f"running mode={mode} threads={t} repeat={repeat}")
+                row = run_one(
+                    vg_bin=vg_bin,
+                    mode=mode,
+                    input_path=path,
+                    threads=t,
+                    keep_output=args.keep_output,
+                )
+                row["repeat"] = repeat
+                row["timestamp"] = dt.datetime.now().isoformat(timespec="seconds")
+                rows.append(row)
+                if row["exit_code"] != 0:
+                    print(
+                        f"warning: run failed mode={mode} threads={t} repeat={repeat} "
+                        f"exit={row['exit_code']}",
+                        file=sys.stderr,
+                    )
+
+    fieldnames = [
+        "timestamp",
+        "repeat",
+        "mode",
+        "threads",
+        "input_path",
+        "exit_code",
+        "wall_seconds",
+        "user_seconds",
+        "system_seconds",
+        "max_rss_kb",
+        "output_path",
+        "error",
+    ]
+    with open(csv_path, "w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(rows)
+
+    end_ts = dt.datetime.now().isoformat(timespec="seconds")
+    print(f"[{end_ts}] benchmarking complete, wrote {len(rows)} rows to {csv_path}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())