Skip to content

Commit c41797c

Browse files
committed
Integrate GFAz submodule for graph conversion
Add GFAz integration roundtrip test Add benchmark script for vg convert
1 parent 0302370 commit c41797c

9 files changed

Lines changed: 838 additions & 19 deletions

File tree

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,3 +133,6 @@
133133
[submodule "deps/mimalloc"]
134134
path = deps/mimalloc
135135
url = https://github.com/microsoft/mimalloc.git
136+
[submodule "deps/GFAz"]
137+
path = deps/GFAz
138+
url = https://github.com/babyplutokurt/GFAz.git

Makefile

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ INCLUDE_FLAGS :=-I$(CWD)/$(INC_DIR) -I. -I$(CWD)/$(SRC_DIR) -I$(CWD)/$(UNITTEST_
7474
# These need to come before library search paths from LDFLAGS or we won't
7575
# prefer linking vg-installed dependencies over system ones.
7676
LD_LIB_DIR_FLAGS := -L$(CWD)/$(LIB_DIR)
77-
LD_LIB_FLAGS := -lvcflib -lwfa2 -ltabixpp -lgssw -lssw -lsublinearLS -lpthread -lncurses -lgcsa2 -lgbwtgraph -lgbwt -lkff -ldivsufsort -ldivsufsort64 -lvcfh -lraptor2 -lpinchesandcacti -l3edgeconnected -lsonlib -lfml -lstructures -lbdsg -lxg -lsdsl -lzstd -lhandlegraph -lcrypto
77+
LD_LIB_FLAGS := -lvcflib -lwfa2 -ltabixpp -lgssw -lssw -lsublinearLS -lpthread -lncurses -lgcsa2 -lgbwtgraph -lgbwt -lkff -ldivsufsort -ldivsufsort64 -lvcfh -lraptor2 -lpinchesandcacti -l3edgeconnected -lsonlib -lfml -lstructures -lbdsg -lxg -lsdsl -lzstd -lhandlegraph -lgfa_compression_core -lcrypto
7878
# We omit Boost Program Options for now; we find it in a platform-dependent way.
7979
# By default it has no suffix
8080
BOOST_SUFFIX=""
@@ -372,6 +372,7 @@ IPS4O_DIR=deps/ips4o
372372
BBHASH_DIR=deps/BBHash
373373
MIO_DIR=deps/mio
374374
ATOMIC_QUEUE_DIR=deps/atomic_queue
375+
GFAz_DIR=deps/GFAz
375376

376377
# Dependencies that go into libvg's archive
377378
# These go in libvg but come from dependencies
@@ -408,6 +409,7 @@ LIB_DEPS += $(LIB_DIR)/libvgio.a
408409
LIB_DEPS += $(LIB_DIR)/libhandlegraph.a
409410
LIB_DEPS += $(LIB_DIR)/libbdsg.a
410411
LIB_DEPS += $(LIB_DIR)/libxg.a
412+
LIB_DEPS += $(LIB_DIR)/libgfa_compression_core.a
411413
ifneq ($(shell uname -s),Darwin)
412414
# On non-Mac (i.e. Linux), where ELF binaries are used, pull in libdw which
413415
# backward-cpp will use.
@@ -518,6 +520,7 @@ DEPS += $(INC_DIR)/raptor2/raptor2.h
518520
DEPS += $(INC_DIR)/BooPHF.h
519521
DEPS += $(INC_DIR)/mio/mmap.hpp
520522
DEPS += $(INC_DIR)/atomic_queue.h
523+
DEPS += $(INC_DIR)/GFAz/gfa_parser.hpp
521524

522525
.PHONY: clean clean-tests get-deps deps lint test set-path objs static static-docker docs man .pre-build version
523526

@@ -918,6 +921,16 @@ $(INC_DIR)/mio/mmap.hpp: $(MIO_DIR)/include/mio/*
918921
$(INC_DIR)/atomic_queue.h: $(ATOMIC_QUEUE_DIR)/include/*
919922
+cp -r $(ATOMIC_QUEUE_DIR)/include/atomic_queue/* $(CWD)/$(INC_DIR)/
920923

924+
$(INC_DIR)/GFAz/gfa_parser.hpp: $(LIB_DIR)/libgfa_compression_core.a
925+
926+
$(LIB_DIR)/libgfa_compression_core.a: $(GFAz_DIR)/CMakeLists.txt $(wildcard $(GFAz_DIR)/src/*) $(wildcard $(GFAz_DIR)/src/gpu/*) $(wildcard $(GFAz_DIR)/include/*) $(wildcard $(GFAz_DIR)/include/gpu/*)
927+
+rm -f $(CWD)/$(LIB_DIR)/libgfa_compression_core.a
928+
+rm -Rf $(CWD)/$(INC_DIR)/GFAz
929+
+cd $(GFAz_DIR) && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_C_COMPILER="$(CC)" -DCMAKE_CXX_COMPILER="$(CXX)" -DCMAKE_CXX_FLAGS="-fPIC $(CXXFLAGS) $(CPPFLAGS)" -DBUILD_PYTHON_BINDINGS=OFF -DBUILD_CLI=OFF -DBUILD_BENCHMARKS=OFF -DBUILD_TESTS=OFF .. && $(MAKE) $(FILTER) gfa_compression_core && cp libgfa_compression_core.a $(CWD)/$(LIB_DIR)/
930+
+mkdir -p $(CWD)/$(INC_DIR)/GFAz
931+
+cp -r $(GFAz_DIR)/include/* $(CWD)/$(INC_DIR)/GFAz/
932+
933+
921934
$(INC_DIR)/mmmultiset.hpp: $(MMMULTIMAP_DIR)/src/mmmultiset.hpp $(INC_DIR)/mmmultimap.hpp
922935
$(INC_DIR)/mmmultimap.hpp: $(MMMULTIMAP_DIR)/src/mmmultimap.hpp $(MMMULTIMAP_DIR)/src/mmmultiset.hpp $(INC_DIR)/mio/mmap.hpp $(INC_DIR)/atomic_queue.h
923936
+cp $(MMMULTIMAP_DIR)/src/mmmultimap.hpp $(MMMULTIMAP_DIR)/src/mmmultiset.hpp $(CWD)/$(INC_DIR)/

deps/GFAz

Submodule GFAz added at d0b330c
Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
#!/usr/bin/env python3
2+
3+
import argparse
4+
import csv
5+
import datetime as dt
6+
import os
7+
import re
8+
import subprocess
9+
import sys
10+
import tempfile
11+
from typing import Dict, List
12+
13+
14+
TIME_PATTERNS = {
15+
"elapsed_raw": re.compile(r"^\s*Elapsed \(wall clock\) time \(h:mm:ss or m:ss\):\s*(.+?)\s*$"),
16+
"user_seconds": re.compile(r"^\s*User time \(seconds\):\s*([0-9]*\.?[0-9]+)\s*$"),
17+
"system_seconds": re.compile(r"^\s*System time \(seconds\):\s*([0-9]*\.?[0-9]+)\s*$"),
18+
"max_rss_kb": re.compile(r"^\s*Maximum resident set size \(kbytes\):\s*([0-9]+)\s*$"),
19+
}
20+
21+
22+
def parse_elapsed_to_seconds(raw: str) -> float:
23+
# Handles formats like "0:03.12", "12:34.56", "1:02:03.45".
24+
parts = raw.strip().split(":")
25+
if len(parts) == 2:
26+
minutes = int(parts[0])
27+
seconds = float(parts[1])
28+
return minutes * 60 + seconds
29+
if len(parts) == 3:
30+
hours = int(parts[0])
31+
minutes = int(parts[1])
32+
seconds = float(parts[2])
33+
return hours * 3600 + minutes * 60 + seconds
34+
raise ValueError(f"Unrecognized elapsed time format: {raw!r}")
35+
36+
37+
def parse_time_v(stderr_text: str) -> Dict[str, float]:
38+
parsed: Dict[str, float] = {}
39+
elapsed_raw = None
40+
for line in stderr_text.splitlines():
41+
for key, pattern in TIME_PATTERNS.items():
42+
m = pattern.match(line)
43+
if not m:
44+
continue
45+
if key == "elapsed_raw":
46+
elapsed_raw = m.group(1)
47+
elif key == "max_rss_kb":
48+
parsed[key] = int(m.group(1))
49+
else:
50+
parsed[key] = float(m.group(1))
51+
if elapsed_raw is not None:
52+
parsed["wall_seconds"] = parse_elapsed_to_seconds(elapsed_raw)
53+
return parsed
54+
55+
56+
def run_one(
57+
vg_bin: str,
58+
mode: str,
59+
input_path: str,
60+
threads: int,
61+
keep_output: bool,
62+
) -> Dict[str, object]:
63+
if mode == "gfa":
64+
mode_flag = "-g"
65+
elif mode == "gfaz":
66+
mode_flag = "-z"
67+
else:
68+
raise ValueError(f"Unsupported mode: {mode}")
69+
70+
output_path = tempfile.mkstemp(prefix=f"bench_{mode}_t{threads}_", suffix=".vg")[1]
71+
cmd = ["/usr/bin/time", "-v", vg_bin, "convert", mode_flag, input_path, "-p", "-t", str(threads)]
72+
73+
try:
74+
with open(output_path, "wb") as out_f:
75+
proc = subprocess.run(cmd, stdout=out_f, stderr=subprocess.PIPE, text=True)
76+
metrics = parse_time_v(proc.stderr or "")
77+
result: Dict[str, object] = {
78+
"mode": mode,
79+
"threads": threads,
80+
"input_path": input_path,
81+
"exit_code": proc.returncode,
82+
"wall_seconds": metrics.get("wall_seconds"),
83+
"user_seconds": metrics.get("user_seconds"),
84+
"system_seconds": metrics.get("system_seconds"),
85+
"max_rss_kb": metrics.get("max_rss_kb"),
86+
"output_path": output_path if keep_output else "",
87+
}
88+
if proc.returncode != 0:
89+
result["error"] = (proc.stderr or "").strip().replace("\n", " | ")
90+
else:
91+
result["error"] = ""
92+
return result
93+
finally:
94+
if not keep_output and os.path.exists(output_path):
95+
os.remove(output_path)
96+
97+
98+
def parse_threads(raw: str) -> List[int]:
99+
values = []
100+
for token in raw.split(","):
101+
token = token.strip()
102+
if not token:
103+
continue
104+
values.append(int(token))
105+
if not values:
106+
raise ValueError("No thread values provided")
107+
return values
108+
109+
110+
def main() -> int:
111+
parser = argparse.ArgumentParser(
112+
description="Benchmark vg convert performance for GFA (-g) vs GFAZ (-z) with /usr/bin/time -v."
113+
)
114+
parser.add_argument("--vg", default="./bin/vg", help="Path to vg binary (default: ./bin/vg)")
115+
parser.add_argument("--gfa", required=True, help="Input GFA path")
116+
parser.add_argument("--gfaz", required=True, help="Input GFAZ path")
117+
parser.add_argument(
118+
"--threads",
119+
default="1,4,8,16",
120+
help="Comma-separated thread counts to test (default: 1,4,8,16)",
121+
)
122+
parser.add_argument("--csv", required=True, help="Output CSV path")
123+
parser.add_argument(
124+
"--keep-output",
125+
action="store_true",
126+
help="Keep generated .vg outputs (off by default)",
127+
)
128+
parser.add_argument(
129+
"--repeats",
130+
type=int,
131+
default=1,
132+
help="Repeats per mode/thread combination (default: 1)",
133+
)
134+
args = parser.parse_args()
135+
136+
vg_bin = os.path.abspath(args.vg)
137+
gfa_path = os.path.abspath(args.gfa)
138+
gfaz_path = os.path.abspath(args.gfaz)
139+
csv_path = os.path.abspath(args.csv)
140+
141+
if not os.path.exists(vg_bin):
142+
print(f"error: vg binary not found: {vg_bin}", file=sys.stderr)
143+
return 2
144+
if not os.path.exists(gfa_path):
145+
print(f"error: gfa input not found: {gfa_path}", file=sys.stderr)
146+
return 2
147+
if not os.path.exists(gfaz_path):
148+
print(f"error: gfaz input not found: {gfaz_path}", file=sys.stderr)
149+
return 2
150+
if args.repeats < 1:
151+
print("error: --repeats must be >= 1", file=sys.stderr)
152+
return 2
153+
154+
try:
155+
threads = parse_threads(args.threads)
156+
except Exception as e:
157+
print(f"error: invalid --threads value: {e}", file=sys.stderr)
158+
return 2
159+
160+
os.makedirs(os.path.dirname(csv_path) or ".", exist_ok=True)
161+
162+
rows: List[Dict[str, object]] = []
163+
start_ts = dt.datetime.now().isoformat(timespec="seconds")
164+
print(f"[{start_ts}] benchmarking started")
165+
166+
for repeat in range(1, args.repeats + 1):
167+
for t in threads:
168+
for mode, path in (("gfa", gfa_path), ("gfaz", gfaz_path)):
169+
print(f"running mode={mode} threads={t} repeat={repeat}")
170+
row = run_one(
171+
vg_bin=vg_bin,
172+
mode=mode,
173+
input_path=path,
174+
threads=t,
175+
keep_output=args.keep_output,
176+
)
177+
row["repeat"] = repeat
178+
row["timestamp"] = dt.datetime.now().isoformat(timespec="seconds")
179+
rows.append(row)
180+
if row["exit_code"] != 0:
181+
print(
182+
f"warning: run failed mode={mode} threads={t} repeat={repeat} "
183+
f"exit={row['exit_code']}",
184+
file=sys.stderr,
185+
)
186+
187+
fieldnames = [
188+
"timestamp",
189+
"repeat",
190+
"mode",
191+
"threads",
192+
"input_path",
193+
"exit_code",
194+
"wall_seconds",
195+
"user_seconds",
196+
"system_seconds",
197+
"max_rss_kb",
198+
"output_path",
199+
"error",
200+
]
201+
with open(csv_path, "w", newline="", encoding="utf-8") as f:
202+
writer = csv.DictWriter(f, fieldnames=fieldnames)
203+
writer.writeheader()
204+
writer.writerows(rows)
205+
206+
end_ts = dt.datetime.now().isoformat(timespec="seconds")
207+
print(f"[{end_ts}] benchmarking complete, wrote {len(rows)} rows to {csv_path}")
208+
return 0
209+
210+
211+
if __name__ == "__main__":
212+
raise SystemExit(main())

0 commit comments

Comments
 (0)