Skip to content

Commit 2ebf9ad

Browse files
committed
preprocessor to add github links to html output
1 parent e152918 commit 2ebf9ad

1 file changed

Lines changed: 217 additions & 0 deletions

File tree

Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
#!/usr/bin/env python3
2+
import argparse
3+
import os
4+
from pathlib import Path
5+
6+
7+
def build_github_url(owner: str, repo: str, branch: str, root: Path, file_path: Path, line: int | None) -> str:
8+
# "root" should be the repository root; we build a URL
9+
# relative to that so it matches the layout on GitHub.
10+
rel = file_path.relative_to(root).as_posix()
11+
base = f"https://github.com/{owner}/{repo}/blob/{branch}/{rel}"
12+
if line is not None:
13+
return f"{base}#L{line}"
14+
return base
15+
16+
17+
def is_github_source_url_line(line: str, repo: str) -> bool:
18+
"""Return True if this line is one of *our* GitHub source <url> lines.
19+
20+
These lines are local-only and never pushed to GitHub, so they must not
21+
be counted when computing line numbers for GitHub URLs.
22+
"""
23+
stripped = line.strip()
24+
return (
25+
stripped.startswith("<url")
26+
and "github.com" in stripped
27+
and repo in stripped
28+
and "Source on GitHub" in stripped
29+
)
30+
31+
32+
def find_exercise_open_line(lines: list[str], close_index: int) -> int | None:
33+
"""
34+
Walk backward from close_index to find the line where the matching
35+
<exercise ...> tag begins. Returns a 0-based index, or None if not found.
36+
"""
37+
for idx in range(close_index, -1, -1):
38+
stripped = lines[idx].lstrip()
39+
if stripped.startswith("<exercise") and not stripped.startswith("</exercise"):
40+
return idx
41+
return None
42+
43+
44+
def process_file(path: Path, root: Path, owner: str, repo: str, branch: str, dry_run: bool) -> int:
45+
text = path.read_text(encoding="utf-8")
46+
lines = text.splitlines(keepends=True)
47+
48+
# First pass: map physical line numbers to "virtual" GitHub line numbers,
49+
# ignoring any existing GitHub source <url> lines (which never exist on GitHub).
50+
phys_to_virtual: list[int | None] = []
51+
virtual_line = 0
52+
for line in lines:
53+
if is_github_source_url_line(line, repo):
54+
phys_to_virtual.append(None)
55+
else:
56+
virtual_line += 1
57+
phys_to_virtual.append(virtual_line)
58+
59+
# Second pass: locate every exercise, its opening line, and any existing
60+
# GitHub source <url> right after it.
61+
exercises: list[dict] = []
62+
for i, line in enumerate(lines):
63+
if "</exercise>" not in line:
64+
continue
65+
66+
close_idx = i
67+
open_idx = find_exercise_open_line(lines, close_idx)
68+
if open_idx is None:
69+
exercise_line_number = None
70+
else:
71+
exercise_line_number = phys_to_virtual[open_idx]
72+
73+
# Look ahead for an existing GitHub source <url> line
74+
j = close_idx + 1
75+
while j < len(lines) and lines[j].strip() == "":
76+
j += 1
77+
78+
existing_url_index = None
79+
if j < len(lines) and is_github_source_url_line(lines[j], repo):
80+
existing_url_index = j
81+
82+
exercises.append(
83+
{
84+
"close_idx": close_idx,
85+
"open_idx": open_idx,
86+
"exercise_line_number": exercise_line_number,
87+
"existing_url_index": existing_url_index,
88+
}
89+
)
90+
91+
if not exercises:
92+
return 0
93+
94+
# Build fast lookup tables for rewriting.
95+
close_to_ex = {e["close_idx"]: e for e in exercises}
96+
url_to_ex = {e["existing_url_index"]: e for e in exercises if e["existing_url_index"] is not None}
97+
98+
# Third pass: rebuild the file contents with corrected URLs and without
99+
# counting our local-only <url> lines toward GitHub line numbers.
100+
new_lines: list[str] = []
101+
insert_count = 0
102+
103+
for idx, line in enumerate(lines):
104+
# Skip existing GitHub source URL lines; we'll reinsert/update them at
105+
# the matching </exercise> line.
106+
if idx in url_to_ex:
107+
continue
108+
109+
if idx in close_to_ex:
110+
ex = close_to_ex[idx]
111+
# Always keep the closing </exercise> line
112+
new_lines.append(line)
113+
114+
# Determine indentation and newline style based on this line
115+
stripped = line.lstrip(" \t")
116+
indent = line[: len(line) - len(stripped)]
117+
newline = "\n"
118+
if line.endswith("\r\n"):
119+
newline = "\r\n"
120+
121+
exercise_line_number = ex["exercise_line_number"]
122+
github_url = build_github_url(owner, repo, branch, root, path, exercise_line_number)
123+
url_line = f'{indent}<url href="{github_url}">Source on GitHub</url>{newline}'
124+
125+
new_lines.append(url_line)
126+
127+
if ex["existing_url_index"] is None:
128+
insert_count += 1
129+
else:
130+
new_lines.append(line)
131+
132+
if not dry_run and new_lines != lines:
133+
path.write_text("".join(new_lines), encoding="utf-8")
134+
135+
return insert_count
136+
137+
138+
def main():
139+
parser = argparse.ArgumentParser(
140+
description="Add a GitHub <url> after every </exercise> in XML files, including a line-number anchor."
141+
)
142+
parser.add_argument(
143+
"--root",
144+
type=str,
145+
default=".",
146+
help="Root of the repository (default: current directory).",
147+
)
148+
parser.add_argument(
149+
"--owner",
150+
type=str,
151+
default="PreTeXtBook",
152+
help="GitHub owner/user name (default: PreTeXtBook).",
153+
)
154+
parser.add_argument(
155+
"--repo",
156+
type=str,
157+
default="pretext",
158+
help="GitHub repository name (default: pretext).",
159+
)
160+
parser.add_argument(
161+
"--branch",
162+
type=str,
163+
default="master",
164+
help="Git branch name for links (default: master).",
165+
)
166+
parser.add_argument(
167+
"--file",
168+
type=str,
169+
help="Single XML file to process (relative to --root or absolute). "
170+
"If omitted, all XML files under --root are processed.",
171+
)
172+
parser.add_argument(
173+
"--dry-run",
174+
action="store_true",
175+
help="Do not modify files, just report what would change.",
176+
)
177+
args = parser.parse_args()
178+
179+
root = Path(args.root).resolve()
180+
181+
if args.file:
182+
target = Path(args.file)
183+
if not target.is_absolute():
184+
target = (root / target).resolve()
185+
if not target.exists():
186+
print(f"Error: file not found: {target}")
187+
return
188+
if target.suffix != ".xml":
189+
print(f"Warning: file does not have .xml suffix: {target}")
190+
191+
inserts = process_file(target, root, args.owner, args.repo, args.branch, args.dry_run)
192+
prefix = "[DRY-RUN] " if args.dry_run else ""
193+
print(f"{prefix}Updated {target}: added {inserts} <url> tag(s)")
194+
print(f"Done. Files changed: {1 if inserts else 0}, <url> tags added: {inserts}")
195+
return
196+
197+
# No --file: process all XML files under root
198+
total_files = 0
199+
total_inserts = 0
200+
for dirpath, _, filenames in os.walk(root):
201+
for fname in filenames:
202+
if not fname.endswith(".xml"):
203+
continue
204+
fpath = Path(dirpath, fname)
205+
inserts = process_file(
206+
fpath, root, args.owner, args.repo, args.branch, args.dry_run
207+
)
208+
if inserts:
209+
total_files += 1
210+
total_inserts += inserts
211+
print(f"{'[DRY-RUN] ' if args.dry_run else ''}Updated {fpath}: added {inserts} <url> tag(s)")
212+
213+
print(f"Done. Files changed: {total_files}, <url> tags added: {total_inserts}")
214+
215+
216+
if __name__ == "__main__":
217+
main()

0 commit comments

Comments
 (0)