1+ #!/usr/bin/env python3
2+ import argparse
3+ import os
4+ from pathlib import Path
5+
6+
7+ def build_github_url (owner : str , repo : str , branch : str , root : Path , file_path : Path , line : int | None ) -> str :
8+ # "root" should be the repository root; we build a URL
9+ # relative to that so it matches the layout on GitHub.
10+ rel = file_path .relative_to (root ).as_posix ()
11+ base = f"https://github.com/{ owner } /{ repo } /blob/{ branch } /{ rel } "
12+ if line is not None :
13+ return f"{ base } #L{ line } "
14+ return base
15+
16+
17+ def is_github_source_url_line (line : str , repo : str ) -> bool :
18+ """Return True if this line is one of *our* GitHub source <url> lines.
19+
20+ These lines are local-only and never pushed to GitHub, so they must not
21+ be counted when computing line numbers for GitHub URLs.
22+ """
23+ stripped = line .strip ()
24+ return (
25+ stripped .startswith ("<url" )
26+ and "github.com" in stripped
27+ and repo in stripped
28+ and "Source on GitHub" in stripped
29+ )
30+
31+
32+ def find_exercise_open_line (lines : list [str ], close_index : int ) -> int | None :
33+ """
34+ Walk backward from close_index to find the line where the matching
35+ <exercise ...> tag begins. Returns a 0-based index, or None if not found.
36+ """
37+ for idx in range (close_index , - 1 , - 1 ):
38+ stripped = lines [idx ].lstrip ()
39+ if stripped .startswith ("<exercise" ) and not stripped .startswith ("</exercise" ):
40+ return idx
41+ return None
42+
43+
44+ def process_file (path : Path , root : Path , owner : str , repo : str , branch : str , dry_run : bool ) -> int :
45+ text = path .read_text (encoding = "utf-8" )
46+ lines = text .splitlines (keepends = True )
47+
48+ # First pass: map physical line numbers to "virtual" GitHub line numbers,
49+ # ignoring any existing GitHub source <url> lines (which never exist on GitHub).
50+ phys_to_virtual : list [int | None ] = []
51+ virtual_line = 0
52+ for line in lines :
53+ if is_github_source_url_line (line , repo ):
54+ phys_to_virtual .append (None )
55+ else :
56+ virtual_line += 1
57+ phys_to_virtual .append (virtual_line )
58+
59+ # Second pass: locate every exercise, its opening line, and any existing
60+ # GitHub source <url> right after it.
61+ exercises : list [dict ] = []
62+ for i , line in enumerate (lines ):
63+ if "</exercise>" not in line :
64+ continue
65+
66+ close_idx = i
67+ open_idx = find_exercise_open_line (lines , close_idx )
68+ if open_idx is None :
69+ exercise_line_number = None
70+ else :
71+ exercise_line_number = phys_to_virtual [open_idx ]
72+
73+ # Look ahead for an existing GitHub source <url> line
74+ j = close_idx + 1
75+ while j < len (lines ) and lines [j ].strip () == "" :
76+ j += 1
77+
78+ existing_url_index = None
79+ if j < len (lines ) and is_github_source_url_line (lines [j ], repo ):
80+ existing_url_index = j
81+
82+ exercises .append (
83+ {
84+ "close_idx" : close_idx ,
85+ "open_idx" : open_idx ,
86+ "exercise_line_number" : exercise_line_number ,
87+ "existing_url_index" : existing_url_index ,
88+ }
89+ )
90+
91+ if not exercises :
92+ return 0
93+
94+ # Build fast lookup tables for rewriting.
95+ close_to_ex = {e ["close_idx" ]: e for e in exercises }
96+ url_to_ex = {e ["existing_url_index" ]: e for e in exercises if e ["existing_url_index" ] is not None }
97+
98+ # Third pass: rebuild the file contents with corrected URLs and without
99+ # counting our local-only <url> lines toward GitHub line numbers.
100+ new_lines : list [str ] = []
101+ insert_count = 0
102+
103+ for idx , line in enumerate (lines ):
104+ # Skip existing GitHub source URL lines; we'll reinsert/update them at
105+ # the matching </exercise> line.
106+ if idx in url_to_ex :
107+ continue
108+
109+ if idx in close_to_ex :
110+ ex = close_to_ex [idx ]
111+ # Always keep the closing </exercise> line
112+ new_lines .append (line )
113+
114+ # Determine indentation and newline style based on this line
115+ stripped = line .lstrip (" \t " )
116+ indent = line [: len (line ) - len (stripped )]
117+ newline = "\n "
118+ if line .endswith ("\r \n " ):
119+ newline = "\r \n "
120+
121+ exercise_line_number = ex ["exercise_line_number" ]
122+ github_url = build_github_url (owner , repo , branch , root , path , exercise_line_number )
123+ url_line = f'{ indent } <url href="{ github_url } ">Source on GitHub</url>{ newline } '
124+
125+ new_lines .append (url_line )
126+
127+ if ex ["existing_url_index" ] is None :
128+ insert_count += 1
129+ else :
130+ new_lines .append (line )
131+
132+ if not dry_run and new_lines != lines :
133+ path .write_text ("" .join (new_lines ), encoding = "utf-8" )
134+
135+ return insert_count
136+
137+
138+ def main ():
139+ parser = argparse .ArgumentParser (
140+ description = "Add a GitHub <url> after every </exercise> in XML files, including a line-number anchor."
141+ )
142+ parser .add_argument (
143+ "--root" ,
144+ type = str ,
145+ default = "." ,
146+ help = "Root of the repository (default: current directory)." ,
147+ )
148+ parser .add_argument (
149+ "--owner" ,
150+ type = str ,
151+ default = "PreTeXtBook" ,
152+ help = "GitHub owner/user name (default: PreTeXtBook)." ,
153+ )
154+ parser .add_argument (
155+ "--repo" ,
156+ type = str ,
157+ default = "pretext" ,
158+ help = "GitHub repository name (default: pretext)." ,
159+ )
160+ parser .add_argument (
161+ "--branch" ,
162+ type = str ,
163+ default = "master" ,
164+ help = "Git branch name for links (default: master)." ,
165+ )
166+ parser .add_argument (
167+ "--file" ,
168+ type = str ,
169+ help = "Single XML file to process (relative to --root or absolute). "
170+ "If omitted, all XML files under --root are processed." ,
171+ )
172+ parser .add_argument (
173+ "--dry-run" ,
174+ action = "store_true" ,
175+ help = "Do not modify files, just report what would change." ,
176+ )
177+ args = parser .parse_args ()
178+
179+ root = Path (args .root ).resolve ()
180+
181+ if args .file :
182+ target = Path (args .file )
183+ if not target .is_absolute ():
184+ target = (root / target ).resolve ()
185+ if not target .exists ():
186+ print (f"Error: file not found: { target } " )
187+ return
188+ if target .suffix != ".xml" :
189+ print (f"Warning: file does not have .xml suffix: { target } " )
190+
191+ inserts = process_file (target , root , args .owner , args .repo , args .branch , args .dry_run )
192+ prefix = "[DRY-RUN] " if args .dry_run else ""
193+ print (f"{ prefix } Updated { target } : added { inserts } <url> tag(s)" )
194+ print (f"Done. Files changed: { 1 if inserts else 0 } , <url> tags added: { inserts } " )
195+ return
196+
197+ # No --file: process all XML files under root
198+ total_files = 0
199+ total_inserts = 0
200+ for dirpath , _ , filenames in os .walk (root ):
201+ for fname in filenames :
202+ if not fname .endswith (".xml" ):
203+ continue
204+ fpath = Path (dirpath , fname )
205+ inserts = process_file (
206+ fpath , root , args .owner , args .repo , args .branch , args .dry_run
207+ )
208+ if inserts :
209+ total_files += 1
210+ total_inserts += inserts
211+ print (f"{ '[DRY-RUN] ' if args .dry_run else '' } Updated { fpath } : added { inserts } <url> tag(s)" )
212+
213+ print (f"Done. Files changed: { total_files } , <url> tags added: { total_inserts } " )
214+
215+
216+ if __name__ == "__main__" :
217+ main ()
0 commit comments