@@ -15,22 +15,159 @@ fun preprocessMarkdown(markdown: String, baseUrl: String): String {
1515 }
1616
1717 fun isSvgUrl (url : String ): Boolean {
18- return url.endsWith(" .svg" , ignoreCase = true ) ||
19- url.contains(" .svg?" , ignoreCase = true ) ||
20- url.contains(" .svg#" , ignoreCase = true )
18+ val lower = url.lowercase()
19+ return lower.endsWith(" .svg" ) ||
20+ lower.contains(" .svg?" ) ||
21+ lower.contains(" .svg#" ) ||
22+ lower.contains(" /svg-badge" ) ||
23+ lower.contains(" badge.svg" )
24+ }
25+
26+ fun isBadgeUrl (url : String ): Boolean {
27+ val lower = url.lowercase()
28+ return lower.contains(" img.shields.io" ) ||
29+ lower.contains(" shields.io/badge" ) ||
30+ lower.contains(" badge.fury.io" ) ||
31+ lower.contains(" badgen.net" ) ||
32+ lower.contains(" repology.org/badge" ) ||
33+ lower.contains(" hosted.weblate.org/widget" ) ||
34+ lower.contains(" codecov.io" ) ||
35+ lower.contains(" coveralls.io" ) ||
36+ lower.contains(" travis-ci." ) ||
37+ lower.contains(" circleci.com" ) ||
38+ lower.contains(" github.com/workflows" ) ||
39+ (lower.contains(" /badge" ) && isSvgUrl(lower))
40+ }
41+
42+ fun shouldSkipImage (url : String ): Boolean {
43+ return isSvgUrl(url) || isBadgeUrl(url)
2144 }
2245
2346 fun resolveUrl (path : String ): String {
24- val isAbsolute = path.startsWith(" http://" ) ||
25- path.startsWith(" https://" ) ||
26- path.startsWith(" data:" )
47+ val trimmed = path.trim()
48+ val isAbsolute = trimmed.startsWith(" http://" ) ||
49+ trimmed.startsWith(" https://" ) ||
50+ trimmed.startsWith(" data:" )
2751 return if (isAbsolute) {
28- normalizeGitHubUrl(path)
52+ normalizeGitHubUrl(trimmed)
53+ } else {
54+ when {
55+ trimmed.startsWith(" ./" ) -> " $normalizedBaseUrl${trimmed.removePrefix(" ./" )} "
56+ trimmed.startsWith(" /" ) -> " $normalizedBaseUrl${trimmed.removePrefix(" /" )} "
57+ trimmed.startsWith(" ../" ) -> {
58+ var base = normalizedBaseUrl.trimEnd(' /' )
59+ var rel = trimmed
60+ while (rel.startsWith(" ../" )) {
61+ base = base.substringBeforeLast(' /' , base)
62+ rel = rel.removePrefix(" ../" )
63+ }
64+ " $base /$rel "
65+ }
66+
67+ else -> " $normalizedBaseUrl$trimmed "
68+ }
69+ }
70+ }
71+
72+ // ========================================================================
73+ // Phase 0: Handle reference-style markdown definitions and usages
74+ // ========================================================================
75+ // Reference definitions: [ref-name]: https://example.com/image.svg
76+ // Reference usages: ![alt][ref-name] or [![img-ref]][link-ref]
77+
78+ // 0a. Parse all reference definitions
79+ val refDefinitionRegex = Regex (
80+ """ ^\[([^\]]+)\]:\s*(\S+).*$""" ,
81+ RegexOption .MULTILINE
82+ )
83+ val referenceMap = mutableMapOf<String , String >()
84+ for (match in refDefinitionRegex.findAll(processed)) {
85+ val refName = match.groupValues[1 ].lowercase()
86+ val url = match.groupValues[2 ]
87+ referenceMap[refName] = url
88+ }
89+
90+ // 0b. Identify which references point to SVGs/badges
91+ val skipRefNames = referenceMap.filter { (_, url) ->
92+ shouldSkipImage(resolveUrl(url))
93+ }.keys
94+
95+ // 0c. Remove reference-style image usages that point to SVGs: ![alt][svg-ref]
96+ if (skipRefNames.isNotEmpty()) {
97+ processed = processed.replace(
98+ Regex (""" !\[([^\]]*)\]\[([^\]]+)\]""" )
99+ ) { match ->
100+ val alt = match.groupValues[1 ]
101+ val refName = match.groupValues[2 ].lowercase()
102+ if (refName in skipRefNames) {
103+ if (alt.isNotEmpty()) " **$alt **" else " "
104+ } else {
105+ match.value
106+ }
107+ }
108+ }
109+
110+ // 0d. Resolve remaining reference-style images to inline format: ![alt][ref] → 
111+ processed = processed.replace(
112+ Regex (""" !\[([^\]]*)\]\[([^\]]+)\]""" )
113+ ) { match ->
114+ val alt = match.groupValues[1 ]
115+ val refName = match.groupValues[2 ].lowercase()
116+ val url = referenceMap[refName]
117+ if (url != null ) {
118+ val resolved = resolveUrl(url)
119+ " "
120+ } else {
121+ match.value
122+ }
123+ }
124+
125+ // 0e. Handle nested badge-as-link patterns: [![badge-ref]][link-ref]
126+ // After 0c strips the inner image, this can leave [**text**][link-ref] or [][link-ref]
127+ processed = processed.replace(
128+ Regex (""" \[(\*\*[^*]*\*\*)\]\[([^\]]+)\]""" )
129+ ) { match ->
130+ val boldText = match.groupValues[1 ]
131+ val refName = match.groupValues[2 ].lowercase()
132+ val url = referenceMap[refName]
133+ if (url != null ) {
134+ " [$boldText ](${resolveUrl(url)} )"
29135 } else {
30- val cleaned = path.trim().trimStart(' .' , ' /' )
31- " $normalizedBaseUrl$cleaned "
136+ boldText
32137 }
33138 }
139+ // Clean empty bracket patterns left from stripped badge images: [][ref]
140+ processed = processed.replace(
141+ Regex (""" \[\s*\]\[([^\]]+)\]""" ),
142+ " "
143+ )
144+
145+ // 0f. Handle reference-style links: [text][ref] → [text](url)
146+ processed = processed.replace(
147+ Regex (""" \[([^\]]+)\]\[([^\]]+)\]""" )
148+ ) { match ->
149+ val text = match.groupValues[1 ]
150+ val refName = match.groupValues[2 ].lowercase()
151+ val url = referenceMap[refName]
152+ // Don't convert if text looks like it was already an image (starts with !)
153+ if (url != null && ! text.startsWith(" !" )) {
154+ " [$text ](${resolveUrl(url)} )"
155+ } else {
156+ match.value
157+ }
158+ }
159+
160+ // 0g. Remove all reference definitions that were resolved
161+ processed = processed.replace(
162+ Regex (""" ^\[([^\]]+)\]:\s*\S+.*$""" , RegexOption .MULTILINE )
163+ ) { match ->
164+ val refName = match.groupValues[1 ].lowercase()
165+ if (refName in referenceMap) " " else match.value
166+ }
167+
168+ // ========================================================================
169+ // Phase 1: HTML → Markdown conversions
170+ // ========================================================================
34171
35172 // 1. Unwrap <picture> elements → keep only the <img> fallback
36173 processed = processed.replace(
@@ -57,25 +194,25 @@ fun preprocessMarkdown(markdown: String, baseUrl: String): String {
57194 match.groupValues[1 ]
58195 }
59196
60- // 3. Convert <img> tags → markdown images
197+ // 3. Convert <img> tags → markdown images (handles multiline img tags)
61198 processed = processed.replace(
62199 Regex (
63200 """ <img\s+([^>]*?)\s*/?>""" ,
64- RegexOption .IGNORE_CASE
201+ setOf ( RegexOption .IGNORE_CASE , RegexOption . DOT_MATCHES_ALL )
65202 )
66203 ) { imgMatch ->
67204 val imgTag = imgMatch.groupValues[1 ]
68205
69- val srcMatch = Regex (""" src= (["'])([^"']+)\1""" ).find(imgTag)
206+ val srcMatch = Regex (""" src\s*=\s* (["'])([^"']+)\1""" ).find(imgTag)
70207 val src = srcMatch?.groupValues?.get(2 ) ? : " "
71208
72- val altMatch = Regex (""" alt= (["'])([^"']*)\1""" ).find(imgTag)
209+ val altMatch = Regex (""" alt\s*=\s* (["'])([^"']*)\1""" ).find(imgTag)
73210 val alt = altMatch?.groupValues?.get(2 ) ? : " "
74211
75212 if (src.isNotEmpty()) {
76213 val normalizedSrc = resolveUrl(src)
77214
78- if (isSvgUrl (normalizedSrc)) {
215+ if (shouldSkipImage (normalizedSrc)) {
79216 if (alt.isNotEmpty()) " **$alt **" else " "
80217 } else {
81218 " "
@@ -90,10 +227,10 @@ fun preprocessMarkdown(markdown: String, baseUrl: String): String {
90227 Regex (""" !\[([^\]]*)\]\(([^)]+)\)""" )
91228 ) { match ->
92229 val alt = match.groupValues[1 ]
93- val originalPath = match.groupValues[2 ]
230+ val originalPath = match.groupValues[2 ].trim()
94231 val finalUrl = resolveUrl(originalPath)
95232
96- if (isSvgUrl (finalUrl)) {
233+ if (shouldSkipImage (finalUrl)) {
97234 if (alt.isNotEmpty()) " **$alt **" else " "
98235 } else {
99236 " "
@@ -108,7 +245,7 @@ fun preprocessMarkdown(markdown: String, baseUrl: String): String {
108245 )
109246 ) { match ->
110247 val src = match.groupValues[2 ]
111- " [Video]($src )"
248+ " [Video](${resolveUrl( src)} )"
112249 }
113250 // Video with <source> inside
114251 processed = processed.replace(
@@ -118,7 +255,7 @@ fun preprocessMarkdown(markdown: String, baseUrl: String): String {
118255 )
119256 ) { match ->
120257 val src = match.groupValues[2 ]
121- " [Video]($src )"
258+ " [Video](${resolveUrl( src)} )"
122259 }
123260
124261 // 6. Convert HTML headings <h1>–<h6> → markdown headings
@@ -186,7 +323,7 @@ fun preprocessMarkdown(markdown: String, baseUrl: String): String {
186323 // 9. Convert <a href="url">text</a> → [text](url) (non-image links)
187324 processed = processed.replace(
188325 Regex (
189- """ <a\s+[^>]*?href= (["'])([^"']+)\1[^>]*>(.*?)</a>""" ,
326+ """ <a\s+[^>]*?href\s*=\s* (["'])([^"']+)\1[^>]*>(.*?)</a>""" ,
190327 setOf (RegexOption .IGNORE_CASE , RegexOption .DOT_MATCHES_ALL )
191328 )
192329 ) { match ->
@@ -210,9 +347,9 @@ fun preprocessMarkdown(markdown: String, baseUrl: String): String {
210347 }
211348
212349 // 11. Strip remaining wrapper tags (keep content)
213- // <div align="center"> and </div>
350+ // <div> tags
214351 processed = processed.replace(
215- Regex (""" <div[^>]*?align=["']center["'][^>]*? >\s*""" , RegexOption .IGNORE_CASE ),
352+ Regex (""" <div[^>]*?>\s*""" , RegexOption .IGNORE_CASE ),
216353 " \n\n "
217354 )
218355 processed = processed.replace(
@@ -238,7 +375,10 @@ fun preprocessMarkdown(markdown: String, baseUrl: String): String {
238375 " \n "
239376 )
240377 processed = processed.replace(
241- Regex (""" <summary[^>]*?>(.*?)</summary>""" , setOf (RegexOption .IGNORE_CASE , RegexOption .DOT_MATCHES_ALL ))
378+ Regex (
379+ """ <summary[^>]*?>(.*?)</summary>""" ,
380+ setOf (RegexOption .IGNORE_CASE , RegexOption .DOT_MATCHES_ALL )
381+ )
242382 ) { match ->
243383 " **${match.groupValues[1 ].trim()} **\n "
244384 }
@@ -247,6 +387,14 @@ fun preprocessMarkdown(markdown: String, baseUrl: String): String {
247387 Regex (""" </?(?:span|sup|sub)[^>]*?>""" , RegexOption .IGNORE_CASE ),
248388 " "
249389 )
390+ // Strip other common straggler HTML tags
391+ processed = processed.replace(
392+ Regex (
393+ """ </?(?:center|font|u|section|article|header|footer|nav|main|aside|figure|figcaption)[^>]*?>""" ,
394+ RegexOption .IGNORE_CASE
395+ ),
396+ " \n "
397+ )
250398
251399 // 12. Decode common HTML entities
252400 processed = processed
@@ -257,6 +405,15 @@ fun preprocessMarkdown(markdown: String, baseUrl: String): String {
257405 .replace(" '" , " '" )
258406 .replace(" '" , " '" )
259407 .replace(" " , " " )
408+ // Numeric HTML entities
409+ processed = processed.replace(Regex (""" &#(\d+);""" )) { match ->
410+ val code = match.groupValues[1 ].toIntOrNull()
411+ if (code != null && code in 32 .. 126 ) {
412+ code.toChar().toString()
413+ } else {
414+ match.value
415+ }
416+ }
260417
261418 // 13. Clean up empty <p> tags and excess newlines
262419 processed = processed.replace(
@@ -274,5 +431,5 @@ fun preprocessMarkdown(markdown: String, baseUrl: String): String {
274431 " "
275432 )
276433
277- return processed
434+ return processed.trim()
278435}
0 commit comments