Skip to content

Commit 4e4c30e

Browse files
committed
Add multi-line comment syntax parser and tests
1 parent 4662adb commit 4e4c30e

5 files changed

Lines changed: 799 additions & 163 deletions

File tree

Lines changed: 366 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,366 @@
1+
import type { AnnotationComment, AnnotationTag, SourceRange } from '../../core/types'
2+
import type { ParseParentCommentOptions } from '../parent-comment'
3+
import { escapeRegExp } from '../../internal/escaping'
4+
import { getTextContentInLine } from '../text-content'
5+
6+
type MultiLineCommentSyntax = {
7+
opening: string
8+
closing: string
9+
continuationLineStart?: RegExp | undefined
10+
}
11+
12+
type MultiLineCommentSyntaxMatch = {
13+
openingRange: SourceRange
14+
openingRangeWithWhitespace: SourceRange
15+
closingRange: SourceRange
16+
closingRangeWithWhitespace: SourceRange
17+
/**
18+
* This flag gets set to `true` while looking for an opening/closing syntax pair
19+
* if the current syntax has requirements that were not met on at least one line,
20+
* e.g. a specific `continuationLineStart` syntax that was not found.
21+
*/
22+
invalid: boolean
23+
}
24+
25+
const multiLineCommentSyntaxes: MultiLineCommentSyntax[] = [
26+
// JSDoc, JavaDoc - a leading `*` that is not part of the content is expected on each new line
27+
{ opening: '/**', closing: '*/', continuationLineStart: /^\s*\*(?=\s|$)/ },
28+
// JS, TS, CSS, Java, C, C++, C#, Rust, Go, SQL, etc.
29+
{ opening: '/*', closing: '*/' },
30+
// HTML, XML
31+
{ opening: '<!--', closing: '-->' },
32+
// JSX, TSX
33+
{ opening: '{/*', closing: '*/}' },
34+
{ opening: '{ /*', closing: '*/ }' },
35+
// Pascal, ML, F#, etc.
36+
{ opening: '(*', closing: '*)' },
37+
// Lua
38+
{ opening: '--[[', closing: ']]' },
39+
]
40+
41+
const multiLineCommentOpeningRegex = createCommentDelimiterRegExp(multiLineCommentSyntaxes, 'opening')
42+
const multiLineCommentClosingRegex = createCommentDelimiterRegExp(multiLineCommentSyntaxes, 'closing')
43+
44+
/**
45+
* Attempts to find and parse a multi-line comment that the given annotation tag is located in.
46+
*
47+
* See {@link parseParentComment} for more information.
48+
*/
49+
export function parseMultiLineParentComment(options: ParseParentCommentOptions): AnnotationComment | undefined {
50+
const { codeLines, tag } = options
51+
52+
const tagLineIndex = tag.range.start.line
53+
const tagLine = codeLines[tagLineIndex]
54+
const tagStartColumn = tag.range.start.column ?? 0
55+
const tagEndColumn = tag.range.end.column ?? tagLine.length
56+
57+
const commentSyntaxMatches: Partial<MultiLineCommentSyntaxMatch>[] = Array.from({ length: multiLineCommentSyntaxes.length }, () => ({}))
58+
let scannedForClosings = false
59+
60+
// Check for a matching pair of beginning and ending multi-line comment syntaxes around the tag
61+
// by first walking backwards from the tag to find potential opening sequences
62+
// and then walking forwards to find matching closing sequences
63+
for (let openingLineIndex = tagLineIndex; openingLineIndex >= 0; openingLineIndex--) {
64+
if (
65+
findCommentSyntaxMatches({
66+
type: 'opening',
67+
commentSyntaxMatches,
68+
codeLines,
69+
lineIndex: openingLineIndex,
70+
endColumn: openingLineIndex === tagLineIndex ? tagStartColumn : undefined,
71+
})
72+
) {
73+
// If we have a matching opening/closing pair now, return it
74+
const comment = getCommentFromMatchingSyntaxPair({
75+
codeLines,
76+
tag,
77+
commentSyntaxMatches,
78+
})
79+
if (comment) return comment
80+
81+
// Otherwise, walk forwards once to find all possible closing sequences,
82+
// stopping early if a matching pair is found
83+
if (!scannedForClosings) {
84+
scannedForClosings = true
85+
let foundAnyClosings = false
86+
for (let closingLineIndex = tagLineIndex; closingLineIndex < codeLines.length; closingLineIndex++) {
87+
if (
88+
findCommentSyntaxMatches({
89+
type: 'closing',
90+
commentSyntaxMatches,
91+
codeLines,
92+
lineIndex: closingLineIndex,
93+
startColumn: closingLineIndex === tagLineIndex ? tagEndColumn : undefined,
94+
})
95+
) {
96+
foundAnyClosings = true
97+
// If we have a matching opening/closing pair now, return it
98+
const comment = getCommentFromMatchingSyntaxPair({
99+
codeLines,
100+
tag,
101+
commentSyntaxMatches,
102+
})
103+
if (comment) return comment
104+
}
105+
}
106+
// If we didn't find any closing sequences, there cannot be a matching pair
107+
if (!foundAnyClosings) return undefined
108+
}
109+
}
110+
}
111+
112+
return undefined
113+
}
114+
115+
/**
116+
* Searches the given line for any multi-line comment syntax opening or closing sequences,
117+
* and updates the `commentSyntaxMatches` array accordingly.
118+
*
119+
* If any matches are found, checks the `commentSyntaxMatches` array for the matched syntax
120+
* entries and adds the new opening or closing range in case it was undefined before.
121+
* Matches are ordered to ensure the ones closest to the tag are processed first.
122+
*
123+
* Also validates the requirements of all syntaxes for the given line and sets the `invalid` flag
124+
* if any requirements were not met (e.g. the `continuationLineStart` syntax was not found).
125+
*
126+
* Returns `true` if there were new matches, or `false` otherwise.
127+
*/
128+
function findCommentSyntaxMatches(options: {
129+
type: 'opening' | 'closing'
130+
commentSyntaxMatches: Partial<MultiLineCommentSyntaxMatch>[]
131+
codeLines: string[]
132+
lineIndex: number
133+
startColumn?: number | undefined
134+
endColumn?: number | undefined
135+
}): boolean {
136+
const { type, commentSyntaxMatches, codeLines, lineIndex, startColumn = 0, endColumn } = options
137+
const line = codeLines[lineIndex]
138+
139+
// Look for opening/closing sequences in the given line
140+
const regex = type === 'opening' ? multiLineCommentOpeningRegex : multiLineCommentClosingRegex
141+
const sequences = findAllCommentDelimiters(regex, line, startColumn, endColumn)
142+
let foundNewMatches = false
143+
if (sequences.length) {
144+
// If we're looking for opening sequences, we need to reverse the matches
145+
// to ensure we process the ones closest to the tag first
146+
if (type === 'opening') sequences.reverse()
147+
148+
// Now go through the matches and update the `commentSyntaxMatches` array if needed
149+
const delimiterProp = type === 'opening' ? 'openingRange' : 'closingRange'
150+
const whitespaceProp = type === 'opening' ? 'openingRangeWithWhitespace' : 'closingRangeWithWhitespace'
151+
sequences.forEach((sequence) => {
152+
commentSyntaxMatches.forEach((match, index) => {
153+
const syntax = multiLineCommentSyntaxes[index]
154+
// Skip matches that are invalid or already have defined ranges
155+
if (match.invalid || match[delimiterProp]) return
156+
// Skip matches where the respective syntax differs from the current sequence
157+
if (syntax[type] !== sequence.delimiter) return
158+
// Otherwise, set the ranges and mark the array as updated
159+
match[delimiterProp] = createRange({
160+
line,
161+
lineIndex,
162+
startColumn: sequence.index,
163+
endColumn: sequence.index + sequence.delimiter.length,
164+
})
165+
match[whitespaceProp] = createRange({
166+
line,
167+
lineIndex,
168+
startColumn: sequence.index - sequence.leadingWhitespace.length,
169+
endColumn: sequence.index + sequence.delimiter.length + sequence.trailingWhitespace.length,
170+
})
171+
foundNewMatches = true
172+
})
173+
})
174+
}
175+
176+
// Validate all syntax requirements for the given line
177+
commentSyntaxMatches.forEach((match, index) => {
178+
const syntax = multiLineCommentSyntaxes[index]
179+
if (
180+
// Check matches that are still valid and that have a continuation line requirement
181+
!match.invalid &&
182+
syntax.continuationLineStart &&
183+
// Only check the requirement on non-opening and non-closing lines
184+
lineIndex !== match.openingRange?.start.line &&
185+
lineIndex !== match.closingRange?.start.line &&
186+
// If the line doesn't match the continuation syntax, mark the match as invalid
187+
!line.match(syntax.continuationLineStart)
188+
) {
189+
match.invalid = true
190+
}
191+
})
192+
193+
return foundNewMatches
194+
}
195+
196+
/**
197+
* Checks the `commentSyntaxMatches` array for matching pairs of opening and closing
198+
* multi-line comment syntaxes. If any pairs are found, determines the innermost one
199+
* and returns the corresponding comment.
200+
*/
201+
function getCommentFromMatchingSyntaxPair(options: {
202+
codeLines: string[]
203+
tag: AnnotationTag
204+
commentSyntaxMatches: Partial<MultiLineCommentSyntaxMatch>[]
205+
}): AnnotationComment | undefined {
206+
const { codeLines, tag, commentSyntaxMatches } = options
207+
208+
const bestMatchIndex = commentSyntaxMatches.reduce((previousBestIndex, match, index) => {
209+
// If the new match isn't a valid pair (yet?), skip it
210+
if (!isValidFullMatch(match)) return previousBestIndex
211+
212+
// If we don't have a previous best pair yet, use the new match
213+
if (previousBestIndex === -1) return index
214+
215+
const previousBestMatch = commentSyntaxMatches[previousBestIndex] as MultiLineCommentSyntaxMatch
216+
217+
// Check if the new match is a better pair than the previous one
218+
if (
219+
// It's better if its opening sequence ends after the previous one,
220+
compareRanges(previousBestMatch.openingRange, match.openingRange, 'end') > 0 ||
221+
// ...or if its closing sequence starts before the previous one
222+
compareRanges(previousBestMatch.closingRange, match.closingRange, 'start') < 0
223+
) {
224+
return index
225+
}
226+
227+
return previousBestIndex
228+
}, -1)
229+
230+
if (bestMatchIndex > -1) {
231+
// We found a matching opening/closing comment syntax pair,
232+
// so build the AnnotationComment object and return it
233+
const match = commentSyntaxMatches[bestMatchIndex] as MultiLineCommentSyntaxMatch
234+
const syntax = multiLineCommentSyntaxes[bestMatchIndex]
235+
const isOnSingleLineBeforeCode = match.openingRange.start.line === match.closingRange.end.line && match.closingRangeWithWhitespace.end.column
236+
const commentRange: SourceRange = {
237+
start: isOnSingleLineBeforeCode ? match.openingRange.start : match.openingRangeWithWhitespace.start,
238+
end: match.closingRangeWithWhitespace.end,
239+
}
240+
const innerRange: SourceRange = {
241+
start: match.openingRangeWithWhitespace.end,
242+
end: match.closingRangeWithWhitespace.start,
243+
}
244+
const contents: string[] = []
245+
const contentRanges: SourceRange[] = []
246+
247+
for (let lineIndex = tag.range.end.line; lineIndex <= innerRange.end.line; lineIndex++) {
248+
const line = codeLines[lineIndex]
249+
const startColumn = lineIndex === tag.range.end.line ? tag.range.end.column : lineIndex === innerRange.start.line ? (innerRange.start.column ?? line.length) : 0
250+
const endColumn = lineIndex === innerRange.end.line ? (innerRange.end.column ?? 0) : line.length
251+
252+
const lineContent = getTextContentInLine({
253+
codeLines,
254+
lineIndex,
255+
startColumn,
256+
endColumn,
257+
continuationLineStart: syntax.continuationLineStart,
258+
})
259+
contents.push(lineContent.content)
260+
contentRanges.push(lineContent.contentRange)
261+
}
262+
263+
// Remove empty lines from the beginning and end of the content arrays
264+
while (contents.length && !contents[0].length) {
265+
contents.shift()
266+
contentRanges.shift()
267+
}
268+
while (contents.length && !contents[contents.length - 1].length) {
269+
contents.pop()
270+
contentRanges.pop()
271+
}
272+
273+
return {
274+
tag,
275+
contents,
276+
commentRange,
277+
contentRanges,
278+
targetRanges: [],
279+
}
280+
}
281+
}
282+
283+
function createCommentDelimiterRegExp(syntaxes: MultiLineCommentSyntax[], delimiterType: 'opening' | 'closing') {
284+
const sequences = syntaxes.map((syntax) => escapeRegExp(syntax[delimiterType]))
285+
const uniqueSortedSequences = [...new Set(sequences)].sort((a, b) => b.length - a.length)
286+
return new RegExp(
287+
[
288+
// Either the beginning of the line or required whitespace (captured)
289+
'(?<=^|(\\s+))',
290+
// Any of the supported multi-line comment opening sequences (captured)
291+
`(${uniqueSortedSequences.join('|')})`,
292+
// Either the end of the line or required whitespace (captured)
293+
`(?=$|(\\s+))`,
294+
].join(''),
295+
'g'
296+
)
297+
}
298+
299+
/**
300+
* Finds all matches of the given comment delimiter regular expression in the given line,
301+
* including partially overlapping matches. Returns an array of match objects that
302+
* each contain an index, the leading whitespace, the delimiter and the trailing whitespace.
303+
*/
304+
function findAllCommentDelimiters(regExp: RegExp, line: string, startColumn: number | undefined, endColumn: number | undefined) {
305+
const matches: { index: number; leadingWhitespace: string; delimiter: string; trailingWhitespace: string }[] = []
306+
let match: RegExpExecArray | null
307+
regExp.lastIndex = startColumn ?? 0
308+
while ((match = regExp.exec(line))) {
309+
const leadingWhitespace = match[1] ?? ''
310+
const delimiter = match[2] ?? ''
311+
const trailingWhitespace = match[3] ?? ''
312+
if (endColumn && match.index + delimiter.length > endColumn) break
313+
matches.push({
314+
index: match.index,
315+
leadingWhitespace,
316+
delimiter,
317+
trailingWhitespace,
318+
})
319+
regExp.lastIndex = match.index + 1
320+
}
321+
return matches
322+
}
323+
324+
function isValidFullMatch(match: Partial<MultiLineCommentSyntaxMatch>): match is MultiLineCommentSyntaxMatch {
325+
// If the match is invalid or the opening/closing ranges are missing, it's not a full match
326+
if (match.invalid || !match.openingRange || !match.closingRange) return false
327+
328+
// Validate multi-line comment rules
329+
// (Note: Columns are only set if they don't match the beginning or end of the line)
330+
const startsAndEndsOnSameLine = match.openingRange.start.line === match.closingRange.end.line
331+
const hasCodeBeforeStart = match.openingRangeWithWhitespace?.start.column !== undefined
332+
const hasCodeAfterEnd = match.closingRangeWithWhitespace?.end.column !== undefined
333+
334+
// If the comment starts and ends on the same line, it must not be surrounded by code
335+
if (startsAndEndsOnSameLine && hasCodeBeforeStart && hasCodeAfterEnd) return false
336+
337+
// If the comment spans multiple lines, the opening and closing line may not contain code
338+
if (!startsAndEndsOnSameLine && (hasCodeBeforeStart || hasCodeAfterEnd)) return false
339+
340+
return true
341+
}
342+
343+
function createRange(options: { line: string; lineIndex: number; startColumn: number; endColumn: number }) {
344+
const { line, lineIndex, startColumn, endColumn } = options
345+
const range: SourceRange = {
346+
start: { line: lineIndex },
347+
end: { line: lineIndex },
348+
}
349+
if (startColumn > 0) range.start.column = startColumn
350+
if (endColumn < line.length) range.end.column = endColumn
351+
return range
352+
}
353+
354+
/**
355+
* Compares two source ranges by their start or end locations.
356+
*
357+
* Returns:
358+
* - `> 0` if the second location is **greater than** (comes after) the first,
359+
* - `< 0` if the second location is **smaller than** (comes before) the first, or
360+
* - `0` if they are equal.
361+
*/
362+
function compareRanges(a: SourceRange, b: SourceRange, prop: 'start' | 'end'): number {
363+
const aCol = a[prop].column ?? 0
364+
const bCol = b[prop].column ?? 0
365+
return a[prop].line - b[prop].line || aCol - bCol
366+
}

0 commit comments

Comments
 (0)