expressive-code
diff --git a/‎packages/annotation-comments/src/parsers/comment-types/multi-line.ts‎
Lines changed: 366 additions & 0 deletions b/‎packages/annotation-comments/src/parsers/comment-types/multi-line.ts‎
Lines changed: 366 additions & 0 deletions
@@ -0,0 +1,366 @@
+import type { AnnotationComment, AnnotationTag, SourceRange } from '../../core/types'
+import type { ParseParentCommentOptions } from '../parent-comment'
+import { escapeRegExp } from '../../internal/escaping'
+import { getTextContentInLine } from '../text-content'
+
+type MultiLineCommentSyntax = {
+	opening: string
+	closing: string
+	continuationLineStart?: RegExp | undefined
+}
+
+type MultiLineCommentSyntaxMatch = {
+	openingRange: SourceRange
+	openingRangeWithWhitespace: SourceRange
+	closingRange: SourceRange
+	closingRangeWithWhitespace: SourceRange
+	/**
+	 * This flag gets set to `true` while looking for an opening/closing syntax pair
+	 * if the current syntax has requirements that were not met on at least one line,
+	 * e.g. a specific `continuationLineStart` syntax that was not found.
+	 */
+	invalid: boolean
+}
+
+const multiLineCommentSyntaxes: MultiLineCommentSyntax[] = [
+	// JSDoc, JavaDoc - a leading `*` that is not part of the content is expected on each new line
+	{ opening: '/**', closing: '*/', continuationLineStart: /^\s*\*(?=\s|$)/ },
+	// JS, TS, CSS, Java, C, C++, C#, Rust, Go, SQL, etc.
+	{ opening: '/*', closing: '*/' },
+	// HTML, XML
+	{ opening: '<!--', closing: '-->' },
+	// JSX, TSX
+	{ opening: '{/*', closing: '*/}' },
+	{ opening: '{ /*', closing: '*/ }' },
+	// Pascal, ML, F#, etc.
+	{ opening: '(*', closing: '*)' },
+	// Lua
+	{ opening: '--[[', closing: ']]' },
+]
+
+const multiLineCommentOpeningRegex = createCommentDelimiterRegExp(multiLineCommentSyntaxes, 'opening')
+const multiLineCommentClosingRegex = createCommentDelimiterRegExp(multiLineCommentSyntaxes, 'closing')
+
+/**
+ * Attempts to find and parse a multi-line comment that the given annotation tag is located in.
+ *
+ * See {@link parseParentComment} for more information.
+ */
+export function parseMultiLineParentComment(options: ParseParentCommentOptions): AnnotationComment | undefined {
+	const { codeLines, tag } = options
+
+	const tagLineIndex = tag.range.start.line
+	const tagLine = codeLines[tagLineIndex]
+	const tagStartColumn = tag.range.start.column ?? 0
+	const tagEndColumn = tag.range.end.column ?? tagLine.length
+
+	const commentSyntaxMatches: Partial<MultiLineCommentSyntaxMatch>[] = Array.from({ length: multiLineCommentSyntaxes.length }, () => ({}))
+	let scannedForClosings = false
+
+	// Check for a matching pair of beginning and ending multi-line comment syntaxes around the tag
+	// by first walking backwards from the tag to find potential opening sequences
+	// and then walking forwards to find matching closing sequences
+	for (let openingLineIndex = tagLineIndex; openingLineIndex >= 0; openingLineIndex--) {
+		if (
+			findCommentSyntaxMatches({
+				type: 'opening',
+				commentSyntaxMatches,
+				codeLines,
+				lineIndex: openingLineIndex,
+				endColumn: openingLineIndex === tagLineIndex ? tagStartColumn : undefined,
+			})
+		) {
+			// If we have a matching opening/closing pair now, return it
+			const comment = getCommentFromMatchingSyntaxPair({
+				codeLines,
+				tag,
+				commentSyntaxMatches,
+			})
+			if (comment) return comment
+
+			// Otherwise, walk forwards once to find all possible closing sequences,
+			// stopping early if a matching pair is found
+			if (!scannedForClosings) {
+				scannedForClosings = true
+				let foundAnyClosings = false
+				for (let closingLineIndex = tagLineIndex; closingLineIndex < codeLines.length; closingLineIndex++) {
+					if (
+						findCommentSyntaxMatches({
+							type: 'closing',
+							commentSyntaxMatches,
+							codeLines,
+							lineIndex: closingLineIndex,
+							startColumn: closingLineIndex === tagLineIndex ? tagEndColumn : undefined,
+						})
+					) {
+						foundAnyClosings = true
+						// If we have a matching opening/closing pair now, return it
+						const comment = getCommentFromMatchingSyntaxPair({
+							codeLines,
+							tag,
+							commentSyntaxMatches,
+						})
+						if (comment) return comment
+					}
+				}
+				// If we didn't find any closing sequences, there cannot be a matching pair
+				if (!foundAnyClosings) return undefined
+			}
+		}
+	}
+
+	return undefined
+}
+
+/**
+ * Searches the given line for any multi-line comment syntax opening or closing sequences,
+ * and updates the `commentSyntaxMatches` array accordingly.
+ *
+ * If any matches are found, checks the `commentSyntaxMatches` array for the matched syntax
+ * entries and adds the new opening or closing range in case it was undefined before.
+ * Matches are ordered to ensure the ones closest to the tag are processed first.
+ *
+ * Also validates the requirements of all syntaxes for the given line and sets the `invalid` flag
+ * if any requirements were not met (e.g. the `continuationLineStart` syntax was not found).
+ *
+ * Returns `true` if there were new matches, or `false` otherwise.
+ */
+function findCommentSyntaxMatches(options: {
+	type: 'opening' | 'closing'
+	commentSyntaxMatches: Partial<MultiLineCommentSyntaxMatch>[]
+	codeLines: string[]
+	lineIndex: number
+	startColumn?: number | undefined
+	endColumn?: number | undefined
+}): boolean {
+	const { type, commentSyntaxMatches, codeLines, lineIndex, startColumn = 0, endColumn } = options
+	const line = codeLines[lineIndex]
+
+	// Look for opening/closing sequences in the given line
+	const regex = type === 'opening' ? multiLineCommentOpeningRegex : multiLineCommentClosingRegex
+	const sequences = findAllCommentDelimiters(regex, line, startColumn, endColumn)
+	let foundNewMatches = false
+	if (sequences.length) {
+		// If we're looking for opening sequences, we need to reverse the matches
+		// to ensure we process the ones closest to the tag first
+		if (type === 'opening') sequences.reverse()
+
+		// Now go through the matches and update the `commentSyntaxMatches` array if needed
+		const delimiterProp = type === 'opening' ? 'openingRange' : 'closingRange'
+		const whitespaceProp = type === 'opening' ? 'openingRangeWithWhitespace' : 'closingRangeWithWhitespace'
+		sequences.forEach((sequence) => {
+			commentSyntaxMatches.forEach((match, index) => {
+				const syntax = multiLineCommentSyntaxes[index]
+				// Skip matches that are invalid or already have defined ranges
+				if (match.invalid || match[delimiterProp]) return
+				// Skip matches where the respective syntax differs from the current sequence
+				if (syntax[type] !== sequence.delimiter) return
+				// Otherwise, set the ranges and mark the array as updated
+				match[delimiterProp] = createRange({
+					line,
+					lineIndex,
+					startColumn: sequence.index,
+					endColumn: sequence.index + sequence.delimiter.length,
+				})
+				match[whitespaceProp] = createRange({
+					line,
+					lineIndex,
+					startColumn: sequence.index - sequence.leadingWhitespace.length,
+					endColumn: sequence.index + sequence.delimiter.length + sequence.trailingWhitespace.length,
+				})
+				foundNewMatches = true
+			})
+		})
+	}
+
+	// Validate all syntax requirements for the given line
+	commentSyntaxMatches.forEach((match, index) => {
+		const syntax = multiLineCommentSyntaxes[index]
+		if (
+			// Check matches that are still valid and that have a continuation line requirement
+			!match.invalid &&
+			syntax.continuationLineStart &&
+			// Only check the requirement on non-opening and non-closing lines
+			lineIndex !== match.openingRange?.start.line &&
+			lineIndex !== match.closingRange?.start.line &&
+			// If the line doesn't match the continuation syntax, mark the match as invalid
+			!line.match(syntax.continuationLineStart)
+		) {
+			match.invalid = true
+		}
+	})
+
+	return foundNewMatches
+}
+
+/**
+ * Checks the `commentSyntaxMatches` array for matching pairs of opening and closing
+ * multi-line comment syntaxes. If any pairs are found, determines the innermost one
+ * and returns the corresponding comment.
+ */
+function getCommentFromMatchingSyntaxPair(options: {
+	codeLines: string[]
+	tag: AnnotationTag
+	commentSyntaxMatches: Partial<MultiLineCommentSyntaxMatch>[]
+}): AnnotationComment | undefined {
+	const { codeLines, tag, commentSyntaxMatches } = options
+
+	const bestMatchIndex = commentSyntaxMatches.reduce((previousBestIndex, match, index) => {
+		// If the new match isn't a valid pair (yet?), skip it
+		if (!isValidFullMatch(match)) return previousBestIndex
+
+		// If we don't have a previous best pair yet, use the new match
+		if (previousBestIndex === -1) return index
+
+		const previousBestMatch = commentSyntaxMatches[previousBestIndex] as MultiLineCommentSyntaxMatch
+
+		// Check if the new match is a better pair than the previous one
+		if (
+			// It's better if its opening sequence ends after the previous one,
+			compareRanges(previousBestMatch.openingRange, match.openingRange, 'end') > 0 ||
+			// ...or if its closing sequence starts before the previous one
+			compareRanges(previousBestMatch.closingRange, match.closingRange, 'start') < 0
+		) {
+			return index
+		}
+
+		return previousBestIndex
+	}, -1)
+
+	if (bestMatchIndex > -1) {
+		// We found a matching opening/closing comment syntax pair,
+		// so build the AnnotationComment object and return it
+		const match = commentSyntaxMatches[bestMatchIndex] as MultiLineCommentSyntaxMatch
+		const syntax = multiLineCommentSyntaxes[bestMatchIndex]
+		const isOnSingleLineBeforeCode = match.openingRange.start.line === match.closingRange.end.line && match.closingRangeWithWhitespace.end.column
+		const commentRange: SourceRange = {
+			start: isOnSingleLineBeforeCode ? match.openingRange.start : match.openingRangeWithWhitespace.start,
+			end: match.closingRangeWithWhitespace.end,
+		}
+		const innerRange: SourceRange = {
+			start: match.openingRangeWithWhitespace.end,
+			end: match.closingRangeWithWhitespace.start,
+		}
+		const contents: string[] = []
+		const contentRanges: SourceRange[] = []
+
+		for (let lineIndex = tag.range.end.line; lineIndex <= innerRange.end.line; lineIndex++) {
+			const line = codeLines[lineIndex]
+			const startColumn = lineIndex === tag.range.end.line ? tag.range.end.column : lineIndex === innerRange.start.line ? (innerRange.start.column ?? line.length) : 0
+			const endColumn = lineIndex === innerRange.end.line ? (innerRange.end.column ?? 0) : line.length
+
+			const lineContent = getTextContentInLine({
+				codeLines,
+				lineIndex,
+				startColumn,
+				endColumn,
+				continuationLineStart: syntax.continuationLineStart,
+			})
+			contents.push(lineContent.content)
+			contentRanges.push(lineContent.contentRange)
+		}
+
+		// Remove empty lines from the beginning and end of the content arrays
+		while (contents.length && !contents[0].length) {
+			contents.shift()
+			contentRanges.shift()
+		}
+		while (contents.length && !contents[contents.length - 1].length) {
+			contents.pop()
+			contentRanges.pop()
+		}
+
+		return {
+			tag,
+			contents,
+			commentRange,
+			contentRanges,
+			targetRanges: [],
+		}
+	}
+}
+
+function createCommentDelimiterRegExp(syntaxes: MultiLineCommentSyntax[], delimiterType: 'opening' | 'closing') {
+	const sequences = syntaxes.map((syntax) => escapeRegExp(syntax[delimiterType]))
+	const uniqueSortedSequences = [...new Set(sequences)].sort((a, b) => b.length - a.length)
+	return new RegExp(
+		[
+			// Either the beginning of the line or required whitespace (captured)
+			'(?<=^|(\\s+))',
+			// Any of the supported multi-line comment opening sequences (captured)
+			`(${uniqueSortedSequences.join('|')})`,
+			// Either the end of the line or required whitespace (captured)
+			`(?=$|(\\s+))`,
+		].join(''),
+		'g'
+	)
+}
+
+/**
+ * Finds all matches of the given comment delimiter regular expression in the given line,
+ * including partially overlapping matches. Returns an array of match objects that
+ * each contain an index, the leading whitespace, the delimiter and the trailing whitespace.
+ */
+function findAllCommentDelimiters(regExp: RegExp, line: string, startColumn: number | undefined, endColumn: number | undefined) {
+	const matches: { index: number; leadingWhitespace: string; delimiter: string; trailingWhitespace: string }[] = []
+	let match: RegExpExecArray | null
+	regExp.lastIndex = startColumn ?? 0
+	while ((match = regExp.exec(line))) {
+		const leadingWhitespace = match[1] ?? ''
+		const delimiter = match[2] ?? ''
+		const trailingWhitespace = match[3] ?? ''
+		if (endColumn && match.index + delimiter.length > endColumn) break
+		matches.push({
+			index: match.index,
+			leadingWhitespace,
+			delimiter,
+			trailingWhitespace,
+		})
+		regExp.lastIndex = match.index + 1
+	}
+	return matches
+}
+
+function isValidFullMatch(match: Partial<MultiLineCommentSyntaxMatch>): match is MultiLineCommentSyntaxMatch {
+	// If the match is invalid or the opening/closing ranges are missing, it's not a full match
+	if (match.invalid || !match.openingRange || !match.closingRange) return false
+
+	// Validate multi-line comment rules
+	// (Note: Columns are only set if they don't match the beginning or end of the line)
+	const startsAndEndsOnSameLine = match.openingRange.start.line === match.closingRange.end.line
+	const hasCodeBeforeStart = match.openingRangeWithWhitespace?.start.column !== undefined
+	const hasCodeAfterEnd = match.closingRangeWithWhitespace?.end.column !== undefined
+
+	// If the comment starts and ends on the same line, it must not be surrounded by code
+	if (startsAndEndsOnSameLine && hasCodeBeforeStart && hasCodeAfterEnd) return false
+
+	// If the comment spans multiple lines, the opening and closing line may not contain code
+	if (!startsAndEndsOnSameLine && (hasCodeBeforeStart || hasCodeAfterEnd)) return false
+
+	return true
+}
+
+function createRange(options: { line: string; lineIndex: number; startColumn: number; endColumn: number }) {
+	const { line, lineIndex, startColumn, endColumn } = options
+	const range: SourceRange = {
+		start: { line: lineIndex },
+		end: { line: lineIndex },
+	}
+	if (startColumn > 0) range.start.column = startColumn
+	if (endColumn < line.length) range.end.column = endColumn
+	return range
+}
+
+/**
+ * Compares two source ranges by their start or end locations.
+ *
+ * Returns:
+ * - `> 0` if the second location is **greater than** (comes after) the first,
+ * - `< 0` if the second location is **smaller than** (comes before) the first, or
+ * - `0` if they are equal.
+ */
+function compareRanges(a: SourceRange, b: SourceRange, prop: 'start' | 'end'): number {
+	const aCol = a[prop].column ?? 0
+	const bCol = b[prop].column ?? 0
+	return a[prop].line - b[prop].line || aCol - bCol
+}