|
| 1 | +/** |
| 2 | + * Given a group of {@link RegExp}s, returns a {@code RegExp} that globally |
| 3 | + * matches the union o the sets o strings matched d by the input RegExp. |
| 4 | + * Since it matches globally, if the input strings have a start-of-input |
| 5 | + * anchor (/^.../), it is ignored for the purposes of unioning. |
| 6 | + * @param {Array.<RegExpr>} regexs non multiline, non-global regexs. |
| 7 | + * @return {RegExp} a global regex. |
| 8 | + */ |
| 9 | +function combinePrefixPatterns(regexs) { |
| 10 | + var capturedGroupIndex = 0; |
| 11 | + |
| 12 | + var needToFoldCase = false; |
| 13 | + var ignoreCase = false; |
| 14 | + for (var i = 0, n = regexs.length; i < n; ++i) { |
| 15 | + var regex = regexs[i]; |
| 16 | + if (regex.ignoreCase) { |
| 17 | + ignoreCase = true; |
| 18 | + } else if (/[a-z]/i.test(regex.source.replace( |
| 19 | + /\\u[0-9a-f]{4}|\\x[0-9a-f]{2}|\\[^u]/gi, ''))) { |
| 20 | + needToFoldCase = true; |
| 21 | + ignoreCase = false; |
| 22 | + break; |
| 23 | + } |
| 24 | + } |
| 25 | + |
| 26 | + function decodeEscape(charsetPart) { |
| 27 | + if (charsetPart.charAt(0) !== '\\') { return charsetPart.charCodeAt(0); } |
| 28 | + switch (charsetPart.charAt(1)) { |
| 29 | + case 'b': return 8; |
| 30 | + case 't': return 9; |
| 31 | + case 'n': return 0xa; |
| 32 | + case 'v': return 0xb; |
| 33 | + case 'f': return 0xc; |
| 34 | + case 'r': return 0xd; |
| 35 | + case 'u': case 'x': |
| 36 | + return parseInt(charsetPart.substring(2), 16) |
| 37 | + || charsetPart.charCodeAt(1); |
| 38 | + case '0': case '1': case '2': case '3': case '4': |
| 39 | + case '5': case '6': case '7': |
| 40 | + return parseInt(charsetPart.substring(1), 8); |
| 41 | + default: return charsetPart.charCodeAt(1); |
| 42 | + } |
| 43 | + } |
| 44 | + |
| 45 | + function encodeEscape(charCode) { |
| 46 | + if (charCode < 0x20) { |
| 47 | + return (charCode < 0x10 ? '\\x0' : '\\x') + charCode.toString(16); |
| 48 | + } |
| 49 | + var ch = String.fromCharCode(charCode); |
| 50 | + if (ch === '\\' || ch === '-' || ch === '[' || ch === ']') { |
| 51 | + ch = '\\' + ch; |
| 52 | + } |
| 53 | + return ch; |
| 54 | + } |
| 55 | + |
| 56 | + function caseFoldCharset(charSet) { |
| 57 | + var charsetParts = charSet.substring(1, charSet.length - 1).match( |
| 58 | + new RegExp( |
| 59 | + '\\\\u[0-9A-Fa-f]{4}' |
| 60 | + + '|\\\\x[0-9A-Fa-f]{2}' |
| 61 | + + '|\\\\[0-3][0-7]{0,2}' |
| 62 | + + '|\\\\[0-7]{1,2}' |
| 63 | + + '|\\\\[\\s\\S]' |
| 64 | + + '|-' |
| 65 | + + '|[^-\\\\]', |
| 66 | + 'g')); |
| 67 | + var groups = []; |
| 68 | + var ranges = []; |
| 69 | + var inverse = charsetParts[0] === '^'; |
| 70 | + for (var i = inverse ? 1 : 0, n = charsetParts.length; i < n; ++i) { |
| 71 | + var p = charsetParts[i]; |
| 72 | + switch (p) { |
| 73 | + case '\\B': case '\\b': |
| 74 | + case '\\D': case '\\d': |
| 75 | + case '\\S': case '\\s': |
| 76 | + case '\\W': case '\\w': |
| 77 | + groups.push(p); |
| 78 | + continue; |
| 79 | + } |
| 80 | + var start = decodeEscape(p); |
| 81 | + var end; |
| 82 | + if (i + 2 < n && '-' === charsetParts[i + 1]) { |
| 83 | + end = decodeEscape(charsetParts[i + 2]); |
| 84 | + i += 2; |
| 85 | + } else { |
| 86 | + end = start; |
| 87 | + } |
| 88 | + ranges.push([start, end]); |
| 89 | + // If the range might intersect letters, then expand it. |
| 90 | + if (!(end < 65 || start > 122)) { |
| 91 | + if (!(end < 65 || start > 90)) { |
| 92 | + ranges.push([Math.max(65, start) | 32, Math.min(end, 90) | 32]); |
| 93 | + } |
| 94 | + if (!(end < 97 || start > 122)) { |
| 95 | + ranges.push([Math.max(97, start) & ~32, Math.min(end, 122) & ~32]); |
| 96 | + } |
| 97 | + } |
| 98 | + } |
| 99 | + |
| 100 | + // [[1, 10], [3, 4], [8, 12], [14, 14], [16, 16], [17, 17]] |
| 101 | + // -> [[1, 12], [14, 14], [16, 17]] |
| 102 | + ranges.sort(function (a, b) { return (a[0] - b[0]) || (b[1] - a[1]); }); |
| 103 | + var consolidatedRanges = []; |
| 104 | + var lastRange = [NaN, NaN]; |
| 105 | + for (var i = 0; i < ranges.length; ++i) { |
| 106 | + var range = ranges[i]; |
| 107 | + if (range[0] <= lastRange[1] + 1) { |
| 108 | + lastRange[1] = Math.max(lastRange[1], range[1]); |
| 109 | + } else { |
| 110 | + consolidatedRanges.push(lastRange = range); |
| 111 | + } |
| 112 | + } |
| 113 | + |
| 114 | + var out = ['[']; |
| 115 | + if (inverse) { out.push('^'); } |
| 116 | + out.push.apply(out, groups); |
| 117 | + for (var i = 0; i < consolidatedRanges.length; ++i) { |
| 118 | + var range = consolidatedRanges[i]; |
| 119 | + out.push(encodeEscape(range[0])); |
| 120 | + if (range[1] > range[0]) { |
| 121 | + if (range[1] + 1 > range[0]) { out.push('-'); } |
| 122 | + out.push(encodeEscape(range[1])); |
| 123 | + } |
| 124 | + } |
| 125 | + out.push(']'); |
| 126 | + return out.join(''); |
| 127 | + } |
| 128 | + |
| 129 | + function allowAnywhereFoldCaseAndRenumberGroups(regex) { |
| 130 | + // Split into character sets, escape sequences, punctuation strings |
| 131 | + // like ('(', '(?:', ')', '^'), and runs of characters that do not |
| 132 | + // include any of the above. |
| 133 | + var parts = regex.source.match( |
| 134 | + new RegExp( |
| 135 | + '(?:' |
| 136 | + + '\\[(?:[^\\x5C\\x5D]|\\\\[\\s\\S])*\\]' // a character set |
| 137 | + + '|\\\\u[A-Fa-f0-9]{4}' // a unicode escape |
| 138 | + + '|\\\\x[A-Fa-f0-9]{2}' // a hex escape |
| 139 | + + '|\\\\[0-9]+' // a back-reference or octal escape |
| 140 | + + '|\\\\[^ux0-9]' // other escape sequence |
| 141 | + + '|\\(\\?[:!=]' // start of a non-capturing group |
| 142 | + + '|[\\(\\)\\^]' // start/emd of a group, or line start |
| 143 | + + '|[^\\x5B\\x5C\\(\\)\\^]+' // run of other characters |
| 144 | + + ')', |
| 145 | + 'g')); |
| 146 | + var n = parts.length; |
| 147 | + |
| 148 | + // Maps captured group numbers to the number they will occupy in |
| 149 | + // the output or to -1 if that has not been determined, or to |
| 150 | + // undefined if they need not be capturing in the output. |
| 151 | + var capturedGroups = []; |
| 152 | + |
| 153 | + // Walk over and identify back references to build the capturedGroups |
| 154 | + // mapping. |
| 155 | + var groupIndex; |
| 156 | + for (var i = 0, groupIndex = 0; i < n; ++i) { |
| 157 | + var p = parts[i]; |
| 158 | + if (p === '(') { |
| 159 | + // groups are 1-indexed, so max group index is count of '(' |
| 160 | + ++groupIndex; |
| 161 | + } else if ('\\' === p.charAt(0)) { |
| 162 | + var decimalValue = +p.substring(1); |
| 163 | + if (decimalValue && decimalValue <= groupIndex) { |
| 164 | + capturedGroups[decimalValue] = -1; |
| 165 | + } |
| 166 | + } |
| 167 | + } |
| 168 | + |
| 169 | + // Renumber groups and reduce capturing groups to non-capturing groups |
| 170 | + // where possible. |
| 171 | + for (var i = 1; i < capturedGroups.length; ++i) { |
| 172 | + if (-1 === capturedGroups[i]) { |
| 173 | + capturedGroups[i] = ++capturedGroupIndex; |
| 174 | + } |
| 175 | + } |
| 176 | + for (var i = 0, groupIndex = 0; i < n; ++i) { |
| 177 | + var p = parts[i]; |
| 178 | + if (p === '(') { |
| 179 | + ++groupIndex; |
| 180 | + if (capturedGroups[groupIndex] === undefined) { |
| 181 | + parts[i] = '(?:'; |
| 182 | + } |
| 183 | + } else if ('\\' === p.charAt(0)) { |
| 184 | + var decimalValue = +p.substring(1); |
| 185 | + if (decimalValue && decimalValue <= groupIndex) { |
| 186 | + parts[i] = '\\' + capturedGroups[groupIndex]; |
| 187 | + } |
| 188 | + } |
| 189 | + } |
| 190 | + |
| 191 | + // Remove any prefix anchors so that the output will match anywhere. |
| 192 | + for (var i = 0, groupIndex = 0; i < n; ++i) { |
| 193 | + if ('^' === parts[i]) { parts[i] = ''; } |
| 194 | + } |
| 195 | + |
| 196 | + // Expand letters to groupts to handle mixing of case-sensitive and |
| 197 | + // case-insensitive patterns if necessary. |
| 198 | + if (regex.ignoreCase && needToFoldCase) { |
| 199 | + for (var i = 0; i < n; ++i) { |
| 200 | + var p = parts[i]; |
| 201 | + var ch0 = p.charAt(0); |
| 202 | + if (p.length >= 2 && ch0 === '[') { |
| 203 | + parts[i] = caseFoldCharset(p); |
| 204 | + } else if (ch0 !== '\\') { |
| 205 | + // TODO: handle letters in numeric escapes. |
| 206 | + parts[i] = p.replace( |
| 207 | + /[a-zA-Z]/g, |
| 208 | + function (ch) { |
| 209 | + var cc = ch.charCodeAt(0); |
| 210 | + return '[' + String.fromCharCode(cc & ~32, cc | 32) + ']'; |
| 211 | + }); |
| 212 | + } |
| 213 | + } |
| 214 | + } |
| 215 | + |
| 216 | + return parts.join(''); |
| 217 | + } |
| 218 | + |
| 219 | + var rewritten = []; |
| 220 | + for (var i = 0, n = regexs.length; i < n; ++i) { |
| 221 | + var regex = regexs[i]; |
| 222 | + if (regex.global || regex.multiline) { throw new Error('' + regex); } |
| 223 | + rewritten.push( |
| 224 | + '(?:' + allowAnywhereFoldCaseAndRenumberGroups(regex) + ')'); |
| 225 | + } |
| 226 | + |
| 227 | + return new RegExp(rewritten.join('|'), ignoreCase ? 'gi' : 'g'); |
| 228 | +} |
0 commit comments