Skip to content
This repository was archived by the owner on Apr 22, 2020. It is now read-only.

Commit 6fb03b0

Browse files
author
mikesamuel@gmail.com
committed
added supporting file
1 parent c2dfea6 commit 6fb03b0

1 file changed

Lines changed: 228 additions & 0 deletions

File tree

src/combineRegexs.js

Lines changed: 228 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,228 @@
1+
/**
2+
* Given a group of {@link RegExp}s, returns a {@code RegExp} that globally
3+
* matches the union o the sets o strings matched d by the input RegExp.
4+
* Since it matches globally, if the input strings have a start-of-input
5+
* anchor (/^.../), it is ignored for the purposes of unioning.
6+
* @param {Array.<RegExpr>} regexs non multiline, non-global regexs.
7+
* @return {RegExp} a global regex.
8+
*/
9+
function combinePrefixPatterns(regexs) {
10+
var capturedGroupIndex = 0;
11+
12+
var needToFoldCase = false;
13+
var ignoreCase = false;
14+
for (var i = 0, n = regexs.length; i < n; ++i) {
15+
var regex = regexs[i];
16+
if (regex.ignoreCase) {
17+
ignoreCase = true;
18+
} else if (/[a-z]/i.test(regex.source.replace(
19+
/\\u[0-9a-f]{4}|\\x[0-9a-f]{2}|\\[^u]/gi, ''))) {
20+
needToFoldCase = true;
21+
ignoreCase = false;
22+
break;
23+
}
24+
}
25+
26+
function decodeEscape(charsetPart) {
27+
if (charsetPart.charAt(0) !== '\\') { return charsetPart.charCodeAt(0); }
28+
switch (charsetPart.charAt(1)) {
29+
case 'b': return 8;
30+
case 't': return 9;
31+
case 'n': return 0xa;
32+
case 'v': return 0xb;
33+
case 'f': return 0xc;
34+
case 'r': return 0xd;
35+
case 'u': case 'x':
36+
return parseInt(charsetPart.substring(2), 16)
37+
|| charsetPart.charCodeAt(1);
38+
case '0': case '1': case '2': case '3': case '4':
39+
case '5': case '6': case '7':
40+
return parseInt(charsetPart.substring(1), 8);
41+
default: return charsetPart.charCodeAt(1);
42+
}
43+
}
44+
45+
function encodeEscape(charCode) {
46+
if (charCode < 0x20) {
47+
return (charCode < 0x10 ? '\\x0' : '\\x') + charCode.toString(16);
48+
}
49+
var ch = String.fromCharCode(charCode);
50+
if (ch === '\\' || ch === '-' || ch === '[' || ch === ']') {
51+
ch = '\\' + ch;
52+
}
53+
return ch;
54+
}
55+
56+
function caseFoldCharset(charSet) {
57+
var charsetParts = charSet.substring(1, charSet.length - 1).match(
58+
new RegExp(
59+
'\\\\u[0-9A-Fa-f]{4}'
60+
+ '|\\\\x[0-9A-Fa-f]{2}'
61+
+ '|\\\\[0-3][0-7]{0,2}'
62+
+ '|\\\\[0-7]{1,2}'
63+
+ '|\\\\[\\s\\S]'
64+
+ '|-'
65+
+ '|[^-\\\\]',
66+
'g'));
67+
var groups = [];
68+
var ranges = [];
69+
var inverse = charsetParts[0] === '^';
70+
for (var i = inverse ? 1 : 0, n = charsetParts.length; i < n; ++i) {
71+
var p = charsetParts[i];
72+
switch (p) {
73+
case '\\B': case '\\b':
74+
case '\\D': case '\\d':
75+
case '\\S': case '\\s':
76+
case '\\W': case '\\w':
77+
groups.push(p);
78+
continue;
79+
}
80+
var start = decodeEscape(p);
81+
var end;
82+
if (i + 2 < n && '-' === charsetParts[i + 1]) {
83+
end = decodeEscape(charsetParts[i + 2]);
84+
i += 2;
85+
} else {
86+
end = start;
87+
}
88+
ranges.push([start, end]);
89+
// If the range might intersect letters, then expand it.
90+
if (!(end < 65 || start > 122)) {
91+
if (!(end < 65 || start > 90)) {
92+
ranges.push([Math.max(65, start) | 32, Math.min(end, 90) | 32]);
93+
}
94+
if (!(end < 97 || start > 122)) {
95+
ranges.push([Math.max(97, start) & ~32, Math.min(end, 122) & ~32]);
96+
}
97+
}
98+
}
99+
100+
// [[1, 10], [3, 4], [8, 12], [14, 14], [16, 16], [17, 17]]
101+
// -> [[1, 12], [14, 14], [16, 17]]
102+
ranges.sort(function (a, b) { return (a[0] - b[0]) || (b[1] - a[1]); });
103+
var consolidatedRanges = [];
104+
var lastRange = [NaN, NaN];
105+
for (var i = 0; i < ranges.length; ++i) {
106+
var range = ranges[i];
107+
if (range[0] <= lastRange[1] + 1) {
108+
lastRange[1] = Math.max(lastRange[1], range[1]);
109+
} else {
110+
consolidatedRanges.push(lastRange = range);
111+
}
112+
}
113+
114+
var out = ['['];
115+
if (inverse) { out.push('^'); }
116+
out.push.apply(out, groups);
117+
for (var i = 0; i < consolidatedRanges.length; ++i) {
118+
var range = consolidatedRanges[i];
119+
out.push(encodeEscape(range[0]));
120+
if (range[1] > range[0]) {
121+
if (range[1] + 1 > range[0]) { out.push('-'); }
122+
out.push(encodeEscape(range[1]));
123+
}
124+
}
125+
out.push(']');
126+
return out.join('');
127+
}
128+
129+
function allowAnywhereFoldCaseAndRenumberGroups(regex) {
130+
// Split into character sets, escape sequences, punctuation strings
131+
// like ('(', '(?:', ')', '^'), and runs of characters that do not
132+
// include any of the above.
133+
var parts = regex.source.match(
134+
new RegExp(
135+
'(?:'
136+
+ '\\[(?:[^\\x5C\\x5D]|\\\\[\\s\\S])*\\]' // a character set
137+
+ '|\\\\u[A-Fa-f0-9]{4}' // a unicode escape
138+
+ '|\\\\x[A-Fa-f0-9]{2}' // a hex escape
139+
+ '|\\\\[0-9]+' // a back-reference or octal escape
140+
+ '|\\\\[^ux0-9]' // other escape sequence
141+
+ '|\\(\\?[:!=]' // start of a non-capturing group
142+
+ '|[\\(\\)\\^]' // start/emd of a group, or line start
143+
+ '|[^\\x5B\\x5C\\(\\)\\^]+' // run of other characters
144+
+ ')',
145+
'g'));
146+
var n = parts.length;
147+
148+
// Maps captured group numbers to the number they will occupy in
149+
// the output or to -1 if that has not been determined, or to
150+
// undefined if they need not be capturing in the output.
151+
var capturedGroups = [];
152+
153+
// Walk over and identify back references to build the capturedGroups
154+
// mapping.
155+
var groupIndex;
156+
for (var i = 0, groupIndex = 0; i < n; ++i) {
157+
var p = parts[i];
158+
if (p === '(') {
159+
// groups are 1-indexed, so max group index is count of '('
160+
++groupIndex;
161+
} else if ('\\' === p.charAt(0)) {
162+
var decimalValue = +p.substring(1);
163+
if (decimalValue && decimalValue <= groupIndex) {
164+
capturedGroups[decimalValue] = -1;
165+
}
166+
}
167+
}
168+
169+
// Renumber groups and reduce capturing groups to non-capturing groups
170+
// where possible.
171+
for (var i = 1; i < capturedGroups.length; ++i) {
172+
if (-1 === capturedGroups[i]) {
173+
capturedGroups[i] = ++capturedGroupIndex;
174+
}
175+
}
176+
for (var i = 0, groupIndex = 0; i < n; ++i) {
177+
var p = parts[i];
178+
if (p === '(') {
179+
++groupIndex;
180+
if (capturedGroups[groupIndex] === undefined) {
181+
parts[i] = '(?:';
182+
}
183+
} else if ('\\' === p.charAt(0)) {
184+
var decimalValue = +p.substring(1);
185+
if (decimalValue && decimalValue <= groupIndex) {
186+
parts[i] = '\\' + capturedGroups[groupIndex];
187+
}
188+
}
189+
}
190+
191+
// Remove any prefix anchors so that the output will match anywhere.
192+
for (var i = 0, groupIndex = 0; i < n; ++i) {
193+
if ('^' === parts[i]) { parts[i] = ''; }
194+
}
195+
196+
// Expand letters to groupts to handle mixing of case-sensitive and
197+
// case-insensitive patterns if necessary.
198+
if (regex.ignoreCase && needToFoldCase) {
199+
for (var i = 0; i < n; ++i) {
200+
var p = parts[i];
201+
var ch0 = p.charAt(0);
202+
if (p.length >= 2 && ch0 === '[') {
203+
parts[i] = caseFoldCharset(p);
204+
} else if (ch0 !== '\\') {
205+
// TODO: handle letters in numeric escapes.
206+
parts[i] = p.replace(
207+
/[a-zA-Z]/g,
208+
function (ch) {
209+
var cc = ch.charCodeAt(0);
210+
return '[' + String.fromCharCode(cc & ~32, cc | 32) + ']';
211+
});
212+
}
213+
}
214+
}
215+
216+
return parts.join('');
217+
}
218+
219+
var rewritten = [];
220+
for (var i = 0, n = regexs.length; i < n; ++i) {
221+
var regex = regexs[i];
222+
if (regex.global || regex.multiline) { throw new Error('' + regex); }
223+
rewritten.push(
224+
'(?:' + allowAnywhereFoldCaseAndRenumberGroups(regex) + ')');
225+
}
226+
227+
return new RegExp(rewritten.join('|'), ignoreCase ? 'gi' : 'g');
228+
}

0 commit comments

Comments
 (0)