added supporting file

mikesamuel@gmail.com · mikesamuel@gmail.com · commit 6fb03b0f1730 · 2009-05-12T17:57:55.000Z
diff --git a/src/combineRegexs.js b/src/combineRegexs.js
@@ -0,0 +1,228 @@
+/**
+ * Given a group of {@link RegExp}s, returns a {@code RegExp} that globally
+ * matches the union o the sets o strings matched d by the input RegExp.
+ * Since it matches globally, if the input strings have a start-of-input
+ * anchor (/^.../), it is ignored for the purposes of unioning.
+ * @param {Array.<RegExpr>} regexs non multiline, non-global regexs.
+ * @return {RegExp} a global regex.
+ */
+function combinePrefixPatterns(regexs) {
+  var capturedGroupIndex = 0;
+
+  var needToFoldCase = false;
+  var ignoreCase = false;
+  for (var i = 0, n = regexs.length; i < n; ++i) {
+    var regex = regexs[i];
+    if (regex.ignoreCase) {
+      ignoreCase = true;
+    } else if (/[a-z]/i.test(regex.source.replace(
+                   /\\u[0-9a-f]{4}|\\x[0-9a-f]{2}|\\[^u]/gi, ''))) {
+      needToFoldCase = true;
+      ignoreCase = false;
+      break;
+    }
+  }
+
+  function decodeEscape(charsetPart) {
+    if (charsetPart.charAt(0) !== '\\') { return charsetPart.charCodeAt(0); }
+    switch (charsetPart.charAt(1)) {
+      case 'b': return 8;
+      case 't': return 9;
+      case 'n': return 0xa;
+      case 'v': return 0xb;
+      case 'f': return 0xc;
+      case 'r': return 0xd;
+      case 'u': case 'x':
+        return parseInt(charsetPart.substring(2), 16)
+            || charsetPart.charCodeAt(1);
+      case '0': case '1': case '2': case '3': case '4':
+      case '5': case '6': case '7':
+        return parseInt(charsetPart.substring(1), 8);
+      default: return charsetPart.charCodeAt(1);
+    }
+  }
+
+  function encodeEscape(charCode) {
+    if (charCode < 0x20) {
+      return (charCode < 0x10 ? '\\x0' : '\\x') + charCode.toString(16);
+    }
+    var ch = String.fromCharCode(charCode);
+    if (ch === '\\' || ch === '-' || ch === '[' || ch === ']') {
+      ch = '\\' + ch;
+    }
+    return ch;
+  }
+
+  function caseFoldCharset(charSet) {
+    var charsetParts = charSet.substring(1, charSet.length - 1).match(
+        new RegExp(
+            '\\\\u[0-9A-Fa-f]{4}'
+            + '|\\\\x[0-9A-Fa-f]{2}'
+            + '|\\\\[0-3][0-7]{0,2}'
+            + '|\\\\[0-7]{1,2}'
+            + '|\\\\[\\s\\S]'
+            + '|-'
+            + '|[^-\\\\]',
+            'g'));
+    var groups = [];
+    var ranges = [];
+    var inverse = charsetParts[0] === '^';
+    for (var i = inverse ? 1 : 0, n = charsetParts.length; i < n; ++i) {
+      var p = charsetParts[i];
+      switch (p) {
+        case '\\B': case '\\b':
+        case '\\D': case '\\d':
+        case '\\S': case '\\s':
+        case '\\W': case '\\w':
+          groups.push(p);
+          continue;
+      }
+      var start = decodeEscape(p);
+      var end;
+      if (i + 2 < n && '-' === charsetParts[i + 1]) {
+        end = decodeEscape(charsetParts[i + 2]);
+        i += 2;
+      } else {
+        end = start;
+      }
+      ranges.push([start, end]);
+      // If the range might intersect letters, then expand it.
+      if (!(end < 65 || start > 122)) {
+        if (!(end < 65 || start > 90)) {
+          ranges.push([Math.max(65, start) | 32, Math.min(end, 90) | 32]);
+        }
+        if (!(end < 97 || start > 122)) {
+          ranges.push([Math.max(97, start) & ~32, Math.min(end, 122) & ~32]);
+        }
+      }
+    }
+
+    // [[1, 10], [3, 4], [8, 12], [14, 14], [16, 16], [17, 17]]
+    // -> [[1, 12], [14, 14], [16, 17]]
+    ranges.sort(function (a, b) { return (a[0] - b[0]) || (b[1]  - a[1]); });
+    var consolidatedRanges = [];
+    var lastRange = [NaN, NaN];
+    for (var i = 0; i < ranges.length; ++i) {
+      var range = ranges[i];
+      if (range[0] <= lastRange[1] + 1) {
+        lastRange[1] = Math.max(lastRange[1], range[1]);
+      } else {
+        consolidatedRanges.push(lastRange = range);
+      }
+    }
+
+    var out = ['['];
+    if (inverse) { out.push('^'); }
+    out.push.apply(out, groups);
+    for (var i = 0; i < consolidatedRanges.length; ++i) {
+      var range = consolidatedRanges[i];
+      out.push(encodeEscape(range[0]));
+      if (range[1] > range[0]) {
+        if (range[1] + 1 > range[0]) { out.push('-'); }
+        out.push(encodeEscape(range[1]));
+      }
+    }
+    out.push(']');
+    return out.join('');
+  }
+
+  function allowAnywhereFoldCaseAndRenumberGroups(regex) {
+    // Split into character sets, escape sequences, punctuation strings
+    // like ('(', '(?:', ')', '^'), and runs of characters that do not
+    // include any of the above.
+    var parts = regex.source.match(
+        new RegExp(
+            '(?:'
+            + '\\[(?:[^\\x5C\\x5D]|\\\\[\\s\\S])*\\]'  // a character set
+            + '|\\\\u[A-Fa-f0-9]{4}'  // a unicode escape
+            + '|\\\\x[A-Fa-f0-9]{2}'  // a hex escape
+            + '|\\\\[0-9]+'  // a back-reference or octal escape
+            + '|\\\\[^ux0-9]'  // other escape sequence
+            + '|\\(\\?[:!=]'  // start of a non-capturing group
+            + '|[\\(\\)\\^]'  // start/emd of a group, or line start
+            + '|[^\\x5B\\x5C\\(\\)\\^]+'  // run of other characters
+            + ')',
+            'g'));
+    var n = parts.length;
+
+    // Maps captured group numbers to the number they will occupy in
+    // the output or to -1 if that has not been determined, or to
+    // undefined if they need not be capturing in the output.
+    var capturedGroups = [];
+
+    // Walk over and identify back references to build the capturedGroups
+    // mapping.
+    var groupIndex;
+    for (var i = 0, groupIndex = 0; i < n; ++i) {
+      var p = parts[i];
+      if (p === '(') {
+        // groups are 1-indexed, so max group index is count of '('
+        ++groupIndex;
+      } else if ('\\' === p.charAt(0)) {
+        var decimalValue = +p.substring(1);
+        if (decimalValue && decimalValue <= groupIndex) {
+          capturedGroups[decimalValue] = -1;
+        }
+      }
+    }
+
+    // Renumber groups and reduce capturing groups to non-capturing groups
+    // where possible.
+    for (var i = 1; i < capturedGroups.length; ++i) {
+      if (-1 === capturedGroups[i]) {
+        capturedGroups[i] = ++capturedGroupIndex;
+      }
+    }
+    for (var i = 0, groupIndex = 0; i < n; ++i) {
+      var p = parts[i];
+      if (p === '(') {
+        ++groupIndex;
+        if (capturedGroups[groupIndex] === undefined) {
+          parts[i] = '(?:';
+        }
+      } else if ('\\' === p.charAt(0)) {
+        var decimalValue = +p.substring(1);
+        if (decimalValue && decimalValue <= groupIndex) {
+          parts[i] = '\\' + capturedGroups[groupIndex];
+        }
+      }
+    }
+
+    // Remove any prefix anchors so that the output will match anywhere.
+    for (var i = 0, groupIndex = 0; i < n; ++i) {
+      if ('^' === parts[i]) { parts[i] = ''; }
+    }
+
+    // Expand letters to groupts to handle mixing of case-sensitive and
+    // case-insensitive patterns if necessary.
+    if (regex.ignoreCase && needToFoldCase) {
+      for (var i = 0; i < n; ++i) {
+        var p = parts[i];
+        var ch0 = p.charAt(0);
+        if (p.length >= 2 && ch0 === '[') {
+          parts[i] = caseFoldCharset(p);
+        } else if (ch0 !== '\\') {
+          // TODO: handle letters in numeric escapes.
+          parts[i] = p.replace(
+              /[a-zA-Z]/g,
+              function (ch) {
+                var cc = ch.charCodeAt(0);
+                return '[' + String.fromCharCode(cc & ~32, cc | 32) + ']';
+              });
+        }
+      }
+    }
+
+    return parts.join('');
+  }
+
+  var rewritten = [];
+  for (var i = 0, n = regexs.length; i < n; ++i) {
+    var regex = regexs[i];
+    if (regex.global || regex.multiline) { throw new Error('' + regex); }
+    rewritten.push(
+        '(?:' + allowAnywhereFoldCaseAndRenumberGroups(regex) + ')');
+  }
+
+  return new RegExp(rewritten.join('|'), ignoreCase ? 'gi' : 'g');
+}