googlearchive
diff --git a/‎Makefile‎
Lines changed: 16 additions & 0 deletions b/‎Makefile‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎js-modules/combinePrefixPatterns.js‎
Lines changed: 229 additions & 0 deletions b/‎js-modules/combinePrefixPatterns.js‎
Lines changed: 229 additions & 0 deletions
diff --git a/‎js-modules/extractSourceSpans.js‎
Lines changed: 99 additions & 0 deletions b/‎js-modules/extractSourceSpans.js‎
Lines changed: 99 additions & 0 deletions
@@ -0,0 +1,16 @@
+src/prettify.js: js-modules/*.js
+	perl -e '\
+	  sub readInclude($$$$) {\
+	    my $$prefix = $$_[0];\
+	    my $$name = "js-modules/" . $$_[1];\
+	    my $$buf = "";\
+	    open(IN, "<$$name") or die "$$name: $$!";\
+	    while (<IN>) {\
+	      $$buf .= "$$prefix$$_";\
+	    }\
+	    return $$buf;\
+	  }' \
+	  -pe 's/^(\s*)include\("([^"]+)"\);/readInclude($$1, $$2)/ge' \
+	  js-modules/prettify.js \
+	  > src/prettify.js \
+	  || rm src/prettify.js
@@ -0,0 +1,229 @@
+
+/**
+ * Given a group of {@link RegExp}s, returns a {@code RegExp} that globally
+ * matches the union of the sets of strings matched by the input RegExp.
+ * Since it matches globally, if the input strings have a start-of-input
+ * anchor (/^.../), it is ignored for the purposes of unioning.
+ * @param {Array.<RegExp>} regexs non multiline, non-global regexs.
+ * @return {RegExp} a global regex.
+ */
+function combinePrefixPatterns(regexs) {
+  var capturedGroupIndex = 0;
+
+  var needToFoldCase = false;
+  var ignoreCase = false;
+  for (var i = 0, n = regexs.length; i < n; ++i) {
+    var regex = regexs[i];
+    if (regex.ignoreCase) {
+      ignoreCase = true;
+    } else if (/[a-z]/i.test(regex.source.replace(
+                   /\\u[0-9a-f]{4}|\\x[0-9a-f]{2}|\\[^ux]/gi, ''))) {
+      needToFoldCase = true;
+      ignoreCase = false;
+      break;
+    }
+  }
+
+  function decodeEscape(charsetPart) {
+    if (charsetPart.charAt(0) !== '\\') { return charsetPart.charCodeAt(0); }
+    switch (charsetPart.charAt(1)) {
+      case 'b': return 8;
+      case 't': return 9;
+      case 'n': return 0xa;
+      case 'v': return 0xb;
+      case 'f': return 0xc;
+      case 'r': return 0xd;
+      case 'u': case 'x':
+        return parseInt(charsetPart.substring(2), 16)
+            || charsetPart.charCodeAt(1);
+      case '0': case '1': case '2': case '3': case '4':
+      case '5': case '6': case '7':
+        return parseInt(charsetPart.substring(1), 8);
+      default: return charsetPart.charCodeAt(1);
+    }
+  }
+
+  function encodeEscape(charCode) {
+    if (charCode < 0x20) {
+      return (charCode < 0x10 ? '\\x0' : '\\x') + charCode.toString(16);
+    }
+    var ch = String.fromCharCode(charCode);
+    if (ch === '\\' || ch === '-' || ch === '[' || ch === ']') {
+      ch = '\\' + ch;
+    }
+    return ch;
+  }
+
+  function caseFoldCharset(charSet) {
+    var charsetParts = charSet.substring(1, charSet.length - 1).match(
+        new RegExp(
+            '\\\\u[0-9A-Fa-f]{4}'
+            + '|\\\\x[0-9A-Fa-f]{2}'
+            + '|\\\\[0-3][0-7]{0,2}'
+            + '|\\\\[0-7]{1,2}'
+            + '|\\\\[\\s\\S]'
+            + '|-'
+            + '|[^-\\\\]',
+            'g'));
+    var groups = [];
+    var ranges = [];
+    var inverse = charsetParts[0] === '^';
+    for (var i = inverse ? 1 : 0, n = charsetParts.length; i < n; ++i) {
+      var p = charsetParts[i];
+      switch (p) {
+        case '\\B': case '\\b':
+        case '\\D': case '\\d':
+        case '\\S': case '\\s':
+        case '\\W': case '\\w':
+          groups.push(p);
+          continue;
+      }
+      var start = decodeEscape(p);
+      var end;
+      if (i + 2 < n && '-' === charsetParts[i + 1]) {
+        end = decodeEscape(charsetParts[i + 2]);
+        i += 2;
+      } else {
+        end = start;
+      }
+      ranges.push([start, end]);
+      // If the range might intersect letters, then expand it.
+      if (!(end < 65 || start > 122)) {
+        if (!(end < 65 || start > 90)) {
+          ranges.push([Math.max(65, start) | 32, Math.min(end, 90) | 32]);
+        }
+        if (!(end < 97 || start > 122)) {
+          ranges.push([Math.max(97, start) & ~32, Math.min(end, 122) & ~32]);
+        }
+      }
+    }
+
+    // [[1, 10], [3, 4], [8, 12], [14, 14], [16, 16], [17, 17]]
+    // -> [[1, 12], [14, 14], [16, 17]]
+    ranges.sort(function (a, b) { return (a[0] - b[0]) || (b[1]  - a[1]); });
+    var consolidatedRanges = [];
+    var lastRange = [NaN, NaN];
+    for (var i = 0; i < ranges.length; ++i) {
+      var range = ranges[i];
+      if (range[0] <= lastRange[1] + 1) {
+        lastRange[1] = Math.max(lastRange[1], range[1]);
+      } else {
+        consolidatedRanges.push(lastRange = range);
+      }
+    }
+
+    var out = ['['];
+    if (inverse) { out.push('^'); }
+    out.push.apply(out, groups);
+    for (var i = 0; i < consolidatedRanges.length; ++i) {
+      var range = consolidatedRanges[i];
+      out.push(encodeEscape(range[0]));
+      if (range[1] > range[0]) {
+        if (range[1] + 1 > range[0]) { out.push('-'); }
+        out.push(encodeEscape(range[1]));
+      }
+    }
+    out.push(']');
+    return out.join('');
+  }
+
+  function allowAnywhereFoldCaseAndRenumberGroups(regex) {
+    // Split into character sets, escape sequences, punctuation strings
+    // like ('(', '(?:', ')', '^'), and runs of characters that do not
+    // include any of the above.
+    var parts = regex.source.match(
+        new RegExp(
+            '(?:'
+            + '\\[(?:[^\\x5C\\x5D]|\\\\[\\s\\S])*\\]'  // a character set
+            + '|\\\\u[A-Fa-f0-9]{4}'  // a unicode escape
+            + '|\\\\x[A-Fa-f0-9]{2}'  // a hex escape
+            + '|\\\\[0-9]+'  // a back-reference or octal escape
+            + '|\\\\[^ux0-9]'  // other escape sequence
+            + '|\\(\\?[:!=]'  // start of a non-capturing group
+            + '|[\\(\\)\\^]'  // start/emd of a group, or line start
+            + '|[^\\x5B\\x5C\\(\\)\\^]+'  // run of other characters
+            + ')',
+            'g'));
+    var n = parts.length;
+
+    // Maps captured group numbers to the number they will occupy in
+    // the output or to -1 if that has not been determined, or to
+    // undefined if they need not be capturing in the output.
+    var capturedGroups = [];
+
+    // Walk over and identify back references to build the capturedGroups
+    // mapping.
+    for (var i = 0, groupIndex = 0; i < n; ++i) {
+      var p = parts[i];
+      if (p === '(') {
+        // groups are 1-indexed, so max group index is count of '('
+        ++groupIndex;
+      } else if ('\\' === p.charAt(0)) {
+        var decimalValue = +p.substring(1);
+        if (decimalValue && decimalValue <= groupIndex) {
+          capturedGroups[decimalValue] = -1;
+        }
+      }
+    }
+
+    // Renumber groups and reduce capturing groups to non-capturing groups
+    // where possible.
+    for (var i = 1; i < capturedGroups.length; ++i) {
+      if (-1 === capturedGroups[i]) {
+        capturedGroups[i] = ++capturedGroupIndex;
+      }
+    }
+    for (var i = 0, groupIndex = 0; i < n; ++i) {
+      var p = parts[i];
+      if (p === '(') {
+        ++groupIndex;
+        if (capturedGroups[groupIndex] === undefined) {
+          parts[i] = '(?:';
+        }
+      } else if ('\\' === p.charAt(0)) {
+        var decimalValue = +p.substring(1);
+        if (decimalValue && decimalValue <= groupIndex) {
+          parts[i] = '\\' + capturedGroups[groupIndex];
+        }
+      }
+    }
+
+    // Remove any prefix anchors so that the output will match anywhere.
+    // ^^ really does mean an anchored match though.
+    for (var i = 0, groupIndex = 0; i < n; ++i) {
+      if ('^' === parts[i] && '^' !== parts[i + 1]) { parts[i] = ''; }
+    }
+
+    // Expand letters to groups to handle mixing of case-sensitive and
+    // case-insensitive patterns if necessary.
+    if (regex.ignoreCase && needToFoldCase) {
+      for (var i = 0; i < n; ++i) {
+        var p = parts[i];
+        var ch0 = p.charAt(0);
+        if (p.length >= 2 && ch0 === '[') {
+          parts[i] = caseFoldCharset(p);
+        } else if (ch0 !== '\\') {
+          // TODO: handle letters in numeric escapes.
+          parts[i] = p.replace(
+              /[a-zA-Z]/g,
+              function (ch) {
+                var cc = ch.charCodeAt(0);
+                return '[' + String.fromCharCode(cc & ~32, cc | 32) + ']';
+              });
+        }
+      }
+    }
+
+    return parts.join('');
+  }
+
+  var rewritten = [];
+  for (var i = 0, n = regexs.length; i < n; ++i) {
+    var regex = regexs[i];
+    if (regex.global || regex.multiline) { throw new Error('' + regex); }
+    rewritten.push(
+        '(?:' + allowAnywhereFoldCaseAndRenumberGroups(regex) + ')');
+  }
+
+  return new RegExp(rewritten.join('|'), ignoreCase ? 'gi' : 'g');
+}
@@ -0,0 +1,99 @@
+/**
+ * Split markup into a string of source code and an array mapping ranges in
+ * that string to the text nodes in which they appear.
+ *
+ * <p>
+ * The HTML DOM structure:</p>
+ * <pre>
+ * (Element   "p"
+ *   (Element "b"
+ *     (Text  "print "))       ; #1
+ *   (Text    "'Hello '")      ; #2
+ *   (Element "br")            ; #3
+ *   (Text    "  + 'World';")) ; #4
+ * </pre>
+ * <p>
+ * corresponds to the HTML
+ * {@code <p><b>print </b>'Hello '<br>  + 'World';</p>}.</p>
+ *
+ * <p>
+ * It will produce the output:</p>
+ * <pre>
+ * {
+ *   source: "print 'Hello '\n  + 'World';",
+ *   //                 1         2
+ *   //       012345678901234 5678901234567
+ *   spans: [0, #1, 6, #2, 14, #3, 15, #4]
+ * }
+ * </pre>
+ * <p>
+ * where #1 is a reference to the {@code "print "} text node above, and so
+ * on for the other text nodes.
+ * </p>
+ *
+ * <p>
+ * The {@code} spans array is an array of pairs.  Even elements are the start
+ * indices of substrings, and odd elements are the text nodes (or BR elements)
+ * that contain the text for those substrings.
+ * Substrings continue until the next index or the end of the source.
+ * </p>
+ *
+ * @param {Node} node an HTML DOM subtree containing source-code.
+ * @return {Object} source code and the text nodes in which they occur.
+ */
+function extractSourceSpans(node) {
+  var nocode = /(?:^|\s)nocode(?:\s|$)/;
+
+  var chunks = [];
+  var length = 0;
+  var spans = [];
+  var k = 0;
+
+  var whitespace;
+  if (node.currentStyle) {
+    whitespace = node.currentStyle.whiteSpace;
+  } else if (window.getComputedStyle) {
+    whitespace = document.defaultView.getComputedStyle(node, null)
+        .getPropertyValue('white-space');
+  }
+  var isPreformatted = whitespace && 'pre' === whitespace.substring(0, 3);
+
+  function walk(node) {
+    switch (node.nodeType) {
+      case 1:  // Element
+        if (nocode.test(node.className)) { return; }
+        for (var child = node.firstChild; child; child = child.nextSibling) {
+          walk(child);
+        }
+        var nodeName = node.nodeName;
+        if ('BR' === nodeName || 'LI' === nodeName) {
+          chunks[k] = '\n';
+          spans[k << 1] = length++;
+          spans[(k++ << 1) | 1] = node;
+        }
+        break;
+      case 3: case 4:  // Text
+        var text = node.nodeValue;
+        if (text.length) {
+          if (!isPreformatted) {
+            text = text.replace(/[ \t\r\n]+/g, ' ');
+          } else {
+            text = text.replace(/\r\n?/g, '\n');  // Normalize newlines.
+          }
+          // TODO: handle tabs here?
+          chunks[k] = text;
+          spans[k << 1] = length;
+          length += text.length;
+          spans[(k++ << 1) | 1] = node;
+        }
+        break;
+    }
+  }
+
+  walk(node);
+
+  return {
+    source: chunks.join('').replace(/\n$/, ''),
+    spans: spans
+  };
+}