Skip to content
This repository was archived by the owner on Apr 22, 2020. It is now read-only.

Commit 0b3341b

Browse files
author
mikesamuel@gmail.com
committed
First pass at a way to dodge newline issues in IE.
1 parent 1160ad9 commit 0b3341b

12 files changed

Lines changed: 3386 additions & 1773 deletions

Makefile

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
src/prettify.js: js-modules/*.js
2+
perl -e '\
3+
sub readInclude($$$$) {\
4+
my $$prefix = $$_[0];\
5+
my $$name = "js-modules/" . $$_[1];\
6+
my $$buf = "";\
7+
open(IN, "<$$name") or die "$$name: $$!";\
8+
while (<IN>) {\
9+
$$buf .= "$$prefix$$_";\
10+
}\
11+
return $$buf;\
12+
}' \
13+
-pe 's/^(\s*)include\("([^"]+)"\);/readInclude($$1, $$2)/ge' \
14+
js-modules/prettify.js \
15+
> src/prettify.js \
16+
|| rm src/prettify.js
Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
2+
/**
3+
* Given a group of {@link RegExp}s, returns a {@code RegExp} that globally
4+
* matches the union of the sets of strings matched by the input RegExp.
5+
* Since it matches globally, if the input strings have a start-of-input
6+
* anchor (/^.../), it is ignored for the purposes of unioning.
7+
* @param {Array.<RegExp>} regexs non multiline, non-global regexs.
8+
* @return {RegExp} a global regex.
9+
*/
10+
function combinePrefixPatterns(regexs) {
11+
var capturedGroupIndex = 0;
12+
13+
var needToFoldCase = false;
14+
var ignoreCase = false;
15+
for (var i = 0, n = regexs.length; i < n; ++i) {
16+
var regex = regexs[i];
17+
if (regex.ignoreCase) {
18+
ignoreCase = true;
19+
} else if (/[a-z]/i.test(regex.source.replace(
20+
/\\u[0-9a-f]{4}|\\x[0-9a-f]{2}|\\[^ux]/gi, ''))) {
21+
needToFoldCase = true;
22+
ignoreCase = false;
23+
break;
24+
}
25+
}
26+
27+
function decodeEscape(charsetPart) {
28+
if (charsetPart.charAt(0) !== '\\') { return charsetPart.charCodeAt(0); }
29+
switch (charsetPart.charAt(1)) {
30+
case 'b': return 8;
31+
case 't': return 9;
32+
case 'n': return 0xa;
33+
case 'v': return 0xb;
34+
case 'f': return 0xc;
35+
case 'r': return 0xd;
36+
case 'u': case 'x':
37+
return parseInt(charsetPart.substring(2), 16)
38+
|| charsetPart.charCodeAt(1);
39+
case '0': case '1': case '2': case '3': case '4':
40+
case '5': case '6': case '7':
41+
return parseInt(charsetPart.substring(1), 8);
42+
default: return charsetPart.charCodeAt(1);
43+
}
44+
}
45+
46+
function encodeEscape(charCode) {
47+
if (charCode < 0x20) {
48+
return (charCode < 0x10 ? '\\x0' : '\\x') + charCode.toString(16);
49+
}
50+
var ch = String.fromCharCode(charCode);
51+
if (ch === '\\' || ch === '-' || ch === '[' || ch === ']') {
52+
ch = '\\' + ch;
53+
}
54+
return ch;
55+
}
56+
57+
function caseFoldCharset(charSet) {
58+
var charsetParts = charSet.substring(1, charSet.length - 1).match(
59+
new RegExp(
60+
'\\\\u[0-9A-Fa-f]{4}'
61+
+ '|\\\\x[0-9A-Fa-f]{2}'
62+
+ '|\\\\[0-3][0-7]{0,2}'
63+
+ '|\\\\[0-7]{1,2}'
64+
+ '|\\\\[\\s\\S]'
65+
+ '|-'
66+
+ '|[^-\\\\]',
67+
'g'));
68+
var groups = [];
69+
var ranges = [];
70+
var inverse = charsetParts[0] === '^';
71+
for (var i = inverse ? 1 : 0, n = charsetParts.length; i < n; ++i) {
72+
var p = charsetParts[i];
73+
switch (p) {
74+
case '\\B': case '\\b':
75+
case '\\D': case '\\d':
76+
case '\\S': case '\\s':
77+
case '\\W': case '\\w':
78+
groups.push(p);
79+
continue;
80+
}
81+
var start = decodeEscape(p);
82+
var end;
83+
if (i + 2 < n && '-' === charsetParts[i + 1]) {
84+
end = decodeEscape(charsetParts[i + 2]);
85+
i += 2;
86+
} else {
87+
end = start;
88+
}
89+
ranges.push([start, end]);
90+
// If the range might intersect letters, then expand it.
91+
if (!(end < 65 || start > 122)) {
92+
if (!(end < 65 || start > 90)) {
93+
ranges.push([Math.max(65, start) | 32, Math.min(end, 90) | 32]);
94+
}
95+
if (!(end < 97 || start > 122)) {
96+
ranges.push([Math.max(97, start) & ~32, Math.min(end, 122) & ~32]);
97+
}
98+
}
99+
}
100+
101+
// [[1, 10], [3, 4], [8, 12], [14, 14], [16, 16], [17, 17]]
102+
// -> [[1, 12], [14, 14], [16, 17]]
103+
ranges.sort(function (a, b) { return (a[0] - b[0]) || (b[1] - a[1]); });
104+
var consolidatedRanges = [];
105+
var lastRange = [NaN, NaN];
106+
for (var i = 0; i < ranges.length; ++i) {
107+
var range = ranges[i];
108+
if (range[0] <= lastRange[1] + 1) {
109+
lastRange[1] = Math.max(lastRange[1], range[1]);
110+
} else {
111+
consolidatedRanges.push(lastRange = range);
112+
}
113+
}
114+
115+
var out = ['['];
116+
if (inverse) { out.push('^'); }
117+
out.push.apply(out, groups);
118+
for (var i = 0; i < consolidatedRanges.length; ++i) {
119+
var range = consolidatedRanges[i];
120+
out.push(encodeEscape(range[0]));
121+
if (range[1] > range[0]) {
122+
if (range[1] + 1 > range[0]) { out.push('-'); }
123+
out.push(encodeEscape(range[1]));
124+
}
125+
}
126+
out.push(']');
127+
return out.join('');
128+
}
129+
130+
function allowAnywhereFoldCaseAndRenumberGroups(regex) {
131+
// Split into character sets, escape sequences, punctuation strings
132+
// like ('(', '(?:', ')', '^'), and runs of characters that do not
133+
// include any of the above.
134+
var parts = regex.source.match(
135+
new RegExp(
136+
'(?:'
137+
+ '\\[(?:[^\\x5C\\x5D]|\\\\[\\s\\S])*\\]' // a character set
138+
+ '|\\\\u[A-Fa-f0-9]{4}' // a unicode escape
139+
+ '|\\\\x[A-Fa-f0-9]{2}' // a hex escape
140+
+ '|\\\\[0-9]+' // a back-reference or octal escape
141+
+ '|\\\\[^ux0-9]' // other escape sequence
142+
+ '|\\(\\?[:!=]' // start of a non-capturing group
143+
+ '|[\\(\\)\\^]' // start/emd of a group, or line start
144+
+ '|[^\\x5B\\x5C\\(\\)\\^]+' // run of other characters
145+
+ ')',
146+
'g'));
147+
var n = parts.length;
148+
149+
// Maps captured group numbers to the number they will occupy in
150+
// the output or to -1 if that has not been determined, or to
151+
// undefined if they need not be capturing in the output.
152+
var capturedGroups = [];
153+
154+
// Walk over and identify back references to build the capturedGroups
155+
// mapping.
156+
for (var i = 0, groupIndex = 0; i < n; ++i) {
157+
var p = parts[i];
158+
if (p === '(') {
159+
// groups are 1-indexed, so max group index is count of '('
160+
++groupIndex;
161+
} else if ('\\' === p.charAt(0)) {
162+
var decimalValue = +p.substring(1);
163+
if (decimalValue && decimalValue <= groupIndex) {
164+
capturedGroups[decimalValue] = -1;
165+
}
166+
}
167+
}
168+
169+
// Renumber groups and reduce capturing groups to non-capturing groups
170+
// where possible.
171+
for (var i = 1; i < capturedGroups.length; ++i) {
172+
if (-1 === capturedGroups[i]) {
173+
capturedGroups[i] = ++capturedGroupIndex;
174+
}
175+
}
176+
for (var i = 0, groupIndex = 0; i < n; ++i) {
177+
var p = parts[i];
178+
if (p === '(') {
179+
++groupIndex;
180+
if (capturedGroups[groupIndex] === undefined) {
181+
parts[i] = '(?:';
182+
}
183+
} else if ('\\' === p.charAt(0)) {
184+
var decimalValue = +p.substring(1);
185+
if (decimalValue && decimalValue <= groupIndex) {
186+
parts[i] = '\\' + capturedGroups[groupIndex];
187+
}
188+
}
189+
}
190+
191+
// Remove any prefix anchors so that the output will match anywhere.
192+
// ^^ really does mean an anchored match though.
193+
for (var i = 0, groupIndex = 0; i < n; ++i) {
194+
if ('^' === parts[i] && '^' !== parts[i + 1]) { parts[i] = ''; }
195+
}
196+
197+
// Expand letters to groups to handle mixing of case-sensitive and
198+
// case-insensitive patterns if necessary.
199+
if (regex.ignoreCase && needToFoldCase) {
200+
for (var i = 0; i < n; ++i) {
201+
var p = parts[i];
202+
var ch0 = p.charAt(0);
203+
if (p.length >= 2 && ch0 === '[') {
204+
parts[i] = caseFoldCharset(p);
205+
} else if (ch0 !== '\\') {
206+
// TODO: handle letters in numeric escapes.
207+
parts[i] = p.replace(
208+
/[a-zA-Z]/g,
209+
function (ch) {
210+
var cc = ch.charCodeAt(0);
211+
return '[' + String.fromCharCode(cc & ~32, cc | 32) + ']';
212+
});
213+
}
214+
}
215+
}
216+
217+
return parts.join('');
218+
}
219+
220+
var rewritten = [];
221+
for (var i = 0, n = regexs.length; i < n; ++i) {
222+
var regex = regexs[i];
223+
if (regex.global || regex.multiline) { throw new Error('' + regex); }
224+
rewritten.push(
225+
'(?:' + allowAnywhereFoldCaseAndRenumberGroups(regex) + ')');
226+
}
227+
228+
return new RegExp(rewritten.join('|'), ignoreCase ? 'gi' : 'g');
229+
}

js-modules/extractSourceSpans.js

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
/**
2+
* Split markup into a string of source code and an array mapping ranges in
3+
* that string to the text nodes in which they appear.
4+
*
5+
* <p>
6+
* The HTML DOM structure:</p>
7+
* <pre>
8+
* (Element "p"
9+
* (Element "b"
10+
* (Text "print ")) ; #1
11+
* (Text "'Hello '") ; #2
12+
* (Element "br") ; #3
13+
* (Text " + 'World';")) ; #4
14+
* </pre>
15+
* <p>
16+
* corresponds to the HTML
17+
* {@code <p><b>print </b>'Hello '<br> + 'World';</p>}.</p>
18+
*
19+
* <p>
20+
* It will produce the output:</p>
21+
* <pre>
22+
* {
23+
* source: "print 'Hello '\n + 'World';",
24+
* // 1 2
25+
* // 012345678901234 5678901234567
26+
* spans: [0, #1, 6, #2, 14, #3, 15, #4]
27+
* }
28+
* </pre>
29+
* <p>
30+
* where #1 is a reference to the {@code "print "} text node above, and so
31+
* on for the other text nodes.
32+
* </p>
33+
*
34+
* <p>
35+
* The {@code} spans array is an array of pairs. Even elements are the start
36+
* indices of substrings, and odd elements are the text nodes (or BR elements)
37+
* that contain the text for those substrings.
38+
* Substrings continue until the next index or the end of the source.
39+
* </p>
40+
*
41+
* @param {Node} node an HTML DOM subtree containing source-code.
42+
* @return {Object} source code and the text nodes in which they occur.
43+
*/
44+
function extractSourceSpans(node) {
45+
var nocode = /(?:^|\s)nocode(?:\s|$)/;
46+
47+
var chunks = [];
48+
var length = 0;
49+
var spans = [];
50+
var k = 0;
51+
52+
var whitespace;
53+
if (node.currentStyle) {
54+
whitespace = node.currentStyle.whiteSpace;
55+
} else if (window.getComputedStyle) {
56+
whitespace = document.defaultView.getComputedStyle(node, null)
57+
.getPropertyValue('white-space');
58+
}
59+
var isPreformatted = whitespace && 'pre' === whitespace.substring(0, 3);
60+
61+
function walk(node) {
62+
switch (node.nodeType) {
63+
case 1: // Element
64+
if (nocode.test(node.className)) { return; }
65+
for (var child = node.firstChild; child; child = child.nextSibling) {
66+
walk(child);
67+
}
68+
var nodeName = node.nodeName;
69+
if ('BR' === nodeName || 'LI' === nodeName) {
70+
chunks[k] = '\n';
71+
spans[k << 1] = length++;
72+
spans[(k++ << 1) | 1] = node;
73+
}
74+
break;
75+
case 3: case 4: // Text
76+
var text = node.nodeValue;
77+
if (text.length) {
78+
if (!isPreformatted) {
79+
text = text.replace(/[ \t\r\n]+/g, ' ');
80+
} else {
81+
text = text.replace(/\r\n?/g, '\n'); // Normalize newlines.
82+
}
83+
// TODO: handle tabs here?
84+
chunks[k] = text;
85+
spans[k << 1] = length;
86+
length += text.length;
87+
spans[(k++ << 1) | 1] = node;
88+
}
89+
break;
90+
}
91+
}
92+
93+
walk(node);
94+
95+
return {
96+
source: chunks.join('').replace(/\n$/, ''),
97+
spans: spans
98+
};
99+
}

0 commit comments

Comments
 (0)