Skip to content

Commit b3ef91f

Browse files
committed
Improve Tokenizer performance by inlining text parsing and removing some Scanner::current calls
1 parent 7453ab0 commit b3ef91f

1 file changed

Lines changed: 63 additions & 21 deletions

File tree

src/HTML5/Parser/Tokenizer.php

Lines changed: 63 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,31 @@ protected function consumeData()
128128
$this->characterReference();
129129
$this->tagOpen();
130130
$this->eof();
131-
$this->characterData();
131+
132+
// Inline the parsing of characters as it's the critical performance path
133+
$tok = $this->scanner->current();
134+
if ($tok !== false) {
135+
switch ($this->textMode) {
136+
case Elements::TEXT_RAW:
137+
$this->rawText($tok);
138+
break;
139+
140+
case Elements::TEXT_RCDATA:
141+
$this->rcdata($tok);
142+
break;
143+
144+
default:
145+
if (!strspn($tok, "<&")) {
146+
// NULL character
147+
if ($tok === "\00") {
148+
$this->parseError("Received null character.");
149+
}
150+
151+
$this->text .= $tok;
152+
$this->scanner->next();
153+
}
154+
}
155+
}
132156

133157
return $this->carryOn;
134158
}
@@ -148,64 +172,78 @@ protected function characterData()
148172
}
149173
switch ($this->textMode) {
150174
case Elements::TEXT_RAW:
151-
return $this->rawText();
175+
return $this->rawText($tok);
152176
case Elements::TEXT_RCDATA:
153-
return $this->rcdata();
177+
return $this->rcdata($tok);
154178
default:
155179
if (strspn($tok, "<&")) {
156180
return false;
157181
}
158-
return $this->text();
182+
return $this->text($tok);
159183
}
160184
}
161185

162186
/**
163187
* This buffers the current token as character data.
188+
*
189+
* @param string $tok The current token.
190+
*
191+
* @return bool
164192
*/
165-
protected function text()
193+
protected function text($tok)
166194
{
167-
$tok = $this->scanner->current();
168-
169195
// This should never happen...
170196
if ($tok === false) {
171197
return false;
172198
}
173-
// Null
199+
200+
// NULL character
174201
if ($tok === "\00") {
175202
$this->parseError("Received null character.");
176203
}
177-
// fprintf(STDOUT, "Writing '%s'", $tok);
204+
178205
$this->buffer($tok);
179206
$this->scanner->next();
207+
180208
return true;
181209
}
182210

183211
/**
184212
* Read text in RAW mode.
213+
*
214+
* @param string $tok The current token.
215+
*
216+
* @return bool
185217
*/
186-
protected function rawText()
218+
protected function rawText($tok)
187219
{
188220
if (is_null($this->untilTag)) {
189-
return $this->text();
221+
return $this->text($tok);
190222
}
223+
191224
$sequence = '</' . $this->untilTag . '>';
192225
$txt = $this->readUntilSequence($sequence);
193226
$this->events->text($txt);
194227
$this->setTextMode(0);
228+
195229
return $this->endTag();
196230
}
197231

198232
/**
199233
* Read text in RCDATA mode.
234+
*
235+
* @param string $tok The current token.
236+
*
237+
* @return bool
200238
*/
201-
protected function rcdata()
239+
protected function rcdata($tok)
202240
{
203241
if (is_null($this->untilTag)) {
204-
return $this->text();
242+
return $this->text($tok);
205243
}
244+
206245
$sequence = '</' . $this->untilTag;
207246
$txt = '';
208-
$tok = $this->scanner->current();
209247

210248
$caseSensitive = !Elements::isHtml5Element($this->untilTag);
211249
while ($tok !== false && ! ($tok == '<' && ($this->sequenceMatches($sequence, $caseSensitive)))) {
@@ -223,9 +261,11 @@ protected function rcdata()
223261
if ($this->scanner->current() !== '>') {
224262
$this->parseError("Unclosed RCDATA end tag");
225263
}
264+
226265
$this->scanner->unconsume($len);
227266
$this->events->text($txt);
228267
$this->setTextMode(0);
268+
229269
return $this->endTag();
230270
}
231271

@@ -279,7 +319,7 @@ protected function tagOpen()
279319
$this->scanner->next();
280320

281321
return $this->markupDeclaration() || $this->endTag() || $this->processingInstruction() || $this->tagName() ||
282-
/* This always returns false. */
322+
// This always returns false.
283323
$this->parseError("Illegal tag opening") || $this->characterData();
284324
}
285325

@@ -343,8 +383,9 @@ protected function endTag()
343383
// Trash whitespace.
344384
$this->scanner->whitespace();
345385

346-
if ($this->scanner->current() != '>') {
347-
$this->parseError("Expected >, got '%s'", $this->scanner->current());
386+
$tok = $this->scanner->current();
387+
if ($tok != '>') {
388+
$this->parseError("Expected >, got '%s'", $tok);
348389
// We just trash stuff until we get to the next tag close.
349390
$this->scanner->charsUntil('>');
350391
}
@@ -456,10 +497,11 @@ protected function attribute(&$attributes)
456497
$name = strtolower($this->scanner->charsUntil("/>=\n\f\t "));
457498

458499
if (strlen($name) == 0) {
459-
$this->parseError("Expected an attribute name, got %s.", $this->scanner->current());
500+
$tok = $this->scanner->current();
501+
$this->parseError("Expected an attribute name, got %s.", $tok);
460502
// Really, only '=' can be the char here. Everything else gets absorbed
461503
// under one rule or another.
462-
$name = $this->scanner->current();
504+
$name = $tok;
463505
$this->scanner->next();
464506
}
465507

@@ -556,7 +598,7 @@ protected function quotedAttributeValue($quote)
556598

557599
$tok = $this->scanner->current();
558600
if ($tok == '&') {
559-
$val .= $this->decodeCharacterReference(true, $tok);
601+
$val .= $this->decodeCharacterReference(true);
560602
continue;
561603
}
562604
break;
@@ -1032,6 +1074,7 @@ protected function parseError($msg)
10321074
$line = $this->scanner->currentLine();
10331075
$col = $this->scanner->columnOffset();
10341076
$this->events->parseError($msg, $line, $col);
1077+
10351078
return false;
10361079
}
10371080

@@ -1049,7 +1092,6 @@ protected function parseError($msg)
10491092
*/
10501093
protected function decodeCharacterReference($inAttribute = false)
10511094
{
1052-
10531095
// If it fails this, it's definitely not an entity.
10541096
if ($this->scanner->current() != '&') {
10551097
return false;

0 commit comments

Comments
 (0)