@@ -128,7 +128,31 @@ protected function consumeData()
128128 $ this ->characterReference ();
129129 $ this ->tagOpen ();
130130 $ this ->eof ();
131- $ this ->characterData ();
131+
132+ // Inline the parsing of characters as it's the critical performance path
133+ $ tok = $ this ->scanner ->current ();
134+ if ($ tok !== false ) {
135+ switch ($ this ->textMode ) {
136+ case Elements::TEXT_RAW :
137+ $ this ->rawText ($ tok );
138+ break ;
139+
140+ case Elements::TEXT_RCDATA :
141+ $ this ->rcdata ($ tok );
142+ break ;
143+
144+ default :
145+ if (!strspn ($ tok , "<& " )) {
146+ // NULL character
147+ if ($ tok === "\00" ) {
148+ $ this ->parseError ("Received null character. " );
149+ }
150+
151+ $ this ->text .= $ tok ;
152+ $ this ->scanner ->next ();
153+ }
154+ }
155+ }
132156
133157 return $ this ->carryOn ;
134158 }
@@ -148,64 +172,78 @@ protected function characterData()
148172 }
149173 switch ($ this ->textMode ) {
150174 case Elements::TEXT_RAW :
151- return $ this ->rawText ();
175+ return $ this ->rawText ($ tok );
152176 case Elements::TEXT_RCDATA :
153- return $ this ->rcdata ();
177+ return $ this ->rcdata ($ tok );
154178 default :
155179 if (strspn ($ tok , "<& " )) {
156180 return false ;
157181 }
158- return $ this ->text ();
182+ return $ this ->text ($ tok );
159183 }
160184 }
161185
162186 /**
163187 * This buffers the current token as character data.
188+ *
189+ * @param string $tok The current token.
190+ *
191+ * @return bool
164192 */
165- protected function text ()
193+ protected function text ($ tok )
166194 {
167- $ tok = $ this ->scanner ->current ();
168-
169195 // This should never happen...
170196 if ($ tok === false ) {
171197 return false ;
172198 }
173- // Null
199+
200+ // NULL character
174201 if ($ tok === "\00" ) {
175202 $ this ->parseError ("Received null character. " );
176203 }
177- // fprintf(STDOUT, "Writing '%s'", $tok);
204+
178205 $ this ->buffer ($ tok );
179206 $ this ->scanner ->next ();
207+
180208 return true ;
181209 }
182210
183211 /**
184212 * Read text in RAW mode.
213+ *
214+ * @param string $tok The current token.
215+ *
216+ * @return bool
185217 */
186- protected function rawText ()
218+ protected function rawText ($ tok )
187219 {
188220 if (is_null ($ this ->untilTag )) {
189- return $ this ->text ();
221+ return $ this ->text ($ tok );
190222 }
223+
191224 $ sequence = '</ ' . $ this ->untilTag . '> ' ;
192225 $ txt = $ this ->readUntilSequence ($ sequence );
193226 $ this ->events ->text ($ txt );
194227 $ this ->setTextMode (0 );
228+
195229 return $ this ->endTag ();
196230 }
197231
198232 /**
199233 * Read text in RCDATA mode.
234+ *
235+ * @param string $tok The current token.
236+ *
237+ * @return bool
200238 */
201- protected function rcdata ()
239+ protected function rcdata ($ tok )
202240 {
203241 if (is_null ($ this ->untilTag )) {
204- return $ this ->text ();
242+ return $ this ->text ($ tok );
205243 }
244+
206245 $ sequence = '</ ' . $ this ->untilTag ;
207246 $ txt = '' ;
208- $ tok = $ this ->scanner ->current ();
209247
210248 $ caseSensitive = !Elements::isHtml5Element ($ this ->untilTag );
211249 while ($ tok !== false && ! ($ tok == '< ' && ($ this ->sequenceMatches ($ sequence , $ caseSensitive )))) {
@@ -223,9 +261,11 @@ protected function rcdata()
223261 if ($ this ->scanner ->current () !== '> ' ) {
224262 $ this ->parseError ("Unclosed RCDATA end tag " );
225263 }
264+
226265 $ this ->scanner ->unconsume ($ len );
227266 $ this ->events ->text ($ txt );
228267 $ this ->setTextMode (0 );
268+
229269 return $ this ->endTag ();
230270 }
231271
@@ -279,7 +319,7 @@ protected function tagOpen()
279319 $ this ->scanner ->next ();
280320
281321 return $ this ->markupDeclaration () || $ this ->endTag () || $ this ->processingInstruction () || $ this ->tagName () ||
282- /* This always returns false. */
322+ // This always returns false.
283323 $ this ->parseError ("Illegal tag opening " ) || $ this ->characterData ();
284324 }
285325
@@ -343,8 +383,9 @@ protected function endTag()
343383 // Trash whitespace.
344384 $ this ->scanner ->whitespace ();
345385
346- if ($ this ->scanner ->current () != '> ' ) {
347- $ this ->parseError ("Expected >, got '%s' " , $ this ->scanner ->current ());
386+ $ tok = $ this ->scanner ->current ();
387+ if ($ tok != '> ' ) {
388+ $ this ->parseError ("Expected >, got '%s' " , $ tok );
348389 // We just trash stuff until we get to the next tag close.
349390 $ this ->scanner ->charsUntil ('> ' );
350391 }
@@ -456,10 +497,11 @@ protected function attribute(&$attributes)
456497 $ name = strtolower ($ this ->scanner ->charsUntil ("/>= \n\f\t " ));
457498
458499 if (strlen ($ name ) == 0 ) {
459- $ this ->parseError ("Expected an attribute name, got %s. " , $ this ->scanner ->current ());
500+ $ tok = $ this ->scanner ->current ();
501+ $ this ->parseError ("Expected an attribute name, got %s. " , $ tok );
460502 // Really, only '=' can be the char here. Everything else gets absorbed
461503 // under one rule or another.
462- $ name = $ this -> scanner -> current () ;
504+ $ name = $ tok ;
463505 $ this ->scanner ->next ();
464506 }
465507
@@ -556,7 +598,7 @@ protected function quotedAttributeValue($quote)
556598
557599 $ tok = $ this ->scanner ->current ();
558600 if ($ tok == '& ' ) {
559- $ val .= $ this ->decodeCharacterReference (true , $ tok );
601+ $ val .= $ this ->decodeCharacterReference (true );
560602 continue ;
561603 }
562604 break ;
@@ -1032,6 +1074,7 @@ protected function parseError($msg)
10321074 $ line = $ this ->scanner ->currentLine ();
10331075 $ col = $ this ->scanner ->columnOffset ();
10341076 $ this ->events ->parseError ($ msg , $ line , $ col );
1077+
10351078 return false ;
10361079 }
10371080
@@ -1049,7 +1092,6 @@ protected function parseError($msg)
10491092 */
10501093 protected function decodeCharacterReference ($ inAttribute = false )
10511094 {
1052-
10531095 // If it fails this, it's definitely not an entity.
10541096 if ($ this ->scanner ->current () != '& ' ) {
10551097 return false ;
0 commit comments