source: trip-planner-front/node_modules/parse5/lib/tokenizer/index.js@ ceaed42

Last change on this file since ceaed42 was 6a3a178, checked in by Ema <ema_spirova@…>, 3 years ago

initial commit

  • Property mode set to 100644
File size: 78.3 KB
Line 
1'use strict';
2
3const Preprocessor = require('./preprocessor');
4const unicode = require('../common/unicode');
5const neTree = require('./named-entity-data');
6const ERR = require('../common/error-codes');
7
8//Aliases
9const $ = unicode.CODE_POINTS;
10const $$ = unicode.CODE_POINT_SEQUENCES;
11
12//C1 Unicode control character reference replacements
13const C1_CONTROLS_REFERENCE_REPLACEMENTS = {
14 0x80: 0x20ac,
15 0x82: 0x201a,
16 0x83: 0x0192,
17 0x84: 0x201e,
18 0x85: 0x2026,
19 0x86: 0x2020,
20 0x87: 0x2021,
21 0x88: 0x02c6,
22 0x89: 0x2030,
23 0x8a: 0x0160,
24 0x8b: 0x2039,
25 0x8c: 0x0152,
26 0x8e: 0x017d,
27 0x91: 0x2018,
28 0x92: 0x2019,
29 0x93: 0x201c,
30 0x94: 0x201d,
31 0x95: 0x2022,
32 0x96: 0x2013,
33 0x97: 0x2014,
34 0x98: 0x02dc,
35 0x99: 0x2122,
36 0x9a: 0x0161,
37 0x9b: 0x203a,
38 0x9c: 0x0153,
39 0x9e: 0x017e,
40 0x9f: 0x0178
41};
42
43// Named entity tree flags
44const HAS_DATA_FLAG = 1 << 0;
45const DATA_DUPLET_FLAG = 1 << 1;
46const HAS_BRANCHES_FLAG = 1 << 2;
47const MAX_BRANCH_MARKER_VALUE = HAS_DATA_FLAG | DATA_DUPLET_FLAG | HAS_BRANCHES_FLAG;
48
49//States
50const DATA_STATE = 'DATA_STATE';
51const RCDATA_STATE = 'RCDATA_STATE';
52const RAWTEXT_STATE = 'RAWTEXT_STATE';
53const SCRIPT_DATA_STATE = 'SCRIPT_DATA_STATE';
54const PLAINTEXT_STATE = 'PLAINTEXT_STATE';
55const TAG_OPEN_STATE = 'TAG_OPEN_STATE';
56const END_TAG_OPEN_STATE = 'END_TAG_OPEN_STATE';
57const TAG_NAME_STATE = 'TAG_NAME_STATE';
58const RCDATA_LESS_THAN_SIGN_STATE = 'RCDATA_LESS_THAN_SIGN_STATE';
59const RCDATA_END_TAG_OPEN_STATE = 'RCDATA_END_TAG_OPEN_STATE';
60const RCDATA_END_TAG_NAME_STATE = 'RCDATA_END_TAG_NAME_STATE';
61const RAWTEXT_LESS_THAN_SIGN_STATE = 'RAWTEXT_LESS_THAN_SIGN_STATE';
62const RAWTEXT_END_TAG_OPEN_STATE = 'RAWTEXT_END_TAG_OPEN_STATE';
63const RAWTEXT_END_TAG_NAME_STATE = 'RAWTEXT_END_TAG_NAME_STATE';
64const SCRIPT_DATA_LESS_THAN_SIGN_STATE = 'SCRIPT_DATA_LESS_THAN_SIGN_STATE';
65const SCRIPT_DATA_END_TAG_OPEN_STATE = 'SCRIPT_DATA_END_TAG_OPEN_STATE';
66const SCRIPT_DATA_END_TAG_NAME_STATE = 'SCRIPT_DATA_END_TAG_NAME_STATE';
67const SCRIPT_DATA_ESCAPE_START_STATE = 'SCRIPT_DATA_ESCAPE_START_STATE';
68const SCRIPT_DATA_ESCAPE_START_DASH_STATE = 'SCRIPT_DATA_ESCAPE_START_DASH_STATE';
69const SCRIPT_DATA_ESCAPED_STATE = 'SCRIPT_DATA_ESCAPED_STATE';
70const SCRIPT_DATA_ESCAPED_DASH_STATE = 'SCRIPT_DATA_ESCAPED_DASH_STATE';
71const SCRIPT_DATA_ESCAPED_DASH_DASH_STATE = 'SCRIPT_DATA_ESCAPED_DASH_DASH_STATE';
72const SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE = 'SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE';
73const SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE = 'SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE';
74const SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE = 'SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE';
75const SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE';
76const SCRIPT_DATA_DOUBLE_ESCAPED_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_STATE';
77const SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE';
78const SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE';
79const SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE';
80const SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE';
81const BEFORE_ATTRIBUTE_NAME_STATE = 'BEFORE_ATTRIBUTE_NAME_STATE';
82const ATTRIBUTE_NAME_STATE = 'ATTRIBUTE_NAME_STATE';
83const AFTER_ATTRIBUTE_NAME_STATE = 'AFTER_ATTRIBUTE_NAME_STATE';
84const BEFORE_ATTRIBUTE_VALUE_STATE = 'BEFORE_ATTRIBUTE_VALUE_STATE';
85const ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE = 'ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE';
86const ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE = 'ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE';
87const ATTRIBUTE_VALUE_UNQUOTED_STATE = 'ATTRIBUTE_VALUE_UNQUOTED_STATE';
88const AFTER_ATTRIBUTE_VALUE_QUOTED_STATE = 'AFTER_ATTRIBUTE_VALUE_QUOTED_STATE';
89const SELF_CLOSING_START_TAG_STATE = 'SELF_CLOSING_START_TAG_STATE';
90const BOGUS_COMMENT_STATE = 'BOGUS_COMMENT_STATE';
91const MARKUP_DECLARATION_OPEN_STATE = 'MARKUP_DECLARATION_OPEN_STATE';
92const COMMENT_START_STATE = 'COMMENT_START_STATE';
93const COMMENT_START_DASH_STATE = 'COMMENT_START_DASH_STATE';
94const COMMENT_STATE = 'COMMENT_STATE';
95const COMMENT_LESS_THAN_SIGN_STATE = 'COMMENT_LESS_THAN_SIGN_STATE';
96const COMMENT_LESS_THAN_SIGN_BANG_STATE = 'COMMENT_LESS_THAN_SIGN_BANG_STATE';
97const COMMENT_LESS_THAN_SIGN_BANG_DASH_STATE = 'COMMENT_LESS_THAN_SIGN_BANG_DASH_STATE';
98const COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH_STATE = 'COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH_STATE';
99const COMMENT_END_DASH_STATE = 'COMMENT_END_DASH_STATE';
100const COMMENT_END_STATE = 'COMMENT_END_STATE';
101const COMMENT_END_BANG_STATE = 'COMMENT_END_BANG_STATE';
102const DOCTYPE_STATE = 'DOCTYPE_STATE';
103const BEFORE_DOCTYPE_NAME_STATE = 'BEFORE_DOCTYPE_NAME_STATE';
104const DOCTYPE_NAME_STATE = 'DOCTYPE_NAME_STATE';
105const AFTER_DOCTYPE_NAME_STATE = 'AFTER_DOCTYPE_NAME_STATE';
106const AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE = 'AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE';
107const BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE = 'BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE';
108const DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE = 'DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE';
109const DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE = 'DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE';
110const AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE = 'AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE';
111const BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE = 'BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE';
112const AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE = 'AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE';
113const BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE = 'BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE';
114const DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE = 'DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE';
115const DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE = 'DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE';
116const AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE = 'AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE';
117const BOGUS_DOCTYPE_STATE = 'BOGUS_DOCTYPE_STATE';
118const CDATA_SECTION_STATE = 'CDATA_SECTION_STATE';
119const CDATA_SECTION_BRACKET_STATE = 'CDATA_SECTION_BRACKET_STATE';
120const CDATA_SECTION_END_STATE = 'CDATA_SECTION_END_STATE';
121const CHARACTER_REFERENCE_STATE = 'CHARACTER_REFERENCE_STATE';
122const NAMED_CHARACTER_REFERENCE_STATE = 'NAMED_CHARACTER_REFERENCE_STATE';
123const AMBIGUOUS_AMPERSAND_STATE = 'AMBIGUOS_AMPERSAND_STATE';
124const NUMERIC_CHARACTER_REFERENCE_STATE = 'NUMERIC_CHARACTER_REFERENCE_STATE';
125const HEXADEMICAL_CHARACTER_REFERENCE_START_STATE = 'HEXADEMICAL_CHARACTER_REFERENCE_START_STATE';
126const DECIMAL_CHARACTER_REFERENCE_START_STATE = 'DECIMAL_CHARACTER_REFERENCE_START_STATE';
127const HEXADEMICAL_CHARACTER_REFERENCE_STATE = 'HEXADEMICAL_CHARACTER_REFERENCE_STATE';
128const DECIMAL_CHARACTER_REFERENCE_STATE = 'DECIMAL_CHARACTER_REFERENCE_STATE';
129const NUMERIC_CHARACTER_REFERENCE_END_STATE = 'NUMERIC_CHARACTER_REFERENCE_END_STATE';
130
131//Utils
132
133//OPTIMIZATION: these utility functions should not be moved out of this module. V8 Crankshaft will not inline
134//this functions if they will be situated in another module due to context switch.
135//Always perform inlining check before modifying this functions ('node --trace-inlining').
136function isWhitespace(cp) {
137 return cp === $.SPACE || cp === $.LINE_FEED || cp === $.TABULATION || cp === $.FORM_FEED;
138}
139
140function isAsciiDigit(cp) {
141 return cp >= $.DIGIT_0 && cp <= $.DIGIT_9;
142}
143
144function isAsciiUpper(cp) {
145 return cp >= $.LATIN_CAPITAL_A && cp <= $.LATIN_CAPITAL_Z;
146}
147
148function isAsciiLower(cp) {
149 return cp >= $.LATIN_SMALL_A && cp <= $.LATIN_SMALL_Z;
150}
151
152function isAsciiLetter(cp) {
153 return isAsciiLower(cp) || isAsciiUpper(cp);
154}
155
156function isAsciiAlphaNumeric(cp) {
157 return isAsciiLetter(cp) || isAsciiDigit(cp);
158}
159
160function isAsciiUpperHexDigit(cp) {
161 return cp >= $.LATIN_CAPITAL_A && cp <= $.LATIN_CAPITAL_F;
162}
163
164function isAsciiLowerHexDigit(cp) {
165 return cp >= $.LATIN_SMALL_A && cp <= $.LATIN_SMALL_F;
166}
167
168function isAsciiHexDigit(cp) {
169 return isAsciiDigit(cp) || isAsciiUpperHexDigit(cp) || isAsciiLowerHexDigit(cp);
170}
171
172function toAsciiLowerCodePoint(cp) {
173 return cp + 0x0020;
174}
175
176//NOTE: String.fromCharCode() function can handle only characters from BMP subset.
177//So, we need to workaround this manually.
178//(see: https://developer.mozilla.org/en-US/docs/JavaScript/Reference/Global_Objects/String/fromCharCode#Getting_it_to_work_with_higher_values)
179function toChar(cp) {
180 if (cp <= 0xffff) {
181 return String.fromCharCode(cp);
182 }
183
184 cp -= 0x10000;
185 return String.fromCharCode(((cp >>> 10) & 0x3ff) | 0xd800) + String.fromCharCode(0xdc00 | (cp & 0x3ff));
186}
187
188function toAsciiLowerChar(cp) {
189 return String.fromCharCode(toAsciiLowerCodePoint(cp));
190}
191
192function findNamedEntityTreeBranch(nodeIx, cp) {
193 const branchCount = neTree[++nodeIx];
194 let lo = ++nodeIx;
195 let hi = lo + branchCount - 1;
196
197 while (lo <= hi) {
198 const mid = (lo + hi) >>> 1;
199 const midCp = neTree[mid];
200
201 if (midCp < cp) {
202 lo = mid + 1;
203 } else if (midCp > cp) {
204 hi = mid - 1;
205 } else {
206 return neTree[mid + branchCount];
207 }
208 }
209
210 return -1;
211}
212
213//Tokenizer
214class Tokenizer {
215 constructor() {
216 this.preprocessor = new Preprocessor();
217
218 this.tokenQueue = [];
219
220 this.allowCDATA = false;
221
222 this.state = DATA_STATE;
223 this.returnState = '';
224
225 this.charRefCode = -1;
226 this.tempBuff = [];
227 this.lastStartTagName = '';
228
229 this.consumedAfterSnapshot = -1;
230 this.active = false;
231
232 this.currentCharacterToken = null;
233 this.currentToken = null;
234 this.currentAttr = null;
235 }
236
237 //Errors
238 _err() {
239 // NOTE: err reporting is noop by default. Enabled by mixin.
240 }
241
242 _errOnNextCodePoint(err) {
243 this._consume();
244 this._err(err);
245 this._unconsume();
246 }
247
248 //API
249 getNextToken() {
250 while (!this.tokenQueue.length && this.active) {
251 this.consumedAfterSnapshot = 0;
252
253 const cp = this._consume();
254
255 if (!this._ensureHibernation()) {
256 this[this.state](cp);
257 }
258 }
259
260 return this.tokenQueue.shift();
261 }
262
263 write(chunk, isLastChunk) {
264 this.active = true;
265 this.preprocessor.write(chunk, isLastChunk);
266 }
267
268 insertHtmlAtCurrentPos(chunk) {
269 this.active = true;
270 this.preprocessor.insertHtmlAtCurrentPos(chunk);
271 }
272
273 //Hibernation
274 _ensureHibernation() {
275 if (this.preprocessor.endOfChunkHit) {
276 for (; this.consumedAfterSnapshot > 0; this.consumedAfterSnapshot--) {
277 this.preprocessor.retreat();
278 }
279
280 this.active = false;
281 this.tokenQueue.push({ type: Tokenizer.HIBERNATION_TOKEN });
282
283 return true;
284 }
285
286 return false;
287 }
288
289 //Consumption
290 _consume() {
291 this.consumedAfterSnapshot++;
292 return this.preprocessor.advance();
293 }
294
295 _unconsume() {
296 this.consumedAfterSnapshot--;
297 this.preprocessor.retreat();
298 }
299
300 _reconsumeInState(state) {
301 this.state = state;
302 this._unconsume();
303 }
304
305 _consumeSequenceIfMatch(pattern, startCp, caseSensitive) {
306 let consumedCount = 0;
307 let isMatch = true;
308 const patternLength = pattern.length;
309 let patternPos = 0;
310 let cp = startCp;
311 let patternCp = void 0;
312
313 for (; patternPos < patternLength; patternPos++) {
314 if (patternPos > 0) {
315 cp = this._consume();
316 consumedCount++;
317 }
318
319 if (cp === $.EOF) {
320 isMatch = false;
321 break;
322 }
323
324 patternCp = pattern[patternPos];
325
326 if (cp !== patternCp && (caseSensitive || cp !== toAsciiLowerCodePoint(patternCp))) {
327 isMatch = false;
328 break;
329 }
330 }
331
332 if (!isMatch) {
333 while (consumedCount--) {
334 this._unconsume();
335 }
336 }
337
338 return isMatch;
339 }
340
341 //Temp buffer
342 _isTempBufferEqualToScriptString() {
343 if (this.tempBuff.length !== $$.SCRIPT_STRING.length) {
344 return false;
345 }
346
347 for (let i = 0; i < this.tempBuff.length; i++) {
348 if (this.tempBuff[i] !== $$.SCRIPT_STRING[i]) {
349 return false;
350 }
351 }
352
353 return true;
354 }
355
356 //Token creation
357 _createStartTagToken() {
358 this.currentToken = {
359 type: Tokenizer.START_TAG_TOKEN,
360 tagName: '',
361 selfClosing: false,
362 ackSelfClosing: false,
363 attrs: []
364 };
365 }
366
367 _createEndTagToken() {
368 this.currentToken = {
369 type: Tokenizer.END_TAG_TOKEN,
370 tagName: '',
371 selfClosing: false,
372 attrs: []
373 };
374 }
375
376 _createCommentToken() {
377 this.currentToken = {
378 type: Tokenizer.COMMENT_TOKEN,
379 data: ''
380 };
381 }
382
383 _createDoctypeToken(initialName) {
384 this.currentToken = {
385 type: Tokenizer.DOCTYPE_TOKEN,
386 name: initialName,
387 forceQuirks: false,
388 publicId: null,
389 systemId: null
390 };
391 }
392
393 _createCharacterToken(type, ch) {
394 this.currentCharacterToken = {
395 type: type,
396 chars: ch
397 };
398 }
399
400 _createEOFToken() {
401 this.currentToken = { type: Tokenizer.EOF_TOKEN };
402 }
403
404 //Tag attributes
405 _createAttr(attrNameFirstCh) {
406 this.currentAttr = {
407 name: attrNameFirstCh,
408 value: ''
409 };
410 }
411
412 _leaveAttrName(toState) {
413 if (Tokenizer.getTokenAttr(this.currentToken, this.currentAttr.name) === null) {
414 this.currentToken.attrs.push(this.currentAttr);
415 } else {
416 this._err(ERR.duplicateAttribute);
417 }
418
419 this.state = toState;
420 }
421
422 _leaveAttrValue(toState) {
423 this.state = toState;
424 }
425
426 //Token emission
427 _emitCurrentToken() {
428 this._emitCurrentCharacterToken();
429
430 const ct = this.currentToken;
431
432 this.currentToken = null;
433
434 //NOTE: store emited start tag's tagName to determine is the following end tag token is appropriate.
435 if (ct.type === Tokenizer.START_TAG_TOKEN) {
436 this.lastStartTagName = ct.tagName;
437 } else if (ct.type === Tokenizer.END_TAG_TOKEN) {
438 if (ct.attrs.length > 0) {
439 this._err(ERR.endTagWithAttributes);
440 }
441
442 if (ct.selfClosing) {
443 this._err(ERR.endTagWithTrailingSolidus);
444 }
445 }
446
447 this.tokenQueue.push(ct);
448 }
449
450 _emitCurrentCharacterToken() {
451 if (this.currentCharacterToken) {
452 this.tokenQueue.push(this.currentCharacterToken);
453 this.currentCharacterToken = null;
454 }
455 }
456
457 _emitEOFToken() {
458 this._createEOFToken();
459 this._emitCurrentToken();
460 }
461
462 //Characters emission
463
464 //OPTIMIZATION: specification uses only one type of character tokens (one token per character).
465 //This causes a huge memory overhead and a lot of unnecessary parser loops. parse5 uses 3 groups of characters.
466 //If we have a sequence of characters that belong to the same group, parser can process it
467 //as a single solid character token.
468 //So, there are 3 types of character tokens in parse5:
469 //1)NULL_CHARACTER_TOKEN - \u0000-character sequences (e.g. '\u0000\u0000\u0000')
470 //2)WHITESPACE_CHARACTER_TOKEN - any whitespace/new-line character sequences (e.g. '\n \r\t \f')
471 //3)CHARACTER_TOKEN - any character sequence which don't belong to groups 1 and 2 (e.g. 'abcdef1234@@#$%^')
472 _appendCharToCurrentCharacterToken(type, ch) {
473 if (this.currentCharacterToken && this.currentCharacterToken.type !== type) {
474 this._emitCurrentCharacterToken();
475 }
476
477 if (this.currentCharacterToken) {
478 this.currentCharacterToken.chars += ch;
479 } else {
480 this._createCharacterToken(type, ch);
481 }
482 }
483
484 _emitCodePoint(cp) {
485 let type = Tokenizer.CHARACTER_TOKEN;
486
487 if (isWhitespace(cp)) {
488 type = Tokenizer.WHITESPACE_CHARACTER_TOKEN;
489 } else if (cp === $.NULL) {
490 type = Tokenizer.NULL_CHARACTER_TOKEN;
491 }
492
493 this._appendCharToCurrentCharacterToken(type, toChar(cp));
494 }
495
496 _emitSeveralCodePoints(codePoints) {
497 for (let i = 0; i < codePoints.length; i++) {
498 this._emitCodePoint(codePoints[i]);
499 }
500 }
501
502 //NOTE: used then we emit character explicitly. This is always a non-whitespace and a non-null character.
503 //So we can avoid additional checks here.
504 _emitChars(ch) {
505 this._appendCharToCurrentCharacterToken(Tokenizer.CHARACTER_TOKEN, ch);
506 }
507
508 // Character reference helpers
509 _matchNamedCharacterReference(startCp) {
510 let result = null;
511 let excess = 1;
512 let i = findNamedEntityTreeBranch(0, startCp);
513
514 this.tempBuff.push(startCp);
515
516 while (i > -1) {
517 const current = neTree[i];
518 const inNode = current < MAX_BRANCH_MARKER_VALUE;
519 const nodeWithData = inNode && current & HAS_DATA_FLAG;
520
521 if (nodeWithData) {
522 //NOTE: we use greedy search, so we continue lookup at this point
523 result = current & DATA_DUPLET_FLAG ? [neTree[++i], neTree[++i]] : [neTree[++i]];
524 excess = 0;
525 }
526
527 const cp = this._consume();
528
529 this.tempBuff.push(cp);
530 excess++;
531
532 if (cp === $.EOF) {
533 break;
534 }
535
536 if (inNode) {
537 i = current & HAS_BRANCHES_FLAG ? findNamedEntityTreeBranch(i, cp) : -1;
538 } else {
539 i = cp === current ? ++i : -1;
540 }
541 }
542
543 while (excess--) {
544 this.tempBuff.pop();
545 this._unconsume();
546 }
547
548 return result;
549 }
550
551 _isCharacterReferenceInAttribute() {
552 return (
553 this.returnState === ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ||
554 this.returnState === ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ||
555 this.returnState === ATTRIBUTE_VALUE_UNQUOTED_STATE
556 );
557 }
558
559 _isCharacterReferenceAttributeQuirk(withSemicolon) {
560 if (!withSemicolon && this._isCharacterReferenceInAttribute()) {
561 const nextCp = this._consume();
562
563 this._unconsume();
564
565 return nextCp === $.EQUALS_SIGN || isAsciiAlphaNumeric(nextCp);
566 }
567
568 return false;
569 }
570
571 _flushCodePointsConsumedAsCharacterReference() {
572 if (this._isCharacterReferenceInAttribute()) {
573 for (let i = 0; i < this.tempBuff.length; i++) {
574 this.currentAttr.value += toChar(this.tempBuff[i]);
575 }
576 } else {
577 this._emitSeveralCodePoints(this.tempBuff);
578 }
579
580 this.tempBuff = [];
581 }
582
583 // State machine
584
585 // Data state
586 //------------------------------------------------------------------
587 [DATA_STATE](cp) {
588 this.preprocessor.dropParsedChunk();
589
590 if (cp === $.LESS_THAN_SIGN) {
591 this.state = TAG_OPEN_STATE;
592 } else if (cp === $.AMPERSAND) {
593 this.returnState = DATA_STATE;
594 this.state = CHARACTER_REFERENCE_STATE;
595 } else if (cp === $.NULL) {
596 this._err(ERR.unexpectedNullCharacter);
597 this._emitCodePoint(cp);
598 } else if (cp === $.EOF) {
599 this._emitEOFToken();
600 } else {
601 this._emitCodePoint(cp);
602 }
603 }
604
605 // RCDATA state
606 //------------------------------------------------------------------
607 [RCDATA_STATE](cp) {
608 this.preprocessor.dropParsedChunk();
609
610 if (cp === $.AMPERSAND) {
611 this.returnState = RCDATA_STATE;
612 this.state = CHARACTER_REFERENCE_STATE;
613 } else if (cp === $.LESS_THAN_SIGN) {
614 this.state = RCDATA_LESS_THAN_SIGN_STATE;
615 } else if (cp === $.NULL) {
616 this._err(ERR.unexpectedNullCharacter);
617 this._emitChars(unicode.REPLACEMENT_CHARACTER);
618 } else if (cp === $.EOF) {
619 this._emitEOFToken();
620 } else {
621 this._emitCodePoint(cp);
622 }
623 }
624
625 // RAWTEXT state
626 //------------------------------------------------------------------
627 [RAWTEXT_STATE](cp) {
628 this.preprocessor.dropParsedChunk();
629
630 if (cp === $.LESS_THAN_SIGN) {
631 this.state = RAWTEXT_LESS_THAN_SIGN_STATE;
632 } else if (cp === $.NULL) {
633 this._err(ERR.unexpectedNullCharacter);
634 this._emitChars(unicode.REPLACEMENT_CHARACTER);
635 } else if (cp === $.EOF) {
636 this._emitEOFToken();
637 } else {
638 this._emitCodePoint(cp);
639 }
640 }
641
642 // Script data state
643 //------------------------------------------------------------------
644 [SCRIPT_DATA_STATE](cp) {
645 this.preprocessor.dropParsedChunk();
646
647 if (cp === $.LESS_THAN_SIGN) {
648 this.state = SCRIPT_DATA_LESS_THAN_SIGN_STATE;
649 } else if (cp === $.NULL) {
650 this._err(ERR.unexpectedNullCharacter);
651 this._emitChars(unicode.REPLACEMENT_CHARACTER);
652 } else if (cp === $.EOF) {
653 this._emitEOFToken();
654 } else {
655 this._emitCodePoint(cp);
656 }
657 }
658
659 // PLAINTEXT state
660 //------------------------------------------------------------------
661 [PLAINTEXT_STATE](cp) {
662 this.preprocessor.dropParsedChunk();
663
664 if (cp === $.NULL) {
665 this._err(ERR.unexpectedNullCharacter);
666 this._emitChars(unicode.REPLACEMENT_CHARACTER);
667 } else if (cp === $.EOF) {
668 this._emitEOFToken();
669 } else {
670 this._emitCodePoint(cp);
671 }
672 }
673
674 // Tag open state
675 //------------------------------------------------------------------
676 [TAG_OPEN_STATE](cp) {
677 if (cp === $.EXCLAMATION_MARK) {
678 this.state = MARKUP_DECLARATION_OPEN_STATE;
679 } else if (cp === $.SOLIDUS) {
680 this.state = END_TAG_OPEN_STATE;
681 } else if (isAsciiLetter(cp)) {
682 this._createStartTagToken();
683 this._reconsumeInState(TAG_NAME_STATE);
684 } else if (cp === $.QUESTION_MARK) {
685 this._err(ERR.unexpectedQuestionMarkInsteadOfTagName);
686 this._createCommentToken();
687 this._reconsumeInState(BOGUS_COMMENT_STATE);
688 } else if (cp === $.EOF) {
689 this._err(ERR.eofBeforeTagName);
690 this._emitChars('<');
691 this._emitEOFToken();
692 } else {
693 this._err(ERR.invalidFirstCharacterOfTagName);
694 this._emitChars('<');
695 this._reconsumeInState(DATA_STATE);
696 }
697 }
698
699 // End tag open state
700 //------------------------------------------------------------------
701 [END_TAG_OPEN_STATE](cp) {
702 if (isAsciiLetter(cp)) {
703 this._createEndTagToken();
704 this._reconsumeInState(TAG_NAME_STATE);
705 } else if (cp === $.GREATER_THAN_SIGN) {
706 this._err(ERR.missingEndTagName);
707 this.state = DATA_STATE;
708 } else if (cp === $.EOF) {
709 this._err(ERR.eofBeforeTagName);
710 this._emitChars('</');
711 this._emitEOFToken();
712 } else {
713 this._err(ERR.invalidFirstCharacterOfTagName);
714 this._createCommentToken();
715 this._reconsumeInState(BOGUS_COMMENT_STATE);
716 }
717 }
718
719 // Tag name state
720 //------------------------------------------------------------------
721 [TAG_NAME_STATE](cp) {
722 if (isWhitespace(cp)) {
723 this.state = BEFORE_ATTRIBUTE_NAME_STATE;
724 } else if (cp === $.SOLIDUS) {
725 this.state = SELF_CLOSING_START_TAG_STATE;
726 } else if (cp === $.GREATER_THAN_SIGN) {
727 this.state = DATA_STATE;
728 this._emitCurrentToken();
729 } else if (isAsciiUpper(cp)) {
730 this.currentToken.tagName += toAsciiLowerChar(cp);
731 } else if (cp === $.NULL) {
732 this._err(ERR.unexpectedNullCharacter);
733 this.currentToken.tagName += unicode.REPLACEMENT_CHARACTER;
734 } else if (cp === $.EOF) {
735 this._err(ERR.eofInTag);
736 this._emitEOFToken();
737 } else {
738 this.currentToken.tagName += toChar(cp);
739 }
740 }
741
742 // RCDATA less-than sign state
743 //------------------------------------------------------------------
744 [RCDATA_LESS_THAN_SIGN_STATE](cp) {
745 if (cp === $.SOLIDUS) {
746 this.tempBuff = [];
747 this.state = RCDATA_END_TAG_OPEN_STATE;
748 } else {
749 this._emitChars('<');
750 this._reconsumeInState(RCDATA_STATE);
751 }
752 }
753
754 // RCDATA end tag open state
755 //------------------------------------------------------------------
756 [RCDATA_END_TAG_OPEN_STATE](cp) {
757 if (isAsciiLetter(cp)) {
758 this._createEndTagToken();
759 this._reconsumeInState(RCDATA_END_TAG_NAME_STATE);
760 } else {
761 this._emitChars('</');
762 this._reconsumeInState(RCDATA_STATE);
763 }
764 }
765
766 // RCDATA end tag name state
767 //------------------------------------------------------------------
768 [RCDATA_END_TAG_NAME_STATE](cp) {
769 if (isAsciiUpper(cp)) {
770 this.currentToken.tagName += toAsciiLowerChar(cp);
771 this.tempBuff.push(cp);
772 } else if (isAsciiLower(cp)) {
773 this.currentToken.tagName += toChar(cp);
774 this.tempBuff.push(cp);
775 } else {
776 if (this.lastStartTagName === this.currentToken.tagName) {
777 if (isWhitespace(cp)) {
778 this.state = BEFORE_ATTRIBUTE_NAME_STATE;
779 return;
780 }
781
782 if (cp === $.SOLIDUS) {
783 this.state = SELF_CLOSING_START_TAG_STATE;
784 return;
785 }
786
787 if (cp === $.GREATER_THAN_SIGN) {
788 this.state = DATA_STATE;
789 this._emitCurrentToken();
790 return;
791 }
792 }
793
794 this._emitChars('</');
795 this._emitSeveralCodePoints(this.tempBuff);
796 this._reconsumeInState(RCDATA_STATE);
797 }
798 }
799
800 // RAWTEXT less-than sign state
801 //------------------------------------------------------------------
802 [RAWTEXT_LESS_THAN_SIGN_STATE](cp) {
803 if (cp === $.SOLIDUS) {
804 this.tempBuff = [];
805 this.state = RAWTEXT_END_TAG_OPEN_STATE;
806 } else {
807 this._emitChars('<');
808 this._reconsumeInState(RAWTEXT_STATE);
809 }
810 }
811
812 // RAWTEXT end tag open state
813 //------------------------------------------------------------------
814 [RAWTEXT_END_TAG_OPEN_STATE](cp) {
815 if (isAsciiLetter(cp)) {
816 this._createEndTagToken();
817 this._reconsumeInState(RAWTEXT_END_TAG_NAME_STATE);
818 } else {
819 this._emitChars('</');
820 this._reconsumeInState(RAWTEXT_STATE);
821 }
822 }
823
824 // RAWTEXT end tag name state
825 //------------------------------------------------------------------
826 [RAWTEXT_END_TAG_NAME_STATE](cp) {
827 if (isAsciiUpper(cp)) {
828 this.currentToken.tagName += toAsciiLowerChar(cp);
829 this.tempBuff.push(cp);
830 } else if (isAsciiLower(cp)) {
831 this.currentToken.tagName += toChar(cp);
832 this.tempBuff.push(cp);
833 } else {
834 if (this.lastStartTagName === this.currentToken.tagName) {
835 if (isWhitespace(cp)) {
836 this.state = BEFORE_ATTRIBUTE_NAME_STATE;
837 return;
838 }
839
840 if (cp === $.SOLIDUS) {
841 this.state = SELF_CLOSING_START_TAG_STATE;
842 return;
843 }
844
845 if (cp === $.GREATER_THAN_SIGN) {
846 this._emitCurrentToken();
847 this.state = DATA_STATE;
848 return;
849 }
850 }
851
852 this._emitChars('</');
853 this._emitSeveralCodePoints(this.tempBuff);
854 this._reconsumeInState(RAWTEXT_STATE);
855 }
856 }
857
858 // Script data less-than sign state
859 //------------------------------------------------------------------
860 [SCRIPT_DATA_LESS_THAN_SIGN_STATE](cp) {
861 if (cp === $.SOLIDUS) {
862 this.tempBuff = [];
863 this.state = SCRIPT_DATA_END_TAG_OPEN_STATE;
864 } else if (cp === $.EXCLAMATION_MARK) {
865 this.state = SCRIPT_DATA_ESCAPE_START_STATE;
866 this._emitChars('<!');
867 } else {
868 this._emitChars('<');
869 this._reconsumeInState(SCRIPT_DATA_STATE);
870 }
871 }
872
873 // Script data end tag open state
874 //------------------------------------------------------------------
875 [SCRIPT_DATA_END_TAG_OPEN_STATE](cp) {
876 if (isAsciiLetter(cp)) {
877 this._createEndTagToken();
878 this._reconsumeInState(SCRIPT_DATA_END_TAG_NAME_STATE);
879 } else {
880 this._emitChars('</');
881 this._reconsumeInState(SCRIPT_DATA_STATE);
882 }
883 }
884
885 // Script data end tag name state
886 //------------------------------------------------------------------
887 [SCRIPT_DATA_END_TAG_NAME_STATE](cp) {
888 if (isAsciiUpper(cp)) {
889 this.currentToken.tagName += toAsciiLowerChar(cp);
890 this.tempBuff.push(cp);
891 } else if (isAsciiLower(cp)) {
892 this.currentToken.tagName += toChar(cp);
893 this.tempBuff.push(cp);
894 } else {
895 if (this.lastStartTagName === this.currentToken.tagName) {
896 if (isWhitespace(cp)) {
897 this.state = BEFORE_ATTRIBUTE_NAME_STATE;
898 return;
899 } else if (cp === $.SOLIDUS) {
900 this.state = SELF_CLOSING_START_TAG_STATE;
901 return;
902 } else if (cp === $.GREATER_THAN_SIGN) {
903 this._emitCurrentToken();
904 this.state = DATA_STATE;
905 return;
906 }
907 }
908
909 this._emitChars('</');
910 this._emitSeveralCodePoints(this.tempBuff);
911 this._reconsumeInState(SCRIPT_DATA_STATE);
912 }
913 }
914
915 // Script data escape start state
916 //------------------------------------------------------------------
917 [SCRIPT_DATA_ESCAPE_START_STATE](cp) {
918 if (cp === $.HYPHEN_MINUS) {
919 this.state = SCRIPT_DATA_ESCAPE_START_DASH_STATE;
920 this._emitChars('-');
921 } else {
922 this._reconsumeInState(SCRIPT_DATA_STATE);
923 }
924 }
925
926 // Script data escape start dash state
927 //------------------------------------------------------------------
928 [SCRIPT_DATA_ESCAPE_START_DASH_STATE](cp) {
929 if (cp === $.HYPHEN_MINUS) {
930 this.state = SCRIPT_DATA_ESCAPED_DASH_DASH_STATE;
931 this._emitChars('-');
932 } else {
933 this._reconsumeInState(SCRIPT_DATA_STATE);
934 }
935 }
936
937 // Script data escaped state
938 //------------------------------------------------------------------
939 [SCRIPT_DATA_ESCAPED_STATE](cp) {
940 if (cp === $.HYPHEN_MINUS) {
941 this.state = SCRIPT_DATA_ESCAPED_DASH_STATE;
942 this._emitChars('-');
943 } else if (cp === $.LESS_THAN_SIGN) {
944 this.state = SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE;
945 } else if (cp === $.NULL) {
946 this._err(ERR.unexpectedNullCharacter);
947 this._emitChars(unicode.REPLACEMENT_CHARACTER);
948 } else if (cp === $.EOF) {
949 this._err(ERR.eofInScriptHtmlCommentLikeText);
950 this._emitEOFToken();
951 } else {
952 this._emitCodePoint(cp);
953 }
954 }
955
956 // Script data escaped dash state
957 //------------------------------------------------------------------
958 [SCRIPT_DATA_ESCAPED_DASH_STATE](cp) {
959 if (cp === $.HYPHEN_MINUS) {
960 this.state = SCRIPT_DATA_ESCAPED_DASH_DASH_STATE;
961 this._emitChars('-');
962 } else if (cp === $.LESS_THAN_SIGN) {
963 this.state = SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE;
964 } else if (cp === $.NULL) {
965 this._err(ERR.unexpectedNullCharacter);
966 this.state = SCRIPT_DATA_ESCAPED_STATE;
967 this._emitChars(unicode.REPLACEMENT_CHARACTER);
968 } else if (cp === $.EOF) {
969 this._err(ERR.eofInScriptHtmlCommentLikeText);
970 this._emitEOFToken();
971 } else {
972 this.state = SCRIPT_DATA_ESCAPED_STATE;
973 this._emitCodePoint(cp);
974 }
975 }
976
977 // Script data escaped dash dash state
978 //------------------------------------------------------------------
979 [SCRIPT_DATA_ESCAPED_DASH_DASH_STATE](cp) {
980 if (cp === $.HYPHEN_MINUS) {
981 this._emitChars('-');
982 } else if (cp === $.LESS_THAN_SIGN) {
983 this.state = SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE;
984 } else if (cp === $.GREATER_THAN_SIGN) {
985 this.state = SCRIPT_DATA_STATE;
986 this._emitChars('>');
987 } else if (cp === $.NULL) {
988 this._err(ERR.unexpectedNullCharacter);
989 this.state = SCRIPT_DATA_ESCAPED_STATE;
990 this._emitChars(unicode.REPLACEMENT_CHARACTER);
991 } else if (cp === $.EOF) {
992 this._err(ERR.eofInScriptHtmlCommentLikeText);
993 this._emitEOFToken();
994 } else {
995 this.state = SCRIPT_DATA_ESCAPED_STATE;
996 this._emitCodePoint(cp);
997 }
998 }
999
1000 // Script data escaped less-than sign state
1001 //------------------------------------------------------------------
1002 [SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE](cp) {
1003 if (cp === $.SOLIDUS) {
1004 this.tempBuff = [];
1005 this.state = SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE;
1006 } else if (isAsciiLetter(cp)) {
1007 this.tempBuff = [];
1008 this._emitChars('<');
1009 this._reconsumeInState(SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE);
1010 } else {
1011 this._emitChars('<');
1012 this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE);
1013 }
1014 }
1015
1016 // Script data escaped end tag open state
1017 //------------------------------------------------------------------
1018 [SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE](cp) {
1019 if (isAsciiLetter(cp)) {
1020 this._createEndTagToken();
1021 this._reconsumeInState(SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE);
1022 } else {
1023 this._emitChars('</');
1024 this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE);
1025 }
1026 }
1027
1028 // Script data escaped end tag name state
1029 //------------------------------------------------------------------
1030 [SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE](cp) {
1031 if (isAsciiUpper(cp)) {
1032 this.currentToken.tagName += toAsciiLowerChar(cp);
1033 this.tempBuff.push(cp);
1034 } else if (isAsciiLower(cp)) {
1035 this.currentToken.tagName += toChar(cp);
1036 this.tempBuff.push(cp);
1037 } else {
1038 if (this.lastStartTagName === this.currentToken.tagName) {
1039 if (isWhitespace(cp)) {
1040 this.state = BEFORE_ATTRIBUTE_NAME_STATE;
1041 return;
1042 }
1043
1044 if (cp === $.SOLIDUS) {
1045 this.state = SELF_CLOSING_START_TAG_STATE;
1046 return;
1047 }
1048
1049 if (cp === $.GREATER_THAN_SIGN) {
1050 this._emitCurrentToken();
1051 this.state = DATA_STATE;
1052 return;
1053 }
1054 }
1055
1056 this._emitChars('</');
1057 this._emitSeveralCodePoints(this.tempBuff);
1058 this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE);
1059 }
1060 }
1061
1062 // Script data double escape start state
1063 //------------------------------------------------------------------
1064 [SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE](cp) {
1065 if (isWhitespace(cp) || cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN) {
1066 this.state = this._isTempBufferEqualToScriptString()
1067 ? SCRIPT_DATA_DOUBLE_ESCAPED_STATE
1068 : SCRIPT_DATA_ESCAPED_STATE;
1069 this._emitCodePoint(cp);
1070 } else if (isAsciiUpper(cp)) {
1071 this.tempBuff.push(toAsciiLowerCodePoint(cp));
1072 this._emitCodePoint(cp);
1073 } else if (isAsciiLower(cp)) {
1074 this.tempBuff.push(cp);
1075 this._emitCodePoint(cp);
1076 } else {
1077 this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE);
1078 }
1079 }
1080
1081 // Script data double escaped state
1082 //------------------------------------------------------------------
1083 [SCRIPT_DATA_DOUBLE_ESCAPED_STATE](cp) {
1084 if (cp === $.HYPHEN_MINUS) {
1085 this.state = SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE;
1086 this._emitChars('-');
1087 } else if (cp === $.LESS_THAN_SIGN) {
1088 this.state = SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE;
1089 this._emitChars('<');
1090 } else if (cp === $.NULL) {
1091 this._err(ERR.unexpectedNullCharacter);
1092 this._emitChars(unicode.REPLACEMENT_CHARACTER);
1093 } else if (cp === $.EOF) {
1094 this._err(ERR.eofInScriptHtmlCommentLikeText);
1095 this._emitEOFToken();
1096 } else {
1097 this._emitCodePoint(cp);
1098 }
1099 }
1100
1101 // Script data double escaped dash state
1102 //------------------------------------------------------------------
1103 [SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE](cp) {
1104 if (cp === $.HYPHEN_MINUS) {
1105 this.state = SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE;
1106 this._emitChars('-');
1107 } else if (cp === $.LESS_THAN_SIGN) {
1108 this.state = SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE;
1109 this._emitChars('<');
1110 } else if (cp === $.NULL) {
1111 this._err(ERR.unexpectedNullCharacter);
1112 this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
1113 this._emitChars(unicode.REPLACEMENT_CHARACTER);
1114 } else if (cp === $.EOF) {
1115 this._err(ERR.eofInScriptHtmlCommentLikeText);
1116 this._emitEOFToken();
1117 } else {
1118 this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
1119 this._emitCodePoint(cp);
1120 }
1121 }
1122
1123 // Script data double escaped dash dash state
1124 //------------------------------------------------------------------
1125 [SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE](cp) {
1126 if (cp === $.HYPHEN_MINUS) {
1127 this._emitChars('-');
1128 } else if (cp === $.LESS_THAN_SIGN) {
1129 this.state = SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE;
1130 this._emitChars('<');
1131 } else if (cp === $.GREATER_THAN_SIGN) {
1132 this.state = SCRIPT_DATA_STATE;
1133 this._emitChars('>');
1134 } else if (cp === $.NULL) {
1135 this._err(ERR.unexpectedNullCharacter);
1136 this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
1137 this._emitChars(unicode.REPLACEMENT_CHARACTER);
1138 } else if (cp === $.EOF) {
1139 this._err(ERR.eofInScriptHtmlCommentLikeText);
1140 this._emitEOFToken();
1141 } else {
1142 this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
1143 this._emitCodePoint(cp);
1144 }
1145 }
1146
1147 // Script data double escaped less-than sign state
1148 //------------------------------------------------------------------
1149 [SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE](cp) {
1150 if (cp === $.SOLIDUS) {
1151 this.tempBuff = [];
1152 this.state = SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE;
1153 this._emitChars('/');
1154 } else {
1155 this._reconsumeInState(SCRIPT_DATA_DOUBLE_ESCAPED_STATE);
1156 }
1157 }
1158
1159 // Script data double escape end state
1160 //------------------------------------------------------------------
1161 [SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE](cp) {
1162 if (isWhitespace(cp) || cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN) {
1163 this.state = this._isTempBufferEqualToScriptString()
1164 ? SCRIPT_DATA_ESCAPED_STATE
1165 : SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
1166
1167 this._emitCodePoint(cp);
1168 } else if (isAsciiUpper(cp)) {
1169 this.tempBuff.push(toAsciiLowerCodePoint(cp));
1170 this._emitCodePoint(cp);
1171 } else if (isAsciiLower(cp)) {
1172 this.tempBuff.push(cp);
1173 this._emitCodePoint(cp);
1174 } else {
1175 this._reconsumeInState(SCRIPT_DATA_DOUBLE_ESCAPED_STATE);
1176 }
1177 }
1178
1179 // Before attribute name state
1180 //------------------------------------------------------------------
1181 [BEFORE_ATTRIBUTE_NAME_STATE](cp) {
1182 if (isWhitespace(cp)) {
1183 return;
1184 }
1185
1186 if (cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN || cp === $.EOF) {
1187 this._reconsumeInState(AFTER_ATTRIBUTE_NAME_STATE);
1188 } else if (cp === $.EQUALS_SIGN) {
1189 this._err(ERR.unexpectedEqualsSignBeforeAttributeName);
1190 this._createAttr('=');
1191 this.state = ATTRIBUTE_NAME_STATE;
1192 } else {
1193 this._createAttr('');
1194 this._reconsumeInState(ATTRIBUTE_NAME_STATE);
1195 }
1196 }
1197
1198 // Attribute name state
1199 //------------------------------------------------------------------
1200 [ATTRIBUTE_NAME_STATE](cp) {
1201 if (isWhitespace(cp) || cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN || cp === $.EOF) {
1202 this._leaveAttrName(AFTER_ATTRIBUTE_NAME_STATE);
1203 this._unconsume();
1204 } else if (cp === $.EQUALS_SIGN) {
1205 this._leaveAttrName(BEFORE_ATTRIBUTE_VALUE_STATE);
1206 } else if (isAsciiUpper(cp)) {
1207 this.currentAttr.name += toAsciiLowerChar(cp);
1208 } else if (cp === $.QUOTATION_MARK || cp === $.APOSTROPHE || cp === $.LESS_THAN_SIGN) {
1209 this._err(ERR.unexpectedCharacterInAttributeName);
1210 this.currentAttr.name += toChar(cp);
1211 } else if (cp === $.NULL) {
1212 this._err(ERR.unexpectedNullCharacter);
1213 this.currentAttr.name += unicode.REPLACEMENT_CHARACTER;
1214 } else {
1215 this.currentAttr.name += toChar(cp);
1216 }
1217 }
1218
1219 // After attribute name state
1220 //------------------------------------------------------------------
1221 [AFTER_ATTRIBUTE_NAME_STATE](cp) {
1222 if (isWhitespace(cp)) {
1223 return;
1224 }
1225
1226 if (cp === $.SOLIDUS) {
1227 this.state = SELF_CLOSING_START_TAG_STATE;
1228 } else if (cp === $.EQUALS_SIGN) {
1229 this.state = BEFORE_ATTRIBUTE_VALUE_STATE;
1230 } else if (cp === $.GREATER_THAN_SIGN) {
1231 this.state = DATA_STATE;
1232 this._emitCurrentToken();
1233 } else if (cp === $.EOF) {
1234 this._err(ERR.eofInTag);
1235 this._emitEOFToken();
1236 } else {
1237 this._createAttr('');
1238 this._reconsumeInState(ATTRIBUTE_NAME_STATE);
1239 }
1240 }
1241
1242 // Before attribute value state
1243 //------------------------------------------------------------------
1244 [BEFORE_ATTRIBUTE_VALUE_STATE](cp) {
1245 if (isWhitespace(cp)) {
1246 return;
1247 }
1248
1249 if (cp === $.QUOTATION_MARK) {
1250 this.state = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1251 } else if (cp === $.APOSTROPHE) {
1252 this.state = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1253 } else if (cp === $.GREATER_THAN_SIGN) {
1254 this._err(ERR.missingAttributeValue);
1255 this.state = DATA_STATE;
1256 this._emitCurrentToken();
1257 } else {
1258 this._reconsumeInState(ATTRIBUTE_VALUE_UNQUOTED_STATE);
1259 }
1260 }
1261
1262 // Attribute value (double-quoted) state
1263 //------------------------------------------------------------------
1264 [ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE](cp) {
1265 if (cp === $.QUOTATION_MARK) {
1266 this.state = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1267 } else if (cp === $.AMPERSAND) {
1268 this.returnState = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1269 this.state = CHARACTER_REFERENCE_STATE;
1270 } else if (cp === $.NULL) {
1271 this._err(ERR.unexpectedNullCharacter);
1272 this.currentAttr.value += unicode.REPLACEMENT_CHARACTER;
1273 } else if (cp === $.EOF) {
1274 this._err(ERR.eofInTag);
1275 this._emitEOFToken();
1276 } else {
1277 this.currentAttr.value += toChar(cp);
1278 }
1279 }
1280
1281 // Attribute value (single-quoted) state
1282 //------------------------------------------------------------------
1283 [ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE](cp) {
1284 if (cp === $.APOSTROPHE) {
1285 this.state = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1286 } else if (cp === $.AMPERSAND) {
1287 this.returnState = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1288 this.state = CHARACTER_REFERENCE_STATE;
1289 } else if (cp === $.NULL) {
1290 this._err(ERR.unexpectedNullCharacter);
1291 this.currentAttr.value += unicode.REPLACEMENT_CHARACTER;
1292 } else if (cp === $.EOF) {
1293 this._err(ERR.eofInTag);
1294 this._emitEOFToken();
1295 } else {
1296 this.currentAttr.value += toChar(cp);
1297 }
1298 }
1299
1300 // Attribute value (unquoted) state
1301 //------------------------------------------------------------------
1302 [ATTRIBUTE_VALUE_UNQUOTED_STATE](cp) {
1303 if (isWhitespace(cp)) {
1304 this._leaveAttrValue(BEFORE_ATTRIBUTE_NAME_STATE);
1305 } else if (cp === $.AMPERSAND) {
1306 this.returnState = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1307 this.state = CHARACTER_REFERENCE_STATE;
1308 } else if (cp === $.GREATER_THAN_SIGN) {
1309 this._leaveAttrValue(DATA_STATE);
1310 this._emitCurrentToken();
1311 } else if (cp === $.NULL) {
1312 this._err(ERR.unexpectedNullCharacter);
1313 this.currentAttr.value += unicode.REPLACEMENT_CHARACTER;
1314 } else if (
1315 cp === $.QUOTATION_MARK ||
1316 cp === $.APOSTROPHE ||
1317 cp === $.LESS_THAN_SIGN ||
1318 cp === $.EQUALS_SIGN ||
1319 cp === $.GRAVE_ACCENT
1320 ) {
1321 this._err(ERR.unexpectedCharacterInUnquotedAttributeValue);
1322 this.currentAttr.value += toChar(cp);
1323 } else if (cp === $.EOF) {
1324 this._err(ERR.eofInTag);
1325 this._emitEOFToken();
1326 } else {
1327 this.currentAttr.value += toChar(cp);
1328 }
1329 }
1330
1331 // After attribute value (quoted) state
1332 //------------------------------------------------------------------
1333 [AFTER_ATTRIBUTE_VALUE_QUOTED_STATE](cp) {
1334 if (isWhitespace(cp)) {
1335 this._leaveAttrValue(BEFORE_ATTRIBUTE_NAME_STATE);
1336 } else if (cp === $.SOLIDUS) {
1337 this._leaveAttrValue(SELF_CLOSING_START_TAG_STATE);
1338 } else if (cp === $.GREATER_THAN_SIGN) {
1339 this._leaveAttrValue(DATA_STATE);
1340 this._emitCurrentToken();
1341 } else if (cp === $.EOF) {
1342 this._err(ERR.eofInTag);
1343 this._emitEOFToken();
1344 } else {
1345 this._err(ERR.missingWhitespaceBetweenAttributes);
1346 this._reconsumeInState(BEFORE_ATTRIBUTE_NAME_STATE);
1347 }
1348 }
1349
1350 // Self-closing start tag state
1351 //------------------------------------------------------------------
1352 [SELF_CLOSING_START_TAG_STATE](cp) {
1353 if (cp === $.GREATER_THAN_SIGN) {
1354 this.currentToken.selfClosing = true;
1355 this.state = DATA_STATE;
1356 this._emitCurrentToken();
1357 } else if (cp === $.EOF) {
1358 this._err(ERR.eofInTag);
1359 this._emitEOFToken();
1360 } else {
1361 this._err(ERR.unexpectedSolidusInTag);
1362 this._reconsumeInState(BEFORE_ATTRIBUTE_NAME_STATE);
1363 }
1364 }
1365
1366 // Bogus comment state
1367 //------------------------------------------------------------------
1368 [BOGUS_COMMENT_STATE](cp) {
1369 if (cp === $.GREATER_THAN_SIGN) {
1370 this.state = DATA_STATE;
1371 this._emitCurrentToken();
1372 } else if (cp === $.EOF) {
1373 this._emitCurrentToken();
1374 this._emitEOFToken();
1375 } else if (cp === $.NULL) {
1376 this._err(ERR.unexpectedNullCharacter);
1377 this.currentToken.data += unicode.REPLACEMENT_CHARACTER;
1378 } else {
1379 this.currentToken.data += toChar(cp);
1380 }
1381 }
1382
1383 // Markup declaration open state
1384 //------------------------------------------------------------------
1385 [MARKUP_DECLARATION_OPEN_STATE](cp) {
1386 if (this._consumeSequenceIfMatch($$.DASH_DASH_STRING, cp, true)) {
1387 this._createCommentToken();
1388 this.state = COMMENT_START_STATE;
1389 } else if (this._consumeSequenceIfMatch($$.DOCTYPE_STRING, cp, false)) {
1390 this.state = DOCTYPE_STATE;
1391 } else if (this._consumeSequenceIfMatch($$.CDATA_START_STRING, cp, true)) {
1392 if (this.allowCDATA) {
1393 this.state = CDATA_SECTION_STATE;
1394 } else {
1395 this._err(ERR.cdataInHtmlContent);
1396 this._createCommentToken();
1397 this.currentToken.data = '[CDATA[';
1398 this.state = BOGUS_COMMENT_STATE;
1399 }
1400 }
1401
1402 //NOTE: sequence lookup can be abrupted by hibernation. In that case lookup
1403 //results are no longer valid and we will need to start over.
1404 else if (!this._ensureHibernation()) {
1405 this._err(ERR.incorrectlyOpenedComment);
1406 this._createCommentToken();
1407 this._reconsumeInState(BOGUS_COMMENT_STATE);
1408 }
1409 }
1410
1411 // Comment start state
1412 //------------------------------------------------------------------
1413 [COMMENT_START_STATE](cp) {
1414 if (cp === $.HYPHEN_MINUS) {
1415 this.state = COMMENT_START_DASH_STATE;
1416 } else if (cp === $.GREATER_THAN_SIGN) {
1417 this._err(ERR.abruptClosingOfEmptyComment);
1418 this.state = DATA_STATE;
1419 this._emitCurrentToken();
1420 } else {
1421 this._reconsumeInState(COMMENT_STATE);
1422 }
1423 }
1424
1425 // Comment start dash state
1426 //------------------------------------------------------------------
1427 [COMMENT_START_DASH_STATE](cp) {
1428 if (cp === $.HYPHEN_MINUS) {
1429 this.state = COMMENT_END_STATE;
1430 } else if (cp === $.GREATER_THAN_SIGN) {
1431 this._err(ERR.abruptClosingOfEmptyComment);
1432 this.state = DATA_STATE;
1433 this._emitCurrentToken();
1434 } else if (cp === $.EOF) {
1435 this._err(ERR.eofInComment);
1436 this._emitCurrentToken();
1437 this._emitEOFToken();
1438 } else {
1439 this.currentToken.data += '-';
1440 this._reconsumeInState(COMMENT_STATE);
1441 }
1442 }
1443
1444 // Comment state
1445 //------------------------------------------------------------------
1446 [COMMENT_STATE](cp) {
1447 if (cp === $.HYPHEN_MINUS) {
1448 this.state = COMMENT_END_DASH_STATE;
1449 } else if (cp === $.LESS_THAN_SIGN) {
1450 this.currentToken.data += '<';
1451 this.state = COMMENT_LESS_THAN_SIGN_STATE;
1452 } else if (cp === $.NULL) {
1453 this._err(ERR.unexpectedNullCharacter);
1454 this.currentToken.data += unicode.REPLACEMENT_CHARACTER;
1455 } else if (cp === $.EOF) {
1456 this._err(ERR.eofInComment);
1457 this._emitCurrentToken();
1458 this._emitEOFToken();
1459 } else {
1460 this.currentToken.data += toChar(cp);
1461 }
1462 }
1463
1464 // Comment less-than sign state
1465 //------------------------------------------------------------------
1466 [COMMENT_LESS_THAN_SIGN_STATE](cp) {
1467 if (cp === $.EXCLAMATION_MARK) {
1468 this.currentToken.data += '!';
1469 this.state = COMMENT_LESS_THAN_SIGN_BANG_STATE;
1470 } else if (cp === $.LESS_THAN_SIGN) {
1471 this.currentToken.data += '!';
1472 } else {
1473 this._reconsumeInState(COMMENT_STATE);
1474 }
1475 }
1476
1477 // Comment less-than sign bang state
1478 //------------------------------------------------------------------
1479 [COMMENT_LESS_THAN_SIGN_BANG_STATE](cp) {
1480 if (cp === $.HYPHEN_MINUS) {
1481 this.state = COMMENT_LESS_THAN_SIGN_BANG_DASH_STATE;
1482 } else {
1483 this._reconsumeInState(COMMENT_STATE);
1484 }
1485 }
1486
1487 // Comment less-than sign bang dash state
1488 //------------------------------------------------------------------
1489 [COMMENT_LESS_THAN_SIGN_BANG_DASH_STATE](cp) {
1490 if (cp === $.HYPHEN_MINUS) {
1491 this.state = COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH_STATE;
1492 } else {
1493 this._reconsumeInState(COMMENT_END_DASH_STATE);
1494 }
1495 }
1496
1497 // Comment less-than sign bang dash dash state
1498 //------------------------------------------------------------------
1499 [COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH_STATE](cp) {
1500 if (cp !== $.GREATER_THAN_SIGN && cp !== $.EOF) {
1501 this._err(ERR.nestedComment);
1502 }
1503
1504 this._reconsumeInState(COMMENT_END_STATE);
1505 }
1506
1507 // Comment end dash state
1508 //------------------------------------------------------------------
1509 [COMMENT_END_DASH_STATE](cp) {
1510 if (cp === $.HYPHEN_MINUS) {
1511 this.state = COMMENT_END_STATE;
1512 } else if (cp === $.EOF) {
1513 this._err(ERR.eofInComment);
1514 this._emitCurrentToken();
1515 this._emitEOFToken();
1516 } else {
1517 this.currentToken.data += '-';
1518 this._reconsumeInState(COMMENT_STATE);
1519 }
1520 }
1521
1522 // Comment end state
1523 //------------------------------------------------------------------
1524 [COMMENT_END_STATE](cp) {
1525 if (cp === $.GREATER_THAN_SIGN) {
1526 this.state = DATA_STATE;
1527 this._emitCurrentToken();
1528 } else if (cp === $.EXCLAMATION_MARK) {
1529 this.state = COMMENT_END_BANG_STATE;
1530 } else if (cp === $.HYPHEN_MINUS) {
1531 this.currentToken.data += '-';
1532 } else if (cp === $.EOF) {
1533 this._err(ERR.eofInComment);
1534 this._emitCurrentToken();
1535 this._emitEOFToken();
1536 } else {
1537 this.currentToken.data += '--';
1538 this._reconsumeInState(COMMENT_STATE);
1539 }
1540 }
1541
1542 // Comment end bang state
1543 //------------------------------------------------------------------
1544 [COMMENT_END_BANG_STATE](cp) {
1545 if (cp === $.HYPHEN_MINUS) {
1546 this.currentToken.data += '--!';
1547 this.state = COMMENT_END_DASH_STATE;
1548 } else if (cp === $.GREATER_THAN_SIGN) {
1549 this._err(ERR.incorrectlyClosedComment);
1550 this.state = DATA_STATE;
1551 this._emitCurrentToken();
1552 } else if (cp === $.EOF) {
1553 this._err(ERR.eofInComment);
1554 this._emitCurrentToken();
1555 this._emitEOFToken();
1556 } else {
1557 this.currentToken.data += '--!';
1558 this._reconsumeInState(COMMENT_STATE);
1559 }
1560 }
1561
1562 // DOCTYPE state
1563 //------------------------------------------------------------------
1564 [DOCTYPE_STATE](cp) {
1565 if (isWhitespace(cp)) {
1566 this.state = BEFORE_DOCTYPE_NAME_STATE;
1567 } else if (cp === $.GREATER_THAN_SIGN) {
1568 this._reconsumeInState(BEFORE_DOCTYPE_NAME_STATE);
1569 } else if (cp === $.EOF) {
1570 this._err(ERR.eofInDoctype);
1571 this._createDoctypeToken(null);
1572 this.currentToken.forceQuirks = true;
1573 this._emitCurrentToken();
1574 this._emitEOFToken();
1575 } else {
1576 this._err(ERR.missingWhitespaceBeforeDoctypeName);
1577 this._reconsumeInState(BEFORE_DOCTYPE_NAME_STATE);
1578 }
1579 }
1580
1581 // Before DOCTYPE name state
1582 //------------------------------------------------------------------
1583 [BEFORE_DOCTYPE_NAME_STATE](cp) {
1584 if (isWhitespace(cp)) {
1585 return;
1586 }
1587
1588 if (isAsciiUpper(cp)) {
1589 this._createDoctypeToken(toAsciiLowerChar(cp));
1590 this.state = DOCTYPE_NAME_STATE;
1591 } else if (cp === $.NULL) {
1592 this._err(ERR.unexpectedNullCharacter);
1593 this._createDoctypeToken(unicode.REPLACEMENT_CHARACTER);
1594 this.state = DOCTYPE_NAME_STATE;
1595 } else if (cp === $.GREATER_THAN_SIGN) {
1596 this._err(ERR.missingDoctypeName);
1597 this._createDoctypeToken(null);
1598 this.currentToken.forceQuirks = true;
1599 this._emitCurrentToken();
1600 this.state = DATA_STATE;
1601 } else if (cp === $.EOF) {
1602 this._err(ERR.eofInDoctype);
1603 this._createDoctypeToken(null);
1604 this.currentToken.forceQuirks = true;
1605 this._emitCurrentToken();
1606 this._emitEOFToken();
1607 } else {
1608 this._createDoctypeToken(toChar(cp));
1609 this.state = DOCTYPE_NAME_STATE;
1610 }
1611 }
1612
1613 // DOCTYPE name state
1614 //------------------------------------------------------------------
1615 [DOCTYPE_NAME_STATE](cp) {
1616 if (isWhitespace(cp)) {
1617 this.state = AFTER_DOCTYPE_NAME_STATE;
1618 } else if (cp === $.GREATER_THAN_SIGN) {
1619 this.state = DATA_STATE;
1620 this._emitCurrentToken();
1621 } else if (isAsciiUpper(cp)) {
1622 this.currentToken.name += toAsciiLowerChar(cp);
1623 } else if (cp === $.NULL) {
1624 this._err(ERR.unexpectedNullCharacter);
1625 this.currentToken.name += unicode.REPLACEMENT_CHARACTER;
1626 } else if (cp === $.EOF) {
1627 this._err(ERR.eofInDoctype);
1628 this.currentToken.forceQuirks = true;
1629 this._emitCurrentToken();
1630 this._emitEOFToken();
1631 } else {
1632 this.currentToken.name += toChar(cp);
1633 }
1634 }
1635
1636 // After DOCTYPE name state
1637 //------------------------------------------------------------------
1638 [AFTER_DOCTYPE_NAME_STATE](cp) {
1639 if (isWhitespace(cp)) {
1640 return;
1641 }
1642
1643 if (cp === $.GREATER_THAN_SIGN) {
1644 this.state = DATA_STATE;
1645 this._emitCurrentToken();
1646 } else if (cp === $.EOF) {
1647 this._err(ERR.eofInDoctype);
1648 this.currentToken.forceQuirks = true;
1649 this._emitCurrentToken();
1650 this._emitEOFToken();
1651 } else if (this._consumeSequenceIfMatch($$.PUBLIC_STRING, cp, false)) {
1652 this.state = AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE;
1653 } else if (this._consumeSequenceIfMatch($$.SYSTEM_STRING, cp, false)) {
1654 this.state = AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE;
1655 }
1656 //NOTE: sequence lookup can be abrupted by hibernation. In that case lookup
1657 //results are no longer valid and we will need to start over.
1658 else if (!this._ensureHibernation()) {
1659 this._err(ERR.invalidCharacterSequenceAfterDoctypeName);
1660 this.currentToken.forceQuirks = true;
1661 this._reconsumeInState(BOGUS_DOCTYPE_STATE);
1662 }
1663 }
1664
1665 // After DOCTYPE public keyword state
1666 //------------------------------------------------------------------
1667 [AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE](cp) {
1668 if (isWhitespace(cp)) {
1669 this.state = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1670 } else if (cp === $.QUOTATION_MARK) {
1671 this._err(ERR.missingWhitespaceAfterDoctypePublicKeyword);
1672 this.currentToken.publicId = '';
1673 this.state = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1674 } else if (cp === $.APOSTROPHE) {
1675 this._err(ERR.missingWhitespaceAfterDoctypePublicKeyword);
1676 this.currentToken.publicId = '';
1677 this.state = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1678 } else if (cp === $.GREATER_THAN_SIGN) {
1679 this._err(ERR.missingDoctypePublicIdentifier);
1680 this.currentToken.forceQuirks = true;
1681 this.state = DATA_STATE;
1682 this._emitCurrentToken();
1683 } else if (cp === $.EOF) {
1684 this._err(ERR.eofInDoctype);
1685 this.currentToken.forceQuirks = true;
1686 this._emitCurrentToken();
1687 this._emitEOFToken();
1688 } else {
1689 this._err(ERR.missingQuoteBeforeDoctypePublicIdentifier);
1690 this.currentToken.forceQuirks = true;
1691 this._reconsumeInState(BOGUS_DOCTYPE_STATE);
1692 }
1693 }
1694
1695 // Before DOCTYPE public identifier state
1696 //------------------------------------------------------------------
1697 [BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE](cp) {
1698 if (isWhitespace(cp)) {
1699 return;
1700 }
1701
1702 if (cp === $.QUOTATION_MARK) {
1703 this.currentToken.publicId = '';
1704 this.state = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1705 } else if (cp === $.APOSTROPHE) {
1706 this.currentToken.publicId = '';
1707 this.state = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1708 } else if (cp === $.GREATER_THAN_SIGN) {
1709 this._err(ERR.missingDoctypePublicIdentifier);
1710 this.currentToken.forceQuirks = true;
1711 this.state = DATA_STATE;
1712 this._emitCurrentToken();
1713 } else if (cp === $.EOF) {
1714 this._err(ERR.eofInDoctype);
1715 this.currentToken.forceQuirks = true;
1716 this._emitCurrentToken();
1717 this._emitEOFToken();
1718 } else {
1719 this._err(ERR.missingQuoteBeforeDoctypePublicIdentifier);
1720 this.currentToken.forceQuirks = true;
1721 this._reconsumeInState(BOGUS_DOCTYPE_STATE);
1722 }
1723 }
1724
1725 // DOCTYPE public identifier (double-quoted) state
1726 //------------------------------------------------------------------
1727 [DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE](cp) {
1728 if (cp === $.QUOTATION_MARK) {
1729 this.state = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1730 } else if (cp === $.NULL) {
1731 this._err(ERR.unexpectedNullCharacter);
1732 this.currentToken.publicId += unicode.REPLACEMENT_CHARACTER;
1733 } else if (cp === $.GREATER_THAN_SIGN) {
1734 this._err(ERR.abruptDoctypePublicIdentifier);
1735 this.currentToken.forceQuirks = true;
1736 this._emitCurrentToken();
1737 this.state = DATA_STATE;
1738 } else if (cp === $.EOF) {
1739 this._err(ERR.eofInDoctype);
1740 this.currentToken.forceQuirks = true;
1741 this._emitCurrentToken();
1742 this._emitEOFToken();
1743 } else {
1744 this.currentToken.publicId += toChar(cp);
1745 }
1746 }
1747
1748 // DOCTYPE public identifier (single-quoted) state
1749 //------------------------------------------------------------------
1750 [DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE](cp) {
1751 if (cp === $.APOSTROPHE) {
1752 this.state = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1753 } else if (cp === $.NULL) {
1754 this._err(ERR.unexpectedNullCharacter);
1755 this.currentToken.publicId += unicode.REPLACEMENT_CHARACTER;
1756 } else if (cp === $.GREATER_THAN_SIGN) {
1757 this._err(ERR.abruptDoctypePublicIdentifier);
1758 this.currentToken.forceQuirks = true;
1759 this._emitCurrentToken();
1760 this.state = DATA_STATE;
1761 } else if (cp === $.EOF) {
1762 this._err(ERR.eofInDoctype);
1763 this.currentToken.forceQuirks = true;
1764 this._emitCurrentToken();
1765 this._emitEOFToken();
1766 } else {
1767 this.currentToken.publicId += toChar(cp);
1768 }
1769 }
1770
1771 // After DOCTYPE public identifier state
1772 //------------------------------------------------------------------
1773 [AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE](cp) {
1774 if (isWhitespace(cp)) {
1775 this.state = BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE;
1776 } else if (cp === $.GREATER_THAN_SIGN) {
1777 this.state = DATA_STATE;
1778 this._emitCurrentToken();
1779 } else if (cp === $.QUOTATION_MARK) {
1780 this._err(ERR.missingWhitespaceBetweenDoctypePublicAndSystemIdentifiers);
1781 this.currentToken.systemId = '';
1782 this.state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
1783 } else if (cp === $.APOSTROPHE) {
1784 this._err(ERR.missingWhitespaceBetweenDoctypePublicAndSystemIdentifiers);
1785 this.currentToken.systemId = '';
1786 this.state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
1787 } else if (cp === $.EOF) {
1788 this._err(ERR.eofInDoctype);
1789 this.currentToken.forceQuirks = true;
1790 this._emitCurrentToken();
1791 this._emitEOFToken();
1792 } else {
1793 this._err(ERR.missingQuoteBeforeDoctypeSystemIdentifier);
1794 this.currentToken.forceQuirks = true;
1795 this._reconsumeInState(BOGUS_DOCTYPE_STATE);
1796 }
1797 }
1798
1799 // Between DOCTYPE public and system identifiers state
1800 //------------------------------------------------------------------
1801 [BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE](cp) {
1802 if (isWhitespace(cp)) {
1803 return;
1804 }
1805
1806 if (cp === $.GREATER_THAN_SIGN) {
1807 this._emitCurrentToken();
1808 this.state = DATA_STATE;
1809 } else if (cp === $.QUOTATION_MARK) {
1810 this.currentToken.systemId = '';
1811 this.state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
1812 } else if (cp === $.APOSTROPHE) {
1813 this.currentToken.systemId = '';
1814 this.state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
1815 } else if (cp === $.EOF) {
1816 this._err(ERR.eofInDoctype);
1817 this.currentToken.forceQuirks = true;
1818 this._emitCurrentToken();
1819 this._emitEOFToken();
1820 } else {
1821 this._err(ERR.missingQuoteBeforeDoctypeSystemIdentifier);
1822 this.currentToken.forceQuirks = true;
1823 this._reconsumeInState(BOGUS_DOCTYPE_STATE);
1824 }
1825 }
1826
1827 // After DOCTYPE system keyword state
1828 //------------------------------------------------------------------
1829 [AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE](cp) {
1830 if (isWhitespace(cp)) {
1831 this.state = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1832 } else if (cp === $.QUOTATION_MARK) {
1833 this._err(ERR.missingWhitespaceAfterDoctypeSystemKeyword);
1834 this.currentToken.systemId = '';
1835 this.state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
1836 } else if (cp === $.APOSTROPHE) {
1837 this._err(ERR.missingWhitespaceAfterDoctypeSystemKeyword);
1838 this.currentToken.systemId = '';
1839 this.state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
1840 } else if (cp === $.GREATER_THAN_SIGN) {
1841 this._err(ERR.missingDoctypeSystemIdentifier);
1842 this.currentToken.forceQuirks = true;
1843 this.state = DATA_STATE;
1844 this._emitCurrentToken();
1845 } else if (cp === $.EOF) {
1846 this._err(ERR.eofInDoctype);
1847 this.currentToken.forceQuirks = true;
1848 this._emitCurrentToken();
1849 this._emitEOFToken();
1850 } else {
1851 this._err(ERR.missingQuoteBeforeDoctypeSystemIdentifier);
1852 this.currentToken.forceQuirks = true;
1853 this._reconsumeInState(BOGUS_DOCTYPE_STATE);
1854 }
1855 }
1856
1857 // Before DOCTYPE system identifier state
1858 //------------------------------------------------------------------
1859 [BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE](cp) {
1860 if (isWhitespace(cp)) {
1861 return;
1862 }
1863
1864 if (cp === $.QUOTATION_MARK) {
1865 this.currentToken.systemId = '';
1866 this.state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
1867 } else if (cp === $.APOSTROPHE) {
1868 this.currentToken.systemId = '';
1869 this.state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
1870 } else if (cp === $.GREATER_THAN_SIGN) {
1871 this._err(ERR.missingDoctypeSystemIdentifier);
1872 this.currentToken.forceQuirks = true;
1873 this.state = DATA_STATE;
1874 this._emitCurrentToken();
1875 } else if (cp === $.EOF) {
1876 this._err(ERR.eofInDoctype);
1877 this.currentToken.forceQuirks = true;
1878 this._emitCurrentToken();
1879 this._emitEOFToken();
1880 } else {
1881 this._err(ERR.missingQuoteBeforeDoctypeSystemIdentifier);
1882 this.currentToken.forceQuirks = true;
1883 this._reconsumeInState(BOGUS_DOCTYPE_STATE);
1884 }
1885 }
1886
1887 // DOCTYPE system identifier (double-quoted) state
1888 //------------------------------------------------------------------
1889 [DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE](cp) {
1890 if (cp === $.QUOTATION_MARK) {
1891 this.state = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1892 } else if (cp === $.NULL) {
1893 this._err(ERR.unexpectedNullCharacter);
1894 this.currentToken.systemId += unicode.REPLACEMENT_CHARACTER;
1895 } else if (cp === $.GREATER_THAN_SIGN) {
1896 this._err(ERR.abruptDoctypeSystemIdentifier);
1897 this.currentToken.forceQuirks = true;
1898 this._emitCurrentToken();
1899 this.state = DATA_STATE;
1900 } else if (cp === $.EOF) {
1901 this._err(ERR.eofInDoctype);
1902 this.currentToken.forceQuirks = true;
1903 this._emitCurrentToken();
1904 this._emitEOFToken();
1905 } else {
1906 this.currentToken.systemId += toChar(cp);
1907 }
1908 }
1909
1910 // DOCTYPE system identifier (single-quoted) state
1911 //------------------------------------------------------------------
1912 [DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE](cp) {
1913 if (cp === $.APOSTROPHE) {
1914 this.state = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1915 } else if (cp === $.NULL) {
1916 this._err(ERR.unexpectedNullCharacter);
1917 this.currentToken.systemId += unicode.REPLACEMENT_CHARACTER;
1918 } else if (cp === $.GREATER_THAN_SIGN) {
1919 this._err(ERR.abruptDoctypeSystemIdentifier);
1920 this.currentToken.forceQuirks = true;
1921 this._emitCurrentToken();
1922 this.state = DATA_STATE;
1923 } else if (cp === $.EOF) {
1924 this._err(ERR.eofInDoctype);
1925 this.currentToken.forceQuirks = true;
1926 this._emitCurrentToken();
1927 this._emitEOFToken();
1928 } else {
1929 this.currentToken.systemId += toChar(cp);
1930 }
1931 }
1932
1933 // After DOCTYPE system identifier state
1934 //------------------------------------------------------------------
1935 [AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE](cp) {
1936 if (isWhitespace(cp)) {
1937 return;
1938 }
1939
1940 if (cp === $.GREATER_THAN_SIGN) {
1941 this._emitCurrentToken();
1942 this.state = DATA_STATE;
1943 } else if (cp === $.EOF) {
1944 this._err(ERR.eofInDoctype);
1945 this.currentToken.forceQuirks = true;
1946 this._emitCurrentToken();
1947 this._emitEOFToken();
1948 } else {
1949 this._err(ERR.unexpectedCharacterAfterDoctypeSystemIdentifier);
1950 this._reconsumeInState(BOGUS_DOCTYPE_STATE);
1951 }
1952 }
1953
1954 // Bogus DOCTYPE state
1955 //------------------------------------------------------------------
1956 [BOGUS_DOCTYPE_STATE](cp) {
1957 if (cp === $.GREATER_THAN_SIGN) {
1958 this._emitCurrentToken();
1959 this.state = DATA_STATE;
1960 } else if (cp === $.NULL) {
1961 this._err(ERR.unexpectedNullCharacter);
1962 } else if (cp === $.EOF) {
1963 this._emitCurrentToken();
1964 this._emitEOFToken();
1965 }
1966 }
1967
1968 // CDATA section state
1969 //------------------------------------------------------------------
1970 [CDATA_SECTION_STATE](cp) {
1971 if (cp === $.RIGHT_SQUARE_BRACKET) {
1972 this.state = CDATA_SECTION_BRACKET_STATE;
1973 } else if (cp === $.EOF) {
1974 this._err(ERR.eofInCdata);
1975 this._emitEOFToken();
1976 } else {
1977 this._emitCodePoint(cp);
1978 }
1979 }
1980
1981 // CDATA section bracket state
1982 //------------------------------------------------------------------
1983 [CDATA_SECTION_BRACKET_STATE](cp) {
1984 if (cp === $.RIGHT_SQUARE_BRACKET) {
1985 this.state = CDATA_SECTION_END_STATE;
1986 } else {
1987 this._emitChars(']');
1988 this._reconsumeInState(CDATA_SECTION_STATE);
1989 }
1990 }
1991
1992 // CDATA section end state
1993 //------------------------------------------------------------------
1994 [CDATA_SECTION_END_STATE](cp) {
1995 if (cp === $.GREATER_THAN_SIGN) {
1996 this.state = DATA_STATE;
1997 } else if (cp === $.RIGHT_SQUARE_BRACKET) {
1998 this._emitChars(']');
1999 } else {
2000 this._emitChars(']]');
2001 this._reconsumeInState(CDATA_SECTION_STATE);
2002 }
2003 }
2004
2005 // Character reference state
2006 //------------------------------------------------------------------
2007 [CHARACTER_REFERENCE_STATE](cp) {
2008 this.tempBuff = [$.AMPERSAND];
2009
2010 if (cp === $.NUMBER_SIGN) {
2011 this.tempBuff.push(cp);
2012 this.state = NUMERIC_CHARACTER_REFERENCE_STATE;
2013 } else if (isAsciiAlphaNumeric(cp)) {
2014 this._reconsumeInState(NAMED_CHARACTER_REFERENCE_STATE);
2015 } else {
2016 this._flushCodePointsConsumedAsCharacterReference();
2017 this._reconsumeInState(this.returnState);
2018 }
2019 }
2020
2021 // Named character reference state
2022 //------------------------------------------------------------------
2023 [NAMED_CHARACTER_REFERENCE_STATE](cp) {
2024 const matchResult = this._matchNamedCharacterReference(cp);
2025
2026 //NOTE: matching can be abrupted by hibernation. In that case match
2027 //results are no longer valid and we will need to start over.
2028 if (this._ensureHibernation()) {
2029 this.tempBuff = [$.AMPERSAND];
2030 } else if (matchResult) {
2031 const withSemicolon = this.tempBuff[this.tempBuff.length - 1] === $.SEMICOLON;
2032
2033 if (!this._isCharacterReferenceAttributeQuirk(withSemicolon)) {
2034 if (!withSemicolon) {
2035 this._errOnNextCodePoint(ERR.missingSemicolonAfterCharacterReference);
2036 }
2037
2038 this.tempBuff = matchResult;
2039 }
2040
2041 this._flushCodePointsConsumedAsCharacterReference();
2042 this.state = this.returnState;
2043 } else {
2044 this._flushCodePointsConsumedAsCharacterReference();
2045 this.state = AMBIGUOUS_AMPERSAND_STATE;
2046 }
2047 }
2048
2049 // Ambiguos ampersand state
2050 //------------------------------------------------------------------
2051 [AMBIGUOUS_AMPERSAND_STATE](cp) {
2052 if (isAsciiAlphaNumeric(cp)) {
2053 if (this._isCharacterReferenceInAttribute()) {
2054 this.currentAttr.value += toChar(cp);
2055 } else {
2056 this._emitCodePoint(cp);
2057 }
2058 } else {
2059 if (cp === $.SEMICOLON) {
2060 this._err(ERR.unknownNamedCharacterReference);
2061 }
2062
2063 this._reconsumeInState(this.returnState);
2064 }
2065 }
2066
2067 // Numeric character reference state
2068 //------------------------------------------------------------------
2069 [NUMERIC_CHARACTER_REFERENCE_STATE](cp) {
2070 this.charRefCode = 0;
2071
2072 if (cp === $.LATIN_SMALL_X || cp === $.LATIN_CAPITAL_X) {
2073 this.tempBuff.push(cp);
2074 this.state = HEXADEMICAL_CHARACTER_REFERENCE_START_STATE;
2075 } else {
2076 this._reconsumeInState(DECIMAL_CHARACTER_REFERENCE_START_STATE);
2077 }
2078 }
2079
2080 // Hexademical character reference start state
2081 //------------------------------------------------------------------
2082 [HEXADEMICAL_CHARACTER_REFERENCE_START_STATE](cp) {
2083 if (isAsciiHexDigit(cp)) {
2084 this._reconsumeInState(HEXADEMICAL_CHARACTER_REFERENCE_STATE);
2085 } else {
2086 this._err(ERR.absenceOfDigitsInNumericCharacterReference);
2087 this._flushCodePointsConsumedAsCharacterReference();
2088 this._reconsumeInState(this.returnState);
2089 }
2090 }
2091
2092 // Decimal character reference start state
2093 //------------------------------------------------------------------
2094 [DECIMAL_CHARACTER_REFERENCE_START_STATE](cp) {
2095 if (isAsciiDigit(cp)) {
2096 this._reconsumeInState(DECIMAL_CHARACTER_REFERENCE_STATE);
2097 } else {
2098 this._err(ERR.absenceOfDigitsInNumericCharacterReference);
2099 this._flushCodePointsConsumedAsCharacterReference();
2100 this._reconsumeInState(this.returnState);
2101 }
2102 }
2103
2104 // Hexademical character reference state
2105 //------------------------------------------------------------------
2106 [HEXADEMICAL_CHARACTER_REFERENCE_STATE](cp) {
2107 if (isAsciiUpperHexDigit(cp)) {
2108 this.charRefCode = this.charRefCode * 16 + cp - 0x37;
2109 } else if (isAsciiLowerHexDigit(cp)) {
2110 this.charRefCode = this.charRefCode * 16 + cp - 0x57;
2111 } else if (isAsciiDigit(cp)) {
2112 this.charRefCode = this.charRefCode * 16 + cp - 0x30;
2113 } else if (cp === $.SEMICOLON) {
2114 this.state = NUMERIC_CHARACTER_REFERENCE_END_STATE;
2115 } else {
2116 this._err(ERR.missingSemicolonAfterCharacterReference);
2117 this._reconsumeInState(NUMERIC_CHARACTER_REFERENCE_END_STATE);
2118 }
2119 }
2120
2121 // Decimal character reference state
2122 //------------------------------------------------------------------
2123 [DECIMAL_CHARACTER_REFERENCE_STATE](cp) {
2124 if (isAsciiDigit(cp)) {
2125 this.charRefCode = this.charRefCode * 10 + cp - 0x30;
2126 } else if (cp === $.SEMICOLON) {
2127 this.state = NUMERIC_CHARACTER_REFERENCE_END_STATE;
2128 } else {
2129 this._err(ERR.missingSemicolonAfterCharacterReference);
2130 this._reconsumeInState(NUMERIC_CHARACTER_REFERENCE_END_STATE);
2131 }
2132 }
2133
2134 // Numeric character reference end state
2135 //------------------------------------------------------------------
2136 [NUMERIC_CHARACTER_REFERENCE_END_STATE]() {
2137 if (this.charRefCode === $.NULL) {
2138 this._err(ERR.nullCharacterReference);
2139 this.charRefCode = $.REPLACEMENT_CHARACTER;
2140 } else if (this.charRefCode > 0x10ffff) {
2141 this._err(ERR.characterReferenceOutsideUnicodeRange);
2142 this.charRefCode = $.REPLACEMENT_CHARACTER;
2143 } else if (unicode.isSurrogate(this.charRefCode)) {
2144 this._err(ERR.surrogateCharacterReference);
2145 this.charRefCode = $.REPLACEMENT_CHARACTER;
2146 } else if (unicode.isUndefinedCodePoint(this.charRefCode)) {
2147 this._err(ERR.noncharacterCharacterReference);
2148 } else if (unicode.isControlCodePoint(this.charRefCode) || this.charRefCode === $.CARRIAGE_RETURN) {
2149 this._err(ERR.controlCharacterReference);
2150
2151 const replacement = C1_CONTROLS_REFERENCE_REPLACEMENTS[this.charRefCode];
2152
2153 if (replacement) {
2154 this.charRefCode = replacement;
2155 }
2156 }
2157
2158 this.tempBuff = [this.charRefCode];
2159
2160 this._flushCodePointsConsumedAsCharacterReference();
2161 this._reconsumeInState(this.returnState);
2162 }
2163}
2164
2165//Token types
2166Tokenizer.CHARACTER_TOKEN = 'CHARACTER_TOKEN';
2167Tokenizer.NULL_CHARACTER_TOKEN = 'NULL_CHARACTER_TOKEN';
2168Tokenizer.WHITESPACE_CHARACTER_TOKEN = 'WHITESPACE_CHARACTER_TOKEN';
2169Tokenizer.START_TAG_TOKEN = 'START_TAG_TOKEN';
2170Tokenizer.END_TAG_TOKEN = 'END_TAG_TOKEN';
2171Tokenizer.COMMENT_TOKEN = 'COMMENT_TOKEN';
2172Tokenizer.DOCTYPE_TOKEN = 'DOCTYPE_TOKEN';
2173Tokenizer.EOF_TOKEN = 'EOF_TOKEN';
2174Tokenizer.HIBERNATION_TOKEN = 'HIBERNATION_TOKEN';
2175
2176//Tokenizer initial states for different modes
2177Tokenizer.MODE = {
2178 DATA: DATA_STATE,
2179 RCDATA: RCDATA_STATE,
2180 RAWTEXT: RAWTEXT_STATE,
2181 SCRIPT_DATA: SCRIPT_DATA_STATE,
2182 PLAINTEXT: PLAINTEXT_STATE
2183};
2184
2185//Static
2186Tokenizer.getTokenAttr = function(token, attrName) {
2187 for (let i = token.attrs.length - 1; i >= 0; i--) {
2188 if (token.attrs[i].name === attrName) {
2189 return token.attrs[i].value;
2190 }
2191 }
2192
2193 return null;
2194};
2195
2196module.exports = Tokenizer;
Note: See TracBrowser for help on using the repository browser.