1 | /** All the states the tokenizer can be in. */
|
---|
2 | declare const enum State {
|
---|
3 | Text = 1,
|
---|
4 | BeforeTagName = 2,
|
---|
5 | InTagName = 3,
|
---|
6 | InSelfClosingTag = 4,
|
---|
7 | BeforeClosingTagName = 5,
|
---|
8 | InClosingTagName = 6,
|
---|
9 | AfterClosingTagName = 7,
|
---|
10 | BeforeAttributeName = 8,
|
---|
11 | InAttributeName = 9,
|
---|
12 | AfterAttributeName = 10,
|
---|
13 | BeforeAttributeValue = 11,
|
---|
14 | InAttributeValueDq = 12,
|
---|
15 | InAttributeValueSq = 13,
|
---|
16 | InAttributeValueNq = 14,
|
---|
17 | BeforeDeclaration = 15,
|
---|
18 | InDeclaration = 16,
|
---|
19 | InProcessingInstruction = 17,
|
---|
20 | BeforeComment = 18,
|
---|
21 | InComment = 19,
|
---|
22 | InSpecialComment = 20,
|
---|
23 | AfterComment1 = 21,
|
---|
24 | AfterComment2 = 22,
|
---|
25 | BeforeCdata1 = 23,
|
---|
26 | BeforeCdata2 = 24,
|
---|
27 | BeforeCdata3 = 25,
|
---|
28 | BeforeCdata4 = 26,
|
---|
29 | BeforeCdata5 = 27,
|
---|
30 | BeforeCdata6 = 28,
|
---|
31 | InCdata = 29,
|
---|
32 | AfterCdata1 = 30,
|
---|
33 | AfterCdata2 = 31,
|
---|
34 | BeforeSpecialS = 32,
|
---|
35 | BeforeSpecialSEnd = 33,
|
---|
36 | BeforeScript1 = 34,
|
---|
37 | BeforeScript2 = 35,
|
---|
38 | BeforeScript3 = 36,
|
---|
39 | BeforeScript4 = 37,
|
---|
40 | BeforeScript5 = 38,
|
---|
41 | AfterScript1 = 39,
|
---|
42 | AfterScript2 = 40,
|
---|
43 | AfterScript3 = 41,
|
---|
44 | AfterScript4 = 42,
|
---|
45 | AfterScript5 = 43,
|
---|
46 | BeforeStyle1 = 44,
|
---|
47 | BeforeStyle2 = 45,
|
---|
48 | BeforeStyle3 = 46,
|
---|
49 | BeforeStyle4 = 47,
|
---|
50 | AfterStyle1 = 48,
|
---|
51 | AfterStyle2 = 49,
|
---|
52 | AfterStyle3 = 50,
|
---|
53 | AfterStyle4 = 51,
|
---|
54 | BeforeSpecialT = 52,
|
---|
55 | BeforeSpecialTEnd = 53,
|
---|
56 | BeforeTitle1 = 54,
|
---|
57 | BeforeTitle2 = 55,
|
---|
58 | BeforeTitle3 = 56,
|
---|
59 | BeforeTitle4 = 57,
|
---|
60 | AfterTitle1 = 58,
|
---|
61 | AfterTitle2 = 59,
|
---|
62 | AfterTitle3 = 60,
|
---|
63 | AfterTitle4 = 61,
|
---|
64 | BeforeEntity = 62,
|
---|
65 | BeforeNumericEntity = 63,
|
---|
66 | InNamedEntity = 64,
|
---|
67 | InNumericEntity = 65,
|
---|
68 | InHexEntity = 66
|
---|
69 | }
|
---|
70 | export interface Callbacks {
|
---|
71 | onattribdata(value: string): void;
|
---|
72 | onattribend(quote: string | undefined | null): void;
|
---|
73 | onattribname(name: string): void;
|
---|
74 | oncdata(data: string): void;
|
---|
75 | onclosetag(name: string): void;
|
---|
76 | oncomment(data: string): void;
|
---|
77 | ondeclaration(content: string): void;
|
---|
78 | onend(): void;
|
---|
79 | onerror(error: Error, state?: State): void;
|
---|
80 | onopentagend(): void;
|
---|
81 | onopentagname(name: string): void;
|
---|
82 | onprocessinginstruction(instruction: string): void;
|
---|
83 | onselfclosingtag(): void;
|
---|
84 | ontext(value: string): void;
|
---|
85 | }
|
---|
86 | export default class Tokenizer {
|
---|
87 | /** The current state the tokenizer is in. */
|
---|
88 | _state: State;
|
---|
89 | /** The read buffer. */
|
---|
90 | private buffer;
|
---|
91 | /** The beginning of the section that is currently being read. */
|
---|
92 | sectionStart: number;
|
---|
93 | /** The index within the buffer that we are currently looking at. */
|
---|
94 | _index: number;
|
---|
95 | /**
|
---|
96 | * Data that has already been processed will be removed from the buffer occasionally.
|
---|
97 | * `_bufferOffset` keeps track of how many characters have been removed, to make sure position information is accurate.
|
---|
98 | */
|
---|
99 | private bufferOffset;
|
---|
100 | /** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */
|
---|
101 | private baseState;
|
---|
102 | /** For special parsing behavior inside of script and style tags. */
|
---|
103 | private special;
|
---|
104 | /** Indicates whether the tokenizer has been paused. */
|
---|
105 | private running;
|
---|
106 | /** Indicates whether the tokenizer has finished running / `.end` has been called. */
|
---|
107 | private ended;
|
---|
108 | private readonly cbs;
|
---|
109 | private readonly xmlMode;
|
---|
110 | private readonly decodeEntities;
|
---|
111 | constructor(options: {
|
---|
112 | xmlMode?: boolean;
|
---|
113 | decodeEntities?: boolean;
|
---|
114 | } | null, cbs: Callbacks);
|
---|
115 | reset(): void;
|
---|
116 | write(chunk: string): void;
|
---|
117 | end(chunk?: string): void;
|
---|
118 | pause(): void;
|
---|
119 | resume(): void;
|
---|
120 | /**
|
---|
121 | * The current index within all of the written data.
|
---|
122 | */
|
---|
123 | getAbsoluteIndex(): number;
|
---|
124 | private stateText;
|
---|
125 | /**
|
---|
126 | * HTML only allows ASCII alpha characters (a-z and A-Z) at the beginning of a tag name.
|
---|
127 | *
|
---|
128 | * XML allows a lot more characters here (@see https://www.w3.org/TR/REC-xml/#NT-NameStartChar).
|
---|
129 | * We allow anything that wouldn't end the tag.
|
---|
130 | */
|
---|
131 | private isTagStartChar;
|
---|
132 | private stateBeforeTagName;
|
---|
133 | private stateInTagName;
|
---|
134 | private stateBeforeClosingTagName;
|
---|
135 | private stateInClosingTagName;
|
---|
136 | private stateAfterClosingTagName;
|
---|
137 | private stateBeforeAttributeName;
|
---|
138 | private stateInSelfClosingTag;
|
---|
139 | private stateInAttributeName;
|
---|
140 | private stateAfterAttributeName;
|
---|
141 | private stateBeforeAttributeValue;
|
---|
142 | private handleInAttributeValue;
|
---|
143 | private stateInAttributeValueDoubleQuotes;
|
---|
144 | private stateInAttributeValueSingleQuotes;
|
---|
145 | private stateInAttributeValueNoQuotes;
|
---|
146 | private stateBeforeDeclaration;
|
---|
147 | private stateInDeclaration;
|
---|
148 | private stateInProcessingInstruction;
|
---|
149 | private stateBeforeComment;
|
---|
150 | private stateInComment;
|
---|
151 | private stateInSpecialComment;
|
---|
152 | private stateAfterComment1;
|
---|
153 | private stateAfterComment2;
|
---|
154 | private stateBeforeCdata6;
|
---|
155 | private stateInCdata;
|
---|
156 | private stateAfterCdata1;
|
---|
157 | private stateAfterCdata2;
|
---|
158 | private stateBeforeSpecialS;
|
---|
159 | private stateBeforeSpecialSEnd;
|
---|
160 | private stateBeforeSpecialLast;
|
---|
161 | private stateAfterSpecialLast;
|
---|
162 | private parseFixedEntity;
|
---|
163 | private parseLegacyEntity;
|
---|
164 | private stateInNamedEntity;
|
---|
165 | private decodeNumericEntity;
|
---|
166 | private stateInNumericEntity;
|
---|
167 | private stateInHexEntity;
|
---|
168 | private cleanup;
|
---|
169 | /**
|
---|
170 | * Iterates through the buffer, calling the function corresponding to the current state.
|
---|
171 | *
|
---|
172 | * States that are more likely to be hit are higher up, as a performance improvement.
|
---|
173 | */
|
---|
174 | private parse;
|
---|
175 | private finish;
|
---|
176 | private handleTrailingData;
|
---|
177 | private getSection;
|
---|
178 | private emitToken;
|
---|
179 | private emitPartial;
|
---|
180 | }
|
---|
181 | export {};
|
---|
182 | //# sourceMappingURL=Tokenizer.d.ts.map |
---|