source: node_modules/entities/lib/esm/decode.js

Last change on this file was 57e58a3, checked in by ste08 <sjovanoska@…>, 4 months ago

Initial commit

  • Property mode set to 100644
File size: 19.3 KB
Line 
1import htmlDecodeTree from "./generated/decode-data-html.js";
2import xmlDecodeTree from "./generated/decode-data-xml.js";
3import decodeCodePoint, { replaceCodePoint, fromCodePoint, } from "./decode_codepoint.js";
4// Re-export for use by eg. htmlparser2
5export { htmlDecodeTree, xmlDecodeTree, decodeCodePoint };
6export { replaceCodePoint, fromCodePoint } from "./decode_codepoint.js";
7var CharCodes;
8(function (CharCodes) {
9 CharCodes[CharCodes["NUM"] = 35] = "NUM";
10 CharCodes[CharCodes["SEMI"] = 59] = "SEMI";
11 CharCodes[CharCodes["EQUALS"] = 61] = "EQUALS";
12 CharCodes[CharCodes["ZERO"] = 48] = "ZERO";
13 CharCodes[CharCodes["NINE"] = 57] = "NINE";
14 CharCodes[CharCodes["LOWER_A"] = 97] = "LOWER_A";
15 CharCodes[CharCodes["LOWER_F"] = 102] = "LOWER_F";
16 CharCodes[CharCodes["LOWER_X"] = 120] = "LOWER_X";
17 CharCodes[CharCodes["LOWER_Z"] = 122] = "LOWER_Z";
18 CharCodes[CharCodes["UPPER_A"] = 65] = "UPPER_A";
19 CharCodes[CharCodes["UPPER_F"] = 70] = "UPPER_F";
20 CharCodes[CharCodes["UPPER_Z"] = 90] = "UPPER_Z";
21})(CharCodes || (CharCodes = {}));
22/** Bit that needs to be set to convert an upper case ASCII character to lower case */
23const TO_LOWER_BIT = 0b100000;
24export var BinTrieFlags;
25(function (BinTrieFlags) {
26 BinTrieFlags[BinTrieFlags["VALUE_LENGTH"] = 49152] = "VALUE_LENGTH";
27 BinTrieFlags[BinTrieFlags["BRANCH_LENGTH"] = 16256] = "BRANCH_LENGTH";
28 BinTrieFlags[BinTrieFlags["JUMP_TABLE"] = 127] = "JUMP_TABLE";
29})(BinTrieFlags || (BinTrieFlags = {}));
30function isNumber(code) {
31 return code >= CharCodes.ZERO && code <= CharCodes.NINE;
32}
33function isHexadecimalCharacter(code) {
34 return ((code >= CharCodes.UPPER_A && code <= CharCodes.UPPER_F) ||
35 (code >= CharCodes.LOWER_A && code <= CharCodes.LOWER_F));
36}
37function isAsciiAlphaNumeric(code) {
38 return ((code >= CharCodes.UPPER_A && code <= CharCodes.UPPER_Z) ||
39 (code >= CharCodes.LOWER_A && code <= CharCodes.LOWER_Z) ||
40 isNumber(code));
41}
42/**
43 * Checks if the given character is a valid end character for an entity in an attribute.
44 *
45 * Attribute values that aren't terminated properly aren't parsed, and shouldn't lead to a parser error.
46 * See the example in https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
47 */
48function isEntityInAttributeInvalidEnd(code) {
49 return code === CharCodes.EQUALS || isAsciiAlphaNumeric(code);
50}
51var EntityDecoderState;
52(function (EntityDecoderState) {
53 EntityDecoderState[EntityDecoderState["EntityStart"] = 0] = "EntityStart";
54 EntityDecoderState[EntityDecoderState["NumericStart"] = 1] = "NumericStart";
55 EntityDecoderState[EntityDecoderState["NumericDecimal"] = 2] = "NumericDecimal";
56 EntityDecoderState[EntityDecoderState["NumericHex"] = 3] = "NumericHex";
57 EntityDecoderState[EntityDecoderState["NamedEntity"] = 4] = "NamedEntity";
58})(EntityDecoderState || (EntityDecoderState = {}));
59export var DecodingMode;
60(function (DecodingMode) {
61 /** Entities in text nodes that can end with any character. */
62 DecodingMode[DecodingMode["Legacy"] = 0] = "Legacy";
63 /** Only allow entities terminated with a semicolon. */
64 DecodingMode[DecodingMode["Strict"] = 1] = "Strict";
65 /** Entities in attributes have limitations on ending characters. */
66 DecodingMode[DecodingMode["Attribute"] = 2] = "Attribute";
67})(DecodingMode || (DecodingMode = {}));
68/**
69 * Token decoder with support of writing partial entities.
70 */
71export class EntityDecoder {
72 constructor(
73 /** The tree used to decode entities. */
74 decodeTree,
75 /**
76 * The function that is called when a codepoint is decoded.
77 *
78 * For multi-byte named entities, this will be called multiple times,
79 * with the second codepoint, and the same `consumed` value.
80 *
81 * @param codepoint The decoded codepoint.
82 * @param consumed The number of bytes consumed by the decoder.
83 */
84 emitCodePoint,
85 /** An object that is used to produce errors. */
86 errors) {
87 this.decodeTree = decodeTree;
88 this.emitCodePoint = emitCodePoint;
89 this.errors = errors;
90 /** The current state of the decoder. */
91 this.state = EntityDecoderState.EntityStart;
92 /** Characters that were consumed while parsing an entity. */
93 this.consumed = 1;
94 /**
95 * The result of the entity.
96 *
97 * Either the result index of a numeric entity, or the codepoint of a
98 * numeric entity.
99 */
100 this.result = 0;
101 /** The current index in the decode tree. */
102 this.treeIndex = 0;
103 /** The number of characters that were consumed in excess. */
104 this.excess = 1;
105 /** The mode in which the decoder is operating. */
106 this.decodeMode = DecodingMode.Strict;
107 }
108 /** Resets the instance to make it reusable. */
109 startEntity(decodeMode) {
110 this.decodeMode = decodeMode;
111 this.state = EntityDecoderState.EntityStart;
112 this.result = 0;
113 this.treeIndex = 0;
114 this.excess = 1;
115 this.consumed = 1;
116 }
117 /**
118 * Write an entity to the decoder. This can be called multiple times with partial entities.
119 * If the entity is incomplete, the decoder will return -1.
120 *
121 * Mirrors the implementation of `getDecoder`, but with the ability to stop decoding if the
122 * entity is incomplete, and resume when the next string is written.
123 *
124 * @param string The string containing the entity (or a continuation of the entity).
125 * @param offset The offset at which the entity begins. Should be 0 if this is not the first call.
126 * @returns The number of characters that were consumed, or -1 if the entity is incomplete.
127 */
128 write(str, offset) {
129 switch (this.state) {
130 case EntityDecoderState.EntityStart: {
131 if (str.charCodeAt(offset) === CharCodes.NUM) {
132 this.state = EntityDecoderState.NumericStart;
133 this.consumed += 1;
134 return this.stateNumericStart(str, offset + 1);
135 }
136 this.state = EntityDecoderState.NamedEntity;
137 return this.stateNamedEntity(str, offset);
138 }
139 case EntityDecoderState.NumericStart: {
140 return this.stateNumericStart(str, offset);
141 }
142 case EntityDecoderState.NumericDecimal: {
143 return this.stateNumericDecimal(str, offset);
144 }
145 case EntityDecoderState.NumericHex: {
146 return this.stateNumericHex(str, offset);
147 }
148 case EntityDecoderState.NamedEntity: {
149 return this.stateNamedEntity(str, offset);
150 }
151 }
152 }
153 /**
154 * Switches between the numeric decimal and hexadecimal states.
155 *
156 * Equivalent to the `Numeric character reference state` in the HTML spec.
157 *
158 * @param str The string containing the entity (or a continuation of the entity).
159 * @param offset The current offset.
160 * @returns The number of characters that were consumed, or -1 if the entity is incomplete.
161 */
162 stateNumericStart(str, offset) {
163 if (offset >= str.length) {
164 return -1;
165 }
166 if ((str.charCodeAt(offset) | TO_LOWER_BIT) === CharCodes.LOWER_X) {
167 this.state = EntityDecoderState.NumericHex;
168 this.consumed += 1;
169 return this.stateNumericHex(str, offset + 1);
170 }
171 this.state = EntityDecoderState.NumericDecimal;
172 return this.stateNumericDecimal(str, offset);
173 }
174 addToNumericResult(str, start, end, base) {
175 if (start !== end) {
176 const digitCount = end - start;
177 this.result =
178 this.result * Math.pow(base, digitCount) +
179 parseInt(str.substr(start, digitCount), base);
180 this.consumed += digitCount;
181 }
182 }
183 /**
184 * Parses a hexadecimal numeric entity.
185 *
186 * Equivalent to the `Hexademical character reference state` in the HTML spec.
187 *
188 * @param str The string containing the entity (or a continuation of the entity).
189 * @param offset The current offset.
190 * @returns The number of characters that were consumed, or -1 if the entity is incomplete.
191 */
192 stateNumericHex(str, offset) {
193 const startIdx = offset;
194 while (offset < str.length) {
195 const char = str.charCodeAt(offset);
196 if (isNumber(char) || isHexadecimalCharacter(char)) {
197 offset += 1;
198 }
199 else {
200 this.addToNumericResult(str, startIdx, offset, 16);
201 return this.emitNumericEntity(char, 3);
202 }
203 }
204 this.addToNumericResult(str, startIdx, offset, 16);
205 return -1;
206 }
207 /**
208 * Parses a decimal numeric entity.
209 *
210 * Equivalent to the `Decimal character reference state` in the HTML spec.
211 *
212 * @param str The string containing the entity (or a continuation of the entity).
213 * @param offset The current offset.
214 * @returns The number of characters that were consumed, or -1 if the entity is incomplete.
215 */
216 stateNumericDecimal(str, offset) {
217 const startIdx = offset;
218 while (offset < str.length) {
219 const char = str.charCodeAt(offset);
220 if (isNumber(char)) {
221 offset += 1;
222 }
223 else {
224 this.addToNumericResult(str, startIdx, offset, 10);
225 return this.emitNumericEntity(char, 2);
226 }
227 }
228 this.addToNumericResult(str, startIdx, offset, 10);
229 return -1;
230 }
231 /**
232 * Validate and emit a numeric entity.
233 *
234 * Implements the logic from the `Hexademical character reference start
235 * state` and `Numeric character reference end state` in the HTML spec.
236 *
237 * @param lastCp The last code point of the entity. Used to see if the
238 * entity was terminated with a semicolon.
239 * @param expectedLength The minimum number of characters that should be
240 * consumed. Used to validate that at least one digit
241 * was consumed.
242 * @returns The number of characters that were consumed.
243 */
244 emitNumericEntity(lastCp, expectedLength) {
245 var _a;
246 // Ensure we consumed at least one digit.
247 if (this.consumed <= expectedLength) {
248 (_a = this.errors) === null || _a === void 0 ? void 0 : _a.absenceOfDigitsInNumericCharacterReference(this.consumed);
249 return 0;
250 }
251 // Figure out if this is a legit end of the entity
252 if (lastCp === CharCodes.SEMI) {
253 this.consumed += 1;
254 }
255 else if (this.decodeMode === DecodingMode.Strict) {
256 return 0;
257 }
258 this.emitCodePoint(replaceCodePoint(this.result), this.consumed);
259 if (this.errors) {
260 if (lastCp !== CharCodes.SEMI) {
261 this.errors.missingSemicolonAfterCharacterReference();
262 }
263 this.errors.validateNumericCharacterReference(this.result);
264 }
265 return this.consumed;
266 }
267 /**
268 * Parses a named entity.
269 *
270 * Equivalent to the `Named character reference state` in the HTML spec.
271 *
272 * @param str The string containing the entity (or a continuation of the entity).
273 * @param offset The current offset.
274 * @returns The number of characters that were consumed, or -1 if the entity is incomplete.
275 */
276 stateNamedEntity(str, offset) {
277 const { decodeTree } = this;
278 let current = decodeTree[this.treeIndex];
279 // The mask is the number of bytes of the value, including the current byte.
280 let valueLength = (current & BinTrieFlags.VALUE_LENGTH) >> 14;
281 for (; offset < str.length; offset++, this.excess++) {
282 const char = str.charCodeAt(offset);
283 this.treeIndex = determineBranch(decodeTree, current, this.treeIndex + Math.max(1, valueLength), char);
284 if (this.treeIndex < 0) {
285 return this.result === 0 ||
286 // If we are parsing an attribute
287 (this.decodeMode === DecodingMode.Attribute &&
288 // We shouldn't have consumed any characters after the entity,
289 (valueLength === 0 ||
290 // And there should be no invalid characters.
291 isEntityInAttributeInvalidEnd(char)))
292 ? 0
293 : this.emitNotTerminatedNamedEntity();
294 }
295 current = decodeTree[this.treeIndex];
296 valueLength = (current & BinTrieFlags.VALUE_LENGTH) >> 14;
297 // If the branch is a value, store it and continue
298 if (valueLength !== 0) {
299 // If the entity is terminated by a semicolon, we are done.
300 if (char === CharCodes.SEMI) {
301 return this.emitNamedEntityData(this.treeIndex, valueLength, this.consumed + this.excess);
302 }
303 // If we encounter a non-terminated (legacy) entity while parsing strictly, then ignore it.
304 if (this.decodeMode !== DecodingMode.Strict) {
305 this.result = this.treeIndex;
306 this.consumed += this.excess;
307 this.excess = 0;
308 }
309 }
310 }
311 return -1;
312 }
313 /**
314 * Emit a named entity that was not terminated with a semicolon.
315 *
316 * @returns The number of characters consumed.
317 */
318 emitNotTerminatedNamedEntity() {
319 var _a;
320 const { result, decodeTree } = this;
321 const valueLength = (decodeTree[result] & BinTrieFlags.VALUE_LENGTH) >> 14;
322 this.emitNamedEntityData(result, valueLength, this.consumed);
323 (_a = this.errors) === null || _a === void 0 ? void 0 : _a.missingSemicolonAfterCharacterReference();
324 return this.consumed;
325 }
326 /**
327 * Emit a named entity.
328 *
329 * @param result The index of the entity in the decode tree.
330 * @param valueLength The number of bytes in the entity.
331 * @param consumed The number of characters consumed.
332 *
333 * @returns The number of characters consumed.
334 */
335 emitNamedEntityData(result, valueLength, consumed) {
336 const { decodeTree } = this;
337 this.emitCodePoint(valueLength === 1
338 ? decodeTree[result] & ~BinTrieFlags.VALUE_LENGTH
339 : decodeTree[result + 1], consumed);
340 if (valueLength === 3) {
341 // For multi-byte values, we need to emit the second byte.
342 this.emitCodePoint(decodeTree[result + 2], consumed);
343 }
344 return consumed;
345 }
346 /**
347 * Signal to the parser that the end of the input was reached.
348 *
349 * Remaining data will be emitted and relevant errors will be produced.
350 *
351 * @returns The number of characters consumed.
352 */
353 end() {
354 var _a;
355 switch (this.state) {
356 case EntityDecoderState.NamedEntity: {
357 // Emit a named entity if we have one.
358 return this.result !== 0 &&
359 (this.decodeMode !== DecodingMode.Attribute ||
360 this.result === this.treeIndex)
361 ? this.emitNotTerminatedNamedEntity()
362 : 0;
363 }
364 // Otherwise, emit a numeric entity if we have one.
365 case EntityDecoderState.NumericDecimal: {
366 return this.emitNumericEntity(0, 2);
367 }
368 case EntityDecoderState.NumericHex: {
369 return this.emitNumericEntity(0, 3);
370 }
371 case EntityDecoderState.NumericStart: {
372 (_a = this.errors) === null || _a === void 0 ? void 0 : _a.absenceOfDigitsInNumericCharacterReference(this.consumed);
373 return 0;
374 }
375 case EntityDecoderState.EntityStart: {
376 // Return 0 if we have no entity.
377 return 0;
378 }
379 }
380 }
381}
382/**
383 * Creates a function that decodes entities in a string.
384 *
385 * @param decodeTree The decode tree.
386 * @returns A function that decodes entities in a string.
387 */
388function getDecoder(decodeTree) {
389 let ret = "";
390 const decoder = new EntityDecoder(decodeTree, (str) => (ret += fromCodePoint(str)));
391 return function decodeWithTrie(str, decodeMode) {
392 let lastIndex = 0;
393 let offset = 0;
394 while ((offset = str.indexOf("&", offset)) >= 0) {
395 ret += str.slice(lastIndex, offset);
396 decoder.startEntity(decodeMode);
397 const len = decoder.write(str,
398 // Skip the "&"
399 offset + 1);
400 if (len < 0) {
401 lastIndex = offset + decoder.end();
402 break;
403 }
404 lastIndex = offset + len;
405 // If `len` is 0, skip the current `&` and continue.
406 offset = len === 0 ? lastIndex + 1 : lastIndex;
407 }
408 const result = ret + str.slice(lastIndex);
409 // Make sure we don't keep a reference to the final string.
410 ret = "";
411 return result;
412 };
413}
414/**
415 * Determines the branch of the current node that is taken given the current
416 * character. This function is used to traverse the trie.
417 *
418 * @param decodeTree The trie.
419 * @param current The current node.
420 * @param nodeIdx The index right after the current node and its value.
421 * @param char The current character.
422 * @returns The index of the next node, or -1 if no branch is taken.
423 */
424export function determineBranch(decodeTree, current, nodeIdx, char) {
425 const branchCount = (current & BinTrieFlags.BRANCH_LENGTH) >> 7;
426 const jumpOffset = current & BinTrieFlags.JUMP_TABLE;
427 // Case 1: Single branch encoded in jump offset
428 if (branchCount === 0) {
429 return jumpOffset !== 0 && char === jumpOffset ? nodeIdx : -1;
430 }
431 // Case 2: Multiple branches encoded in jump table
432 if (jumpOffset) {
433 const value = char - jumpOffset;
434 return value < 0 || value >= branchCount
435 ? -1
436 : decodeTree[nodeIdx + value] - 1;
437 }
438 // Case 3: Multiple branches encoded in dictionary
439 // Binary search for the character.
440 let lo = nodeIdx;
441 let hi = lo + branchCount - 1;
442 while (lo <= hi) {
443 const mid = (lo + hi) >>> 1;
444 const midVal = decodeTree[mid];
445 if (midVal < char) {
446 lo = mid + 1;
447 }
448 else if (midVal > char) {
449 hi = mid - 1;
450 }
451 else {
452 return decodeTree[mid + branchCount];
453 }
454 }
455 return -1;
456}
457const htmlDecoder = getDecoder(htmlDecodeTree);
458const xmlDecoder = getDecoder(xmlDecodeTree);
459/**
460 * Decodes an HTML string.
461 *
462 * @param str The string to decode.
463 * @param mode The decoding mode.
464 * @returns The decoded string.
465 */
466export function decodeHTML(str, mode = DecodingMode.Legacy) {
467 return htmlDecoder(str, mode);
468}
469/**
470 * Decodes an HTML string in an attribute.
471 *
472 * @param str The string to decode.
473 * @returns The decoded string.
474 */
475export function decodeHTMLAttribute(str) {
476 return htmlDecoder(str, DecodingMode.Attribute);
477}
478/**
479 * Decodes an HTML string, requiring all entities to be terminated by a semicolon.
480 *
481 * @param str The string to decode.
482 * @returns The decoded string.
483 */
484export function decodeHTMLStrict(str) {
485 return htmlDecoder(str, DecodingMode.Strict);
486}
487/**
488 * Decodes an XML string, requiring all entities to be terminated by a semicolon.
489 *
490 * @param str The string to decode.
491 * @returns The decoded string.
492 */
493export function decodeXML(str) {
494 return xmlDecoder(str, DecodingMode.Strict);
495}
496//# sourceMappingURL=decode.js.map
Note: See TracBrowser for help on using the repository browser.