source: node_modules/entities/lib/decode.js

Last change on this file was 57e58a3, checked in by ste08 <sjovanoska@…>, 4 months ago

Initial commit

  • Property mode set to 100644
File size: 22.1 KB
Line 
1"use strict";
2var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3 if (k2 === undefined) k2 = k;
4 var desc = Object.getOwnPropertyDescriptor(m, k);
5 if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6 desc = { enumerable: true, get: function() { return m[k]; } };
7 }
8 Object.defineProperty(o, k2, desc);
9}) : (function(o, m, k, k2) {
10 if (k2 === undefined) k2 = k;
11 o[k2] = m[k];
12}));
13var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14 Object.defineProperty(o, "default", { enumerable: true, value: v });
15}) : function(o, v) {
16 o["default"] = v;
17});
18var __importStar = (this && this.__importStar) || function (mod) {
19 if (mod && mod.__esModule) return mod;
20 var result = {};
21 if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
22 __setModuleDefault(result, mod);
23 return result;
24};
25var __importDefault = (this && this.__importDefault) || function (mod) {
26 return (mod && mod.__esModule) ? mod : { "default": mod };
27};
28Object.defineProperty(exports, "__esModule", { value: true });
29exports.decodeXML = exports.decodeHTMLStrict = exports.decodeHTMLAttribute = exports.decodeHTML = exports.determineBranch = exports.EntityDecoder = exports.DecodingMode = exports.BinTrieFlags = exports.fromCodePoint = exports.replaceCodePoint = exports.decodeCodePoint = exports.xmlDecodeTree = exports.htmlDecodeTree = void 0;
30var decode_data_html_js_1 = __importDefault(require("./generated/decode-data-html.js"));
31exports.htmlDecodeTree = decode_data_html_js_1.default;
32var decode_data_xml_js_1 = __importDefault(require("./generated/decode-data-xml.js"));
33exports.xmlDecodeTree = decode_data_xml_js_1.default;
34var decode_codepoint_js_1 = __importStar(require("./decode_codepoint.js"));
35exports.decodeCodePoint = decode_codepoint_js_1.default;
36var decode_codepoint_js_2 = require("./decode_codepoint.js");
37Object.defineProperty(exports, "replaceCodePoint", { enumerable: true, get: function () { return decode_codepoint_js_2.replaceCodePoint; } });
38Object.defineProperty(exports, "fromCodePoint", { enumerable: true, get: function () { return decode_codepoint_js_2.fromCodePoint; } });
39var CharCodes;
40(function (CharCodes) {
41 CharCodes[CharCodes["NUM"] = 35] = "NUM";
42 CharCodes[CharCodes["SEMI"] = 59] = "SEMI";
43 CharCodes[CharCodes["EQUALS"] = 61] = "EQUALS";
44 CharCodes[CharCodes["ZERO"] = 48] = "ZERO";
45 CharCodes[CharCodes["NINE"] = 57] = "NINE";
46 CharCodes[CharCodes["LOWER_A"] = 97] = "LOWER_A";
47 CharCodes[CharCodes["LOWER_F"] = 102] = "LOWER_F";
48 CharCodes[CharCodes["LOWER_X"] = 120] = "LOWER_X";
49 CharCodes[CharCodes["LOWER_Z"] = 122] = "LOWER_Z";
50 CharCodes[CharCodes["UPPER_A"] = 65] = "UPPER_A";
51 CharCodes[CharCodes["UPPER_F"] = 70] = "UPPER_F";
52 CharCodes[CharCodes["UPPER_Z"] = 90] = "UPPER_Z";
53})(CharCodes || (CharCodes = {}));
54/** Bit that needs to be set to convert an upper case ASCII character to lower case */
55var TO_LOWER_BIT = 32;
56var BinTrieFlags;
57(function (BinTrieFlags) {
58 BinTrieFlags[BinTrieFlags["VALUE_LENGTH"] = 49152] = "VALUE_LENGTH";
59 BinTrieFlags[BinTrieFlags["BRANCH_LENGTH"] = 16256] = "BRANCH_LENGTH";
60 BinTrieFlags[BinTrieFlags["JUMP_TABLE"] = 127] = "JUMP_TABLE";
61})(BinTrieFlags = exports.BinTrieFlags || (exports.BinTrieFlags = {}));
62function isNumber(code) {
63 return code >= CharCodes.ZERO && code <= CharCodes.NINE;
64}
65function isHexadecimalCharacter(code) {
66 return ((code >= CharCodes.UPPER_A && code <= CharCodes.UPPER_F) ||
67 (code >= CharCodes.LOWER_A && code <= CharCodes.LOWER_F));
68}
69function isAsciiAlphaNumeric(code) {
70 return ((code >= CharCodes.UPPER_A && code <= CharCodes.UPPER_Z) ||
71 (code >= CharCodes.LOWER_A && code <= CharCodes.LOWER_Z) ||
72 isNumber(code));
73}
74/**
75 * Checks if the given character is a valid end character for an entity in an attribute.
76 *
77 * Attribute values that aren't terminated properly aren't parsed, and shouldn't lead to a parser error.
78 * See the example in https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
79 */
80function isEntityInAttributeInvalidEnd(code) {
81 return code === CharCodes.EQUALS || isAsciiAlphaNumeric(code);
82}
83var EntityDecoderState;
84(function (EntityDecoderState) {
85 EntityDecoderState[EntityDecoderState["EntityStart"] = 0] = "EntityStart";
86 EntityDecoderState[EntityDecoderState["NumericStart"] = 1] = "NumericStart";
87 EntityDecoderState[EntityDecoderState["NumericDecimal"] = 2] = "NumericDecimal";
88 EntityDecoderState[EntityDecoderState["NumericHex"] = 3] = "NumericHex";
89 EntityDecoderState[EntityDecoderState["NamedEntity"] = 4] = "NamedEntity";
90})(EntityDecoderState || (EntityDecoderState = {}));
91var DecodingMode;
92(function (DecodingMode) {
93 /** Entities in text nodes that can end with any character. */
94 DecodingMode[DecodingMode["Legacy"] = 0] = "Legacy";
95 /** Only allow entities terminated with a semicolon. */
96 DecodingMode[DecodingMode["Strict"] = 1] = "Strict";
97 /** Entities in attributes have limitations on ending characters. */
98 DecodingMode[DecodingMode["Attribute"] = 2] = "Attribute";
99})(DecodingMode = exports.DecodingMode || (exports.DecodingMode = {}));
100/**
101 * Token decoder with support of writing partial entities.
102 */
103var EntityDecoder = /** @class */ (function () {
104 function EntityDecoder(
105 /** The tree used to decode entities. */
106 decodeTree,
107 /**
108 * The function that is called when a codepoint is decoded.
109 *
110 * For multi-byte named entities, this will be called multiple times,
111 * with the second codepoint, and the same `consumed` value.
112 *
113 * @param codepoint The decoded codepoint.
114 * @param consumed The number of bytes consumed by the decoder.
115 */
116 emitCodePoint,
117 /** An object that is used to produce errors. */
118 errors) {
119 this.decodeTree = decodeTree;
120 this.emitCodePoint = emitCodePoint;
121 this.errors = errors;
122 /** The current state of the decoder. */
123 this.state = EntityDecoderState.EntityStart;
124 /** Characters that were consumed while parsing an entity. */
125 this.consumed = 1;
126 /**
127 * The result of the entity.
128 *
129 * Either the result index of a numeric entity, or the codepoint of a
130 * numeric entity.
131 */
132 this.result = 0;
133 /** The current index in the decode tree. */
134 this.treeIndex = 0;
135 /** The number of characters that were consumed in excess. */
136 this.excess = 1;
137 /** The mode in which the decoder is operating. */
138 this.decodeMode = DecodingMode.Strict;
139 }
140 /** Resets the instance to make it reusable. */
141 EntityDecoder.prototype.startEntity = function (decodeMode) {
142 this.decodeMode = decodeMode;
143 this.state = EntityDecoderState.EntityStart;
144 this.result = 0;
145 this.treeIndex = 0;
146 this.excess = 1;
147 this.consumed = 1;
148 };
149 /**
150 * Write an entity to the decoder. This can be called multiple times with partial entities.
151 * If the entity is incomplete, the decoder will return -1.
152 *
153 * Mirrors the implementation of `getDecoder`, but with the ability to stop decoding if the
154 * entity is incomplete, and resume when the next string is written.
155 *
156 * @param string The string containing the entity (or a continuation of the entity).
157 * @param offset The offset at which the entity begins. Should be 0 if this is not the first call.
158 * @returns The number of characters that were consumed, or -1 if the entity is incomplete.
159 */
160 EntityDecoder.prototype.write = function (str, offset) {
161 switch (this.state) {
162 case EntityDecoderState.EntityStart: {
163 if (str.charCodeAt(offset) === CharCodes.NUM) {
164 this.state = EntityDecoderState.NumericStart;
165 this.consumed += 1;
166 return this.stateNumericStart(str, offset + 1);
167 }
168 this.state = EntityDecoderState.NamedEntity;
169 return this.stateNamedEntity(str, offset);
170 }
171 case EntityDecoderState.NumericStart: {
172 return this.stateNumericStart(str, offset);
173 }
174 case EntityDecoderState.NumericDecimal: {
175 return this.stateNumericDecimal(str, offset);
176 }
177 case EntityDecoderState.NumericHex: {
178 return this.stateNumericHex(str, offset);
179 }
180 case EntityDecoderState.NamedEntity: {
181 return this.stateNamedEntity(str, offset);
182 }
183 }
184 };
185 /**
186 * Switches between the numeric decimal and hexadecimal states.
187 *
188 * Equivalent to the `Numeric character reference state` in the HTML spec.
189 *
190 * @param str The string containing the entity (or a continuation of the entity).
191 * @param offset The current offset.
192 * @returns The number of characters that were consumed, or -1 if the entity is incomplete.
193 */
194 EntityDecoder.prototype.stateNumericStart = function (str, offset) {
195 if (offset >= str.length) {
196 return -1;
197 }
198 if ((str.charCodeAt(offset) | TO_LOWER_BIT) === CharCodes.LOWER_X) {
199 this.state = EntityDecoderState.NumericHex;
200 this.consumed += 1;
201 return this.stateNumericHex(str, offset + 1);
202 }
203 this.state = EntityDecoderState.NumericDecimal;
204 return this.stateNumericDecimal(str, offset);
205 };
206 EntityDecoder.prototype.addToNumericResult = function (str, start, end, base) {
207 if (start !== end) {
208 var digitCount = end - start;
209 this.result =
210 this.result * Math.pow(base, digitCount) +
211 parseInt(str.substr(start, digitCount), base);
212 this.consumed += digitCount;
213 }
214 };
215 /**
216 * Parses a hexadecimal numeric entity.
217 *
218 * Equivalent to the `Hexademical character reference state` in the HTML spec.
219 *
220 * @param str The string containing the entity (or a continuation of the entity).
221 * @param offset The current offset.
222 * @returns The number of characters that were consumed, or -1 if the entity is incomplete.
223 */
224 EntityDecoder.prototype.stateNumericHex = function (str, offset) {
225 var startIdx = offset;
226 while (offset < str.length) {
227 var char = str.charCodeAt(offset);
228 if (isNumber(char) || isHexadecimalCharacter(char)) {
229 offset += 1;
230 }
231 else {
232 this.addToNumericResult(str, startIdx, offset, 16);
233 return this.emitNumericEntity(char, 3);
234 }
235 }
236 this.addToNumericResult(str, startIdx, offset, 16);
237 return -1;
238 };
239 /**
240 * Parses a decimal numeric entity.
241 *
242 * Equivalent to the `Decimal character reference state` in the HTML spec.
243 *
244 * @param str The string containing the entity (or a continuation of the entity).
245 * @param offset The current offset.
246 * @returns The number of characters that were consumed, or -1 if the entity is incomplete.
247 */
248 EntityDecoder.prototype.stateNumericDecimal = function (str, offset) {
249 var startIdx = offset;
250 while (offset < str.length) {
251 var char = str.charCodeAt(offset);
252 if (isNumber(char)) {
253 offset += 1;
254 }
255 else {
256 this.addToNumericResult(str, startIdx, offset, 10);
257 return this.emitNumericEntity(char, 2);
258 }
259 }
260 this.addToNumericResult(str, startIdx, offset, 10);
261 return -1;
262 };
263 /**
264 * Validate and emit a numeric entity.
265 *
266 * Implements the logic from the `Hexademical character reference start
267 * state` and `Numeric character reference end state` in the HTML spec.
268 *
269 * @param lastCp The last code point of the entity. Used to see if the
270 * entity was terminated with a semicolon.
271 * @param expectedLength The minimum number of characters that should be
272 * consumed. Used to validate that at least one digit
273 * was consumed.
274 * @returns The number of characters that were consumed.
275 */
276 EntityDecoder.prototype.emitNumericEntity = function (lastCp, expectedLength) {
277 var _a;
278 // Ensure we consumed at least one digit.
279 if (this.consumed <= expectedLength) {
280 (_a = this.errors) === null || _a === void 0 ? void 0 : _a.absenceOfDigitsInNumericCharacterReference(this.consumed);
281 return 0;
282 }
283 // Figure out if this is a legit end of the entity
284 if (lastCp === CharCodes.SEMI) {
285 this.consumed += 1;
286 }
287 else if (this.decodeMode === DecodingMode.Strict) {
288 return 0;
289 }
290 this.emitCodePoint((0, decode_codepoint_js_1.replaceCodePoint)(this.result), this.consumed);
291 if (this.errors) {
292 if (lastCp !== CharCodes.SEMI) {
293 this.errors.missingSemicolonAfterCharacterReference();
294 }
295 this.errors.validateNumericCharacterReference(this.result);
296 }
297 return this.consumed;
298 };
299 /**
300 * Parses a named entity.
301 *
302 * Equivalent to the `Named character reference state` in the HTML spec.
303 *
304 * @param str The string containing the entity (or a continuation of the entity).
305 * @param offset The current offset.
306 * @returns The number of characters that were consumed, or -1 if the entity is incomplete.
307 */
308 EntityDecoder.prototype.stateNamedEntity = function (str, offset) {
309 var decodeTree = this.decodeTree;
310 var current = decodeTree[this.treeIndex];
311 // The mask is the number of bytes of the value, including the current byte.
312 var valueLength = (current & BinTrieFlags.VALUE_LENGTH) >> 14;
313 for (; offset < str.length; offset++, this.excess++) {
314 var char = str.charCodeAt(offset);
315 this.treeIndex = determineBranch(decodeTree, current, this.treeIndex + Math.max(1, valueLength), char);
316 if (this.treeIndex < 0) {
317 return this.result === 0 ||
318 // If we are parsing an attribute
319 (this.decodeMode === DecodingMode.Attribute &&
320 // We shouldn't have consumed any characters after the entity,
321 (valueLength === 0 ||
322 // And there should be no invalid characters.
323 isEntityInAttributeInvalidEnd(char)))
324 ? 0
325 : this.emitNotTerminatedNamedEntity();
326 }
327 current = decodeTree[this.treeIndex];
328 valueLength = (current & BinTrieFlags.VALUE_LENGTH) >> 14;
329 // If the branch is a value, store it and continue
330 if (valueLength !== 0) {
331 // If the entity is terminated by a semicolon, we are done.
332 if (char === CharCodes.SEMI) {
333 return this.emitNamedEntityData(this.treeIndex, valueLength, this.consumed + this.excess);
334 }
335 // If we encounter a non-terminated (legacy) entity while parsing strictly, then ignore it.
336 if (this.decodeMode !== DecodingMode.Strict) {
337 this.result = this.treeIndex;
338 this.consumed += this.excess;
339 this.excess = 0;
340 }
341 }
342 }
343 return -1;
344 };
345 /**
346 * Emit a named entity that was not terminated with a semicolon.
347 *
348 * @returns The number of characters consumed.
349 */
350 EntityDecoder.prototype.emitNotTerminatedNamedEntity = function () {
351 var _a;
352 var _b = this, result = _b.result, decodeTree = _b.decodeTree;
353 var valueLength = (decodeTree[result] & BinTrieFlags.VALUE_LENGTH) >> 14;
354 this.emitNamedEntityData(result, valueLength, this.consumed);
355 (_a = this.errors) === null || _a === void 0 ? void 0 : _a.missingSemicolonAfterCharacterReference();
356 return this.consumed;
357 };
358 /**
359 * Emit a named entity.
360 *
361 * @param result The index of the entity in the decode tree.
362 * @param valueLength The number of bytes in the entity.
363 * @param consumed The number of characters consumed.
364 *
365 * @returns The number of characters consumed.
366 */
367 EntityDecoder.prototype.emitNamedEntityData = function (result, valueLength, consumed) {
368 var decodeTree = this.decodeTree;
369 this.emitCodePoint(valueLength === 1
370 ? decodeTree[result] & ~BinTrieFlags.VALUE_LENGTH
371 : decodeTree[result + 1], consumed);
372 if (valueLength === 3) {
373 // For multi-byte values, we need to emit the second byte.
374 this.emitCodePoint(decodeTree[result + 2], consumed);
375 }
376 return consumed;
377 };
378 /**
379 * Signal to the parser that the end of the input was reached.
380 *
381 * Remaining data will be emitted and relevant errors will be produced.
382 *
383 * @returns The number of characters consumed.
384 */
385 EntityDecoder.prototype.end = function () {
386 var _a;
387 switch (this.state) {
388 case EntityDecoderState.NamedEntity: {
389 // Emit a named entity if we have one.
390 return this.result !== 0 &&
391 (this.decodeMode !== DecodingMode.Attribute ||
392 this.result === this.treeIndex)
393 ? this.emitNotTerminatedNamedEntity()
394 : 0;
395 }
396 // Otherwise, emit a numeric entity if we have one.
397 case EntityDecoderState.NumericDecimal: {
398 return this.emitNumericEntity(0, 2);
399 }
400 case EntityDecoderState.NumericHex: {
401 return this.emitNumericEntity(0, 3);
402 }
403 case EntityDecoderState.NumericStart: {
404 (_a = this.errors) === null || _a === void 0 ? void 0 : _a.absenceOfDigitsInNumericCharacterReference(this.consumed);
405 return 0;
406 }
407 case EntityDecoderState.EntityStart: {
408 // Return 0 if we have no entity.
409 return 0;
410 }
411 }
412 };
413 return EntityDecoder;
414}());
415exports.EntityDecoder = EntityDecoder;
416/**
417 * Creates a function that decodes entities in a string.
418 *
419 * @param decodeTree The decode tree.
420 * @returns A function that decodes entities in a string.
421 */
422function getDecoder(decodeTree) {
423 var ret = "";
424 var decoder = new EntityDecoder(decodeTree, function (str) { return (ret += (0, decode_codepoint_js_1.fromCodePoint)(str)); });
425 return function decodeWithTrie(str, decodeMode) {
426 var lastIndex = 0;
427 var offset = 0;
428 while ((offset = str.indexOf("&", offset)) >= 0) {
429 ret += str.slice(lastIndex, offset);
430 decoder.startEntity(decodeMode);
431 var len = decoder.write(str,
432 // Skip the "&"
433 offset + 1);
434 if (len < 0) {
435 lastIndex = offset + decoder.end();
436 break;
437 }
438 lastIndex = offset + len;
439 // If `len` is 0, skip the current `&` and continue.
440 offset = len === 0 ? lastIndex + 1 : lastIndex;
441 }
442 var result = ret + str.slice(lastIndex);
443 // Make sure we don't keep a reference to the final string.
444 ret = "";
445 return result;
446 };
447}
448/**
449 * Determines the branch of the current node that is taken given the current
450 * character. This function is used to traverse the trie.
451 *
452 * @param decodeTree The trie.
453 * @param current The current node.
454 * @param nodeIdx The index right after the current node and its value.
455 * @param char The current character.
456 * @returns The index of the next node, or -1 if no branch is taken.
457 */
458function determineBranch(decodeTree, current, nodeIdx, char) {
459 var branchCount = (current & BinTrieFlags.BRANCH_LENGTH) >> 7;
460 var jumpOffset = current & BinTrieFlags.JUMP_TABLE;
461 // Case 1: Single branch encoded in jump offset
462 if (branchCount === 0) {
463 return jumpOffset !== 0 && char === jumpOffset ? nodeIdx : -1;
464 }
465 // Case 2: Multiple branches encoded in jump table
466 if (jumpOffset) {
467 var value = char - jumpOffset;
468 return value < 0 || value >= branchCount
469 ? -1
470 : decodeTree[nodeIdx + value] - 1;
471 }
472 // Case 3: Multiple branches encoded in dictionary
473 // Binary search for the character.
474 var lo = nodeIdx;
475 var hi = lo + branchCount - 1;
476 while (lo <= hi) {
477 var mid = (lo + hi) >>> 1;
478 var midVal = decodeTree[mid];
479 if (midVal < char) {
480 lo = mid + 1;
481 }
482 else if (midVal > char) {
483 hi = mid - 1;
484 }
485 else {
486 return decodeTree[mid + branchCount];
487 }
488 }
489 return -1;
490}
491exports.determineBranch = determineBranch;
492var htmlDecoder = getDecoder(decode_data_html_js_1.default);
493var xmlDecoder = getDecoder(decode_data_xml_js_1.default);
494/**
495 * Decodes an HTML string.
496 *
497 * @param str The string to decode.
498 * @param mode The decoding mode.
499 * @returns The decoded string.
500 */
501function decodeHTML(str, mode) {
502 if (mode === void 0) { mode = DecodingMode.Legacy; }
503 return htmlDecoder(str, mode);
504}
505exports.decodeHTML = decodeHTML;
506/**
507 * Decodes an HTML string in an attribute.
508 *
509 * @param str The string to decode.
510 * @returns The decoded string.
511 */
512function decodeHTMLAttribute(str) {
513 return htmlDecoder(str, DecodingMode.Attribute);
514}
515exports.decodeHTMLAttribute = decodeHTMLAttribute;
516/**
517 * Decodes an HTML string, requiring all entities to be terminated by a semicolon.
518 *
519 * @param str The string to decode.
520 * @returns The decoded string.
521 */
522function decodeHTMLStrict(str) {
523 return htmlDecoder(str, DecodingMode.Strict);
524}
525exports.decodeHTMLStrict = decodeHTMLStrict;
526/**
527 * Decodes an XML string, requiring all entities to be terminated by a semicolon.
528 *
529 * @param str The string to decode.
530 * @returns The decoded string.
531 */
532function decodeXML(str) {
533 return xmlDecoder(str, DecodingMode.Strict);
534}
535exports.decodeXML = decodeXML;
536//# sourceMappingURL=decode.js.map
Note: See TracBrowser for help on using the repository browser.