Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

source: node_modules/entities/lib/esm/decode.js

Last change on this file was 57e58a3, checked in by ste08 <sjovanoska@…>, 4 months ago
Initial commit
Property mode set to `100644`
File size: 19.3 KB

Line
1	import htmlDecodeTree from "./generated/decode-data-html.js";
2	import xmlDecodeTree from "./generated/decode-data-xml.js";
3	import decodeCodePoint, { replaceCodePoint, fromCodePoint, } from "./decode_codepoint.js";
4	// Re-export for use by eg. htmlparser2
5	export { htmlDecodeTree, xmlDecodeTree, decodeCodePoint };
6	export { replaceCodePoint, fromCodePoint } from "./decode_codepoint.js";
7	var CharCodes;
8	(function (CharCodes) {
9	CharCodes[CharCodes["NUM"] = 35] = "NUM";
10	CharCodes[CharCodes["SEMI"] = 59] = "SEMI";
11	CharCodes[CharCodes["EQUALS"] = 61] = "EQUALS";
12	CharCodes[CharCodes["ZERO"] = 48] = "ZERO";
13	CharCodes[CharCodes["NINE"] = 57] = "NINE";
14	CharCodes[CharCodes["LOWER_A"] = 97] = "LOWER_A";
15	CharCodes[CharCodes["LOWER_F"] = 102] = "LOWER_F";
16	CharCodes[CharCodes["LOWER_X"] = 120] = "LOWER_X";
17	CharCodes[CharCodes["LOWER_Z"] = 122] = "LOWER_Z";
18	CharCodes[CharCodes["UPPER_A"] = 65] = "UPPER_A";
19	CharCodes[CharCodes["UPPER_F"] = 70] = "UPPER_F";
20	CharCodes[CharCodes["UPPER_Z"] = 90] = "UPPER_Z";
21	})(CharCodes \|\| (CharCodes = {}));
22	/** Bit that needs to be set to convert an upper case ASCII character to lower case */
23	const TO_LOWER_BIT = 0b100000;
24	export var BinTrieFlags;
25	(function (BinTrieFlags) {
26	BinTrieFlags[BinTrieFlags["VALUE_LENGTH"] = 49152] = "VALUE_LENGTH";
27	BinTrieFlags[BinTrieFlags["BRANCH_LENGTH"] = 16256] = "BRANCH_LENGTH";
28	BinTrieFlags[BinTrieFlags["JUMP_TABLE"] = 127] = "JUMP_TABLE";
29	})(BinTrieFlags \|\| (BinTrieFlags = {}));
30	function isNumber(code) {
31	return code >= CharCodes.ZERO && code <= CharCodes.NINE;
32	}
33	function isHexadecimalCharacter(code) {
34	return ((code >= CharCodes.UPPER_A && code <= CharCodes.UPPER_F) \|\|
35	(code >= CharCodes.LOWER_A && code <= CharCodes.LOWER_F));
36	}
37	function isAsciiAlphaNumeric(code) {
38	return ((code >= CharCodes.UPPER_A && code <= CharCodes.UPPER_Z) \|\|
39	(code >= CharCodes.LOWER_A && code <= CharCodes.LOWER_Z) \|\|
40	isNumber(code));
41	}
42	/**
43	* Checks if the given character is a valid end character for an entity in an attribute.
44	*
45	* Attribute values that aren't terminated properly aren't parsed, and shouldn't lead to a parser error.
46	* See the example in https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
47	*/
48	function isEntityInAttributeInvalidEnd(code) {
49	return code === CharCodes.EQUALS \|\| isAsciiAlphaNumeric(code);
50	}
51	var EntityDecoderState;
52	(function (EntityDecoderState) {
53	EntityDecoderState[EntityDecoderState["EntityStart"] = 0] = "EntityStart";
54	EntityDecoderState[EntityDecoderState["NumericStart"] = 1] = "NumericStart";
55	EntityDecoderState[EntityDecoderState["NumericDecimal"] = 2] = "NumericDecimal";
56	EntityDecoderState[EntityDecoderState["NumericHex"] = 3] = "NumericHex";
57	EntityDecoderState[EntityDecoderState["NamedEntity"] = 4] = "NamedEntity";
58	})(EntityDecoderState \|\| (EntityDecoderState = {}));
59	export var DecodingMode;
60	(function (DecodingMode) {
61	/** Entities in text nodes that can end with any character. */
62	DecodingMode[DecodingMode["Legacy"] = 0] = "Legacy";
63	/** Only allow entities terminated with a semicolon. */
64	DecodingMode[DecodingMode["Strict"] = 1] = "Strict";
65	/** Entities in attributes have limitations on ending characters. */
66	DecodingMode[DecodingMode["Attribute"] = 2] = "Attribute";
67	})(DecodingMode \|\| (DecodingMode = {}));
68	/**
69	* Token decoder with support of writing partial entities.
70	*/
71	export class EntityDecoder {
72	constructor(
73	/** The tree used to decode entities. */
74	decodeTree,
75	/**
76	* The function that is called when a codepoint is decoded.
77	*
78	* For multi-byte named entities, this will be called multiple times,
79	* with the second codepoint, and the same `consumed` value.
80	*
81	* @param codepoint The decoded codepoint.
82	* @param consumed The number of bytes consumed by the decoder.
83	*/
84	emitCodePoint,
85	/** An object that is used to produce errors. */
86	errors) {
87	this.decodeTree = decodeTree;
88	this.emitCodePoint = emitCodePoint;
89	this.errors = errors;
90	/** The current state of the decoder. */
91	this.state = EntityDecoderState.EntityStart;
92	/** Characters that were consumed while parsing an entity. */
93	this.consumed = 1;
94	/**
95	* The result of the entity.
96	*
97	* Either the result index of a numeric entity, or the codepoint of a
98	* numeric entity.
99	*/
100	this.result = 0;
101	/** The current index in the decode tree. */
102	this.treeIndex = 0;
103	/** The number of characters that were consumed in excess. */
104	this.excess = 1;
105	/** The mode in which the decoder is operating. */
106	this.decodeMode = DecodingMode.Strict;
107	}
108	/** Resets the instance to make it reusable. */
109	startEntity(decodeMode) {
110	this.decodeMode = decodeMode;
111	this.state = EntityDecoderState.EntityStart;
112	this.result = 0;
113	this.treeIndex = 0;
114	this.excess = 1;
115	this.consumed = 1;
116	}
117	/**
118	* Write an entity to the decoder. This can be called multiple times with partial entities.
119	* If the entity is incomplete, the decoder will return -1.
120	*
121	* Mirrors the implementation of `getDecoder`, but with the ability to stop decoding if the
122	* entity is incomplete, and resume when the next string is written.
123	*
124	* @param string The string containing the entity (or a continuation of the entity).
125	* @param offset The offset at which the entity begins. Should be 0 if this is not the first call.
126	* @returns The number of characters that were consumed, or -1 if the entity is incomplete.
127	*/
128	write(str, offset) {
129	switch (this.state) {
130	case EntityDecoderState.EntityStart: {
131	if (str.charCodeAt(offset) === CharCodes.NUM) {
132	this.state = EntityDecoderState.NumericStart;
133	this.consumed += 1;
134	return this.stateNumericStart(str, offset + 1);
135	}
136	this.state = EntityDecoderState.NamedEntity;
137	return this.stateNamedEntity(str, offset);
138	}
139	case EntityDecoderState.NumericStart: {
140	return this.stateNumericStart(str, offset);
141	}
142	case EntityDecoderState.NumericDecimal: {
143	return this.stateNumericDecimal(str, offset);
144	}
145	case EntityDecoderState.NumericHex: {
146	return this.stateNumericHex(str, offset);
147	}
148	case EntityDecoderState.NamedEntity: {
149	return this.stateNamedEntity(str, offset);
150	}
151	}
152	}
153	/**
154	* Switches between the numeric decimal and hexadecimal states.
155	*
156	* Equivalent to the `Numeric character reference state` in the HTML spec.
157	*
158	* @param str The string containing the entity (or a continuation of the entity).
159	* @param offset The current offset.
160	* @returns The number of characters that were consumed, or -1 if the entity is incomplete.
161	*/
162	stateNumericStart(str, offset) {
163	if (offset >= str.length) {
164	return -1;
165	}
166	if ((str.charCodeAt(offset) \| TO_LOWER_BIT) === CharCodes.LOWER_X) {
167	this.state = EntityDecoderState.NumericHex;
168	this.consumed += 1;
169	return this.stateNumericHex(str, offset + 1);
170	}
171	this.state = EntityDecoderState.NumericDecimal;
172	return this.stateNumericDecimal(str, offset);
173	}
174	addToNumericResult(str, start, end, base) {
175	if (start !== end) {
176	const digitCount = end - start;
177	this.result =
178	this.result * Math.pow(base, digitCount) +
179	parseInt(str.substr(start, digitCount), base);
180	this.consumed += digitCount;
181	}
182	}
183	/**
184	* Parses a hexadecimal numeric entity.
185	*
186	* Equivalent to the `Hexademical character reference state` in the HTML spec.
187	*
188	* @param str The string containing the entity (or a continuation of the entity).
189	* @param offset The current offset.
190	* @returns The number of characters that were consumed, or -1 if the entity is incomplete.
191	*/
192	stateNumericHex(str, offset) {
193	const startIdx = offset;
194	while (offset < str.length) {
195	const char = str.charCodeAt(offset);
196	if (isNumber(char) \|\| isHexadecimalCharacter(char)) {
197	offset += 1;
198	}
199	else {
200	this.addToNumericResult(str, startIdx, offset, 16);
201	return this.emitNumericEntity(char, 3);
202	}
203	}
204	this.addToNumericResult(str, startIdx, offset, 16);
205	return -1;
206	}
207	/**
208	* Parses a decimal numeric entity.
209	*
210	* Equivalent to the `Decimal character reference state` in the HTML spec.
211	*
212	* @param str The string containing the entity (or a continuation of the entity).
213	* @param offset The current offset.
214	* @returns The number of characters that were consumed, or -1 if the entity is incomplete.
215	*/
216	stateNumericDecimal(str, offset) {
217	const startIdx = offset;
218	while (offset < str.length) {
219	const char = str.charCodeAt(offset);
220	if (isNumber(char)) {
221	offset += 1;
222	}
223	else {
224	this.addToNumericResult(str, startIdx, offset, 10);
225	return this.emitNumericEntity(char, 2);
226	}
227	}
228	this.addToNumericResult(str, startIdx, offset, 10);
229	return -1;
230	}
231	/**
232	* Validate and emit a numeric entity.
233	*
234	* Implements the logic from the `Hexademical character reference start
235	* state` and `Numeric character reference end state` in the HTML spec.
236	*
237	* @param lastCp The last code point of the entity. Used to see if the
238	* entity was terminated with a semicolon.
239	* @param expectedLength The minimum number of characters that should be
240	* consumed. Used to validate that at least one digit
241	* was consumed.
242	* @returns The number of characters that were consumed.
243	*/
244	emitNumericEntity(lastCp, expectedLength) {
245	var _a;
246	// Ensure we consumed at least one digit.
247	if (this.consumed <= expectedLength) {
248	(_a = this.errors) === null \|\| _a === void 0 ? void 0 : _a.absenceOfDigitsInNumericCharacterReference(this.consumed);
249	return 0;
250	}
251	// Figure out if this is a legit end of the entity
252	if (lastCp === CharCodes.SEMI) {
253	this.consumed += 1;
254	}
255	else if (this.decodeMode === DecodingMode.Strict) {
256	return 0;
257	}
258	this.emitCodePoint(replaceCodePoint(this.result), this.consumed);
259	if (this.errors) {
260	if (lastCp !== CharCodes.SEMI) {
261	this.errors.missingSemicolonAfterCharacterReference();
262	}
263	this.errors.validateNumericCharacterReference(this.result);
264	}
265	return this.consumed;
266	}
267	/**
268	* Parses a named entity.
269	*
270	* Equivalent to the `Named character reference state` in the HTML spec.
271	*
272	* @param str The string containing the entity (or a continuation of the entity).
273	* @param offset The current offset.
274	* @returns The number of characters that were consumed, or -1 if the entity is incomplete.
275	*/
276	stateNamedEntity(str, offset) {
277	const { decodeTree } = this;
278	let current = decodeTree[this.treeIndex];
279	// The mask is the number of bytes of the value, including the current byte.
280	let valueLength = (current & BinTrieFlags.VALUE_LENGTH) >> 14;
281	for (; offset < str.length; offset++, this.excess++) {
282	const char = str.charCodeAt(offset);
283	this.treeIndex = determineBranch(decodeTree, current, this.treeIndex + Math.max(1, valueLength), char);
284	if (this.treeIndex < 0) {
285	return this.result === 0 \|\|
286	// If we are parsing an attribute
287	(this.decodeMode === DecodingMode.Attribute &&
288	// We shouldn't have consumed any characters after the entity,
289	(valueLength === 0 \|\|
290	// And there should be no invalid characters.
291	isEntityInAttributeInvalidEnd(char)))
292	? 0
293	: this.emitNotTerminatedNamedEntity();
294	}
295	current = decodeTree[this.treeIndex];
296	valueLength = (current & BinTrieFlags.VALUE_LENGTH) >> 14;
297	// If the branch is a value, store it and continue
298	if (valueLength !== 0) {
299	// If the entity is terminated by a semicolon, we are done.
300	if (char === CharCodes.SEMI) {
301	return this.emitNamedEntityData(this.treeIndex, valueLength, this.consumed + this.excess);
302	}
303	// If we encounter a non-terminated (legacy) entity while parsing strictly, then ignore it.
304	if (this.decodeMode !== DecodingMode.Strict) {
305	this.result = this.treeIndex;
306	this.consumed += this.excess;
307	this.excess = 0;
308	}
309	}
310	}
311	return -1;
312	}
313	/**
314	* Emit a named entity that was not terminated with a semicolon.
315	*
316	* @returns The number of characters consumed.
317	*/
318	emitNotTerminatedNamedEntity() {
319	var _a;
320	const { result, decodeTree } = this;
321	const valueLength = (decodeTree[result] & BinTrieFlags.VALUE_LENGTH) >> 14;
322	this.emitNamedEntityData(result, valueLength, this.consumed);
323	(_a = this.errors) === null \|\| _a === void 0 ? void 0 : _a.missingSemicolonAfterCharacterReference();
324	return this.consumed;
325	}
326	/**
327	* Emit a named entity.
328	*
329	* @param result The index of the entity in the decode tree.
330	* @param valueLength The number of bytes in the entity.
331	* @param consumed The number of characters consumed.
332	*
333	* @returns The number of characters consumed.
334	*/
335	emitNamedEntityData(result, valueLength, consumed) {
336	const { decodeTree } = this;
337	this.emitCodePoint(valueLength === 1
338	? decodeTree[result] & ~BinTrieFlags.VALUE_LENGTH
339	: decodeTree[result + 1], consumed);
340	if (valueLength === 3) {
341	// For multi-byte values, we need to emit the second byte.
342	this.emitCodePoint(decodeTree[result + 2], consumed);
343	}
344	return consumed;
345	}
346	/**
347	* Signal to the parser that the end of the input was reached.
348	*
349	* Remaining data will be emitted and relevant errors will be produced.
350	*
351	* @returns The number of characters consumed.
352	*/
353	end() {
354	var _a;
355	switch (this.state) {
356	case EntityDecoderState.NamedEntity: {
357	// Emit a named entity if we have one.
358	return this.result !== 0 &&
359	(this.decodeMode !== DecodingMode.Attribute \|\|
360	this.result === this.treeIndex)
361	? this.emitNotTerminatedNamedEntity()
362	: 0;
363	}
364	// Otherwise, emit a numeric entity if we have one.
365	case EntityDecoderState.NumericDecimal: {
366	return this.emitNumericEntity(0, 2);
367	}
368	case EntityDecoderState.NumericHex: {
369	return this.emitNumericEntity(0, 3);
370	}
371	case EntityDecoderState.NumericStart: {
372	(_a = this.errors) === null \|\| _a === void 0 ? void 0 : _a.absenceOfDigitsInNumericCharacterReference(this.consumed);
373	return 0;
374	}
375	case EntityDecoderState.EntityStart: {
376	// Return 0 if we have no entity.
377	return 0;
378	}
379	}
380	}
381	}
382	/**
383	* Creates a function that decodes entities in a string.
384	*
385	* @param decodeTree The decode tree.
386	* @returns A function that decodes entities in a string.
387	*/
388	function getDecoder(decodeTree) {
389	let ret = "";
390	const decoder = new EntityDecoder(decodeTree, (str) => (ret += fromCodePoint(str)));
391	return function decodeWithTrie(str, decodeMode) {
392	let lastIndex = 0;
393	let offset = 0;
394	while ((offset = str.indexOf("&", offset)) >= 0) {
395	ret += str.slice(lastIndex, offset);
396	decoder.startEntity(decodeMode);
397	const len = decoder.write(str,
398	// Skip the "&"
399	offset + 1);
400	if (len < 0) {
401	lastIndex = offset + decoder.end();
402	break;
403	}
404	lastIndex = offset + len;
405	// If `len` is 0, skip the current `&` and continue.
406	offset = len === 0 ? lastIndex + 1 : lastIndex;
407	}
408	const result = ret + str.slice(lastIndex);
409	// Make sure we don't keep a reference to the final string.
410	ret = "";
411	return result;
412	};
413	}
414	/**
415	* Determines the branch of the current node that is taken given the current
416	* character. This function is used to traverse the trie.
417	*
418	* @param decodeTree The trie.
419	* @param current The current node.
420	* @param nodeIdx The index right after the current node and its value.
421	* @param char The current character.
422	* @returns The index of the next node, or -1 if no branch is taken.
423	*/
424	export function determineBranch(decodeTree, current, nodeIdx, char) {
425	const branchCount = (current & BinTrieFlags.BRANCH_LENGTH) >> 7;
426	const jumpOffset = current & BinTrieFlags.JUMP_TABLE;
427	// Case 1: Single branch encoded in jump offset
428	if (branchCount === 0) {
429	return jumpOffset !== 0 && char === jumpOffset ? nodeIdx : -1;
430	}
431	// Case 2: Multiple branches encoded in jump table
432	if (jumpOffset) {
433	const value = char - jumpOffset;
434	return value < 0 \|\| value >= branchCount
435	? -1
436	: decodeTree[nodeIdx + value] - 1;
437	}
438	// Case 3: Multiple branches encoded in dictionary
439	// Binary search for the character.
440	let lo = nodeIdx;
441	let hi = lo + branchCount - 1;
442	while (lo <= hi) {
443	const mid = (lo + hi) >>> 1;
444	const midVal = decodeTree[mid];
445	if (midVal < char) {
446	lo = mid + 1;
447	}
448	else if (midVal > char) {
449	hi = mid - 1;
450	}
451	else {
452	return decodeTree[mid + branchCount];
453	}
454	}
455	return -1;
456	}
457	const htmlDecoder = getDecoder(htmlDecodeTree);
458	const xmlDecoder = getDecoder(xmlDecodeTree);
459	/**
460	* Decodes an HTML string.
461	*
462	* @param str The string to decode.
463	* @param mode The decoding mode.
464	* @returns The decoded string.
465	*/
466	export function decodeHTML(str, mode = DecodingMode.Legacy) {
467	return htmlDecoder(str, mode);
468	}
469	/**
470	* Decodes an HTML string in an attribute.
471	*
472	* @param str The string to decode.
473	* @returns The decoded string.
474	*/
475	export function decodeHTMLAttribute(str) {
476	return htmlDecoder(str, DecodingMode.Attribute);
477	}
478	/**
479	* Decodes an HTML string, requiring all entities to be terminated by a semicolon.
480	*
481	* @param str The string to decode.
482	* @returns The decoded string.
483	*/
484	export function decodeHTMLStrict(str) {
485	return htmlDecoder(str, DecodingMode.Strict);
486	}
487	/**
488	* Decodes an XML string, requiring all entities to be terminated by a semicolon.
489	*
490	* @param str The string to decode.
491	* @returns The decoded string.
492	*/
493	export function decodeXML(str) {
494	return xmlDecoder(str, DecodingMode.Strict);
495	}
496	//# sourceMappingURL=decode.js.map

Note: See TracBrowser for help on using the repository browser.

Download in other formats: