Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

source: node_modules/entities/lib/esm/decode.js

Last change on this file was 57e58a3, checked in by ste08 <sjovanoska@…>, 4 months ago
Initial commit
Property mode set to `100644`
File size: 19.3 KB

Rev	Line
[57e58a3]	1	import htmlDecodeTree from "./generated/decode-data-html.js";
	2	import xmlDecodeTree from "./generated/decode-data-xml.js";
	3	import decodeCodePoint, { replaceCodePoint, fromCodePoint, } from "./decode_codepoint.js";
	4	// Re-export for use by eg. htmlparser2
	5	export { htmlDecodeTree, xmlDecodeTree, decodeCodePoint };
	6	export { replaceCodePoint, fromCodePoint } from "./decode_codepoint.js";
	7	var CharCodes;
	8	(function (CharCodes) {
	9	CharCodes[CharCodes["NUM"] = 35] = "NUM";
	10	CharCodes[CharCodes["SEMI"] = 59] = "SEMI";
	11	CharCodes[CharCodes["EQUALS"] = 61] = "EQUALS";
	12	CharCodes[CharCodes["ZERO"] = 48] = "ZERO";
	13	CharCodes[CharCodes["NINE"] = 57] = "NINE";
	14	CharCodes[CharCodes["LOWER_A"] = 97] = "LOWER_A";
	15	CharCodes[CharCodes["LOWER_F"] = 102] = "LOWER_F";
	16	CharCodes[CharCodes["LOWER_X"] = 120] = "LOWER_X";
	17	CharCodes[CharCodes["LOWER_Z"] = 122] = "LOWER_Z";
	18	CharCodes[CharCodes["UPPER_A"] = 65] = "UPPER_A";
	19	CharCodes[CharCodes["UPPER_F"] = 70] = "UPPER_F";
	20	CharCodes[CharCodes["UPPER_Z"] = 90] = "UPPER_Z";
	21	})(CharCodes \|\| (CharCodes = {}));
	22	/** Bit that needs to be set to convert an upper case ASCII character to lower case */
	23	const TO_LOWER_BIT = 0b100000;
	24	export var BinTrieFlags;
	25	(function (BinTrieFlags) {
	26	BinTrieFlags[BinTrieFlags["VALUE_LENGTH"] = 49152] = "VALUE_LENGTH";
	27	BinTrieFlags[BinTrieFlags["BRANCH_LENGTH"] = 16256] = "BRANCH_LENGTH";
	28	BinTrieFlags[BinTrieFlags["JUMP_TABLE"] = 127] = "JUMP_TABLE";
	29	})(BinTrieFlags \|\| (BinTrieFlags = {}));
	30	function isNumber(code) {
	31	return code >= CharCodes.ZERO && code <= CharCodes.NINE;
	32	}
	33	function isHexadecimalCharacter(code) {
	34	return ((code >= CharCodes.UPPER_A && code <= CharCodes.UPPER_F) \|\|
	35	(code >= CharCodes.LOWER_A && code <= CharCodes.LOWER_F));
	36	}
	37	function isAsciiAlphaNumeric(code) {
	38	return ((code >= CharCodes.UPPER_A && code <= CharCodes.UPPER_Z) \|\|
	39	(code >= CharCodes.LOWER_A && code <= CharCodes.LOWER_Z) \|\|
	40	isNumber(code));
	41	}
	42	/**
	43	* Checks if the given character is a valid end character for an entity in an attribute.
	44	*
	45	* Attribute values that aren't terminated properly aren't parsed, and shouldn't lead to a parser error.
	46	* See the example in https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
	47	*/
	48	function isEntityInAttributeInvalidEnd(code) {
	49	return code === CharCodes.EQUALS \|\| isAsciiAlphaNumeric(code);
	50	}
	51	var EntityDecoderState;
	52	(function (EntityDecoderState) {
	53	EntityDecoderState[EntityDecoderState["EntityStart"] = 0] = "EntityStart";
	54	EntityDecoderState[EntityDecoderState["NumericStart"] = 1] = "NumericStart";
	55	EntityDecoderState[EntityDecoderState["NumericDecimal"] = 2] = "NumericDecimal";
	56	EntityDecoderState[EntityDecoderState["NumericHex"] = 3] = "NumericHex";
	57	EntityDecoderState[EntityDecoderState["NamedEntity"] = 4] = "NamedEntity";
	58	})(EntityDecoderState \|\| (EntityDecoderState = {}));
	59	export var DecodingMode;
	60	(function (DecodingMode) {
	61	/** Entities in text nodes that can end with any character. */
	62	DecodingMode[DecodingMode["Legacy"] = 0] = "Legacy";
	63	/** Only allow entities terminated with a semicolon. */
	64	DecodingMode[DecodingMode["Strict"] = 1] = "Strict";
	65	/** Entities in attributes have limitations on ending characters. */
	66	DecodingMode[DecodingMode["Attribute"] = 2] = "Attribute";
	67	})(DecodingMode \|\| (DecodingMode = {}));
	68	/**
	69	* Token decoder with support of writing partial entities.
	70	*/
	71	export class EntityDecoder {
	72	constructor(
	73	/** The tree used to decode entities. */
	74	decodeTree,
	75	/**
	76	* The function that is called when a codepoint is decoded.
	77	*
	78	* For multi-byte named entities, this will be called multiple times,
	79	* with the second codepoint, and the same `consumed` value.
	80	*
	81	* @param codepoint The decoded codepoint.
	82	* @param consumed The number of bytes consumed by the decoder.
	83	*/
	84	emitCodePoint,
	85	/** An object that is used to produce errors. */
	86	errors) {
	87	this.decodeTree = decodeTree;
	88	this.emitCodePoint = emitCodePoint;
	89	this.errors = errors;
	90	/** The current state of the decoder. */
	91	this.state = EntityDecoderState.EntityStart;
	92	/** Characters that were consumed while parsing an entity. */
	93	this.consumed = 1;
	94	/**
	95	* The result of the entity.
	96	*
	97	* Either the result index of a numeric entity, or the codepoint of a
	98	* numeric entity.
	99	*/
	100	this.result = 0;
	101	/** The current index in the decode tree. */
	102	this.treeIndex = 0;
	103	/** The number of characters that were consumed in excess. */
	104	this.excess = 1;
	105	/** The mode in which the decoder is operating. */
	106	this.decodeMode = DecodingMode.Strict;
	107	}
	108	/** Resets the instance to make it reusable. */
	109	startEntity(decodeMode) {
	110	this.decodeMode = decodeMode;
	111	this.state = EntityDecoderState.EntityStart;
	112	this.result = 0;
	113	this.treeIndex = 0;
	114	this.excess = 1;
	115	this.consumed = 1;
	116	}
	117	/**
	118	* Write an entity to the decoder. This can be called multiple times with partial entities.
	119	* If the entity is incomplete, the decoder will return -1.
	120	*
	121	* Mirrors the implementation of `getDecoder`, but with the ability to stop decoding if the
	122	* entity is incomplete, and resume when the next string is written.
	123	*
	124	* @param string The string containing the entity (or a continuation of the entity).
	125	* @param offset The offset at which the entity begins. Should be 0 if this is not the first call.
	126	* @returns The number of characters that were consumed, or -1 if the entity is incomplete.
	127	*/
	128	write(str, offset) {
	129	switch (this.state) {
	130	case EntityDecoderState.EntityStart: {
	131	if (str.charCodeAt(offset) === CharCodes.NUM) {
	132	this.state = EntityDecoderState.NumericStart;
	133	this.consumed += 1;
	134	return this.stateNumericStart(str, offset + 1);
	135	}
	136	this.state = EntityDecoderState.NamedEntity;
	137	return this.stateNamedEntity(str, offset);
	138	}
	139	case EntityDecoderState.NumericStart: {
	140	return this.stateNumericStart(str, offset);
	141	}
	142	case EntityDecoderState.NumericDecimal: {
	143	return this.stateNumericDecimal(str, offset);
	144	}
	145	case EntityDecoderState.NumericHex: {
	146	return this.stateNumericHex(str, offset);
	147	}
	148	case EntityDecoderState.NamedEntity: {
	149	return this.stateNamedEntity(str, offset);
	150	}
	151	}
	152	}
	153	/**
	154	* Switches between the numeric decimal and hexadecimal states.
	155	*
	156	* Equivalent to the `Numeric character reference state` in the HTML spec.
	157	*
	158	* @param str The string containing the entity (or a continuation of the entity).
	159	* @param offset The current offset.
	160	* @returns The number of characters that were consumed, or -1 if the entity is incomplete.
	161	*/
	162	stateNumericStart(str, offset) {
	163	if (offset >= str.length) {
	164	return -1;
	165	}
	166	if ((str.charCodeAt(offset) \| TO_LOWER_BIT) === CharCodes.LOWER_X) {
	167	this.state = EntityDecoderState.NumericHex;
	168	this.consumed += 1;
	169	return this.stateNumericHex(str, offset + 1);
	170	}
	171	this.state = EntityDecoderState.NumericDecimal;
	172	return this.stateNumericDecimal(str, offset);
	173	}
	174	addToNumericResult(str, start, end, base) {
	175	if (start !== end) {
	176	const digitCount = end - start;
	177	this.result =
	178	this.result * Math.pow(base, digitCount) +
	179	parseInt(str.substr(start, digitCount), base);
	180	this.consumed += digitCount;
	181	}
	182	}
	183	/**
	184	* Parses a hexadecimal numeric entity.
	185	*
	186	* Equivalent to the `Hexademical character reference state` in the HTML spec.
	187	*
	188	* @param str The string containing the entity (or a continuation of the entity).
	189	* @param offset The current offset.
	190	* @returns The number of characters that were consumed, or -1 if the entity is incomplete.
	191	*/
	192	stateNumericHex(str, offset) {
	193	const startIdx = offset;
	194	while (offset < str.length) {
	195	const char = str.charCodeAt(offset);
	196	if (isNumber(char) \|\| isHexadecimalCharacter(char)) {
	197	offset += 1;
	198	}
	199	else {
	200	this.addToNumericResult(str, startIdx, offset, 16);
	201	return this.emitNumericEntity(char, 3);
	202	}
	203	}
	204	this.addToNumericResult(str, startIdx, offset, 16);
	205	return -1;
	206	}
	207	/**
	208	* Parses a decimal numeric entity.
	209	*
	210	* Equivalent to the `Decimal character reference state` in the HTML spec.
	211	*
	212	* @param str The string containing the entity (or a continuation of the entity).
	213	* @param offset The current offset.
	214	* @returns The number of characters that were consumed, or -1 if the entity is incomplete.
	215	*/
	216	stateNumericDecimal(str, offset) {
	217	const startIdx = offset;
	218	while (offset < str.length) {
	219	const char = str.charCodeAt(offset);
	220	if (isNumber(char)) {
	221	offset += 1;
	222	}
	223	else {
	224	this.addToNumericResult(str, startIdx, offset, 10);
	225	return this.emitNumericEntity(char, 2);
	226	}
	227	}
	228	this.addToNumericResult(str, startIdx, offset, 10);
	229	return -1;
	230	}
	231	/**
	232	* Validate and emit a numeric entity.
	233	*
	234	* Implements the logic from the `Hexademical character reference start
	235	* state` and `Numeric character reference end state` in the HTML spec.
	236	*
	237	* @param lastCp The last code point of the entity. Used to see if the
	238	* entity was terminated with a semicolon.
	239	* @param expectedLength The minimum number of characters that should be
	240	* consumed. Used to validate that at least one digit
	241	* was consumed.
	242	* @returns The number of characters that were consumed.
	243	*/
	244	emitNumericEntity(lastCp, expectedLength) {
	245	var _a;
	246	// Ensure we consumed at least one digit.
	247	if (this.consumed <= expectedLength) {
	248	(_a = this.errors) === null \|\| _a === void 0 ? void 0 : _a.absenceOfDigitsInNumericCharacterReference(this.consumed);
	249	return 0;
	250	}
	251	// Figure out if this is a legit end of the entity
	252	if (lastCp === CharCodes.SEMI) {
	253	this.consumed += 1;
	254	}
	255	else if (this.decodeMode === DecodingMode.Strict) {
	256	return 0;
	257	}
	258	this.emitCodePoint(replaceCodePoint(this.result), this.consumed);
	259	if (this.errors) {
	260	if (lastCp !== CharCodes.SEMI) {
	261	this.errors.missingSemicolonAfterCharacterReference();
	262	}
	263	this.errors.validateNumericCharacterReference(this.result);
	264	}
	265	return this.consumed;
	266	}
	267	/**
	268	* Parses a named entity.
	269	*
	270	* Equivalent to the `Named character reference state` in the HTML spec.
	271	*
	272	* @param str The string containing the entity (or a continuation of the entity).
	273	* @param offset The current offset.
	274	* @returns The number of characters that were consumed, or -1 if the entity is incomplete.
	275	*/
	276	stateNamedEntity(str, offset) {
	277	const { decodeTree } = this;
	278	let current = decodeTree[this.treeIndex];
	279	// The mask is the number of bytes of the value, including the current byte.
	280	let valueLength = (current & BinTrieFlags.VALUE_LENGTH) >> 14;
	281	for (; offset < str.length; offset++, this.excess++) {
	282	const char = str.charCodeAt(offset);
	283	this.treeIndex = determineBranch(decodeTree, current, this.treeIndex + Math.max(1, valueLength), char);
	284	if (this.treeIndex < 0) {
	285	return this.result === 0 \|\|
	286	// If we are parsing an attribute
	287	(this.decodeMode === DecodingMode.Attribute &&
	288	// We shouldn't have consumed any characters after the entity,
	289	(valueLength === 0 \|\|
	290	// And there should be no invalid characters.
	291	isEntityInAttributeInvalidEnd(char)))
	292	? 0
	293	: this.emitNotTerminatedNamedEntity();
	294	}
	295	current = decodeTree[this.treeIndex];
	296	valueLength = (current & BinTrieFlags.VALUE_LENGTH) >> 14;
	297	// If the branch is a value, store it and continue
	298	if (valueLength !== 0) {
	299	// If the entity is terminated by a semicolon, we are done.
	300	if (char === CharCodes.SEMI) {
	301	return this.emitNamedEntityData(this.treeIndex, valueLength, this.consumed + this.excess);
	302	}
	303	// If we encounter a non-terminated (legacy) entity while parsing strictly, then ignore it.
	304	if (this.decodeMode !== DecodingMode.Strict) {
	305	this.result = this.treeIndex;
	306	this.consumed += this.excess;
	307	this.excess = 0;
	308	}
	309	}
	310	}
	311	return -1;
	312	}
	313	/**
	314	* Emit a named entity that was not terminated with a semicolon.
	315	*
	316	* @returns The number of characters consumed.
	317	*/
	318	emitNotTerminatedNamedEntity() {
	319	var _a;
	320	const { result, decodeTree } = this;
	321	const valueLength = (decodeTree[result] & BinTrieFlags.VALUE_LENGTH) >> 14;
	322	this.emitNamedEntityData(result, valueLength, this.consumed);
	323	(_a = this.errors) === null \|\| _a === void 0 ? void 0 : _a.missingSemicolonAfterCharacterReference();
	324	return this.consumed;
	325	}
	326	/**
	327	* Emit a named entity.
	328	*
	329	* @param result The index of the entity in the decode tree.
	330	* @param valueLength The number of bytes in the entity.
	331	* @param consumed The number of characters consumed.
	332	*
	333	* @returns The number of characters consumed.
	334	*/
	335	emitNamedEntityData(result, valueLength, consumed) {
	336	const { decodeTree } = this;
	337	this.emitCodePoint(valueLength === 1
	338	? decodeTree[result] & ~BinTrieFlags.VALUE_LENGTH
	339	: decodeTree[result + 1], consumed);
	340	if (valueLength === 3) {
	341	// For multi-byte values, we need to emit the second byte.
	342	this.emitCodePoint(decodeTree[result + 2], consumed);
	343	}
	344	return consumed;
	345	}
	346	/**
	347	* Signal to the parser that the end of the input was reached.
	348	*
	349	* Remaining data will be emitted and relevant errors will be produced.
	350	*
	351	* @returns The number of characters consumed.
	352	*/
	353	end() {
	354	var _a;
	355	switch (this.state) {
	356	case EntityDecoderState.NamedEntity: {
	357	// Emit a named entity if we have one.
	358	return this.result !== 0 &&
	359	(this.decodeMode !== DecodingMode.Attribute \|\|
	360	this.result === this.treeIndex)
	361	? this.emitNotTerminatedNamedEntity()
	362	: 0;
	363	}
	364	// Otherwise, emit a numeric entity if we have one.
	365	case EntityDecoderState.NumericDecimal: {
	366	return this.emitNumericEntity(0, 2);
	367	}
	368	case EntityDecoderState.NumericHex: {
	369	return this.emitNumericEntity(0, 3);
	370	}
	371	case EntityDecoderState.NumericStart: {
	372	(_a = this.errors) === null \|\| _a === void 0 ? void 0 : _a.absenceOfDigitsInNumericCharacterReference(this.consumed);
	373	return 0;
	374	}
	375	case EntityDecoderState.EntityStart: {
	376	// Return 0 if we have no entity.
	377	return 0;
	378	}
	379	}
	380	}
	381	}
	382	/**
	383	* Creates a function that decodes entities in a string.
	384	*
	385	* @param decodeTree The decode tree.
	386	* @returns A function that decodes entities in a string.
	387	*/
	388	function getDecoder(decodeTree) {
	389	let ret = "";
	390	const decoder = new EntityDecoder(decodeTree, (str) => (ret += fromCodePoint(str)));
	391	return function decodeWithTrie(str, decodeMode) {
	392	let lastIndex = 0;
	393	let offset = 0;
	394	while ((offset = str.indexOf("&", offset)) >= 0) {
	395	ret += str.slice(lastIndex, offset);
	396	decoder.startEntity(decodeMode);
	397	const len = decoder.write(str,
	398	// Skip the "&"
	399	offset + 1);
	400	if (len < 0) {
	401	lastIndex = offset + decoder.end();
	402	break;
	403	}
	404	lastIndex = offset + len;
	405	// If `len` is 0, skip the current `&` and continue.
	406	offset = len === 0 ? lastIndex + 1 : lastIndex;
	407	}
	408	const result = ret + str.slice(lastIndex);
	409	// Make sure we don't keep a reference to the final string.
	410	ret = "";
	411	return result;
	412	};
	413	}
	414	/**
	415	* Determines the branch of the current node that is taken given the current
	416	* character. This function is used to traverse the trie.
	417	*
	418	* @param decodeTree The trie.
	419	* @param current The current node.
	420	* @param nodeIdx The index right after the current node and its value.
	421	* @param char The current character.
	422	* @returns The index of the next node, or -1 if no branch is taken.
	423	*/
	424	export function determineBranch(decodeTree, current, nodeIdx, char) {
	425	const branchCount = (current & BinTrieFlags.BRANCH_LENGTH) >> 7;
	426	const jumpOffset = current & BinTrieFlags.JUMP_TABLE;
	427	// Case 1: Single branch encoded in jump offset
	428	if (branchCount === 0) {
	429	return jumpOffset !== 0 && char === jumpOffset ? nodeIdx : -1;
	430	}
	431	// Case 2: Multiple branches encoded in jump table
	432	if (jumpOffset) {
	433	const value = char - jumpOffset;
	434	return value < 0 \|\| value >= branchCount
	435	? -1
	436	: decodeTree[nodeIdx + value] - 1;
	437	}
	438	// Case 3: Multiple branches encoded in dictionary
	439	// Binary search for the character.
	440	let lo = nodeIdx;
	441	let hi = lo + branchCount - 1;
	442	while (lo <= hi) {
	443	const mid = (lo + hi) >>> 1;
	444	const midVal = decodeTree[mid];
	445	if (midVal < char) {
	446	lo = mid + 1;
	447	}
	448	else if (midVal > char) {
	449	hi = mid - 1;
	450	}
	451	else {
	452	return decodeTree[mid + branchCount];
	453	}
	454	}
	455	return -1;
	456	}
	457	const htmlDecoder = getDecoder(htmlDecodeTree);
	458	const xmlDecoder = getDecoder(xmlDecodeTree);
	459	/**
	460	* Decodes an HTML string.
	461	*
	462	* @param str The string to decode.
	463	* @param mode The decoding mode.
	464	* @returns The decoded string.
	465	*/
	466	export function decodeHTML(str, mode = DecodingMode.Legacy) {
	467	return htmlDecoder(str, mode);
	468	}
	469	/**
	470	* Decodes an HTML string in an attribute.
	471	*
	472	* @param str The string to decode.
	473	* @returns The decoded string.
	474	*/
	475	export function decodeHTMLAttribute(str) {
	476	return htmlDecoder(str, DecodingMode.Attribute);
	477	}
	478	/**
	479	* Decodes an HTML string, requiring all entities to be terminated by a semicolon.
	480	*
	481	* @param str The string to decode.
	482	* @returns The decoded string.
	483	*/
	484	export function decodeHTMLStrict(str) {
	485	return htmlDecoder(str, DecodingMode.Strict);
	486	}
	487	/**
	488	* Decodes an XML string, requiring all entities to be terminated by a semicolon.
	489	*
	490	* @param str The string to decode.
	491	* @returns The decoded string.
	492	*/
	493	export function decodeXML(str) {
	494	return xmlDecoder(str, DecodingMode.Strict);
	495	}
	496	//# sourceMappingURL=decode.js.map

Note: See TracBrowser for help on using the repository browser.

Download in other formats: