Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

source: node_modules/autolinker/dist/es2015/htmlParser/parse-html.js

main

Last change on this file was d24f17c, checked in by Aleksandar Panovski <apano77@…>, 19 months ago
Initial commit
Property mode set to `100644`
File size: 24.4 KB

Line
1	import { __assign } from "tslib";
2	import { letterRe, digitRe, whitespaceRe, quoteRe, controlCharsRe } from '../regex-lib';
3	import { throwUnhandledCaseError } from '../utils';
4	// For debugging: search for other "For debugging" lines
5	// import CliTable from 'cli-table';
6	/**
7	* Parses an HTML string, calling the callbacks to notify of tags and text.
8	*
9	* ## History
10	*
11	* This file previously used a regular expression to find html tags in the input
12	* text. Unfortunately, we ran into a bunch of catastrophic backtracking issues
13	* with certain input text, causing Autolinker to either hang or just take a
14	* really long time to parse the string.
15	*
16	* The current code is intended to be a O(n) algorithm that walks through
17	* the string in one pass, and tries to be as cheap as possible. We don't need
18	* to implement the full HTML spec, but rather simply determine where the string
19	* looks like an HTML tag, and where it looks like text (so that we can autolink
20	* that).
21	*
22	* This state machine parser is intended just to be a simple but performant
23	* parser of HTML for the subset of requirements we have. We simply need to:
24	*
25	* 1. Determine where HTML tags are
26	* 2. Determine the tag name (Autolinker specifically only cares about <a>,
27	* <script>, and <style> tags, so as not to link any text within them)
28	*
29	* We don't need to:
30	*
31	* 1. Create a parse tree
32	* 2. Auto-close tags with invalid markup
33	* 3. etc.
34	*
35	* The other intention behind this is that we didn't want to add external
36	* dependencies on the Autolinker utility which would increase its size. For
37	* instance, adding htmlparser2 adds 125kb to the minified output file,
38	* increasing its final size from 47kb to 172kb (at the time of writing). It
39	* also doesn't work exactly correctly, treating the string "<3 blah blah blah"
40	* as an HTML tag.
41	*
42	* Reference for HTML spec:
43	*
44	* https://www.w3.org/TR/html51/syntax.html#sec-tokenization
45	*
46	* @param {String} html The HTML to parse
47	* @param {Object} callbacks
48	* @param {Function} callbacks.onOpenTag Callback function to call when an open
49	* tag is parsed. Called with the tagName as its argument.
50	* @param {Function} callbacks.onCloseTag Callback function to call when a close
51	* tag is parsed. Called with the tagName as its argument. If a self-closing
52	* tag is found, `onCloseTag` is called immediately after `onOpenTag`.
53	* @param {Function} callbacks.onText Callback function to call when text (i.e
54	* not an HTML tag) is parsed. Called with the text (string) as its first
55	* argument, and offset (number) into the string as its second.
56	*/
57	export function parseHtml(html, _a) {
58	var onOpenTag = _a.onOpenTag, onCloseTag = _a.onCloseTag, onText = _a.onText, onComment = _a.onComment, onDoctype = _a.onDoctype;
59	var noCurrentTag = new CurrentTag();
60	var charIdx = 0, len = html.length, state = 0 /* Data */, currentDataIdx = 0, // where the current data start index is
61	currentTag = noCurrentTag; // describes the current tag that is being read
62	// For debugging: search for other "For debugging" lines
63	// const table = new CliTable( {
64	// head: [ 'charIdx', 'char', 'state', 'currentDataIdx', 'currentOpenTagIdx', 'tag.type' ]
65	// } );
66	while (charIdx < len) {
67	var char = html.charAt(charIdx);
68	// For debugging: search for other "For debugging" lines
69	// ALSO: Temporarily remove the 'const' keyword on the State enum
70	// table.push(
71	// [ charIdx, char, State[ state ], currentDataIdx, currentTag.idx, currentTag.idx === -1 ? '' : currentTag.type ]
72	// );
73	switch (state) {
74	case 0 /* Data */:
75	stateData(char);
76	break;
77	case 1 /* TagOpen */:
78	stateTagOpen(char);
79	break;
80	case 2 /* EndTagOpen */:
81	stateEndTagOpen(char);
82	break;
83	case 3 /* TagName */:
84	stateTagName(char);
85	break;
86	case 4 /* BeforeAttributeName */:
87	stateBeforeAttributeName(char);
88	break;
89	case 5 /* AttributeName */:
90	stateAttributeName(char);
91	break;
92	case 6 /* AfterAttributeName */:
93	stateAfterAttributeName(char);
94	break;
95	case 7 /* BeforeAttributeValue */:
96	stateBeforeAttributeValue(char);
97	break;
98	case 8 /* AttributeValueDoubleQuoted */:
99	stateAttributeValueDoubleQuoted(char);
100	break;
101	case 9 /* AttributeValueSingleQuoted */:
102	stateAttributeValueSingleQuoted(char);
103	break;
104	case 10 /* AttributeValueUnquoted */:
105	stateAttributeValueUnquoted(char);
106	break;
107	case 11 /* AfterAttributeValueQuoted */:
108	stateAfterAttributeValueQuoted(char);
109	break;
110	case 12 /* SelfClosingStartTag */:
111	stateSelfClosingStartTag(char);
112	break;
113	case 13 /* MarkupDeclarationOpenState */:
114	stateMarkupDeclarationOpen(char);
115	break;
116	case 14 /* CommentStart */:
117	stateCommentStart(char);
118	break;
119	case 15 /* CommentStartDash */:
120	stateCommentStartDash(char);
121	break;
122	case 16 /* Comment */:
123	stateComment(char);
124	break;
125	case 17 /* CommentEndDash */:
126	stateCommentEndDash(char);
127	break;
128	case 18 /* CommentEnd */:
129	stateCommentEnd(char);
130	break;
131	case 19 /* CommentEndBang */:
132	stateCommentEndBang(char);
133	break;
134	case 20 /* Doctype */:
135	stateDoctype(char);
136	break;
137	default:
138	throwUnhandledCaseError(state);
139	}
140	// For debugging: search for other "For debugging" lines
141	// ALSO: Temporarily remove the 'const' keyword on the State enum
142	// table.push(
143	// [ charIdx, char, State[ state ], currentDataIdx, currentTag.idx, currentTag.idx === -1 ? '' : currentTag.type ]
144	// );
145	charIdx++;
146	}
147	if (currentDataIdx < charIdx) {
148	emitText();
149	}
150	// For debugging: search for other "For debugging" lines
151	// console.log( '\n' + table.toString() );
152	// Called when non-tags are being read (i.e. the text around HTML †ags)
153	// https://www.w3.org/TR/html51/syntax.html#data-state
154	function stateData(char) {
155	if (char === '<') {
156	startNewTag();
157	}
158	}
159	// Called after a '<' is read from the Data state
160	// https://www.w3.org/TR/html51/syntax.html#tag-open-state
161	function stateTagOpen(char) {
162	if (char === '!') {
163	state = 13 /* MarkupDeclarationOpenState */;
164	}
165	else if (char === '/') {
166	state = 2 /* EndTagOpen */;
167	currentTag = new CurrentTag(__assign(__assign({}, currentTag), { isClosing: true }));
168	}
169	else if (char === '<') {
170	// start of another tag (ignore the previous, incomplete one)
171	startNewTag();
172	}
173	else if (letterRe.test(char)) {
174	// tag name start (and no '/' read)
175	state = 3 /* TagName */;
176	currentTag = new CurrentTag(__assign(__assign({}, currentTag), { isOpening: true }));
177	}
178	else {
179	// Any other
180	state = 0 /* Data */;
181	currentTag = noCurrentTag;
182	}
183	}
184	// After a '<x', '</x' sequence is read (where 'x' is a letter character),
185	// this is to continue reading the tag name
186	// https://www.w3.org/TR/html51/syntax.html#tag-name-state
187	function stateTagName(char) {
188	if (whitespaceRe.test(char)) {
189	currentTag = new CurrentTag(__assign(__assign({}, currentTag), { name: captureTagName() }));
190	state = 4 /* BeforeAttributeName */;
191	}
192	else if (char === '<') {
193	// start of another tag (ignore the previous, incomplete one)
194	startNewTag();
195	}
196	else if (char === '/') {
197	currentTag = new CurrentTag(__assign(__assign({}, currentTag), { name: captureTagName() }));
198	state = 12 /* SelfClosingStartTag */;
199	}
200	else if (char === '>') {
201	currentTag = new CurrentTag(__assign(__assign({}, currentTag), { name: captureTagName() }));
202	emitTagAndPreviousTextNode(); // resets to Data state as well
203	}
204	else if (!letterRe.test(char) && !digitRe.test(char) && char !== ':') {
205	// Anything else that does not form an html tag. Note: the colon
206	// character is accepted for XML namespaced tags
207	resetToDataState();
208	}
209	else {
210	// continue reading tag name
211	}
212	}
213	// Called after the '/' is read from a '</' sequence
214	// https://www.w3.org/TR/html51/syntax.html#end-tag-open-state
215	function stateEndTagOpen(char) {
216	if (char === '>') {
217	// parse error. Encountered "</>". Skip it without treating as a tag
218	resetToDataState();
219	}
220	else if (letterRe.test(char)) {
221	state = 3 /* TagName */;
222	}
223	else {
224	// some other non-tag-like character, don't treat this as a tag
225	resetToDataState();
226	}
227	}
228	// https://www.w3.org/TR/html51/syntax.html#before-attribute-name-state
229	function stateBeforeAttributeName(char) {
230	if (whitespaceRe.test(char)) {
231	// stay in BeforeAttributeName state - continue reading chars
232	}
233	else if (char === '/') {
234	state = 12 /* SelfClosingStartTag */;
235	}
236	else if (char === '>') {
237	emitTagAndPreviousTextNode(); // resets to Data state as well
238	}
239	else if (char === '<') {
240	// start of another tag (ignore the previous, incomplete one)
241	startNewTag();
242	}
243	else if (char === "=" \|\| quoteRe.test(char) \|\| controlCharsRe.test(char)) {
244	// "Parse error" characters that, according to the spec, should be
245	// appended to the attribute name, but we'll treat these characters
246	// as not forming a real HTML tag
247	resetToDataState();
248	}
249	else {
250	// Any other char, start of a new attribute name
251	state = 5 /* AttributeName */;
252	}
253	}
254	// https://www.w3.org/TR/html51/syntax.html#attribute-name-state
255	function stateAttributeName(char) {
256	if (whitespaceRe.test(char)) {
257	state = 6 /* AfterAttributeName */;
258	}
259	else if (char === '/') {
260	state = 12 /* SelfClosingStartTag */;
261	}
262	else if (char === '=') {
263	state = 7 /* BeforeAttributeValue */;
264	}
265	else if (char === '>') {
266	emitTagAndPreviousTextNode(); // resets to Data state as well
267	}
268	else if (char === '<') {
269	// start of another tag (ignore the previous, incomplete one)
270	startNewTag();
271	}
272	else if (quoteRe.test(char)) {
273	// "Parse error" characters that, according to the spec, should be
274	// appended to the attribute name, but we'll treat these characters
275	// as not forming a real HTML tag
276	resetToDataState();
277	}
278	else {
279	// anything else: continue reading attribute name
280	}
281	}
282	// https://www.w3.org/TR/html51/syntax.html#after-attribute-name-state
283	function stateAfterAttributeName(char) {
284	if (whitespaceRe.test(char)) {
285	// ignore the character - continue reading
286	}
287	else if (char === '/') {
288	state = 12 /* SelfClosingStartTag */;
289	}
290	else if (char === '=') {
291	state = 7 /* BeforeAttributeValue */;
292	}
293	else if (char === '>') {
294	emitTagAndPreviousTextNode();
295	}
296	else if (char === '<') {
297	// start of another tag (ignore the previous, incomplete one)
298	startNewTag();
299	}
300	else if (quoteRe.test(char)) {
301	// "Parse error" characters that, according to the spec, should be
302	// appended to the attribute name, but we'll treat these characters
303	// as not forming a real HTML tag
304	resetToDataState();
305	}
306	else {
307	// Any other character, start a new attribute in the current tag
308	state = 5 /* AttributeName */;
309	}
310	}
311	// https://www.w3.org/TR/html51/syntax.html#before-attribute-value-state
312	function stateBeforeAttributeValue(char) {
313	if (whitespaceRe.test(char)) {
314	// ignore the character - continue reading
315	}
316	else if (char === "\"") {
317	state = 8 /* AttributeValueDoubleQuoted */;
318	}
319	else if (char === "'") {
320	state = 9 /* AttributeValueSingleQuoted */;
321	}
322	else if (/[>=`]/.test(char)) {
323	// Invalid chars after an '=' for an attribute value, don't count
324	// the current tag as an HTML tag
325	resetToDataState();
326	}
327	else if (char === '<') {
328	// start of another tag (ignore the previous, incomplete one)
329	startNewTag();
330	}
331	else {
332	// Any other character, consider it an unquoted attribute value
333	state = 10 /* AttributeValueUnquoted */;
334	}
335	}
336	// https://www.w3.org/TR/html51/syntax.html#attribute-value-double-quoted-state
337	function stateAttributeValueDoubleQuoted(char) {
338	if (char === "\"") {
339	// end the current double-quoted attribute
340	state = 11 /* AfterAttributeValueQuoted */;
341	}
342	else {
343	// consume the character as part of the double-quoted attribute value
344	}
345	}
346	// https://www.w3.org/TR/html51/syntax.html#attribute-value-single-quoted-state
347	function stateAttributeValueSingleQuoted(char) {
348	if (char === "'") {
349	// end the current single-quoted attribute
350	state = 11 /* AfterAttributeValueQuoted */;
351	}
352	else {
353	// consume the character as part of the double-quoted attribute value
354	}
355	}
356	// https://www.w3.org/TR/html51/syntax.html#attribute-value-unquoted-state
357	function stateAttributeValueUnquoted(char) {
358	if (whitespaceRe.test(char)) {
359	state = 4 /* BeforeAttributeName */;
360	}
361	else if (char === '>') {
362	emitTagAndPreviousTextNode();
363	}
364	else if (char === '<') {
365	// start of another tag (ignore the previous, incomplete one)
366	startNewTag();
367	}
368	else {
369	// Any other character, treat it as part of the attribute value
370	}
371	}
372	// https://www.w3.org/TR/html51/syntax.html#after-attribute-value-quoted-state
373	function stateAfterAttributeValueQuoted(char) {
374	if (whitespaceRe.test(char)) {
375	state = 4 /* BeforeAttributeName */;
376	}
377	else if (char === '/') {
378	state = 12 /* SelfClosingStartTag */;
379	}
380	else if (char === '>') {
381	emitTagAndPreviousTextNode();
382	}
383	else if (char === '<') {
384	// start of another tag (ignore the previous, incomplete one)
385	startNewTag();
386	}
387	else {
388	// Any other character, "parse error". Spec says to switch to the
389	// BeforeAttributeState and re-consume the character, as it may be
390	// the start of a new attribute name
391	state = 4 /* BeforeAttributeName */;
392	reconsumeCurrentCharacter();
393	}
394	}
395	// A '/' has just been read in the current tag (presumably for '/>'), and
396	// this handles the next character
397	// https://www.w3.org/TR/html51/syntax.html#self-closing-start-tag-state
398	function stateSelfClosingStartTag(char) {
399	if (char === '>') {
400	currentTag = new CurrentTag(__assign(__assign({}, currentTag), { isClosing: true }));
401	emitTagAndPreviousTextNode(); // resets to Data state as well
402	}
403	else {
404	state = 4 /* BeforeAttributeName */;
405	}
406	}
407	// https://www.w3.org/TR/html51/syntax.html#markup-declaration-open-state
408	// (HTML Comments or !DOCTYPE)
409	function stateMarkupDeclarationOpen(char) {
410	if (html.substr(charIdx, 2) === '--') {
411	// html comment
412	charIdx += 2; // "consume" characters
413	currentTag = new CurrentTag(__assign(__assign({}, currentTag), { type: 'comment' }));
414	state = 14 /* CommentStart */;
415	}
416	else if (html.substr(charIdx, 7).toUpperCase() === 'DOCTYPE') {
417	charIdx += 7; // "consume" characters
418	currentTag = new CurrentTag(__assign(__assign({}, currentTag), { type: 'doctype' }));
419	state = 20 /* Doctype */;
420	}
421	else {
422	// At this point, the spec specifies that the state machine should
423	// enter the "bogus comment" state, in which case any character(s)
424	// after the '<!' that were read should become an HTML comment up
425	// until the first '>' that is read (or EOF). Instead, we'll assume
426	// that a user just typed '<!' as part of text data
427	resetToDataState();
428	}
429	}
430	// Handles after the sequence '<!--' has been read
431	// https://www.w3.org/TR/html51/syntax.html#comment-start-state
432	function stateCommentStart(char) {
433	if (char === '-') {
434	// We've read the sequence '<!---' at this point (3 dashes)
435	state = 15 /* CommentStartDash */;
436	}
437	else if (char === '>') {
438	// At this point, we'll assume the comment wasn't a real comment
439	// so we'll just emit it as data. We basically read the sequence
440	// '<!-->'
441	resetToDataState();
442	}
443	else {
444	// Any other char, take it as part of the comment
445	state = 16 /* Comment */;
446	}
447	}
448	// We've read the sequence '<!---' at this point (3 dashes)
449	// https://www.w3.org/TR/html51/syntax.html#comment-start-dash-state
450	function stateCommentStartDash(char) {
451	if (char === '-') {
452	// We've read '<!----' (4 dashes) at this point
453	state = 18 /* CommentEnd */;
454	}
455	else if (char === '>') {
456	// At this point, we'll assume the comment wasn't a real comment
457	// so we'll just emit it as data. We basically read the sequence
458	// '<!--->'
459	resetToDataState();
460	}
461	else {
462	// Anything else, take it as a valid comment
463	state = 16 /* Comment */;
464	}
465	}
466	// Currently reading the comment's text (data)
467	// https://www.w3.org/TR/html51/syntax.html#comment-state
468	function stateComment(char) {
469	if (char === '-') {
470	state = 17 /* CommentEndDash */;
471	}
472	else {
473	// Any other character, stay in the Comment state
474	}
475	}
476	// When we we've read the first dash inside a comment, it may signal the
477	// end of the comment if we read another dash
478	// https://www.w3.org/TR/html51/syntax.html#comment-end-dash-state
479	function stateCommentEndDash(char) {
480	if (char === '-') {
481	state = 18 /* CommentEnd */;
482	}
483	else {
484	// Wasn't a dash, must still be part of the comment
485	state = 16 /* Comment */;
486	}
487	}
488	// After we've read two dashes inside a comment, it may signal the end of
489	// the comment if we then read a '>' char
490	// https://www.w3.org/TR/html51/syntax.html#comment-end-state
491	function stateCommentEnd(char) {
492	if (char === '>') {
493	emitTagAndPreviousTextNode();
494	}
495	else if (char === '!') {
496	state = 19 /* CommentEndBang */;
497	}
498	else if (char === '-') {
499	// A 3rd '-' has been read: stay in the CommentEnd state
500	}
501	else {
502	// Anything else, switch back to the comment state since we didn't
503	// read the full "end comment" sequence (i.e. '-->')
504	state = 16 /* Comment */;
505	}
506	}
507	// We've read the sequence '--!' inside of a comment
508	// https://www.w3.org/TR/html51/syntax.html#comment-end-bang-state
509	function stateCommentEndBang(char) {
510	if (char === '-') {
511	// We read the sequence '--!-' inside of a comment. The last dash
512	// could signify that the comment is going to close
513	state = 17 /* CommentEndDash */;
514	}
515	else if (char === '>') {
516	// End of comment with the sequence '--!>'
517	emitTagAndPreviousTextNode();
518	}
519	else {
520	// The '--!' was not followed by a '>', continue reading the
521	// comment's text
522	state = 16 /* Comment */;
523	}
524	}
525	/**
526	* For DOCTYPES in particular, we don't care about the attributes. Just
527	* advance to the '>' character and emit the tag, unless we find a '<'
528	* character in which case we'll start a new tag.
529	*
530	* Example doctype tag:
531	* <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
532	*
533	* Actual spec: https://www.w3.org/TR/html51/syntax.html#doctype-state
534	*/
535	function stateDoctype(char) {
536	if (char === '>') {
537	emitTagAndPreviousTextNode();
538	}
539	else if (char === '<') {
540	startNewTag();
541	}
542	else {
543	// stay in the Doctype state
544	}
545	}
546	/**
547	* Resets the state back to the Data state, and removes the current tag.
548	*
549	* We'll generally run this function whenever a "parse error" is
550	* encountered, where the current tag that is being read no longer looks
551	* like a real HTML tag.
552	*/
553	function resetToDataState() {
554	state = 0 /* Data */;
555	currentTag = noCurrentTag;
556	}
557	/**
558	* Starts a new HTML tag at the current index, ignoring any previous HTML
559	* tag that was being read.
560	*
561	* We'll generally run this function whenever we read a new '<' character,
562	* including when we read a '<' character inside of an HTML tag that we were
563	* previously reading.
564	*/
565	function startNewTag() {
566	state = 1 /* TagOpen */;
567	currentTag = new CurrentTag({ idx: charIdx });
568	}
569	/**
570	* Once we've decided to emit an open tag, that means we can also emit the
571	* text node before it.
572	*/
573	function emitTagAndPreviousTextNode() {
574	var textBeforeTag = html.slice(currentDataIdx, currentTag.idx);
575	if (textBeforeTag) {
576	// the html tag was the first element in the html string, or two
577	// tags next to each other, in which case we should not emit a text
578	// node
579	onText(textBeforeTag, currentDataIdx);
580	}
581	if (currentTag.type === 'comment') {
582	onComment(currentTag.idx);
583	}
584	else if (currentTag.type === 'doctype') {
585	onDoctype(currentTag.idx);
586	}
587	else {
588	if (currentTag.isOpening) {
589	onOpenTag(currentTag.name, currentTag.idx);
590	}
591	if (currentTag.isClosing) {
592	// note: self-closing tags will emit both opening and closing
593	onCloseTag(currentTag.name, currentTag.idx);
594	}
595	}
596	// Since we just emitted a tag, reset to the data state for the next char
597	resetToDataState();
598	currentDataIdx = charIdx + 1;
599	}
600	function emitText() {
601	var text = html.slice(currentDataIdx, charIdx);
602	onText(text, currentDataIdx);
603	currentDataIdx = charIdx + 1;
604	}
605	/**
606	* Captures the tag name from the start of the tag to the current character
607	* index, and converts it to lower case
608	*/
609	function captureTagName() {
610	var startIdx = currentTag.idx + (currentTag.isClosing ? 2 : 1);
611	return html.slice(startIdx, charIdx).toLowerCase();
612	}
613	/**
614	* Causes the main loop to re-consume the current character, such as after
615	* encountering a "parse error" that changed state and needs to reconsume
616	* the same character in that new state.
617	*/
618	function reconsumeCurrentCharacter() {
619	charIdx--;
620	}
621	}
622	var CurrentTag = /** @class */ (function () {
623	function CurrentTag(cfg) {
624	if (cfg === void 0) { cfg = {}; }
625	this.idx = cfg.idx !== undefined ? cfg.idx : -1;
626	this.type = cfg.type \|\| 'tag';
627	this.name = cfg.name \|\| '';
628	this.isOpening = !!cfg.isOpening;
629	this.isClosing = !!cfg.isClosing;
630	}
631	return CurrentTag;
632	}());
633	//# sourceMappingURL=parse-html.js.map

Note: See TracBrowser for help on using the repository browser.

Download in other formats: