source: node_modules/autolinker/dist/es2015/htmlParser/parse-html.js@ d24f17c

main
Last change on this file since d24f17c was d24f17c, checked in by Aleksandar Panovski <apano77@…>, 15 months ago

Initial commit

  • Property mode set to 100644
File size: 24.4 KB
Line 
1import { __assign } from "tslib";
2import { letterRe, digitRe, whitespaceRe, quoteRe, controlCharsRe } from '../regex-lib';
3import { throwUnhandledCaseError } from '../utils';
4// For debugging: search for other "For debugging" lines
5// import CliTable from 'cli-table';
6/**
7 * Parses an HTML string, calling the callbacks to notify of tags and text.
8 *
9 * ## History
10 *
11 * This file previously used a regular expression to find html tags in the input
12 * text. Unfortunately, we ran into a bunch of catastrophic backtracking issues
13 * with certain input text, causing Autolinker to either hang or just take a
14 * really long time to parse the string.
15 *
16 * The current code is intended to be a O(n) algorithm that walks through
17 * the string in one pass, and tries to be as cheap as possible. We don't need
18 * to implement the full HTML spec, but rather simply determine where the string
19 * looks like an HTML tag, and where it looks like text (so that we can autolink
20 * that).
21 *
22 * This state machine parser is intended just to be a simple but performant
23 * parser of HTML for the subset of requirements we have. We simply need to:
24 *
25 * 1. Determine where HTML tags are
26 * 2. Determine the tag name (Autolinker specifically only cares about <a>,
27 * <script>, and <style> tags, so as not to link any text within them)
28 *
29 * We don't need to:
30 *
31 * 1. Create a parse tree
32 * 2. Auto-close tags with invalid markup
33 * 3. etc.
34 *
35 * The other intention behind this is that we didn't want to add external
36 * dependencies on the Autolinker utility which would increase its size. For
37 * instance, adding htmlparser2 adds 125kb to the minified output file,
38 * increasing its final size from 47kb to 172kb (at the time of writing). It
39 * also doesn't work exactly correctly, treating the string "<3 blah blah blah"
40 * as an HTML tag.
41 *
42 * Reference for HTML spec:
43 *
44 * https://www.w3.org/TR/html51/syntax.html#sec-tokenization
45 *
46 * @param {String} html The HTML to parse
47 * @param {Object} callbacks
48 * @param {Function} callbacks.onOpenTag Callback function to call when an open
49 * tag is parsed. Called with the tagName as its argument.
50 * @param {Function} callbacks.onCloseTag Callback function to call when a close
51 * tag is parsed. Called with the tagName as its argument. If a self-closing
52 * tag is found, `onCloseTag` is called immediately after `onOpenTag`.
53 * @param {Function} callbacks.onText Callback function to call when text (i.e
54 * not an HTML tag) is parsed. Called with the text (string) as its first
55 * argument, and offset (number) into the string as its second.
56 */
57export function parseHtml(html, _a) {
58 var onOpenTag = _a.onOpenTag, onCloseTag = _a.onCloseTag, onText = _a.onText, onComment = _a.onComment, onDoctype = _a.onDoctype;
59 var noCurrentTag = new CurrentTag();
60 var charIdx = 0, len = html.length, state = 0 /* Data */, currentDataIdx = 0, // where the current data start index is
61 currentTag = noCurrentTag; // describes the current tag that is being read
62 // For debugging: search for other "For debugging" lines
63 // const table = new CliTable( {
64 // head: [ 'charIdx', 'char', 'state', 'currentDataIdx', 'currentOpenTagIdx', 'tag.type' ]
65 // } );
66 while (charIdx < len) {
67 var char = html.charAt(charIdx);
68 // For debugging: search for other "For debugging" lines
69 // ALSO: Temporarily remove the 'const' keyword on the State enum
70 // table.push(
71 // [ charIdx, char, State[ state ], currentDataIdx, currentTag.idx, currentTag.idx === -1 ? '' : currentTag.type ]
72 // );
73 switch (state) {
74 case 0 /* Data */:
75 stateData(char);
76 break;
77 case 1 /* TagOpen */:
78 stateTagOpen(char);
79 break;
80 case 2 /* EndTagOpen */:
81 stateEndTagOpen(char);
82 break;
83 case 3 /* TagName */:
84 stateTagName(char);
85 break;
86 case 4 /* BeforeAttributeName */:
87 stateBeforeAttributeName(char);
88 break;
89 case 5 /* AttributeName */:
90 stateAttributeName(char);
91 break;
92 case 6 /* AfterAttributeName */:
93 stateAfterAttributeName(char);
94 break;
95 case 7 /* BeforeAttributeValue */:
96 stateBeforeAttributeValue(char);
97 break;
98 case 8 /* AttributeValueDoubleQuoted */:
99 stateAttributeValueDoubleQuoted(char);
100 break;
101 case 9 /* AttributeValueSingleQuoted */:
102 stateAttributeValueSingleQuoted(char);
103 break;
104 case 10 /* AttributeValueUnquoted */:
105 stateAttributeValueUnquoted(char);
106 break;
107 case 11 /* AfterAttributeValueQuoted */:
108 stateAfterAttributeValueQuoted(char);
109 break;
110 case 12 /* SelfClosingStartTag */:
111 stateSelfClosingStartTag(char);
112 break;
113 case 13 /* MarkupDeclarationOpenState */:
114 stateMarkupDeclarationOpen(char);
115 break;
116 case 14 /* CommentStart */:
117 stateCommentStart(char);
118 break;
119 case 15 /* CommentStartDash */:
120 stateCommentStartDash(char);
121 break;
122 case 16 /* Comment */:
123 stateComment(char);
124 break;
125 case 17 /* CommentEndDash */:
126 stateCommentEndDash(char);
127 break;
128 case 18 /* CommentEnd */:
129 stateCommentEnd(char);
130 break;
131 case 19 /* CommentEndBang */:
132 stateCommentEndBang(char);
133 break;
134 case 20 /* Doctype */:
135 stateDoctype(char);
136 break;
137 default:
138 throwUnhandledCaseError(state);
139 }
140 // For debugging: search for other "For debugging" lines
141 // ALSO: Temporarily remove the 'const' keyword on the State enum
142 // table.push(
143 // [ charIdx, char, State[ state ], currentDataIdx, currentTag.idx, currentTag.idx === -1 ? '' : currentTag.type ]
144 // );
145 charIdx++;
146 }
147 if (currentDataIdx < charIdx) {
148 emitText();
149 }
150 // For debugging: search for other "For debugging" lines
151 // console.log( '\n' + table.toString() );
152 // Called when non-tags are being read (i.e. the text around HTML †ags)
153 // https://www.w3.org/TR/html51/syntax.html#data-state
154 function stateData(char) {
155 if (char === '<') {
156 startNewTag();
157 }
158 }
159 // Called after a '<' is read from the Data state
160 // https://www.w3.org/TR/html51/syntax.html#tag-open-state
161 function stateTagOpen(char) {
162 if (char === '!') {
163 state = 13 /* MarkupDeclarationOpenState */;
164 }
165 else if (char === '/') {
166 state = 2 /* EndTagOpen */;
167 currentTag = new CurrentTag(__assign(__assign({}, currentTag), { isClosing: true }));
168 }
169 else if (char === '<') {
170 // start of another tag (ignore the previous, incomplete one)
171 startNewTag();
172 }
173 else if (letterRe.test(char)) {
174 // tag name start (and no '/' read)
175 state = 3 /* TagName */;
176 currentTag = new CurrentTag(__assign(__assign({}, currentTag), { isOpening: true }));
177 }
178 else {
179 // Any other
180 state = 0 /* Data */;
181 currentTag = noCurrentTag;
182 }
183 }
184 // After a '<x', '</x' sequence is read (where 'x' is a letter character),
185 // this is to continue reading the tag name
186 // https://www.w3.org/TR/html51/syntax.html#tag-name-state
187 function stateTagName(char) {
188 if (whitespaceRe.test(char)) {
189 currentTag = new CurrentTag(__assign(__assign({}, currentTag), { name: captureTagName() }));
190 state = 4 /* BeforeAttributeName */;
191 }
192 else if (char === '<') {
193 // start of another tag (ignore the previous, incomplete one)
194 startNewTag();
195 }
196 else if (char === '/') {
197 currentTag = new CurrentTag(__assign(__assign({}, currentTag), { name: captureTagName() }));
198 state = 12 /* SelfClosingStartTag */;
199 }
200 else if (char === '>') {
201 currentTag = new CurrentTag(__assign(__assign({}, currentTag), { name: captureTagName() }));
202 emitTagAndPreviousTextNode(); // resets to Data state as well
203 }
204 else if (!letterRe.test(char) && !digitRe.test(char) && char !== ':') {
205 // Anything else that does not form an html tag. Note: the colon
206 // character is accepted for XML namespaced tags
207 resetToDataState();
208 }
209 else {
210 // continue reading tag name
211 }
212 }
213 // Called after the '/' is read from a '</' sequence
214 // https://www.w3.org/TR/html51/syntax.html#end-tag-open-state
215 function stateEndTagOpen(char) {
216 if (char === '>') {
217 // parse error. Encountered "</>". Skip it without treating as a tag
218 resetToDataState();
219 }
220 else if (letterRe.test(char)) {
221 state = 3 /* TagName */;
222 }
223 else {
224 // some other non-tag-like character, don't treat this as a tag
225 resetToDataState();
226 }
227 }
228 // https://www.w3.org/TR/html51/syntax.html#before-attribute-name-state
229 function stateBeforeAttributeName(char) {
230 if (whitespaceRe.test(char)) {
231 // stay in BeforeAttributeName state - continue reading chars
232 }
233 else if (char === '/') {
234 state = 12 /* SelfClosingStartTag */;
235 }
236 else if (char === '>') {
237 emitTagAndPreviousTextNode(); // resets to Data state as well
238 }
239 else if (char === '<') {
240 // start of another tag (ignore the previous, incomplete one)
241 startNewTag();
242 }
243 else if (char === "=" || quoteRe.test(char) || controlCharsRe.test(char)) {
244 // "Parse error" characters that, according to the spec, should be
245 // appended to the attribute name, but we'll treat these characters
246 // as not forming a real HTML tag
247 resetToDataState();
248 }
249 else {
250 // Any other char, start of a new attribute name
251 state = 5 /* AttributeName */;
252 }
253 }
254 // https://www.w3.org/TR/html51/syntax.html#attribute-name-state
255 function stateAttributeName(char) {
256 if (whitespaceRe.test(char)) {
257 state = 6 /* AfterAttributeName */;
258 }
259 else if (char === '/') {
260 state = 12 /* SelfClosingStartTag */;
261 }
262 else if (char === '=') {
263 state = 7 /* BeforeAttributeValue */;
264 }
265 else if (char === '>') {
266 emitTagAndPreviousTextNode(); // resets to Data state as well
267 }
268 else if (char === '<') {
269 // start of another tag (ignore the previous, incomplete one)
270 startNewTag();
271 }
272 else if (quoteRe.test(char)) {
273 // "Parse error" characters that, according to the spec, should be
274 // appended to the attribute name, but we'll treat these characters
275 // as not forming a real HTML tag
276 resetToDataState();
277 }
278 else {
279 // anything else: continue reading attribute name
280 }
281 }
282 // https://www.w3.org/TR/html51/syntax.html#after-attribute-name-state
283 function stateAfterAttributeName(char) {
284 if (whitespaceRe.test(char)) {
285 // ignore the character - continue reading
286 }
287 else if (char === '/') {
288 state = 12 /* SelfClosingStartTag */;
289 }
290 else if (char === '=') {
291 state = 7 /* BeforeAttributeValue */;
292 }
293 else if (char === '>') {
294 emitTagAndPreviousTextNode();
295 }
296 else if (char === '<') {
297 // start of another tag (ignore the previous, incomplete one)
298 startNewTag();
299 }
300 else if (quoteRe.test(char)) {
301 // "Parse error" characters that, according to the spec, should be
302 // appended to the attribute name, but we'll treat these characters
303 // as not forming a real HTML tag
304 resetToDataState();
305 }
306 else {
307 // Any other character, start a new attribute in the current tag
308 state = 5 /* AttributeName */;
309 }
310 }
311 // https://www.w3.org/TR/html51/syntax.html#before-attribute-value-state
312 function stateBeforeAttributeValue(char) {
313 if (whitespaceRe.test(char)) {
314 // ignore the character - continue reading
315 }
316 else if (char === "\"") {
317 state = 8 /* AttributeValueDoubleQuoted */;
318 }
319 else if (char === "'") {
320 state = 9 /* AttributeValueSingleQuoted */;
321 }
322 else if (/[>=`]/.test(char)) {
323 // Invalid chars after an '=' for an attribute value, don't count
324 // the current tag as an HTML tag
325 resetToDataState();
326 }
327 else if (char === '<') {
328 // start of another tag (ignore the previous, incomplete one)
329 startNewTag();
330 }
331 else {
332 // Any other character, consider it an unquoted attribute value
333 state = 10 /* AttributeValueUnquoted */;
334 }
335 }
336 // https://www.w3.org/TR/html51/syntax.html#attribute-value-double-quoted-state
337 function stateAttributeValueDoubleQuoted(char) {
338 if (char === "\"") {
339 // end the current double-quoted attribute
340 state = 11 /* AfterAttributeValueQuoted */;
341 }
342 else {
343 // consume the character as part of the double-quoted attribute value
344 }
345 }
346 // https://www.w3.org/TR/html51/syntax.html#attribute-value-single-quoted-state
347 function stateAttributeValueSingleQuoted(char) {
348 if (char === "'") {
349 // end the current single-quoted attribute
350 state = 11 /* AfterAttributeValueQuoted */;
351 }
352 else {
353 // consume the character as part of the double-quoted attribute value
354 }
355 }
356 // https://www.w3.org/TR/html51/syntax.html#attribute-value-unquoted-state
357 function stateAttributeValueUnquoted(char) {
358 if (whitespaceRe.test(char)) {
359 state = 4 /* BeforeAttributeName */;
360 }
361 else if (char === '>') {
362 emitTagAndPreviousTextNode();
363 }
364 else if (char === '<') {
365 // start of another tag (ignore the previous, incomplete one)
366 startNewTag();
367 }
368 else {
369 // Any other character, treat it as part of the attribute value
370 }
371 }
372 // https://www.w3.org/TR/html51/syntax.html#after-attribute-value-quoted-state
373 function stateAfterAttributeValueQuoted(char) {
374 if (whitespaceRe.test(char)) {
375 state = 4 /* BeforeAttributeName */;
376 }
377 else if (char === '/') {
378 state = 12 /* SelfClosingStartTag */;
379 }
380 else if (char === '>') {
381 emitTagAndPreviousTextNode();
382 }
383 else if (char === '<') {
384 // start of another tag (ignore the previous, incomplete one)
385 startNewTag();
386 }
387 else {
388 // Any other character, "parse error". Spec says to switch to the
389 // BeforeAttributeState and re-consume the character, as it may be
390 // the start of a new attribute name
391 state = 4 /* BeforeAttributeName */;
392 reconsumeCurrentCharacter();
393 }
394 }
395 // A '/' has just been read in the current tag (presumably for '/>'), and
396 // this handles the next character
397 // https://www.w3.org/TR/html51/syntax.html#self-closing-start-tag-state
398 function stateSelfClosingStartTag(char) {
399 if (char === '>') {
400 currentTag = new CurrentTag(__assign(__assign({}, currentTag), { isClosing: true }));
401 emitTagAndPreviousTextNode(); // resets to Data state as well
402 }
403 else {
404 state = 4 /* BeforeAttributeName */;
405 }
406 }
407 // https://www.w3.org/TR/html51/syntax.html#markup-declaration-open-state
408 // (HTML Comments or !DOCTYPE)
409 function stateMarkupDeclarationOpen(char) {
410 if (html.substr(charIdx, 2) === '--') {
411 // html comment
412 charIdx += 2; // "consume" characters
413 currentTag = new CurrentTag(__assign(__assign({}, currentTag), { type: 'comment' }));
414 state = 14 /* CommentStart */;
415 }
416 else if (html.substr(charIdx, 7).toUpperCase() === 'DOCTYPE') {
417 charIdx += 7; // "consume" characters
418 currentTag = new CurrentTag(__assign(__assign({}, currentTag), { type: 'doctype' }));
419 state = 20 /* Doctype */;
420 }
421 else {
422 // At this point, the spec specifies that the state machine should
423 // enter the "bogus comment" state, in which case any character(s)
424 // after the '<!' that were read should become an HTML comment up
425 // until the first '>' that is read (or EOF). Instead, we'll assume
426 // that a user just typed '<!' as part of text data
427 resetToDataState();
428 }
429 }
430 // Handles after the sequence '<!--' has been read
431 // https://www.w3.org/TR/html51/syntax.html#comment-start-state
432 function stateCommentStart(char) {
433 if (char === '-') {
434 // We've read the sequence '<!---' at this point (3 dashes)
435 state = 15 /* CommentStartDash */;
436 }
437 else if (char === '>') {
438 // At this point, we'll assume the comment wasn't a real comment
439 // so we'll just emit it as data. We basically read the sequence
440 // '<!-->'
441 resetToDataState();
442 }
443 else {
444 // Any other char, take it as part of the comment
445 state = 16 /* Comment */;
446 }
447 }
448 // We've read the sequence '<!---' at this point (3 dashes)
449 // https://www.w3.org/TR/html51/syntax.html#comment-start-dash-state
450 function stateCommentStartDash(char) {
451 if (char === '-') {
452 // We've read '<!----' (4 dashes) at this point
453 state = 18 /* CommentEnd */;
454 }
455 else if (char === '>') {
456 // At this point, we'll assume the comment wasn't a real comment
457 // so we'll just emit it as data. We basically read the sequence
458 // '<!--->'
459 resetToDataState();
460 }
461 else {
462 // Anything else, take it as a valid comment
463 state = 16 /* Comment */;
464 }
465 }
466 // Currently reading the comment's text (data)
467 // https://www.w3.org/TR/html51/syntax.html#comment-state
468 function stateComment(char) {
469 if (char === '-') {
470 state = 17 /* CommentEndDash */;
471 }
472 else {
473 // Any other character, stay in the Comment state
474 }
475 }
476 // When we we've read the first dash inside a comment, it may signal the
477 // end of the comment if we read another dash
478 // https://www.w3.org/TR/html51/syntax.html#comment-end-dash-state
479 function stateCommentEndDash(char) {
480 if (char === '-') {
481 state = 18 /* CommentEnd */;
482 }
483 else {
484 // Wasn't a dash, must still be part of the comment
485 state = 16 /* Comment */;
486 }
487 }
488 // After we've read two dashes inside a comment, it may signal the end of
489 // the comment if we then read a '>' char
490 // https://www.w3.org/TR/html51/syntax.html#comment-end-state
491 function stateCommentEnd(char) {
492 if (char === '>') {
493 emitTagAndPreviousTextNode();
494 }
495 else if (char === '!') {
496 state = 19 /* CommentEndBang */;
497 }
498 else if (char === '-') {
499 // A 3rd '-' has been read: stay in the CommentEnd state
500 }
501 else {
502 // Anything else, switch back to the comment state since we didn't
503 // read the full "end comment" sequence (i.e. '-->')
504 state = 16 /* Comment */;
505 }
506 }
507 // We've read the sequence '--!' inside of a comment
508 // https://www.w3.org/TR/html51/syntax.html#comment-end-bang-state
509 function stateCommentEndBang(char) {
510 if (char === '-') {
511 // We read the sequence '--!-' inside of a comment. The last dash
512 // could signify that the comment is going to close
513 state = 17 /* CommentEndDash */;
514 }
515 else if (char === '>') {
516 // End of comment with the sequence '--!>'
517 emitTagAndPreviousTextNode();
518 }
519 else {
520 // The '--!' was not followed by a '>', continue reading the
521 // comment's text
522 state = 16 /* Comment */;
523 }
524 }
525 /**
526 * For DOCTYPES in particular, we don't care about the attributes. Just
527 * advance to the '>' character and emit the tag, unless we find a '<'
528 * character in which case we'll start a new tag.
529 *
530 * Example doctype tag:
531 * <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
532 *
533 * Actual spec: https://www.w3.org/TR/html51/syntax.html#doctype-state
534 */
535 function stateDoctype(char) {
536 if (char === '>') {
537 emitTagAndPreviousTextNode();
538 }
539 else if (char === '<') {
540 startNewTag();
541 }
542 else {
543 // stay in the Doctype state
544 }
545 }
546 /**
547 * Resets the state back to the Data state, and removes the current tag.
548 *
549 * We'll generally run this function whenever a "parse error" is
550 * encountered, where the current tag that is being read no longer looks
551 * like a real HTML tag.
552 */
553 function resetToDataState() {
554 state = 0 /* Data */;
555 currentTag = noCurrentTag;
556 }
557 /**
558 * Starts a new HTML tag at the current index, ignoring any previous HTML
559 * tag that was being read.
560 *
561 * We'll generally run this function whenever we read a new '<' character,
562 * including when we read a '<' character inside of an HTML tag that we were
563 * previously reading.
564 */
565 function startNewTag() {
566 state = 1 /* TagOpen */;
567 currentTag = new CurrentTag({ idx: charIdx });
568 }
569 /**
570 * Once we've decided to emit an open tag, that means we can also emit the
571 * text node before it.
572 */
573 function emitTagAndPreviousTextNode() {
574 var textBeforeTag = html.slice(currentDataIdx, currentTag.idx);
575 if (textBeforeTag) {
576 // the html tag was the first element in the html string, or two
577 // tags next to each other, in which case we should not emit a text
578 // node
579 onText(textBeforeTag, currentDataIdx);
580 }
581 if (currentTag.type === 'comment') {
582 onComment(currentTag.idx);
583 }
584 else if (currentTag.type === 'doctype') {
585 onDoctype(currentTag.idx);
586 }
587 else {
588 if (currentTag.isOpening) {
589 onOpenTag(currentTag.name, currentTag.idx);
590 }
591 if (currentTag.isClosing) {
592 // note: self-closing tags will emit both opening and closing
593 onCloseTag(currentTag.name, currentTag.idx);
594 }
595 }
596 // Since we just emitted a tag, reset to the data state for the next char
597 resetToDataState();
598 currentDataIdx = charIdx + 1;
599 }
600 function emitText() {
601 var text = html.slice(currentDataIdx, charIdx);
602 onText(text, currentDataIdx);
603 currentDataIdx = charIdx + 1;
604 }
605 /**
606 * Captures the tag name from the start of the tag to the current character
607 * index, and converts it to lower case
608 */
609 function captureTagName() {
610 var startIdx = currentTag.idx + (currentTag.isClosing ? 2 : 1);
611 return html.slice(startIdx, charIdx).toLowerCase();
612 }
613 /**
614 * Causes the main loop to re-consume the current character, such as after
615 * encountering a "parse error" that changed state and needs to reconsume
616 * the same character in that new state.
617 */
618 function reconsumeCurrentCharacter() {
619 charIdx--;
620 }
621}
622var CurrentTag = /** @class */ (function () {
623 function CurrentTag(cfg) {
624 if (cfg === void 0) { cfg = {}; }
625 this.idx = cfg.idx !== undefined ? cfg.idx : -1;
626 this.type = cfg.type || 'tag';
627 this.name = cfg.name || '';
628 this.isOpening = !!cfg.isOpening;
629 this.isClosing = !!cfg.isClosing;
630 }
631 return CurrentTag;
632}());
633//# sourceMappingURL=parse-html.js.map
Note: See TracBrowser for help on using the repository browser.