source: node_modules/autolinker/dist/commonjs/htmlParser/parse-html.js

main
Last change on this file was d24f17c, checked in by Aleksandar Panovski <apano77@…>, 15 months ago

Initial commit

  • Property mode set to 100644
File size: 24.9 KB
RevLine 
[d24f17c]1"use strict";
2Object.defineProperty(exports, "__esModule", { value: true });
3exports.parseHtml = void 0;
4var tslib_1 = require("tslib");
5var regex_lib_1 = require("../regex-lib");
6var utils_1 = require("../utils");
7// For debugging: search for other "For debugging" lines
8// import CliTable from 'cli-table';
9/**
10 * Parses an HTML string, calling the callbacks to notify of tags and text.
11 *
12 * ## History
13 *
14 * This file previously used a regular expression to find html tags in the input
15 * text. Unfortunately, we ran into a bunch of catastrophic backtracking issues
16 * with certain input text, causing Autolinker to either hang or just take a
17 * really long time to parse the string.
18 *
19 * The current code is intended to be a O(n) algorithm that walks through
20 * the string in one pass, and tries to be as cheap as possible. We don't need
21 * to implement the full HTML spec, but rather simply determine where the string
22 * looks like an HTML tag, and where it looks like text (so that we can autolink
23 * that).
24 *
25 * This state machine parser is intended just to be a simple but performant
26 * parser of HTML for the subset of requirements we have. We simply need to:
27 *
28 * 1. Determine where HTML tags are
29 * 2. Determine the tag name (Autolinker specifically only cares about <a>,
30 * <script>, and <style> tags, so as not to link any text within them)
31 *
32 * We don't need to:
33 *
34 * 1. Create a parse tree
35 * 2. Auto-close tags with invalid markup
36 * 3. etc.
37 *
38 * The other intention behind this is that we didn't want to add external
39 * dependencies on the Autolinker utility which would increase its size. For
40 * instance, adding htmlparser2 adds 125kb to the minified output file,
41 * increasing its final size from 47kb to 172kb (at the time of writing). It
42 * also doesn't work exactly correctly, treating the string "<3 blah blah blah"
43 * as an HTML tag.
44 *
45 * Reference for HTML spec:
46 *
47 * https://www.w3.org/TR/html51/syntax.html#sec-tokenization
48 *
49 * @param {String} html The HTML to parse
50 * @param {Object} callbacks
51 * @param {Function} callbacks.onOpenTag Callback function to call when an open
52 * tag is parsed. Called with the tagName as its argument.
53 * @param {Function} callbacks.onCloseTag Callback function to call when a close
54 * tag is parsed. Called with the tagName as its argument. If a self-closing
55 * tag is found, `onCloseTag` is called immediately after `onOpenTag`.
56 * @param {Function} callbacks.onText Callback function to call when text (i.e
57 * not an HTML tag) is parsed. Called with the text (string) as its first
58 * argument, and offset (number) into the string as its second.
59 */
60function parseHtml(html, _a) {
61 var onOpenTag = _a.onOpenTag, onCloseTag = _a.onCloseTag, onText = _a.onText, onComment = _a.onComment, onDoctype = _a.onDoctype;
62 var noCurrentTag = new CurrentTag();
63 var charIdx = 0, len = html.length, state = 0 /* Data */, currentDataIdx = 0, // where the current data start index is
64 currentTag = noCurrentTag; // describes the current tag that is being read
65 // For debugging: search for other "For debugging" lines
66 // const table = new CliTable( {
67 // head: [ 'charIdx', 'char', 'state', 'currentDataIdx', 'currentOpenTagIdx', 'tag.type' ]
68 // } );
69 while (charIdx < len) {
70 var char = html.charAt(charIdx);
71 // For debugging: search for other "For debugging" lines
72 // ALSO: Temporarily remove the 'const' keyword on the State enum
73 // table.push(
74 // [ charIdx, char, State[ state ], currentDataIdx, currentTag.idx, currentTag.idx === -1 ? '' : currentTag.type ]
75 // );
76 switch (state) {
77 case 0 /* Data */:
78 stateData(char);
79 break;
80 case 1 /* TagOpen */:
81 stateTagOpen(char);
82 break;
83 case 2 /* EndTagOpen */:
84 stateEndTagOpen(char);
85 break;
86 case 3 /* TagName */:
87 stateTagName(char);
88 break;
89 case 4 /* BeforeAttributeName */:
90 stateBeforeAttributeName(char);
91 break;
92 case 5 /* AttributeName */:
93 stateAttributeName(char);
94 break;
95 case 6 /* AfterAttributeName */:
96 stateAfterAttributeName(char);
97 break;
98 case 7 /* BeforeAttributeValue */:
99 stateBeforeAttributeValue(char);
100 break;
101 case 8 /* AttributeValueDoubleQuoted */:
102 stateAttributeValueDoubleQuoted(char);
103 break;
104 case 9 /* AttributeValueSingleQuoted */:
105 stateAttributeValueSingleQuoted(char);
106 break;
107 case 10 /* AttributeValueUnquoted */:
108 stateAttributeValueUnquoted(char);
109 break;
110 case 11 /* AfterAttributeValueQuoted */:
111 stateAfterAttributeValueQuoted(char);
112 break;
113 case 12 /* SelfClosingStartTag */:
114 stateSelfClosingStartTag(char);
115 break;
116 case 13 /* MarkupDeclarationOpenState */:
117 stateMarkupDeclarationOpen(char);
118 break;
119 case 14 /* CommentStart */:
120 stateCommentStart(char);
121 break;
122 case 15 /* CommentStartDash */:
123 stateCommentStartDash(char);
124 break;
125 case 16 /* Comment */:
126 stateComment(char);
127 break;
128 case 17 /* CommentEndDash */:
129 stateCommentEndDash(char);
130 break;
131 case 18 /* CommentEnd */:
132 stateCommentEnd(char);
133 break;
134 case 19 /* CommentEndBang */:
135 stateCommentEndBang(char);
136 break;
137 case 20 /* Doctype */:
138 stateDoctype(char);
139 break;
140 default:
141 (0, utils_1.throwUnhandledCaseError)(state);
142 }
143 // For debugging: search for other "For debugging" lines
144 // ALSO: Temporarily remove the 'const' keyword on the State enum
145 // table.push(
146 // [ charIdx, char, State[ state ], currentDataIdx, currentTag.idx, currentTag.idx === -1 ? '' : currentTag.type ]
147 // );
148 charIdx++;
149 }
150 if (currentDataIdx < charIdx) {
151 emitText();
152 }
153 // For debugging: search for other "For debugging" lines
154 // console.log( '\n' + table.toString() );
155 // Called when non-tags are being read (i.e. the text around HTML †ags)
156 // https://www.w3.org/TR/html51/syntax.html#data-state
157 function stateData(char) {
158 if (char === '<') {
159 startNewTag();
160 }
161 }
162 // Called after a '<' is read from the Data state
163 // https://www.w3.org/TR/html51/syntax.html#tag-open-state
164 function stateTagOpen(char) {
165 if (char === '!') {
166 state = 13 /* MarkupDeclarationOpenState */;
167 }
168 else if (char === '/') {
169 state = 2 /* EndTagOpen */;
170 currentTag = new CurrentTag((0, tslib_1.__assign)((0, tslib_1.__assign)({}, currentTag), { isClosing: true }));
171 }
172 else if (char === '<') {
173 // start of another tag (ignore the previous, incomplete one)
174 startNewTag();
175 }
176 else if (regex_lib_1.letterRe.test(char)) {
177 // tag name start (and no '/' read)
178 state = 3 /* TagName */;
179 currentTag = new CurrentTag((0, tslib_1.__assign)((0, tslib_1.__assign)({}, currentTag), { isOpening: true }));
180 }
181 else {
182 // Any other
183 state = 0 /* Data */;
184 currentTag = noCurrentTag;
185 }
186 }
187 // After a '<x', '</x' sequence is read (where 'x' is a letter character),
188 // this is to continue reading the tag name
189 // https://www.w3.org/TR/html51/syntax.html#tag-name-state
190 function stateTagName(char) {
191 if (regex_lib_1.whitespaceRe.test(char)) {
192 currentTag = new CurrentTag((0, tslib_1.__assign)((0, tslib_1.__assign)({}, currentTag), { name: captureTagName() }));
193 state = 4 /* BeforeAttributeName */;
194 }
195 else if (char === '<') {
196 // start of another tag (ignore the previous, incomplete one)
197 startNewTag();
198 }
199 else if (char === '/') {
200 currentTag = new CurrentTag((0, tslib_1.__assign)((0, tslib_1.__assign)({}, currentTag), { name: captureTagName() }));
201 state = 12 /* SelfClosingStartTag */;
202 }
203 else if (char === '>') {
204 currentTag = new CurrentTag((0, tslib_1.__assign)((0, tslib_1.__assign)({}, currentTag), { name: captureTagName() }));
205 emitTagAndPreviousTextNode(); // resets to Data state as well
206 }
207 else if (!regex_lib_1.letterRe.test(char) && !regex_lib_1.digitRe.test(char) && char !== ':') {
208 // Anything else that does not form an html tag. Note: the colon
209 // character is accepted for XML namespaced tags
210 resetToDataState();
211 }
212 else {
213 // continue reading tag name
214 }
215 }
216 // Called after the '/' is read from a '</' sequence
217 // https://www.w3.org/TR/html51/syntax.html#end-tag-open-state
218 function stateEndTagOpen(char) {
219 if (char === '>') {
220 // parse error. Encountered "</>". Skip it without treating as a tag
221 resetToDataState();
222 }
223 else if (regex_lib_1.letterRe.test(char)) {
224 state = 3 /* TagName */;
225 }
226 else {
227 // some other non-tag-like character, don't treat this as a tag
228 resetToDataState();
229 }
230 }
231 // https://www.w3.org/TR/html51/syntax.html#before-attribute-name-state
232 function stateBeforeAttributeName(char) {
233 if (regex_lib_1.whitespaceRe.test(char)) {
234 // stay in BeforeAttributeName state - continue reading chars
235 }
236 else if (char === '/') {
237 state = 12 /* SelfClosingStartTag */;
238 }
239 else if (char === '>') {
240 emitTagAndPreviousTextNode(); // resets to Data state as well
241 }
242 else if (char === '<') {
243 // start of another tag (ignore the previous, incomplete one)
244 startNewTag();
245 }
246 else if (char === "=" || regex_lib_1.quoteRe.test(char) || regex_lib_1.controlCharsRe.test(char)) {
247 // "Parse error" characters that, according to the spec, should be
248 // appended to the attribute name, but we'll treat these characters
249 // as not forming a real HTML tag
250 resetToDataState();
251 }
252 else {
253 // Any other char, start of a new attribute name
254 state = 5 /* AttributeName */;
255 }
256 }
257 // https://www.w3.org/TR/html51/syntax.html#attribute-name-state
258 function stateAttributeName(char) {
259 if (regex_lib_1.whitespaceRe.test(char)) {
260 state = 6 /* AfterAttributeName */;
261 }
262 else if (char === '/') {
263 state = 12 /* SelfClosingStartTag */;
264 }
265 else if (char === '=') {
266 state = 7 /* BeforeAttributeValue */;
267 }
268 else if (char === '>') {
269 emitTagAndPreviousTextNode(); // resets to Data state as well
270 }
271 else if (char === '<') {
272 // start of another tag (ignore the previous, incomplete one)
273 startNewTag();
274 }
275 else if (regex_lib_1.quoteRe.test(char)) {
276 // "Parse error" characters that, according to the spec, should be
277 // appended to the attribute name, but we'll treat these characters
278 // as not forming a real HTML tag
279 resetToDataState();
280 }
281 else {
282 // anything else: continue reading attribute name
283 }
284 }
285 // https://www.w3.org/TR/html51/syntax.html#after-attribute-name-state
286 function stateAfterAttributeName(char) {
287 if (regex_lib_1.whitespaceRe.test(char)) {
288 // ignore the character - continue reading
289 }
290 else if (char === '/') {
291 state = 12 /* SelfClosingStartTag */;
292 }
293 else if (char === '=') {
294 state = 7 /* BeforeAttributeValue */;
295 }
296 else if (char === '>') {
297 emitTagAndPreviousTextNode();
298 }
299 else if (char === '<') {
300 // start of another tag (ignore the previous, incomplete one)
301 startNewTag();
302 }
303 else if (regex_lib_1.quoteRe.test(char)) {
304 // "Parse error" characters that, according to the spec, should be
305 // appended to the attribute name, but we'll treat these characters
306 // as not forming a real HTML tag
307 resetToDataState();
308 }
309 else {
310 // Any other character, start a new attribute in the current tag
311 state = 5 /* AttributeName */;
312 }
313 }
314 // https://www.w3.org/TR/html51/syntax.html#before-attribute-value-state
315 function stateBeforeAttributeValue(char) {
316 if (regex_lib_1.whitespaceRe.test(char)) {
317 // ignore the character - continue reading
318 }
319 else if (char === "\"") {
320 state = 8 /* AttributeValueDoubleQuoted */;
321 }
322 else if (char === "'") {
323 state = 9 /* AttributeValueSingleQuoted */;
324 }
325 else if (/[>=`]/.test(char)) {
326 // Invalid chars after an '=' for an attribute value, don't count
327 // the current tag as an HTML tag
328 resetToDataState();
329 }
330 else if (char === '<') {
331 // start of another tag (ignore the previous, incomplete one)
332 startNewTag();
333 }
334 else {
335 // Any other character, consider it an unquoted attribute value
336 state = 10 /* AttributeValueUnquoted */;
337 }
338 }
339 // https://www.w3.org/TR/html51/syntax.html#attribute-value-double-quoted-state
340 function stateAttributeValueDoubleQuoted(char) {
341 if (char === "\"") {
342 // end the current double-quoted attribute
343 state = 11 /* AfterAttributeValueQuoted */;
344 }
345 else {
346 // consume the character as part of the double-quoted attribute value
347 }
348 }
349 // https://www.w3.org/TR/html51/syntax.html#attribute-value-single-quoted-state
350 function stateAttributeValueSingleQuoted(char) {
351 if (char === "'") {
352 // end the current single-quoted attribute
353 state = 11 /* AfterAttributeValueQuoted */;
354 }
355 else {
356 // consume the character as part of the double-quoted attribute value
357 }
358 }
359 // https://www.w3.org/TR/html51/syntax.html#attribute-value-unquoted-state
360 function stateAttributeValueUnquoted(char) {
361 if (regex_lib_1.whitespaceRe.test(char)) {
362 state = 4 /* BeforeAttributeName */;
363 }
364 else if (char === '>') {
365 emitTagAndPreviousTextNode();
366 }
367 else if (char === '<') {
368 // start of another tag (ignore the previous, incomplete one)
369 startNewTag();
370 }
371 else {
372 // Any other character, treat it as part of the attribute value
373 }
374 }
375 // https://www.w3.org/TR/html51/syntax.html#after-attribute-value-quoted-state
376 function stateAfterAttributeValueQuoted(char) {
377 if (regex_lib_1.whitespaceRe.test(char)) {
378 state = 4 /* BeforeAttributeName */;
379 }
380 else if (char === '/') {
381 state = 12 /* SelfClosingStartTag */;
382 }
383 else if (char === '>') {
384 emitTagAndPreviousTextNode();
385 }
386 else if (char === '<') {
387 // start of another tag (ignore the previous, incomplete one)
388 startNewTag();
389 }
390 else {
391 // Any other character, "parse error". Spec says to switch to the
392 // BeforeAttributeState and re-consume the character, as it may be
393 // the start of a new attribute name
394 state = 4 /* BeforeAttributeName */;
395 reconsumeCurrentCharacter();
396 }
397 }
398 // A '/' has just been read in the current tag (presumably for '/>'), and
399 // this handles the next character
400 // https://www.w3.org/TR/html51/syntax.html#self-closing-start-tag-state
401 function stateSelfClosingStartTag(char) {
402 if (char === '>') {
403 currentTag = new CurrentTag((0, tslib_1.__assign)((0, tslib_1.__assign)({}, currentTag), { isClosing: true }));
404 emitTagAndPreviousTextNode(); // resets to Data state as well
405 }
406 else {
407 state = 4 /* BeforeAttributeName */;
408 }
409 }
410 // https://www.w3.org/TR/html51/syntax.html#markup-declaration-open-state
411 // (HTML Comments or !DOCTYPE)
412 function stateMarkupDeclarationOpen(char) {
413 if (html.substr(charIdx, 2) === '--') {
414 // html comment
415 charIdx += 2; // "consume" characters
416 currentTag = new CurrentTag((0, tslib_1.__assign)((0, tslib_1.__assign)({}, currentTag), { type: 'comment' }));
417 state = 14 /* CommentStart */;
418 }
419 else if (html.substr(charIdx, 7).toUpperCase() === 'DOCTYPE') {
420 charIdx += 7; // "consume" characters
421 currentTag = new CurrentTag((0, tslib_1.__assign)((0, tslib_1.__assign)({}, currentTag), { type: 'doctype' }));
422 state = 20 /* Doctype */;
423 }
424 else {
425 // At this point, the spec specifies that the state machine should
426 // enter the "bogus comment" state, in which case any character(s)
427 // after the '<!' that were read should become an HTML comment up
428 // until the first '>' that is read (or EOF). Instead, we'll assume
429 // that a user just typed '<!' as part of text data
430 resetToDataState();
431 }
432 }
433 // Handles after the sequence '<!--' has been read
434 // https://www.w3.org/TR/html51/syntax.html#comment-start-state
435 function stateCommentStart(char) {
436 if (char === '-') {
437 // We've read the sequence '<!---' at this point (3 dashes)
438 state = 15 /* CommentStartDash */;
439 }
440 else if (char === '>') {
441 // At this point, we'll assume the comment wasn't a real comment
442 // so we'll just emit it as data. We basically read the sequence
443 // '<!-->'
444 resetToDataState();
445 }
446 else {
447 // Any other char, take it as part of the comment
448 state = 16 /* Comment */;
449 }
450 }
451 // We've read the sequence '<!---' at this point (3 dashes)
452 // https://www.w3.org/TR/html51/syntax.html#comment-start-dash-state
453 function stateCommentStartDash(char) {
454 if (char === '-') {
455 // We've read '<!----' (4 dashes) at this point
456 state = 18 /* CommentEnd */;
457 }
458 else if (char === '>') {
459 // At this point, we'll assume the comment wasn't a real comment
460 // so we'll just emit it as data. We basically read the sequence
461 // '<!--->'
462 resetToDataState();
463 }
464 else {
465 // Anything else, take it as a valid comment
466 state = 16 /* Comment */;
467 }
468 }
469 // Currently reading the comment's text (data)
470 // https://www.w3.org/TR/html51/syntax.html#comment-state
471 function stateComment(char) {
472 if (char === '-') {
473 state = 17 /* CommentEndDash */;
474 }
475 else {
476 // Any other character, stay in the Comment state
477 }
478 }
479 // When we we've read the first dash inside a comment, it may signal the
480 // end of the comment if we read another dash
481 // https://www.w3.org/TR/html51/syntax.html#comment-end-dash-state
482 function stateCommentEndDash(char) {
483 if (char === '-') {
484 state = 18 /* CommentEnd */;
485 }
486 else {
487 // Wasn't a dash, must still be part of the comment
488 state = 16 /* Comment */;
489 }
490 }
491 // After we've read two dashes inside a comment, it may signal the end of
492 // the comment if we then read a '>' char
493 // https://www.w3.org/TR/html51/syntax.html#comment-end-state
494 function stateCommentEnd(char) {
495 if (char === '>') {
496 emitTagAndPreviousTextNode();
497 }
498 else if (char === '!') {
499 state = 19 /* CommentEndBang */;
500 }
501 else if (char === '-') {
502 // A 3rd '-' has been read: stay in the CommentEnd state
503 }
504 else {
505 // Anything else, switch back to the comment state since we didn't
506 // read the full "end comment" sequence (i.e. '-->')
507 state = 16 /* Comment */;
508 }
509 }
510 // We've read the sequence '--!' inside of a comment
511 // https://www.w3.org/TR/html51/syntax.html#comment-end-bang-state
512 function stateCommentEndBang(char) {
513 if (char === '-') {
514 // We read the sequence '--!-' inside of a comment. The last dash
515 // could signify that the comment is going to close
516 state = 17 /* CommentEndDash */;
517 }
518 else if (char === '>') {
519 // End of comment with the sequence '--!>'
520 emitTagAndPreviousTextNode();
521 }
522 else {
523 // The '--!' was not followed by a '>', continue reading the
524 // comment's text
525 state = 16 /* Comment */;
526 }
527 }
528 /**
529 * For DOCTYPES in particular, we don't care about the attributes. Just
530 * advance to the '>' character and emit the tag, unless we find a '<'
531 * character in which case we'll start a new tag.
532 *
533 * Example doctype tag:
534 * <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
535 *
536 * Actual spec: https://www.w3.org/TR/html51/syntax.html#doctype-state
537 */
538 function stateDoctype(char) {
539 if (char === '>') {
540 emitTagAndPreviousTextNode();
541 }
542 else if (char === '<') {
543 startNewTag();
544 }
545 else {
546 // stay in the Doctype state
547 }
548 }
549 /**
550 * Resets the state back to the Data state, and removes the current tag.
551 *
552 * We'll generally run this function whenever a "parse error" is
553 * encountered, where the current tag that is being read no longer looks
554 * like a real HTML tag.
555 */
556 function resetToDataState() {
557 state = 0 /* Data */;
558 currentTag = noCurrentTag;
559 }
560 /**
561 * Starts a new HTML tag at the current index, ignoring any previous HTML
562 * tag that was being read.
563 *
564 * We'll generally run this function whenever we read a new '<' character,
565 * including when we read a '<' character inside of an HTML tag that we were
566 * previously reading.
567 */
568 function startNewTag() {
569 state = 1 /* TagOpen */;
570 currentTag = new CurrentTag({ idx: charIdx });
571 }
572 /**
573 * Once we've decided to emit an open tag, that means we can also emit the
574 * text node before it.
575 */
576 function emitTagAndPreviousTextNode() {
577 var textBeforeTag = html.slice(currentDataIdx, currentTag.idx);
578 if (textBeforeTag) {
579 // the html tag was the first element in the html string, or two
580 // tags next to each other, in which case we should not emit a text
581 // node
582 onText(textBeforeTag, currentDataIdx);
583 }
584 if (currentTag.type === 'comment') {
585 onComment(currentTag.idx);
586 }
587 else if (currentTag.type === 'doctype') {
588 onDoctype(currentTag.idx);
589 }
590 else {
591 if (currentTag.isOpening) {
592 onOpenTag(currentTag.name, currentTag.idx);
593 }
594 if (currentTag.isClosing) {
595 // note: self-closing tags will emit both opening and closing
596 onCloseTag(currentTag.name, currentTag.idx);
597 }
598 }
599 // Since we just emitted a tag, reset to the data state for the next char
600 resetToDataState();
601 currentDataIdx = charIdx + 1;
602 }
603 function emitText() {
604 var text = html.slice(currentDataIdx, charIdx);
605 onText(text, currentDataIdx);
606 currentDataIdx = charIdx + 1;
607 }
608 /**
609 * Captures the tag name from the start of the tag to the current character
610 * index, and converts it to lower case
611 */
612 function captureTagName() {
613 var startIdx = currentTag.idx + (currentTag.isClosing ? 2 : 1);
614 return html.slice(startIdx, charIdx).toLowerCase();
615 }
616 /**
617 * Causes the main loop to re-consume the current character, such as after
618 * encountering a "parse error" that changed state and needs to reconsume
619 * the same character in that new state.
620 */
621 function reconsumeCurrentCharacter() {
622 charIdx--;
623 }
624}
625exports.parseHtml = parseHtml;
626var CurrentTag = /** @class */ (function () {
627 function CurrentTag(cfg) {
628 if (cfg === void 0) { cfg = {}; }
629 this.idx = cfg.idx !== undefined ? cfg.idx : -1;
630 this.type = cfg.type || 'tag';
631 this.name = cfg.name || '';
632 this.isOpening = !!cfg.isOpening;
633 this.isClosing = !!cfg.isClosing;
634 }
635 return CurrentTag;
636}());
637//# sourceMappingURL=parse-html.js.map
Note: See TracBrowser for help on using the repository browser.