[d24f17c] | 1 | "use strict";
|
---|
| 2 | Object.defineProperty(exports, "__esModule", { value: true });
|
---|
| 3 | exports.parseHtml = void 0;
|
---|
| 4 | var tslib_1 = require("tslib");
|
---|
| 5 | var regex_lib_1 = require("../regex-lib");
|
---|
| 6 | var utils_1 = require("../utils");
|
---|
| 7 | // For debugging: search for other "For debugging" lines
|
---|
| 8 | // import CliTable from 'cli-table';
|
---|
| 9 | /**
|
---|
| 10 | * Parses an HTML string, calling the callbacks to notify of tags and text.
|
---|
| 11 | *
|
---|
| 12 | * ## History
|
---|
| 13 | *
|
---|
| 14 | * This file previously used a regular expression to find html tags in the input
|
---|
| 15 | * text. Unfortunately, we ran into a bunch of catastrophic backtracking issues
|
---|
| 16 | * with certain input text, causing Autolinker to either hang or just take a
|
---|
| 17 | * really long time to parse the string.
|
---|
| 18 | *
|
---|
| 19 | * The current code is intended to be a O(n) algorithm that walks through
|
---|
| 20 | * the string in one pass, and tries to be as cheap as possible. We don't need
|
---|
| 21 | * to implement the full HTML spec, but rather simply determine where the string
|
---|
| 22 | * looks like an HTML tag, and where it looks like text (so that we can autolink
|
---|
| 23 | * that).
|
---|
| 24 | *
|
---|
| 25 | * This state machine parser is intended just to be a simple but performant
|
---|
| 26 | * parser of HTML for the subset of requirements we have. We simply need to:
|
---|
| 27 | *
|
---|
| 28 | * 1. Determine where HTML tags are
|
---|
| 29 | * 2. Determine the tag name (Autolinker specifically only cares about <a>,
|
---|
| 30 | * <script>, and <style> tags, so as not to link any text within them)
|
---|
| 31 | *
|
---|
| 32 | * We don't need to:
|
---|
| 33 | *
|
---|
| 34 | * 1. Create a parse tree
|
---|
| 35 | * 2. Auto-close tags with invalid markup
|
---|
| 36 | * 3. etc.
|
---|
| 37 | *
|
---|
| 38 | * The other intention behind this is that we didn't want to add external
|
---|
| 39 | * dependencies on the Autolinker utility which would increase its size. For
|
---|
| 40 | * instance, adding htmlparser2 adds 125kb to the minified output file,
|
---|
| 41 | * increasing its final size from 47kb to 172kb (at the time of writing). It
|
---|
| 42 | * also doesn't work exactly correctly, treating the string "<3 blah blah blah"
|
---|
| 43 | * as an HTML tag.
|
---|
| 44 | *
|
---|
| 45 | * Reference for HTML spec:
|
---|
| 46 | *
|
---|
| 47 | * https://www.w3.org/TR/html51/syntax.html#sec-tokenization
|
---|
| 48 | *
|
---|
| 49 | * @param {String} html The HTML to parse
|
---|
| 50 | * @param {Object} callbacks
|
---|
| 51 | * @param {Function} callbacks.onOpenTag Callback function to call when an open
|
---|
| 52 | * tag is parsed. Called with the tagName as its argument.
|
---|
| 53 | * @param {Function} callbacks.onCloseTag Callback function to call when a close
|
---|
| 54 | * tag is parsed. Called with the tagName as its argument. If a self-closing
|
---|
| 55 | * tag is found, `onCloseTag` is called immediately after `onOpenTag`.
|
---|
| 56 | * @param {Function} callbacks.onText Callback function to call when text (i.e
|
---|
| 57 | * not an HTML tag) is parsed. Called with the text (string) as its first
|
---|
| 58 | * argument, and offset (number) into the string as its second.
|
---|
| 59 | */
|
---|
| 60 | function parseHtml(html, _a) {
|
---|
| 61 | var onOpenTag = _a.onOpenTag, onCloseTag = _a.onCloseTag, onText = _a.onText, onComment = _a.onComment, onDoctype = _a.onDoctype;
|
---|
| 62 | var noCurrentTag = new CurrentTag();
|
---|
| 63 | var charIdx = 0, len = html.length, state = 0 /* Data */, currentDataIdx = 0, // where the current data start index is
|
---|
| 64 | currentTag = noCurrentTag; // describes the current tag that is being read
|
---|
| 65 | // For debugging: search for other "For debugging" lines
|
---|
| 66 | // const table = new CliTable( {
|
---|
| 67 | // head: [ 'charIdx', 'char', 'state', 'currentDataIdx', 'currentOpenTagIdx', 'tag.type' ]
|
---|
| 68 | // } );
|
---|
| 69 | while (charIdx < len) {
|
---|
| 70 | var char = html.charAt(charIdx);
|
---|
| 71 | // For debugging: search for other "For debugging" lines
|
---|
| 72 | // ALSO: Temporarily remove the 'const' keyword on the State enum
|
---|
| 73 | // table.push(
|
---|
| 74 | // [ charIdx, char, State[ state ], currentDataIdx, currentTag.idx, currentTag.idx === -1 ? '' : currentTag.type ]
|
---|
| 75 | // );
|
---|
| 76 | switch (state) {
|
---|
| 77 | case 0 /* Data */:
|
---|
| 78 | stateData(char);
|
---|
| 79 | break;
|
---|
| 80 | case 1 /* TagOpen */:
|
---|
| 81 | stateTagOpen(char);
|
---|
| 82 | break;
|
---|
| 83 | case 2 /* EndTagOpen */:
|
---|
| 84 | stateEndTagOpen(char);
|
---|
| 85 | break;
|
---|
| 86 | case 3 /* TagName */:
|
---|
| 87 | stateTagName(char);
|
---|
| 88 | break;
|
---|
| 89 | case 4 /* BeforeAttributeName */:
|
---|
| 90 | stateBeforeAttributeName(char);
|
---|
| 91 | break;
|
---|
| 92 | case 5 /* AttributeName */:
|
---|
| 93 | stateAttributeName(char);
|
---|
| 94 | break;
|
---|
| 95 | case 6 /* AfterAttributeName */:
|
---|
| 96 | stateAfterAttributeName(char);
|
---|
| 97 | break;
|
---|
| 98 | case 7 /* BeforeAttributeValue */:
|
---|
| 99 | stateBeforeAttributeValue(char);
|
---|
| 100 | break;
|
---|
| 101 | case 8 /* AttributeValueDoubleQuoted */:
|
---|
| 102 | stateAttributeValueDoubleQuoted(char);
|
---|
| 103 | break;
|
---|
| 104 | case 9 /* AttributeValueSingleQuoted */:
|
---|
| 105 | stateAttributeValueSingleQuoted(char);
|
---|
| 106 | break;
|
---|
| 107 | case 10 /* AttributeValueUnquoted */:
|
---|
| 108 | stateAttributeValueUnquoted(char);
|
---|
| 109 | break;
|
---|
| 110 | case 11 /* AfterAttributeValueQuoted */:
|
---|
| 111 | stateAfterAttributeValueQuoted(char);
|
---|
| 112 | break;
|
---|
| 113 | case 12 /* SelfClosingStartTag */:
|
---|
| 114 | stateSelfClosingStartTag(char);
|
---|
| 115 | break;
|
---|
| 116 | case 13 /* MarkupDeclarationOpenState */:
|
---|
| 117 | stateMarkupDeclarationOpen(char);
|
---|
| 118 | break;
|
---|
| 119 | case 14 /* CommentStart */:
|
---|
| 120 | stateCommentStart(char);
|
---|
| 121 | break;
|
---|
| 122 | case 15 /* CommentStartDash */:
|
---|
| 123 | stateCommentStartDash(char);
|
---|
| 124 | break;
|
---|
| 125 | case 16 /* Comment */:
|
---|
| 126 | stateComment(char);
|
---|
| 127 | break;
|
---|
| 128 | case 17 /* CommentEndDash */:
|
---|
| 129 | stateCommentEndDash(char);
|
---|
| 130 | break;
|
---|
| 131 | case 18 /* CommentEnd */:
|
---|
| 132 | stateCommentEnd(char);
|
---|
| 133 | break;
|
---|
| 134 | case 19 /* CommentEndBang */:
|
---|
| 135 | stateCommentEndBang(char);
|
---|
| 136 | break;
|
---|
| 137 | case 20 /* Doctype */:
|
---|
| 138 | stateDoctype(char);
|
---|
| 139 | break;
|
---|
| 140 | default:
|
---|
| 141 | (0, utils_1.throwUnhandledCaseError)(state);
|
---|
| 142 | }
|
---|
| 143 | // For debugging: search for other "For debugging" lines
|
---|
| 144 | // ALSO: Temporarily remove the 'const' keyword on the State enum
|
---|
| 145 | // table.push(
|
---|
| 146 | // [ charIdx, char, State[ state ], currentDataIdx, currentTag.idx, currentTag.idx === -1 ? '' : currentTag.type ]
|
---|
| 147 | // );
|
---|
| 148 | charIdx++;
|
---|
| 149 | }
|
---|
| 150 | if (currentDataIdx < charIdx) {
|
---|
| 151 | emitText();
|
---|
| 152 | }
|
---|
| 153 | // For debugging: search for other "For debugging" lines
|
---|
| 154 | // console.log( '\n' + table.toString() );
|
---|
| 155 | // Called when non-tags are being read (i.e. the text around HTML †ags)
|
---|
| 156 | // https://www.w3.org/TR/html51/syntax.html#data-state
|
---|
| 157 | function stateData(char) {
|
---|
| 158 | if (char === '<') {
|
---|
| 159 | startNewTag();
|
---|
| 160 | }
|
---|
| 161 | }
|
---|
| 162 | // Called after a '<' is read from the Data state
|
---|
| 163 | // https://www.w3.org/TR/html51/syntax.html#tag-open-state
|
---|
| 164 | function stateTagOpen(char) {
|
---|
| 165 | if (char === '!') {
|
---|
| 166 | state = 13 /* MarkupDeclarationOpenState */;
|
---|
| 167 | }
|
---|
| 168 | else if (char === '/') {
|
---|
| 169 | state = 2 /* EndTagOpen */;
|
---|
| 170 | currentTag = new CurrentTag((0, tslib_1.__assign)((0, tslib_1.__assign)({}, currentTag), { isClosing: true }));
|
---|
| 171 | }
|
---|
| 172 | else if (char === '<') {
|
---|
| 173 | // start of another tag (ignore the previous, incomplete one)
|
---|
| 174 | startNewTag();
|
---|
| 175 | }
|
---|
| 176 | else if (regex_lib_1.letterRe.test(char)) {
|
---|
| 177 | // tag name start (and no '/' read)
|
---|
| 178 | state = 3 /* TagName */;
|
---|
| 179 | currentTag = new CurrentTag((0, tslib_1.__assign)((0, tslib_1.__assign)({}, currentTag), { isOpening: true }));
|
---|
| 180 | }
|
---|
| 181 | else {
|
---|
| 182 | // Any other
|
---|
| 183 | state = 0 /* Data */;
|
---|
| 184 | currentTag = noCurrentTag;
|
---|
| 185 | }
|
---|
| 186 | }
|
---|
| 187 | // After a '<x', '</x' sequence is read (where 'x' is a letter character),
|
---|
| 188 | // this is to continue reading the tag name
|
---|
| 189 | // https://www.w3.org/TR/html51/syntax.html#tag-name-state
|
---|
| 190 | function stateTagName(char) {
|
---|
| 191 | if (regex_lib_1.whitespaceRe.test(char)) {
|
---|
| 192 | currentTag = new CurrentTag((0, tslib_1.__assign)((0, tslib_1.__assign)({}, currentTag), { name: captureTagName() }));
|
---|
| 193 | state = 4 /* BeforeAttributeName */;
|
---|
| 194 | }
|
---|
| 195 | else if (char === '<') {
|
---|
| 196 | // start of another tag (ignore the previous, incomplete one)
|
---|
| 197 | startNewTag();
|
---|
| 198 | }
|
---|
| 199 | else if (char === '/') {
|
---|
| 200 | currentTag = new CurrentTag((0, tslib_1.__assign)((0, tslib_1.__assign)({}, currentTag), { name: captureTagName() }));
|
---|
| 201 | state = 12 /* SelfClosingStartTag */;
|
---|
| 202 | }
|
---|
| 203 | else if (char === '>') {
|
---|
| 204 | currentTag = new CurrentTag((0, tslib_1.__assign)((0, tslib_1.__assign)({}, currentTag), { name: captureTagName() }));
|
---|
| 205 | emitTagAndPreviousTextNode(); // resets to Data state as well
|
---|
| 206 | }
|
---|
| 207 | else if (!regex_lib_1.letterRe.test(char) && !regex_lib_1.digitRe.test(char) && char !== ':') {
|
---|
| 208 | // Anything else that does not form an html tag. Note: the colon
|
---|
| 209 | // character is accepted for XML namespaced tags
|
---|
| 210 | resetToDataState();
|
---|
| 211 | }
|
---|
| 212 | else {
|
---|
| 213 | // continue reading tag name
|
---|
| 214 | }
|
---|
| 215 | }
|
---|
| 216 | // Called after the '/' is read from a '</' sequence
|
---|
| 217 | // https://www.w3.org/TR/html51/syntax.html#end-tag-open-state
|
---|
| 218 | function stateEndTagOpen(char) {
|
---|
| 219 | if (char === '>') {
|
---|
| 220 | // parse error. Encountered "</>". Skip it without treating as a tag
|
---|
| 221 | resetToDataState();
|
---|
| 222 | }
|
---|
| 223 | else if (regex_lib_1.letterRe.test(char)) {
|
---|
| 224 | state = 3 /* TagName */;
|
---|
| 225 | }
|
---|
| 226 | else {
|
---|
| 227 | // some other non-tag-like character, don't treat this as a tag
|
---|
| 228 | resetToDataState();
|
---|
| 229 | }
|
---|
| 230 | }
|
---|
| 231 | // https://www.w3.org/TR/html51/syntax.html#before-attribute-name-state
|
---|
| 232 | function stateBeforeAttributeName(char) {
|
---|
| 233 | if (regex_lib_1.whitespaceRe.test(char)) {
|
---|
| 234 | // stay in BeforeAttributeName state - continue reading chars
|
---|
| 235 | }
|
---|
| 236 | else if (char === '/') {
|
---|
| 237 | state = 12 /* SelfClosingStartTag */;
|
---|
| 238 | }
|
---|
| 239 | else if (char === '>') {
|
---|
| 240 | emitTagAndPreviousTextNode(); // resets to Data state as well
|
---|
| 241 | }
|
---|
| 242 | else if (char === '<') {
|
---|
| 243 | // start of another tag (ignore the previous, incomplete one)
|
---|
| 244 | startNewTag();
|
---|
| 245 | }
|
---|
| 246 | else if (char === "=" || regex_lib_1.quoteRe.test(char) || regex_lib_1.controlCharsRe.test(char)) {
|
---|
| 247 | // "Parse error" characters that, according to the spec, should be
|
---|
| 248 | // appended to the attribute name, but we'll treat these characters
|
---|
| 249 | // as not forming a real HTML tag
|
---|
| 250 | resetToDataState();
|
---|
| 251 | }
|
---|
| 252 | else {
|
---|
| 253 | // Any other char, start of a new attribute name
|
---|
| 254 | state = 5 /* AttributeName */;
|
---|
| 255 | }
|
---|
| 256 | }
|
---|
| 257 | // https://www.w3.org/TR/html51/syntax.html#attribute-name-state
|
---|
| 258 | function stateAttributeName(char) {
|
---|
| 259 | if (regex_lib_1.whitespaceRe.test(char)) {
|
---|
| 260 | state = 6 /* AfterAttributeName */;
|
---|
| 261 | }
|
---|
| 262 | else if (char === '/') {
|
---|
| 263 | state = 12 /* SelfClosingStartTag */;
|
---|
| 264 | }
|
---|
| 265 | else if (char === '=') {
|
---|
| 266 | state = 7 /* BeforeAttributeValue */;
|
---|
| 267 | }
|
---|
| 268 | else if (char === '>') {
|
---|
| 269 | emitTagAndPreviousTextNode(); // resets to Data state as well
|
---|
| 270 | }
|
---|
| 271 | else if (char === '<') {
|
---|
| 272 | // start of another tag (ignore the previous, incomplete one)
|
---|
| 273 | startNewTag();
|
---|
| 274 | }
|
---|
| 275 | else if (regex_lib_1.quoteRe.test(char)) {
|
---|
| 276 | // "Parse error" characters that, according to the spec, should be
|
---|
| 277 | // appended to the attribute name, but we'll treat these characters
|
---|
| 278 | // as not forming a real HTML tag
|
---|
| 279 | resetToDataState();
|
---|
| 280 | }
|
---|
| 281 | else {
|
---|
| 282 | // anything else: continue reading attribute name
|
---|
| 283 | }
|
---|
| 284 | }
|
---|
| 285 | // https://www.w3.org/TR/html51/syntax.html#after-attribute-name-state
|
---|
| 286 | function stateAfterAttributeName(char) {
|
---|
| 287 | if (regex_lib_1.whitespaceRe.test(char)) {
|
---|
| 288 | // ignore the character - continue reading
|
---|
| 289 | }
|
---|
| 290 | else if (char === '/') {
|
---|
| 291 | state = 12 /* SelfClosingStartTag */;
|
---|
| 292 | }
|
---|
| 293 | else if (char === '=') {
|
---|
| 294 | state = 7 /* BeforeAttributeValue */;
|
---|
| 295 | }
|
---|
| 296 | else if (char === '>') {
|
---|
| 297 | emitTagAndPreviousTextNode();
|
---|
| 298 | }
|
---|
| 299 | else if (char === '<') {
|
---|
| 300 | // start of another tag (ignore the previous, incomplete one)
|
---|
| 301 | startNewTag();
|
---|
| 302 | }
|
---|
| 303 | else if (regex_lib_1.quoteRe.test(char)) {
|
---|
| 304 | // "Parse error" characters that, according to the spec, should be
|
---|
| 305 | // appended to the attribute name, but we'll treat these characters
|
---|
| 306 | // as not forming a real HTML tag
|
---|
| 307 | resetToDataState();
|
---|
| 308 | }
|
---|
| 309 | else {
|
---|
| 310 | // Any other character, start a new attribute in the current tag
|
---|
| 311 | state = 5 /* AttributeName */;
|
---|
| 312 | }
|
---|
| 313 | }
|
---|
| 314 | // https://www.w3.org/TR/html51/syntax.html#before-attribute-value-state
|
---|
| 315 | function stateBeforeAttributeValue(char) {
|
---|
| 316 | if (regex_lib_1.whitespaceRe.test(char)) {
|
---|
| 317 | // ignore the character - continue reading
|
---|
| 318 | }
|
---|
| 319 | else if (char === "\"") {
|
---|
| 320 | state = 8 /* AttributeValueDoubleQuoted */;
|
---|
| 321 | }
|
---|
| 322 | else if (char === "'") {
|
---|
| 323 | state = 9 /* AttributeValueSingleQuoted */;
|
---|
| 324 | }
|
---|
| 325 | else if (/[>=`]/.test(char)) {
|
---|
| 326 | // Invalid chars after an '=' for an attribute value, don't count
|
---|
| 327 | // the current tag as an HTML tag
|
---|
| 328 | resetToDataState();
|
---|
| 329 | }
|
---|
| 330 | else if (char === '<') {
|
---|
| 331 | // start of another tag (ignore the previous, incomplete one)
|
---|
| 332 | startNewTag();
|
---|
| 333 | }
|
---|
| 334 | else {
|
---|
| 335 | // Any other character, consider it an unquoted attribute value
|
---|
| 336 | state = 10 /* AttributeValueUnquoted */;
|
---|
| 337 | }
|
---|
| 338 | }
|
---|
| 339 | // https://www.w3.org/TR/html51/syntax.html#attribute-value-double-quoted-state
|
---|
| 340 | function stateAttributeValueDoubleQuoted(char) {
|
---|
| 341 | if (char === "\"") {
|
---|
| 342 | // end the current double-quoted attribute
|
---|
| 343 | state = 11 /* AfterAttributeValueQuoted */;
|
---|
| 344 | }
|
---|
| 345 | else {
|
---|
| 346 | // consume the character as part of the double-quoted attribute value
|
---|
| 347 | }
|
---|
| 348 | }
|
---|
| 349 | // https://www.w3.org/TR/html51/syntax.html#attribute-value-single-quoted-state
|
---|
| 350 | function stateAttributeValueSingleQuoted(char) {
|
---|
| 351 | if (char === "'") {
|
---|
| 352 | // end the current single-quoted attribute
|
---|
| 353 | state = 11 /* AfterAttributeValueQuoted */;
|
---|
| 354 | }
|
---|
| 355 | else {
|
---|
| 356 | // consume the character as part of the double-quoted attribute value
|
---|
| 357 | }
|
---|
| 358 | }
|
---|
| 359 | // https://www.w3.org/TR/html51/syntax.html#attribute-value-unquoted-state
|
---|
| 360 | function stateAttributeValueUnquoted(char) {
|
---|
| 361 | if (regex_lib_1.whitespaceRe.test(char)) {
|
---|
| 362 | state = 4 /* BeforeAttributeName */;
|
---|
| 363 | }
|
---|
| 364 | else if (char === '>') {
|
---|
| 365 | emitTagAndPreviousTextNode();
|
---|
| 366 | }
|
---|
| 367 | else if (char === '<') {
|
---|
| 368 | // start of another tag (ignore the previous, incomplete one)
|
---|
| 369 | startNewTag();
|
---|
| 370 | }
|
---|
| 371 | else {
|
---|
| 372 | // Any other character, treat it as part of the attribute value
|
---|
| 373 | }
|
---|
| 374 | }
|
---|
| 375 | // https://www.w3.org/TR/html51/syntax.html#after-attribute-value-quoted-state
|
---|
| 376 | function stateAfterAttributeValueQuoted(char) {
|
---|
| 377 | if (regex_lib_1.whitespaceRe.test(char)) {
|
---|
| 378 | state = 4 /* BeforeAttributeName */;
|
---|
| 379 | }
|
---|
| 380 | else if (char === '/') {
|
---|
| 381 | state = 12 /* SelfClosingStartTag */;
|
---|
| 382 | }
|
---|
| 383 | else if (char === '>') {
|
---|
| 384 | emitTagAndPreviousTextNode();
|
---|
| 385 | }
|
---|
| 386 | else if (char === '<') {
|
---|
| 387 | // start of another tag (ignore the previous, incomplete one)
|
---|
| 388 | startNewTag();
|
---|
| 389 | }
|
---|
| 390 | else {
|
---|
| 391 | // Any other character, "parse error". Spec says to switch to the
|
---|
| 392 | // BeforeAttributeState and re-consume the character, as it may be
|
---|
| 393 | // the start of a new attribute name
|
---|
| 394 | state = 4 /* BeforeAttributeName */;
|
---|
| 395 | reconsumeCurrentCharacter();
|
---|
| 396 | }
|
---|
| 397 | }
|
---|
| 398 | // A '/' has just been read in the current tag (presumably for '/>'), and
|
---|
| 399 | // this handles the next character
|
---|
| 400 | // https://www.w3.org/TR/html51/syntax.html#self-closing-start-tag-state
|
---|
| 401 | function stateSelfClosingStartTag(char) {
|
---|
| 402 | if (char === '>') {
|
---|
| 403 | currentTag = new CurrentTag((0, tslib_1.__assign)((0, tslib_1.__assign)({}, currentTag), { isClosing: true }));
|
---|
| 404 | emitTagAndPreviousTextNode(); // resets to Data state as well
|
---|
| 405 | }
|
---|
| 406 | else {
|
---|
| 407 | state = 4 /* BeforeAttributeName */;
|
---|
| 408 | }
|
---|
| 409 | }
|
---|
| 410 | // https://www.w3.org/TR/html51/syntax.html#markup-declaration-open-state
|
---|
| 411 | // (HTML Comments or !DOCTYPE)
|
---|
| 412 | function stateMarkupDeclarationOpen(char) {
|
---|
| 413 | if (html.substr(charIdx, 2) === '--') {
|
---|
| 414 | // html comment
|
---|
| 415 | charIdx += 2; // "consume" characters
|
---|
| 416 | currentTag = new CurrentTag((0, tslib_1.__assign)((0, tslib_1.__assign)({}, currentTag), { type: 'comment' }));
|
---|
| 417 | state = 14 /* CommentStart */;
|
---|
| 418 | }
|
---|
| 419 | else if (html.substr(charIdx, 7).toUpperCase() === 'DOCTYPE') {
|
---|
| 420 | charIdx += 7; // "consume" characters
|
---|
| 421 | currentTag = new CurrentTag((0, tslib_1.__assign)((0, tslib_1.__assign)({}, currentTag), { type: 'doctype' }));
|
---|
| 422 | state = 20 /* Doctype */;
|
---|
| 423 | }
|
---|
| 424 | else {
|
---|
| 425 | // At this point, the spec specifies that the state machine should
|
---|
| 426 | // enter the "bogus comment" state, in which case any character(s)
|
---|
| 427 | // after the '<!' that were read should become an HTML comment up
|
---|
| 428 | // until the first '>' that is read (or EOF). Instead, we'll assume
|
---|
| 429 | // that a user just typed '<!' as part of text data
|
---|
| 430 | resetToDataState();
|
---|
| 431 | }
|
---|
| 432 | }
|
---|
| 433 | // Handles after the sequence '<!--' has been read
|
---|
| 434 | // https://www.w3.org/TR/html51/syntax.html#comment-start-state
|
---|
| 435 | function stateCommentStart(char) {
|
---|
| 436 | if (char === '-') {
|
---|
| 437 | // We've read the sequence '<!---' at this point (3 dashes)
|
---|
| 438 | state = 15 /* CommentStartDash */;
|
---|
| 439 | }
|
---|
| 440 | else if (char === '>') {
|
---|
| 441 | // At this point, we'll assume the comment wasn't a real comment
|
---|
| 442 | // so we'll just emit it as data. We basically read the sequence
|
---|
| 443 | // '<!-->'
|
---|
| 444 | resetToDataState();
|
---|
| 445 | }
|
---|
| 446 | else {
|
---|
| 447 | // Any other char, take it as part of the comment
|
---|
| 448 | state = 16 /* Comment */;
|
---|
| 449 | }
|
---|
| 450 | }
|
---|
| 451 | // We've read the sequence '<!---' at this point (3 dashes)
|
---|
| 452 | // https://www.w3.org/TR/html51/syntax.html#comment-start-dash-state
|
---|
| 453 | function stateCommentStartDash(char) {
|
---|
| 454 | if (char === '-') {
|
---|
| 455 | // We've read '<!----' (4 dashes) at this point
|
---|
| 456 | state = 18 /* CommentEnd */;
|
---|
| 457 | }
|
---|
| 458 | else if (char === '>') {
|
---|
| 459 | // At this point, we'll assume the comment wasn't a real comment
|
---|
| 460 | // so we'll just emit it as data. We basically read the sequence
|
---|
| 461 | // '<!--->'
|
---|
| 462 | resetToDataState();
|
---|
| 463 | }
|
---|
| 464 | else {
|
---|
| 465 | // Anything else, take it as a valid comment
|
---|
| 466 | state = 16 /* Comment */;
|
---|
| 467 | }
|
---|
| 468 | }
|
---|
| 469 | // Currently reading the comment's text (data)
|
---|
| 470 | // https://www.w3.org/TR/html51/syntax.html#comment-state
|
---|
| 471 | function stateComment(char) {
|
---|
| 472 | if (char === '-') {
|
---|
| 473 | state = 17 /* CommentEndDash */;
|
---|
| 474 | }
|
---|
| 475 | else {
|
---|
| 476 | // Any other character, stay in the Comment state
|
---|
| 477 | }
|
---|
| 478 | }
|
---|
| 479 | // When we we've read the first dash inside a comment, it may signal the
|
---|
| 480 | // end of the comment if we read another dash
|
---|
| 481 | // https://www.w3.org/TR/html51/syntax.html#comment-end-dash-state
|
---|
| 482 | function stateCommentEndDash(char) {
|
---|
| 483 | if (char === '-') {
|
---|
| 484 | state = 18 /* CommentEnd */;
|
---|
| 485 | }
|
---|
| 486 | else {
|
---|
| 487 | // Wasn't a dash, must still be part of the comment
|
---|
| 488 | state = 16 /* Comment */;
|
---|
| 489 | }
|
---|
| 490 | }
|
---|
| 491 | // After we've read two dashes inside a comment, it may signal the end of
|
---|
| 492 | // the comment if we then read a '>' char
|
---|
| 493 | // https://www.w3.org/TR/html51/syntax.html#comment-end-state
|
---|
| 494 | function stateCommentEnd(char) {
|
---|
| 495 | if (char === '>') {
|
---|
| 496 | emitTagAndPreviousTextNode();
|
---|
| 497 | }
|
---|
| 498 | else if (char === '!') {
|
---|
| 499 | state = 19 /* CommentEndBang */;
|
---|
| 500 | }
|
---|
| 501 | else if (char === '-') {
|
---|
| 502 | // A 3rd '-' has been read: stay in the CommentEnd state
|
---|
| 503 | }
|
---|
| 504 | else {
|
---|
| 505 | // Anything else, switch back to the comment state since we didn't
|
---|
| 506 | // read the full "end comment" sequence (i.e. '-->')
|
---|
| 507 | state = 16 /* Comment */;
|
---|
| 508 | }
|
---|
| 509 | }
|
---|
| 510 | // We've read the sequence '--!' inside of a comment
|
---|
| 511 | // https://www.w3.org/TR/html51/syntax.html#comment-end-bang-state
|
---|
| 512 | function stateCommentEndBang(char) {
|
---|
| 513 | if (char === '-') {
|
---|
| 514 | // We read the sequence '--!-' inside of a comment. The last dash
|
---|
| 515 | // could signify that the comment is going to close
|
---|
| 516 | state = 17 /* CommentEndDash */;
|
---|
| 517 | }
|
---|
| 518 | else if (char === '>') {
|
---|
| 519 | // End of comment with the sequence '--!>'
|
---|
| 520 | emitTagAndPreviousTextNode();
|
---|
| 521 | }
|
---|
| 522 | else {
|
---|
| 523 | // The '--!' was not followed by a '>', continue reading the
|
---|
| 524 | // comment's text
|
---|
| 525 | state = 16 /* Comment */;
|
---|
| 526 | }
|
---|
| 527 | }
|
---|
| 528 | /**
|
---|
| 529 | * For DOCTYPES in particular, we don't care about the attributes. Just
|
---|
| 530 | * advance to the '>' character and emit the tag, unless we find a '<'
|
---|
| 531 | * character in which case we'll start a new tag.
|
---|
| 532 | *
|
---|
| 533 | * Example doctype tag:
|
---|
| 534 | * <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
|
---|
| 535 | *
|
---|
| 536 | * Actual spec: https://www.w3.org/TR/html51/syntax.html#doctype-state
|
---|
| 537 | */
|
---|
| 538 | function stateDoctype(char) {
|
---|
| 539 | if (char === '>') {
|
---|
| 540 | emitTagAndPreviousTextNode();
|
---|
| 541 | }
|
---|
| 542 | else if (char === '<') {
|
---|
| 543 | startNewTag();
|
---|
| 544 | }
|
---|
| 545 | else {
|
---|
| 546 | // stay in the Doctype state
|
---|
| 547 | }
|
---|
| 548 | }
|
---|
| 549 | /**
|
---|
| 550 | * Resets the state back to the Data state, and removes the current tag.
|
---|
| 551 | *
|
---|
| 552 | * We'll generally run this function whenever a "parse error" is
|
---|
| 553 | * encountered, where the current tag that is being read no longer looks
|
---|
| 554 | * like a real HTML tag.
|
---|
| 555 | */
|
---|
| 556 | function resetToDataState() {
|
---|
| 557 | state = 0 /* Data */;
|
---|
| 558 | currentTag = noCurrentTag;
|
---|
| 559 | }
|
---|
| 560 | /**
|
---|
| 561 | * Starts a new HTML tag at the current index, ignoring any previous HTML
|
---|
| 562 | * tag that was being read.
|
---|
| 563 | *
|
---|
| 564 | * We'll generally run this function whenever we read a new '<' character,
|
---|
| 565 | * including when we read a '<' character inside of an HTML tag that we were
|
---|
| 566 | * previously reading.
|
---|
| 567 | */
|
---|
| 568 | function startNewTag() {
|
---|
| 569 | state = 1 /* TagOpen */;
|
---|
| 570 | currentTag = new CurrentTag({ idx: charIdx });
|
---|
| 571 | }
|
---|
| 572 | /**
|
---|
| 573 | * Once we've decided to emit an open tag, that means we can also emit the
|
---|
| 574 | * text node before it.
|
---|
| 575 | */
|
---|
| 576 | function emitTagAndPreviousTextNode() {
|
---|
| 577 | var textBeforeTag = html.slice(currentDataIdx, currentTag.idx);
|
---|
| 578 | if (textBeforeTag) {
|
---|
| 579 | // the html tag was the first element in the html string, or two
|
---|
| 580 | // tags next to each other, in which case we should not emit a text
|
---|
| 581 | // node
|
---|
| 582 | onText(textBeforeTag, currentDataIdx);
|
---|
| 583 | }
|
---|
| 584 | if (currentTag.type === 'comment') {
|
---|
| 585 | onComment(currentTag.idx);
|
---|
| 586 | }
|
---|
| 587 | else if (currentTag.type === 'doctype') {
|
---|
| 588 | onDoctype(currentTag.idx);
|
---|
| 589 | }
|
---|
| 590 | else {
|
---|
| 591 | if (currentTag.isOpening) {
|
---|
| 592 | onOpenTag(currentTag.name, currentTag.idx);
|
---|
| 593 | }
|
---|
| 594 | if (currentTag.isClosing) {
|
---|
| 595 | // note: self-closing tags will emit both opening and closing
|
---|
| 596 | onCloseTag(currentTag.name, currentTag.idx);
|
---|
| 597 | }
|
---|
| 598 | }
|
---|
| 599 | // Since we just emitted a tag, reset to the data state for the next char
|
---|
| 600 | resetToDataState();
|
---|
| 601 | currentDataIdx = charIdx + 1;
|
---|
| 602 | }
|
---|
| 603 | function emitText() {
|
---|
| 604 | var text = html.slice(currentDataIdx, charIdx);
|
---|
| 605 | onText(text, currentDataIdx);
|
---|
| 606 | currentDataIdx = charIdx + 1;
|
---|
| 607 | }
|
---|
| 608 | /**
|
---|
| 609 | * Captures the tag name from the start of the tag to the current character
|
---|
| 610 | * index, and converts it to lower case
|
---|
| 611 | */
|
---|
| 612 | function captureTagName() {
|
---|
| 613 | var startIdx = currentTag.idx + (currentTag.isClosing ? 2 : 1);
|
---|
| 614 | return html.slice(startIdx, charIdx).toLowerCase();
|
---|
| 615 | }
|
---|
| 616 | /**
|
---|
| 617 | * Causes the main loop to re-consume the current character, such as after
|
---|
| 618 | * encountering a "parse error" that changed state and needs to reconsume
|
---|
| 619 | * the same character in that new state.
|
---|
| 620 | */
|
---|
| 621 | function reconsumeCurrentCharacter() {
|
---|
| 622 | charIdx--;
|
---|
| 623 | }
|
---|
| 624 | }
|
---|
| 625 | exports.parseHtml = parseHtml;
|
---|
| 626 | var CurrentTag = /** @class */ (function () {
|
---|
| 627 | function CurrentTag(cfg) {
|
---|
| 628 | if (cfg === void 0) { cfg = {}; }
|
---|
| 629 | this.idx = cfg.idx !== undefined ? cfg.idx : -1;
|
---|
| 630 | this.type = cfg.type || 'tag';
|
---|
| 631 | this.name = cfg.name || '';
|
---|
| 632 | this.isOpening = !!cfg.isOpening;
|
---|
| 633 | this.isClosing = !!cfg.isClosing;
|
---|
| 634 | }
|
---|
| 635 | return CurrentTag;
|
---|
| 636 | }());
|
---|
| 637 | //# sourceMappingURL=parse-html.js.map |
---|