[79a0317] | 1 | /*!
|
---|
| 2 | * HTML Parser By John Resig (ejohn.org)
|
---|
| 3 | * Modified by Juriy "kangax" Zaytsev
|
---|
| 4 | * Original code by Erik Arvidsson, Mozilla Public License
|
---|
| 5 | * http://erik.eae.net/simplehtmlparser/simplehtmlparser.js
|
---|
| 6 | */
|
---|
| 7 |
|
---|
| 8 | /*
|
---|
| 9 | * // Use like so:
|
---|
| 10 | * HTMLParser(htmlString, {
|
---|
| 11 | * start: function(tag, attrs, unary) {},
|
---|
| 12 | * end: function(tag) {},
|
---|
| 13 | * chars: function(text) {},
|
---|
| 14 | * comment: function(text) {}
|
---|
| 15 | * });
|
---|
| 16 | *
|
---|
| 17 | * // or to get an XML string:
|
---|
| 18 | * HTMLtoXML(htmlString);
|
---|
| 19 | *
|
---|
| 20 | * // or to get an XML DOM Document
|
---|
| 21 | * HTMLtoDOM(htmlString);
|
---|
| 22 | *
|
---|
| 23 | * // or to inject into an existing document/DOM node
|
---|
| 24 | * HTMLtoDOM(htmlString, document);
|
---|
| 25 | * HTMLtoDOM(htmlString, document.body);
|
---|
| 26 | *
|
---|
| 27 | */
|
---|
| 28 |
|
---|
| 29 | /* global ActiveXObject, DOMDocument */
|
---|
| 30 |
|
---|
| 31 | 'use strict';
|
---|
| 32 |
|
---|
| 33 | var createMapFromString = require('./utils').createMapFromString;
|
---|
| 34 | var replaceAsync = require('./utils').replaceAsync;
|
---|
| 35 |
|
---|
| 36 | function makeMap(values) {
|
---|
| 37 | return createMapFromString(values, true);
|
---|
| 38 | }
|
---|
| 39 |
|
---|
| 40 | // Regular Expressions for parsing tags and attributes
|
---|
| 41 | var singleAttrIdentifier = /([^\s"'<>/=]+)/,
|
---|
| 42 | singleAttrAssigns = [/=/],
|
---|
| 43 | singleAttrValues = [
|
---|
| 44 | // attr value double quotes
|
---|
| 45 | /"([^"]*)"+/.source,
|
---|
| 46 | // attr value, single quotes
|
---|
| 47 | /'([^']*)'+/.source,
|
---|
| 48 | // attr value, no quotes
|
---|
| 49 | /([^ \t\n\f\r"'`=<>]+)/.source
|
---|
| 50 | ],
|
---|
| 51 | // https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-QName
|
---|
| 52 | qnameCapture = (function() {
|
---|
| 53 | // based on https://www.npmjs.com/package/ncname
|
---|
| 54 | var combiningChar = '\\u0300-\\u0345\\u0360\\u0361\\u0483-\\u0486\\u0591-\\u05A1\\u05A3-\\u05B9\\u05BB-\\u05BD\\u05BF\\u05C1\\u05C2\\u05C4\\u064B-\\u0652\\u0670\\u06D6-\\u06E4\\u06E7\\u06E8\\u06EA-\\u06ED\\u0901-\\u0903\\u093C\\u093E-\\u094D\\u0951-\\u0954\\u0962\\u0963\\u0981-\\u0983\\u09BC\\u09BE-\\u09C4\\u09C7\\u09C8\\u09CB-\\u09CD\\u09D7\\u09E2\\u09E3\\u0A02\\u0A3C\\u0A3E-\\u0A42\\u0A47\\u0A48\\u0A4B-\\u0A4D\\u0A70\\u0A71\\u0A81-\\u0A83\\u0ABC\\u0ABE-\\u0AC5\\u0AC7-\\u0AC9\\u0ACB-\\u0ACD\\u0B01-\\u0B03\\u0B3C\\u0B3E-\\u0B43\\u0B47\\u0B48\\u0B4B-\\u0B4D\\u0B56\\u0B57\\u0B82\\u0B83\\u0BBE-\\u0BC2\\u0BC6-\\u0BC8\\u0BCA-\\u0BCD\\u0BD7\\u0C01-\\u0C03\\u0C3E-\\u0C44\\u0C46-\\u0C48\\u0C4A-\\u0C4D\\u0C55\\u0C56\\u0C82\\u0C83\\u0CBE-\\u0CC4\\u0CC6-\\u0CC8\\u0CCA-\\u0CCD\\u0CD5\\u0CD6\\u0D02\\u0D03\\u0D3E-\\u0D43\\u0D46-\\u0D48\\u0D4A-\\u0D4D\\u0D57\\u0E31\\u0E34-\\u0E3A\\u0E47-\\u0E4E\\u0EB1\\u0EB4-\\u0EB9\\u0EBB\\u0EBC\\u0EC8-\\u0ECD\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39\\u0F3E\\u0F3F\\u0F71-\\u0F84\\u0F86-\\u0F8B\\u0F90-\\u0F95\\u0F97\\u0F99-\\u0FAD\\u0FB1-\\u0FB7\\u0FB9\\u20D0-\\u20DC\\u20E1\\u302A-\\u302F\\u3099\\u309A';
|
---|
| 55 | var digit = '0-9\\u0660-\\u0669\\u06F0-\\u06F9\\u0966-\\u096F\\u09E6-\\u09EF\\u0A66-\\u0A6F\\u0AE6-\\u0AEF\\u0B66-\\u0B6F\\u0BE7-\\u0BEF\\u0C66-\\u0C6F\\u0CE6-\\u0CEF\\u0D66-\\u0D6F\\u0E50-\\u0E59\\u0ED0-\\u0ED9\\u0F20-\\u0F29';
|
---|
| 56 | var extender = '\\xB7\\u02D0\\u02D1\\u0387\\u0640\\u0E46\\u0EC6\\u3005\\u3031-\\u3035\\u309D\\u309E\\u30FC-\\u30FE';
|
---|
| 57 | var letter = 'A-Za-z\\xC0-\\xD6\\xD8-\\xF6\\xF8-\\u0131\\u0134-\\u013E\\u0141-\\u0148\\u014A-\\u017E\\u0180-\\u01C3\\u01CD-\\u01F0\\u01F4\\u01F5\\u01FA-\\u0217\\u0250-\\u02A8\\u02BB-\\u02C1\\u0386\\u0388-\\u038A\\u038C\\u038E-\\u03A1\\u03A3-\\u03CE\\u03D0-\\u03D6\\u03DA\\u03DC\\u03DE\\u03E0\\u03E2-\\u03F3\\u0401-\\u040C\\u040E-\\u044F\\u0451-\\u045C\\u045E-\\u0481\\u0490-\\u04C4\\u04C7\\u04C8\\u04CB\\u04CC\\u04D0-\\u04EB\\u04EE-\\u04F5\\u04F8\\u04F9\\u0531-\\u0556\\u0559\\u0561-\\u0586\\u05D0-\\u05EA\\u05F0-\\u05F2\\u0621-\\u063A\\u0641-\\u064A\\u0671-\\u06B7\\u06BA-\\u06BE\\u06C0-\\u06CE\\u06D0-\\u06D3\\u06D5\\u06E5\\u06E6\\u0905-\\u0939\\u093D\\u0958-\\u0961\\u0985-\\u098C\\u098F\\u0990\\u0993-\\u09A8\\u09AA-\\u09B0\\u09B2\\u09B6-\\u09B9\\u09DC\\u09DD\\u09DF-\\u09E1\\u09F0\\u09F1\\u0A05-\\u0A0A\\u0A0F\\u0A10\\u0A13-\\u0A28\\u0A2A-\\u0A30\\u0A32\\u0A33\\u0A35\\u0A36\\u0A38\\u0A39\\u0A59-\\u0A5C\\u0A5E\\u0A72-\\u0A74\\u0A85-\\u0A8B\\u0A8D\\u0A8F-\\u0A91\\u0A93-\\u0AA8\\u0AAA-\\u0AB0\\u0AB2\\u0AB3\\u0AB5-\\u0AB9\\u0ABD\\u0AE0\\u0B05-\\u0B0C\\u0B0F\\u0B10\\u0B13-\\u0B28\\u0B2A-\\u0B30\\u0B32\\u0B33\\u0B36-\\u0B39\\u0B3D\\u0B5C\\u0B5D\\u0B5F-\\u0B61\\u0B85-\\u0B8A\\u0B8E-\\u0B90\\u0B92-\\u0B95\\u0B99\\u0B9A\\u0B9C\\u0B9E\\u0B9F\\u0BA3\\u0BA4\\u0BA8-\\u0BAA\\u0BAE-\\u0BB5\\u0BB7-\\u0BB9\\u0C05-\\u0C0C\\u0C0E-\\u0C10\\u0C12-\\u0C28\\u0C2A-\\u0C33\\u0C35-\\u0C39\\u0C60\\u0C61\\u0C85-\\u0C8C\\u0C8E-\\u0C90\\u0C92-\\u0CA8\\u0CAA-\\u0CB3\\u0CB5-\\u0CB9\\u0CDE\\u0CE0\\u0CE1\\u0D05-\\u0D0C\\u0D0E-\\u0D10\\u0D12-\\u0D28\\u0D2A-\\u0D39\\u0D60\\u0D61\\u0E01-\\u0E2E\\u0E30\\u0E32\\u0E33\\u0E40-\\u0E45\\u0E81\\u0E82\\u0E84\\u0E87\\u0E88\\u0E8A\\u0E8D\\u0E94-\\u0E97\\u0E99-\\u0E9F\\u0EA1-\\u0EA3\\u0EA5\\u0EA7\\u0EAA\\u0EAB\\u0EAD\\u0EAE\\u0EB0\\u0EB2\\u0EB3\\u0EBD\\u0EC0-\\u0EC4\\u0F40-\\u0F47\\u0F49-\\u0F69\\u10A0-\\u10C5\\u10D0-\\u10F6\\u1100\\u1102\\u1103\\u1105-\\u1107\\u1109\\u110B\\u110C\\u110E-\\u1112\\u113C\\u113E\\u1140\\u114C\\u114E\\u1150\\u1154\\u1155\\u1159\\u115F-\\u1161\\u1163\\u1165\\u1167\\u1169\\u116D\\u116E\\u1172\\u1173\\u1175\\u119E\\u11A8\\u11AB\\u11AE\\u11AF\\u11B7\\u11B8\\u11BA\\u11BC-\\u11C2\\u11EB\\u11F0\\u11F9\\u1E00-\\u1E9B\\u1EA0-\\u1EF9\\u1F00-\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D\\u1F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F7D\\u1F80-\\u1FB4\\u1FB6-\\u1FBC\\u1FBE\\u1FC2-\\u1FC4\\u1FC6-\\u1FCC\\u1FD0-\\u1FD3\\u1FD6-\\u1FDB\\u1FE0-\\u1FEC\\u1FF2-\\u1FF4\\u1FF6-\\u1FFC\\u2126\\u212A\\u212B\\u212E\\u2180-\\u2182\\u3007\\u3021-\\u3029\\u3041-\\u3094\\u30A1-\\u30FA\\u3105-\\u312C\\u4E00-\\u9FA5\\uAC00-\\uD7A3';
|
---|
| 58 | var ncname = '[' + letter + '_][' + letter + digit + '\\.\\-_' + combiningChar + extender + ']*';
|
---|
| 59 | return '((?:' + ncname + '\\:)?' + ncname + ')';
|
---|
| 60 | })(),
|
---|
| 61 | startTagOpen = new RegExp('^<' + qnameCapture),
|
---|
| 62 | startTagClose = /^\s*(\/?)>/,
|
---|
| 63 | endTag = new RegExp('^<\\/' + qnameCapture + '[^>]*>'),
|
---|
| 64 | doctype = /^<!DOCTYPE\s?[^>]+>/i;
|
---|
| 65 |
|
---|
| 66 | var IS_REGEX_CAPTURING_BROKEN = false;
|
---|
| 67 | 'x'.replace(/x(.)?/g, function(m, g) {
|
---|
| 68 | IS_REGEX_CAPTURING_BROKEN = g === '';
|
---|
| 69 | });
|
---|
| 70 |
|
---|
| 71 | // Empty Elements
|
---|
| 72 | var empty = makeMap('area,base,basefont,br,col,embed,frame,hr,img,input,isindex,keygen,link,meta,param,source,track,wbr');
|
---|
| 73 |
|
---|
| 74 | // Inline Elements
|
---|
| 75 | var inline = makeMap('a,abbr,acronym,applet,b,basefont,bdo,big,br,button,cite,code,del,dfn,em,font,i,iframe,img,input,ins,kbd,label,map,noscript,object,q,s,samp,script,select,small,span,strike,strong,sub,sup,svg,textarea,tt,u,var');
|
---|
| 76 |
|
---|
| 77 | // Elements that you can, intentionally, leave open
|
---|
| 78 | // (and which close themselves)
|
---|
| 79 | var closeSelf = makeMap('colgroup,dd,dt,li,option,p,td,tfoot,th,thead,tr,source');
|
---|
| 80 |
|
---|
| 81 | // Attributes that have their values filled in disabled='disabled'
|
---|
| 82 | var fillAttrs = makeMap('checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected');
|
---|
| 83 |
|
---|
| 84 | // Special Elements (can contain anything)
|
---|
| 85 | var special = makeMap('script,style');
|
---|
| 86 |
|
---|
| 87 | // HTML5 tags https://html.spec.whatwg.org/multipage/indices.html#elements-3
|
---|
| 88 | // Phrasing Content https://html.spec.whatwg.org/multipage/dom.html#phrasing-content
|
---|
| 89 | var nonPhrasing = makeMap('address,article,aside,base,blockquote,body,caption,col,colgroup,dd,details,dialog,div,dl,dt,fieldset,figcaption,figure,footer,form,h1,h2,h3,h4,h5,h6,head,header,hgroup,hr,html,legend,li,menuitem,meta,ol,optgroup,option,param,rp,rt,source,style,summary,tbody,td,tfoot,th,thead,title,tr,track,ul');
|
---|
| 90 |
|
---|
| 91 | var reCache = {};
|
---|
| 92 |
|
---|
| 93 | function attrForHandler(handler) {
|
---|
| 94 | var pattern = singleAttrIdentifier.source +
|
---|
| 95 | '(?:\\s*(' + joinSingleAttrAssigns(handler) + ')' +
|
---|
| 96 | '[ \\t\\n\\f\\r]*(?:' + singleAttrValues.join('|') + '))?';
|
---|
| 97 | if (handler.customAttrSurround) {
|
---|
| 98 | var attrClauses = [];
|
---|
| 99 | for (var i = handler.customAttrSurround.length - 1; i >= 0; i--) {
|
---|
| 100 | attrClauses[i] = '(?:' +
|
---|
| 101 | '(' + handler.customAttrSurround[i][0].source + ')\\s*' +
|
---|
| 102 | pattern +
|
---|
| 103 | '\\s*(' + handler.customAttrSurround[i][1].source + ')' +
|
---|
| 104 | ')';
|
---|
| 105 | }
|
---|
| 106 | attrClauses.push('(?:' + pattern + ')');
|
---|
| 107 | pattern = '(?:' + attrClauses.join('|') + ')';
|
---|
| 108 | }
|
---|
| 109 | return new RegExp('^\\s*' + pattern);
|
---|
| 110 | }
|
---|
| 111 |
|
---|
| 112 | function joinSingleAttrAssigns(handler) {
|
---|
| 113 | return singleAttrAssigns.concat(
|
---|
| 114 | handler.customAttrAssign || []
|
---|
| 115 | ).map(function(assign) {
|
---|
| 116 | return '(?:' + assign.source + ')';
|
---|
| 117 | }).join('|');
|
---|
| 118 | }
|
---|
| 119 |
|
---|
| 120 | class HTMLParser {
|
---|
| 121 | constructor(html, handler) {
|
---|
| 122 | this.html = html;
|
---|
| 123 | this.handler = handler;
|
---|
| 124 | }
|
---|
| 125 |
|
---|
| 126 | async parse() {
|
---|
| 127 | let html = this.html;
|
---|
| 128 | const handler = this.handler;
|
---|
| 129 |
|
---|
| 130 | var stack = [], lastTag;
|
---|
| 131 | var attribute = attrForHandler(handler);
|
---|
| 132 | var last, prevTag, nextTag;
|
---|
| 133 | while (html) {
|
---|
| 134 | last = html;
|
---|
| 135 | // Make sure we're not in a script or style element
|
---|
| 136 | if (!lastTag || !special(lastTag)) {
|
---|
| 137 | var textEnd = html.indexOf('<');
|
---|
| 138 | if (textEnd === 0) {
|
---|
| 139 | // Comment:
|
---|
| 140 | if (/^<!--/.test(html)) {
|
---|
| 141 | var commentEnd = html.indexOf('-->');
|
---|
| 142 |
|
---|
| 143 | if (commentEnd >= 0) {
|
---|
| 144 | if (handler.comment) {
|
---|
| 145 | await handler.comment(html.substring(4, commentEnd));
|
---|
| 146 | }
|
---|
| 147 | html = html.substring(commentEnd + 3);
|
---|
| 148 | prevTag = '';
|
---|
| 149 | continue;
|
---|
| 150 | }
|
---|
| 151 | }
|
---|
| 152 |
|
---|
| 153 | // https://en.wikipedia.org/wiki/Conditional_comment#Downlevel-revealed_conditional_comment
|
---|
| 154 | if (/^<!\[/.test(html)) {
|
---|
| 155 | var conditionalEnd = html.indexOf(']>');
|
---|
| 156 |
|
---|
| 157 | if (conditionalEnd >= 0) {
|
---|
| 158 | if (handler.comment) {
|
---|
| 159 | await handler.comment(html.substring(2, conditionalEnd + 1), true /* non-standard */);
|
---|
| 160 | }
|
---|
| 161 | html = html.substring(conditionalEnd + 2);
|
---|
| 162 | prevTag = '';
|
---|
| 163 | continue;
|
---|
| 164 | }
|
---|
| 165 | }
|
---|
| 166 |
|
---|
| 167 | // Doctype:
|
---|
| 168 | var doctypeMatch = html.match(doctype);
|
---|
| 169 | if (doctypeMatch) {
|
---|
| 170 | if (handler.doctype) {
|
---|
| 171 | handler.doctype(doctypeMatch[0]);
|
---|
| 172 | }
|
---|
| 173 | html = html.substring(doctypeMatch[0].length);
|
---|
| 174 | prevTag = '';
|
---|
| 175 | continue;
|
---|
| 176 | }
|
---|
| 177 |
|
---|
| 178 | // End tag:
|
---|
| 179 | var endTagMatch = html.match(endTag);
|
---|
| 180 | if (endTagMatch) {
|
---|
| 181 | html = html.substring(endTagMatch[0].length);
|
---|
| 182 | await replaceAsync(endTagMatch[0], endTag, parseEndTag);
|
---|
| 183 | prevTag = '/' + endTagMatch[1].toLowerCase();
|
---|
| 184 | continue;
|
---|
| 185 | }
|
---|
| 186 |
|
---|
| 187 | // Start tag:
|
---|
| 188 | var startTagMatch = parseStartTag(html);
|
---|
| 189 | if (startTagMatch) {
|
---|
| 190 | html = startTagMatch.rest;
|
---|
| 191 | await handleStartTag(startTagMatch);
|
---|
| 192 | prevTag = startTagMatch.tagName.toLowerCase();
|
---|
| 193 | continue;
|
---|
| 194 | }
|
---|
| 195 |
|
---|
| 196 | // Treat `<` as text
|
---|
| 197 | if (handler.continueOnParseError) {
|
---|
| 198 | textEnd = html.indexOf('<', 1);
|
---|
| 199 | }
|
---|
| 200 | }
|
---|
| 201 |
|
---|
| 202 | var text;
|
---|
| 203 | if (textEnd >= 0) {
|
---|
| 204 | text = html.substring(0, textEnd);
|
---|
| 205 | html = html.substring(textEnd);
|
---|
| 206 | }
|
---|
| 207 | else {
|
---|
| 208 | text = html;
|
---|
| 209 | html = '';
|
---|
| 210 | }
|
---|
| 211 |
|
---|
| 212 | // next tag
|
---|
| 213 | var nextTagMatch = parseStartTag(html);
|
---|
| 214 | if (nextTagMatch) {
|
---|
| 215 | nextTag = nextTagMatch.tagName;
|
---|
| 216 | }
|
---|
| 217 | else {
|
---|
| 218 | nextTagMatch = html.match(endTag);
|
---|
| 219 | if (nextTagMatch) {
|
---|
| 220 | nextTag = '/' + nextTagMatch[1];
|
---|
| 221 | }
|
---|
| 222 | else {
|
---|
| 223 | nextTag = '';
|
---|
| 224 | }
|
---|
| 225 | }
|
---|
| 226 |
|
---|
| 227 | if (handler.chars) {
|
---|
| 228 | await handler.chars(text, prevTag, nextTag);
|
---|
| 229 | }
|
---|
| 230 | prevTag = '';
|
---|
| 231 | }
|
---|
| 232 | else {
|
---|
| 233 | var stackedTag = lastTag.toLowerCase();
|
---|
| 234 | var reStackedTag = reCache[stackedTag] || (reCache[stackedTag] = new RegExp('([\\s\\S]*?)</' + stackedTag + '[^>]*>', 'i'));
|
---|
| 235 |
|
---|
| 236 | html = await replaceAsync(html, reStackedTag, async(_, text) => {
|
---|
| 237 | if (stackedTag !== 'script' && stackedTag !== 'style' && stackedTag !== 'noscript') {
|
---|
| 238 | text = text
|
---|
| 239 | .replace(/<!--([\s\S]*?)-->/g, '$1')
|
---|
| 240 | .replace(/<!\[CDATA\[([\s\S]*?)]]>/g, '$1');
|
---|
| 241 | }
|
---|
| 242 |
|
---|
| 243 |
|
---|
| 244 | if (handler.chars) {
|
---|
| 245 | await handler.chars(text);
|
---|
| 246 | }
|
---|
| 247 |
|
---|
| 248 | return '';
|
---|
| 249 | });
|
---|
| 250 |
|
---|
| 251 | await parseEndTag('</' + stackedTag + '>', stackedTag);
|
---|
| 252 | }
|
---|
| 253 |
|
---|
| 254 | if (html === last) {
|
---|
| 255 | throw new Error('Parse Error: ' + html);
|
---|
| 256 | }
|
---|
| 257 | }
|
---|
| 258 |
|
---|
| 259 | if (!handler.partialMarkup) {
|
---|
| 260 | // Clean up any remaining tags
|
---|
| 261 | await parseEndTag();
|
---|
| 262 | }
|
---|
| 263 |
|
---|
| 264 | function parseStartTag(input) {
|
---|
| 265 | var start = input.match(startTagOpen);
|
---|
| 266 | if (start) {
|
---|
| 267 | var match = {
|
---|
| 268 | tagName: start[1],
|
---|
| 269 | attrs: []
|
---|
| 270 | };
|
---|
| 271 | input = input.slice(start[0].length);
|
---|
| 272 | var end, attr;
|
---|
| 273 | while (!(end = input.match(startTagClose)) && (attr = input.match(attribute))) {
|
---|
| 274 | input = input.slice(attr[0].length);
|
---|
| 275 | match.attrs.push(attr);
|
---|
| 276 | }
|
---|
| 277 | if (end) {
|
---|
| 278 | match.unarySlash = end[1];
|
---|
| 279 | match.rest = input.slice(end[0].length);
|
---|
| 280 | return match;
|
---|
| 281 | }
|
---|
| 282 | }
|
---|
| 283 | }
|
---|
| 284 |
|
---|
| 285 | async function closeIfFound(tagName) {
|
---|
| 286 | if (findTag(tagName) >= 0) {
|
---|
| 287 | await parseEndTag('', tagName);
|
---|
| 288 | return true;
|
---|
| 289 | }
|
---|
| 290 | }
|
---|
| 291 |
|
---|
| 292 | async function handleStartTag(match) {
|
---|
| 293 | var tagName = match.tagName;
|
---|
| 294 | var unarySlash = match.unarySlash;
|
---|
| 295 |
|
---|
| 296 | if (handler.html5) {
|
---|
| 297 | if (lastTag === 'p' && nonPhrasing(tagName)) {
|
---|
| 298 | await parseEndTag('', lastTag);
|
---|
| 299 | }
|
---|
| 300 | else if (tagName === 'tbody') {
|
---|
| 301 | await closeIfFound('thead');
|
---|
| 302 | }
|
---|
| 303 | else if (tagName === 'tfoot') {
|
---|
| 304 | if (!await closeIfFound('tbody')) {
|
---|
| 305 | await closeIfFound('thead');
|
---|
| 306 | }
|
---|
| 307 | }
|
---|
| 308 | if (tagName === 'col' && findTag('colgroup') < 0) {
|
---|
| 309 | lastTag = 'colgroup';
|
---|
| 310 | stack.push({ tag: lastTag, attrs: [] });
|
---|
| 311 | if (handler.start) {
|
---|
| 312 | await handler.start(lastTag, [], false, '');
|
---|
| 313 | }
|
---|
| 314 | }
|
---|
| 315 | }
|
---|
| 316 |
|
---|
| 317 | if (!handler.html5 && !inline(tagName)) {
|
---|
| 318 | while (lastTag && inline(lastTag)) {
|
---|
| 319 | await parseEndTag('', lastTag);
|
---|
| 320 | }
|
---|
| 321 | }
|
---|
| 322 |
|
---|
| 323 | if (closeSelf(tagName) && lastTag === tagName) {
|
---|
| 324 | await parseEndTag('', tagName);
|
---|
| 325 | }
|
---|
| 326 |
|
---|
| 327 | var unary = empty(tagName) || tagName === 'html' && lastTag === 'head' || !!unarySlash;
|
---|
| 328 |
|
---|
| 329 | var attrs = match.attrs.map(function(args) {
|
---|
| 330 | var name, value, customOpen, customClose, customAssign, quote;
|
---|
| 331 | var ncp = 7; // number of captured parts, scalar
|
---|
| 332 |
|
---|
| 333 | // hackish work around FF bug https://bugzilla.mozilla.org/show_bug.cgi?id=369778
|
---|
| 334 | if (IS_REGEX_CAPTURING_BROKEN && args[0].indexOf('""') === -1) {
|
---|
| 335 | if (args[3] === '') { delete args[3]; }
|
---|
| 336 | if (args[4] === '') { delete args[4]; }
|
---|
| 337 | if (args[5] === '') { delete args[5]; }
|
---|
| 338 | }
|
---|
| 339 |
|
---|
| 340 | function populate(index) {
|
---|
| 341 | customAssign = args[index];
|
---|
| 342 | value = args[index + 1];
|
---|
| 343 | if (typeof value !== 'undefined') {
|
---|
| 344 | return '"';
|
---|
| 345 | }
|
---|
| 346 | value = args[index + 2];
|
---|
| 347 | if (typeof value !== 'undefined') {
|
---|
| 348 | return '\'';
|
---|
| 349 | }
|
---|
| 350 | value = args[index + 3];
|
---|
| 351 | if (typeof value === 'undefined' && fillAttrs(name)) {
|
---|
| 352 | value = name;
|
---|
| 353 | }
|
---|
| 354 | return '';
|
---|
| 355 | }
|
---|
| 356 |
|
---|
| 357 | var j = 1;
|
---|
| 358 | if (handler.customAttrSurround) {
|
---|
| 359 | for (var i = 0, l = handler.customAttrSurround.length; i < l; i++, j += ncp) {
|
---|
| 360 | name = args[j + 1];
|
---|
| 361 | if (name) {
|
---|
| 362 | quote = populate(j + 2);
|
---|
| 363 | customOpen = args[j];
|
---|
| 364 | customClose = args[j + 6];
|
---|
| 365 | break;
|
---|
| 366 | }
|
---|
| 367 | }
|
---|
| 368 | }
|
---|
| 369 |
|
---|
| 370 | if (!name && (name = args[j])) {
|
---|
| 371 | quote = populate(j + 1);
|
---|
| 372 | }
|
---|
| 373 |
|
---|
| 374 | return {
|
---|
| 375 | name: name,
|
---|
| 376 | value: value,
|
---|
| 377 | customAssign: customAssign || '=',
|
---|
| 378 | customOpen: customOpen || '',
|
---|
| 379 | customClose: customClose || '',
|
---|
| 380 | quote: quote || ''
|
---|
| 381 | };
|
---|
| 382 | });
|
---|
| 383 |
|
---|
| 384 | if (!unary) {
|
---|
| 385 | stack.push({ tag: tagName, attrs: attrs });
|
---|
| 386 | lastTag = tagName;
|
---|
| 387 | unarySlash = '';
|
---|
| 388 | }
|
---|
| 389 |
|
---|
| 390 | if (handler.start) {
|
---|
| 391 | await handler.start(tagName, attrs, unary, unarySlash);
|
---|
| 392 | }
|
---|
| 393 | }
|
---|
| 394 |
|
---|
| 395 | function findTag(tagName) {
|
---|
| 396 | var pos;
|
---|
| 397 | var needle = tagName.toLowerCase();
|
---|
| 398 | for (pos = stack.length - 1; pos >= 0; pos--) {
|
---|
| 399 | if (stack[pos].tag.toLowerCase() === needle) {
|
---|
| 400 | break;
|
---|
| 401 | }
|
---|
| 402 | }
|
---|
| 403 | return pos;
|
---|
| 404 | }
|
---|
| 405 |
|
---|
| 406 | async function parseEndTag(tag, tagName) {
|
---|
| 407 | var pos;
|
---|
| 408 |
|
---|
| 409 | // Find the closest opened tag of the same type
|
---|
| 410 | if (tagName) {
|
---|
| 411 | pos = findTag(tagName);
|
---|
| 412 | }
|
---|
| 413 | // If no tag name is provided, clean shop
|
---|
| 414 | else {
|
---|
| 415 | pos = 0;
|
---|
| 416 | }
|
---|
| 417 |
|
---|
| 418 | if (pos >= 0) {
|
---|
| 419 | // Close all the open elements, up the stack
|
---|
| 420 | for (var i = stack.length - 1; i >= pos; i--) {
|
---|
| 421 | if (handler.end) {
|
---|
| 422 | handler.end(stack[i].tag, stack[i].attrs, i > pos || !tag);
|
---|
| 423 | }
|
---|
| 424 | }
|
---|
| 425 |
|
---|
| 426 | // Remove the open elements from the stack
|
---|
| 427 | stack.length = pos;
|
---|
| 428 | lastTag = pos && stack[pos - 1].tag;
|
---|
| 429 | }
|
---|
| 430 | else if (tagName.toLowerCase() === 'br') {
|
---|
| 431 | if (handler.start) {
|
---|
| 432 | await handler.start(tagName, [], true, '');
|
---|
| 433 | }
|
---|
| 434 | }
|
---|
| 435 | else if (tagName.toLowerCase() === 'p') {
|
---|
| 436 | if (handler.start) {
|
---|
| 437 | await handler.start(tagName, [], false, '', true);
|
---|
| 438 | }
|
---|
| 439 | if (handler.end) {
|
---|
| 440 | handler.end(tagName, []);
|
---|
| 441 | }
|
---|
| 442 | }
|
---|
| 443 | }
|
---|
| 444 | }
|
---|
| 445 | }
|
---|
| 446 |
|
---|
| 447 | exports.HTMLParser = HTMLParser;
|
---|
| 448 | exports.HTMLtoXML = function(html) {
|
---|
| 449 | var results = '';
|
---|
| 450 |
|
---|
| 451 | new HTMLParser(html, {
|
---|
| 452 | start: function(tag, attrs, unary) {
|
---|
| 453 | results += '<' + tag;
|
---|
| 454 |
|
---|
| 455 | for (var i = 0, len = attrs.length; i < len; i++) {
|
---|
| 456 | results += ' ' + attrs[i].name + '="' + (attrs[i].value || '').replace(/"/g, '"') + '"';
|
---|
| 457 | }
|
---|
| 458 |
|
---|
| 459 | results += (unary ? '/' : '') + '>';
|
---|
| 460 | },
|
---|
| 461 | end: function(tag) {
|
---|
| 462 | results += '</' + tag + '>';
|
---|
| 463 | },
|
---|
| 464 | chars: function(text) {
|
---|
| 465 | results += text;
|
---|
| 466 | },
|
---|
| 467 | comment: function(text) {
|
---|
| 468 | results += '<!--' + text + '-->';
|
---|
| 469 | },
|
---|
| 470 | ignore: function(text) {
|
---|
| 471 | results += text;
|
---|
| 472 | }
|
---|
| 473 | });
|
---|
| 474 |
|
---|
| 475 | return results;
|
---|
| 476 | };
|
---|
| 477 |
|
---|
| 478 | exports.HTMLtoDOM = function(html, doc) {
|
---|
| 479 | // There can be only one of these elements
|
---|
| 480 | var one = {
|
---|
| 481 | html: true,
|
---|
| 482 | head: true,
|
---|
| 483 | body: true,
|
---|
| 484 | title: true
|
---|
| 485 | };
|
---|
| 486 |
|
---|
| 487 | // Enforce a structure for the document
|
---|
| 488 | var structure = {
|
---|
| 489 | link: 'head',
|
---|
| 490 | base: 'head'
|
---|
| 491 | };
|
---|
| 492 |
|
---|
| 493 | if (doc) {
|
---|
| 494 | doc = doc.ownerDocument || doc.getOwnerDocument && doc.getOwnerDocument() || doc;
|
---|
| 495 | }
|
---|
| 496 | else if (typeof DOMDocument !== 'undefined') {
|
---|
| 497 | doc = new DOMDocument();
|
---|
| 498 | }
|
---|
| 499 | else if (typeof document !== 'undefined' && document.implementation && document.implementation.createDocument) {
|
---|
| 500 | doc = document.implementation.createDocument('', '', null);
|
---|
| 501 | }
|
---|
| 502 | else if (typeof ActiveX !== 'undefined') {
|
---|
| 503 | doc = new ActiveXObject('Msxml.DOMDocument');
|
---|
| 504 | }
|
---|
| 505 |
|
---|
| 506 | var elems = [],
|
---|
| 507 | documentElement = doc.documentElement ||
|
---|
| 508 | doc.getDocumentElement && doc.getDocumentElement();
|
---|
| 509 |
|
---|
| 510 | // If we're dealing with an empty document then we
|
---|
| 511 | // need to pre-populate it with the HTML document structure
|
---|
| 512 | if (!documentElement && doc.createElement) {
|
---|
| 513 | (function() {
|
---|
| 514 | var html = doc.createElement('html');
|
---|
| 515 | var head = doc.createElement('head');
|
---|
| 516 | head.appendChild(doc.createElement('title'));
|
---|
| 517 | html.appendChild(head);
|
---|
| 518 | html.appendChild(doc.createElement('body'));
|
---|
| 519 | doc.appendChild(html);
|
---|
| 520 | })();
|
---|
| 521 | }
|
---|
| 522 |
|
---|
| 523 | // Find all the unique elements
|
---|
| 524 | if (doc.getElementsByTagName) {
|
---|
| 525 | for (var i in one) {
|
---|
| 526 | one[i] = doc.getElementsByTagName(i)[0];
|
---|
| 527 | }
|
---|
| 528 | }
|
---|
| 529 |
|
---|
| 530 | // If we're working with a document, inject contents into
|
---|
| 531 | // the body element
|
---|
| 532 | var curParentNode = one.body;
|
---|
| 533 |
|
---|
| 534 | new HTMLParser(html, {
|
---|
| 535 | start: function(tagName, attrs, unary) {
|
---|
| 536 | // If it's a pre-built element, then we can ignore
|
---|
| 537 | // its construction
|
---|
| 538 | if (one[tagName]) {
|
---|
| 539 | curParentNode = one[tagName];
|
---|
| 540 | return;
|
---|
| 541 | }
|
---|
| 542 |
|
---|
| 543 | var elem = doc.createElement(tagName);
|
---|
| 544 |
|
---|
| 545 | for (var attr in attrs) {
|
---|
| 546 | elem.setAttribute(attrs[attr].name, attrs[attr].value);
|
---|
| 547 | }
|
---|
| 548 |
|
---|
| 549 | if (structure[tagName] && typeof one[structure[tagName]] !== 'boolean') {
|
---|
| 550 | one[structure[tagName]].appendChild(elem);
|
---|
| 551 | }
|
---|
| 552 | else if (curParentNode && curParentNode.appendChild) {
|
---|
| 553 | curParentNode.appendChild(elem);
|
---|
| 554 | }
|
---|
| 555 |
|
---|
| 556 | if (!unary) {
|
---|
| 557 | elems.push(elem);
|
---|
| 558 | curParentNode = elem;
|
---|
| 559 | }
|
---|
| 560 | },
|
---|
| 561 | end: function(/* tag */) {
|
---|
| 562 | elems.length -= 1;
|
---|
| 563 |
|
---|
| 564 | // Init the new parentNode
|
---|
| 565 | curParentNode = elems[elems.length - 1];
|
---|
| 566 | },
|
---|
| 567 | chars: function(text) {
|
---|
| 568 | curParentNode.appendChild(doc.createTextNode(text));
|
---|
| 569 | },
|
---|
| 570 | comment: function(/* text */) {
|
---|
| 571 | // create comment node
|
---|
| 572 | },
|
---|
| 573 | ignore: function(/* text */) {
|
---|
| 574 | // What to do here?
|
---|
| 575 | }
|
---|
| 576 | });
|
---|
| 577 |
|
---|
| 578 | return doc;
|
---|
| 579 | };
|
---|
| 580 |
|
---|
| 581 | exports.endTag = endTag;
|
---|