[d24f17c] | 1 | 'use strict'
|
---|
| 2 |
|
---|
| 3 | var legacy = require('character-entities-legacy')
|
---|
| 4 | var invalid = require('character-reference-invalid')
|
---|
| 5 | var decimal = require('is-decimal')
|
---|
| 6 | var hexadecimal = require('is-hexadecimal')
|
---|
| 7 | var alphanumerical = require('is-alphanumerical')
|
---|
| 8 | var decodeEntity = require('./decode-entity')
|
---|
| 9 |
|
---|
| 10 | module.exports = parseEntities
|
---|
| 11 |
|
---|
| 12 | var own = {}.hasOwnProperty
|
---|
| 13 | var fromCharCode = String.fromCharCode
|
---|
| 14 | var noop = Function.prototype
|
---|
| 15 |
|
---|
| 16 | // Default settings.
|
---|
| 17 | var defaults = {
|
---|
| 18 | warning: null,
|
---|
| 19 | reference: null,
|
---|
| 20 | text: null,
|
---|
| 21 | warningContext: null,
|
---|
| 22 | referenceContext: null,
|
---|
| 23 | textContext: null,
|
---|
| 24 | position: {},
|
---|
| 25 | additional: null,
|
---|
| 26 | attribute: false,
|
---|
| 27 | nonTerminated: true
|
---|
| 28 | }
|
---|
| 29 |
|
---|
| 30 | // Characters.
|
---|
| 31 | var tab = 9 // '\t'
|
---|
| 32 | var lineFeed = 10 // '\n'
|
---|
| 33 | var formFeed = 12 // '\f'
|
---|
| 34 | var space = 32 // ' '
|
---|
| 35 | var ampersand = 38 // '&'
|
---|
| 36 | var semicolon = 59 // ';'
|
---|
| 37 | var lessThan = 60 // '<'
|
---|
| 38 | var equalsTo = 61 // '='
|
---|
| 39 | var numberSign = 35 // '#'
|
---|
| 40 | var uppercaseX = 88 // 'X'
|
---|
| 41 | var lowercaseX = 120 // 'x'
|
---|
| 42 | var replacementCharacter = 65533 // '�'
|
---|
| 43 |
|
---|
| 44 | // Reference types.
|
---|
| 45 | var name = 'named'
|
---|
| 46 | var hexa = 'hexadecimal'
|
---|
| 47 | var deci = 'decimal'
|
---|
| 48 |
|
---|
| 49 | // Map of bases.
|
---|
| 50 | var bases = {}
|
---|
| 51 |
|
---|
| 52 | bases[hexa] = 16
|
---|
| 53 | bases[deci] = 10
|
---|
| 54 |
|
---|
| 55 | // Map of types to tests.
|
---|
| 56 | // Each type of character reference accepts different characters.
|
---|
| 57 | // This test is used to detect whether a reference has ended (as the semicolon
|
---|
| 58 | // is not strictly needed).
|
---|
| 59 | var tests = {}
|
---|
| 60 |
|
---|
| 61 | tests[name] = alphanumerical
|
---|
| 62 | tests[deci] = decimal
|
---|
| 63 | tests[hexa] = hexadecimal
|
---|
| 64 |
|
---|
| 65 | // Warning types.
|
---|
| 66 | var namedNotTerminated = 1
|
---|
| 67 | var numericNotTerminated = 2
|
---|
| 68 | var namedEmpty = 3
|
---|
| 69 | var numericEmpty = 4
|
---|
| 70 | var namedUnknown = 5
|
---|
| 71 | var numericDisallowed = 6
|
---|
| 72 | var numericProhibited = 7
|
---|
| 73 |
|
---|
| 74 | // Warning messages.
|
---|
| 75 | var messages = {}
|
---|
| 76 |
|
---|
| 77 | messages[namedNotTerminated] =
|
---|
| 78 | 'Named character references must be terminated by a semicolon'
|
---|
| 79 | messages[numericNotTerminated] =
|
---|
| 80 | 'Numeric character references must be terminated by a semicolon'
|
---|
| 81 | messages[namedEmpty] = 'Named character references cannot be empty'
|
---|
| 82 | messages[numericEmpty] = 'Numeric character references cannot be empty'
|
---|
| 83 | messages[namedUnknown] = 'Named character references must be known'
|
---|
| 84 | messages[numericDisallowed] =
|
---|
| 85 | 'Numeric character references cannot be disallowed'
|
---|
| 86 | messages[numericProhibited] =
|
---|
| 87 | 'Numeric character references cannot be outside the permissible Unicode range'
|
---|
| 88 |
|
---|
| 89 | // Wrap to ensure clean parameters are given to `parse`.
|
---|
| 90 | function parseEntities(value, options) {
|
---|
| 91 | var settings = {}
|
---|
| 92 | var option
|
---|
| 93 | var key
|
---|
| 94 |
|
---|
| 95 | if (!options) {
|
---|
| 96 | options = {}
|
---|
| 97 | }
|
---|
| 98 |
|
---|
| 99 | for (key in defaults) {
|
---|
| 100 | option = options[key]
|
---|
| 101 | settings[key] =
|
---|
| 102 | option === null || option === undefined ? defaults[key] : option
|
---|
| 103 | }
|
---|
| 104 |
|
---|
| 105 | if (settings.position.indent || settings.position.start) {
|
---|
| 106 | settings.indent = settings.position.indent || []
|
---|
| 107 | settings.position = settings.position.start
|
---|
| 108 | }
|
---|
| 109 |
|
---|
| 110 | return parse(value, settings)
|
---|
| 111 | }
|
---|
| 112 |
|
---|
| 113 | // Parse entities.
|
---|
| 114 | // eslint-disable-next-line complexity
|
---|
| 115 | function parse(value, settings) {
|
---|
| 116 | var additional = settings.additional
|
---|
| 117 | var nonTerminated = settings.nonTerminated
|
---|
| 118 | var handleText = settings.text
|
---|
| 119 | var handleReference = settings.reference
|
---|
| 120 | var handleWarning = settings.warning
|
---|
| 121 | var textContext = settings.textContext
|
---|
| 122 | var referenceContext = settings.referenceContext
|
---|
| 123 | var warningContext = settings.warningContext
|
---|
| 124 | var pos = settings.position
|
---|
| 125 | var indent = settings.indent || []
|
---|
| 126 | var length = value.length
|
---|
| 127 | var index = 0
|
---|
| 128 | var lines = -1
|
---|
| 129 | var column = pos.column || 1
|
---|
| 130 | var line = pos.line || 1
|
---|
| 131 | var queue = ''
|
---|
| 132 | var result = []
|
---|
| 133 | var entityCharacters
|
---|
| 134 | var namedEntity
|
---|
| 135 | var terminated
|
---|
| 136 | var characters
|
---|
| 137 | var character
|
---|
| 138 | var reference
|
---|
| 139 | var following
|
---|
| 140 | var warning
|
---|
| 141 | var reason
|
---|
| 142 | var output
|
---|
| 143 | var entity
|
---|
| 144 | var begin
|
---|
| 145 | var start
|
---|
| 146 | var type
|
---|
| 147 | var test
|
---|
| 148 | var prev
|
---|
| 149 | var next
|
---|
| 150 | var diff
|
---|
| 151 | var end
|
---|
| 152 |
|
---|
| 153 | if (typeof additional === 'string') {
|
---|
| 154 | additional = additional.charCodeAt(0)
|
---|
| 155 | }
|
---|
| 156 |
|
---|
| 157 | // Cache the current point.
|
---|
| 158 | prev = now()
|
---|
| 159 |
|
---|
| 160 | // Wrap `handleWarning`.
|
---|
| 161 | warning = handleWarning ? parseError : noop
|
---|
| 162 |
|
---|
| 163 | // Ensure the algorithm walks over the first character and the end
|
---|
| 164 | // (inclusive).
|
---|
| 165 | index--
|
---|
| 166 | length++
|
---|
| 167 |
|
---|
| 168 | while (++index < length) {
|
---|
| 169 | // If the previous character was a newline.
|
---|
| 170 | if (character === lineFeed) {
|
---|
| 171 | column = indent[lines] || 1
|
---|
| 172 | }
|
---|
| 173 |
|
---|
| 174 | character = value.charCodeAt(index)
|
---|
| 175 |
|
---|
| 176 | if (character === ampersand) {
|
---|
| 177 | following = value.charCodeAt(index + 1)
|
---|
| 178 |
|
---|
| 179 | // The behaviour depends on the identity of the next character.
|
---|
| 180 | if (
|
---|
| 181 | following === tab ||
|
---|
| 182 | following === lineFeed ||
|
---|
| 183 | following === formFeed ||
|
---|
| 184 | following === space ||
|
---|
| 185 | following === ampersand ||
|
---|
| 186 | following === lessThan ||
|
---|
| 187 | following !== following ||
|
---|
| 188 | (additional && following === additional)
|
---|
| 189 | ) {
|
---|
| 190 | // Not a character reference.
|
---|
| 191 | // No characters are consumed, and nothing is returned.
|
---|
| 192 | // This is not an error, either.
|
---|
| 193 | queue += fromCharCode(character)
|
---|
| 194 | column++
|
---|
| 195 |
|
---|
| 196 | continue
|
---|
| 197 | }
|
---|
| 198 |
|
---|
| 199 | start = index + 1
|
---|
| 200 | begin = start
|
---|
| 201 | end = start
|
---|
| 202 |
|
---|
| 203 | if (following === numberSign) {
|
---|
| 204 | // Numerical entity.
|
---|
| 205 | end = ++begin
|
---|
| 206 |
|
---|
| 207 | // The behaviour further depends on the next character.
|
---|
| 208 | following = value.charCodeAt(end)
|
---|
| 209 |
|
---|
| 210 | if (following === uppercaseX || following === lowercaseX) {
|
---|
| 211 | // ASCII hex digits.
|
---|
| 212 | type = hexa
|
---|
| 213 | end = ++begin
|
---|
| 214 | } else {
|
---|
| 215 | // ASCII digits.
|
---|
| 216 | type = deci
|
---|
| 217 | }
|
---|
| 218 | } else {
|
---|
| 219 | // Named entity.
|
---|
| 220 | type = name
|
---|
| 221 | }
|
---|
| 222 |
|
---|
| 223 | entityCharacters = ''
|
---|
| 224 | entity = ''
|
---|
| 225 | characters = ''
|
---|
| 226 | test = tests[type]
|
---|
| 227 | end--
|
---|
| 228 |
|
---|
| 229 | while (++end < length) {
|
---|
| 230 | following = value.charCodeAt(end)
|
---|
| 231 |
|
---|
| 232 | if (!test(following)) {
|
---|
| 233 | break
|
---|
| 234 | }
|
---|
| 235 |
|
---|
| 236 | characters += fromCharCode(following)
|
---|
| 237 |
|
---|
| 238 | // Check if we can match a legacy named reference.
|
---|
| 239 | // If so, we cache that as the last viable named reference.
|
---|
| 240 | // This ensures we do not need to walk backwards later.
|
---|
| 241 | if (type === name && own.call(legacy, characters)) {
|
---|
| 242 | entityCharacters = characters
|
---|
| 243 | entity = legacy[characters]
|
---|
| 244 | }
|
---|
| 245 | }
|
---|
| 246 |
|
---|
| 247 | terminated = value.charCodeAt(end) === semicolon
|
---|
| 248 |
|
---|
| 249 | if (terminated) {
|
---|
| 250 | end++
|
---|
| 251 |
|
---|
| 252 | namedEntity = type === name ? decodeEntity(characters) : false
|
---|
| 253 |
|
---|
| 254 | if (namedEntity) {
|
---|
| 255 | entityCharacters = characters
|
---|
| 256 | entity = namedEntity
|
---|
| 257 | }
|
---|
| 258 | }
|
---|
| 259 |
|
---|
| 260 | diff = 1 + end - start
|
---|
| 261 |
|
---|
| 262 | if (!terminated && !nonTerminated) {
|
---|
| 263 | // Empty.
|
---|
| 264 | } else if (!characters) {
|
---|
| 265 | // An empty (possible) entity is valid, unless it’s numeric (thus an
|
---|
| 266 | // ampersand followed by an octothorp).
|
---|
| 267 | if (type !== name) {
|
---|
| 268 | warning(numericEmpty, diff)
|
---|
| 269 | }
|
---|
| 270 | } else if (type === name) {
|
---|
| 271 | // An ampersand followed by anything unknown, and not terminated, is
|
---|
| 272 | // invalid.
|
---|
| 273 | if (terminated && !entity) {
|
---|
| 274 | warning(namedUnknown, 1)
|
---|
| 275 | } else {
|
---|
| 276 | // If theres something after an entity name which is not known, cap
|
---|
| 277 | // the reference.
|
---|
| 278 | if (entityCharacters !== characters) {
|
---|
| 279 | end = begin + entityCharacters.length
|
---|
| 280 | diff = 1 + end - begin
|
---|
| 281 | terminated = false
|
---|
| 282 | }
|
---|
| 283 |
|
---|
| 284 | // If the reference is not terminated, warn.
|
---|
| 285 | if (!terminated) {
|
---|
| 286 | reason = entityCharacters ? namedNotTerminated : namedEmpty
|
---|
| 287 |
|
---|
| 288 | if (settings.attribute) {
|
---|
| 289 | following = value.charCodeAt(end)
|
---|
| 290 |
|
---|
| 291 | if (following === equalsTo) {
|
---|
| 292 | warning(reason, diff)
|
---|
| 293 | entity = null
|
---|
| 294 | } else if (alphanumerical(following)) {
|
---|
| 295 | entity = null
|
---|
| 296 | } else {
|
---|
| 297 | warning(reason, diff)
|
---|
| 298 | }
|
---|
| 299 | } else {
|
---|
| 300 | warning(reason, diff)
|
---|
| 301 | }
|
---|
| 302 | }
|
---|
| 303 | }
|
---|
| 304 |
|
---|
| 305 | reference = entity
|
---|
| 306 | } else {
|
---|
| 307 | if (!terminated) {
|
---|
| 308 | // All non-terminated numeric entities are not rendered, and trigger a
|
---|
| 309 | // warning.
|
---|
| 310 | warning(numericNotTerminated, diff)
|
---|
| 311 | }
|
---|
| 312 |
|
---|
| 313 | // When terminated and number, parse as either hexadecimal or decimal.
|
---|
| 314 | reference = parseInt(characters, bases[type])
|
---|
| 315 |
|
---|
| 316 | // Trigger a warning when the parsed number is prohibited, and replace
|
---|
| 317 | // with replacement character.
|
---|
| 318 | if (prohibited(reference)) {
|
---|
| 319 | warning(numericProhibited, diff)
|
---|
| 320 | reference = fromCharCode(replacementCharacter)
|
---|
| 321 | } else if (reference in invalid) {
|
---|
| 322 | // Trigger a warning when the parsed number is disallowed, and replace
|
---|
| 323 | // by an alternative.
|
---|
| 324 | warning(numericDisallowed, diff)
|
---|
| 325 | reference = invalid[reference]
|
---|
| 326 | } else {
|
---|
| 327 | // Parse the number.
|
---|
| 328 | output = ''
|
---|
| 329 |
|
---|
| 330 | // Trigger a warning when the parsed number should not be used.
|
---|
| 331 | if (disallowed(reference)) {
|
---|
| 332 | warning(numericDisallowed, diff)
|
---|
| 333 | }
|
---|
| 334 |
|
---|
| 335 | // Stringify the number.
|
---|
| 336 | if (reference > 0xffff) {
|
---|
| 337 | reference -= 0x10000
|
---|
| 338 | output += fromCharCode((reference >>> (10 & 0x3ff)) | 0xd800)
|
---|
| 339 | reference = 0xdc00 | (reference & 0x3ff)
|
---|
| 340 | }
|
---|
| 341 |
|
---|
| 342 | reference = output + fromCharCode(reference)
|
---|
| 343 | }
|
---|
| 344 | }
|
---|
| 345 |
|
---|
| 346 | // Found it!
|
---|
| 347 | // First eat the queued characters as normal text, then eat an entity.
|
---|
| 348 | if (reference) {
|
---|
| 349 | flush()
|
---|
| 350 |
|
---|
| 351 | prev = now()
|
---|
| 352 | index = end - 1
|
---|
| 353 | column += end - start + 1
|
---|
| 354 | result.push(reference)
|
---|
| 355 | next = now()
|
---|
| 356 | next.offset++
|
---|
| 357 |
|
---|
| 358 | if (handleReference) {
|
---|
| 359 | handleReference.call(
|
---|
| 360 | referenceContext,
|
---|
| 361 | reference,
|
---|
| 362 | {start: prev, end: next},
|
---|
| 363 | value.slice(start - 1, end)
|
---|
| 364 | )
|
---|
| 365 | }
|
---|
| 366 |
|
---|
| 367 | prev = next
|
---|
| 368 | } else {
|
---|
| 369 | // If we could not find a reference, queue the checked characters (as
|
---|
| 370 | // normal characters), and move the pointer to their end.
|
---|
| 371 | // This is possible because we can be certain neither newlines nor
|
---|
| 372 | // ampersands are included.
|
---|
| 373 | characters = value.slice(start - 1, end)
|
---|
| 374 | queue += characters
|
---|
| 375 | column += characters.length
|
---|
| 376 | index = end - 1
|
---|
| 377 | }
|
---|
| 378 | } else {
|
---|
| 379 | // Handle anything other than an ampersand, including newlines and EOF.
|
---|
| 380 | if (
|
---|
| 381 | character === 10 // Line feed
|
---|
| 382 | ) {
|
---|
| 383 | line++
|
---|
| 384 | lines++
|
---|
| 385 | column = 0
|
---|
| 386 | }
|
---|
| 387 |
|
---|
| 388 | if (character === character) {
|
---|
| 389 | queue += fromCharCode(character)
|
---|
| 390 | column++
|
---|
| 391 | } else {
|
---|
| 392 | flush()
|
---|
| 393 | }
|
---|
| 394 | }
|
---|
| 395 | }
|
---|
| 396 |
|
---|
| 397 | // Return the reduced nodes.
|
---|
| 398 | return result.join('')
|
---|
| 399 |
|
---|
| 400 | // Get current position.
|
---|
| 401 | function now() {
|
---|
| 402 | return {
|
---|
| 403 | line: line,
|
---|
| 404 | column: column,
|
---|
| 405 | offset: index + (pos.offset || 0)
|
---|
| 406 | }
|
---|
| 407 | }
|
---|
| 408 |
|
---|
| 409 | // “Throw” a parse-error: a warning.
|
---|
| 410 | function parseError(code, offset) {
|
---|
| 411 | var position = now()
|
---|
| 412 |
|
---|
| 413 | position.column += offset
|
---|
| 414 | position.offset += offset
|
---|
| 415 |
|
---|
| 416 | handleWarning.call(warningContext, messages[code], position, code)
|
---|
| 417 | }
|
---|
| 418 |
|
---|
| 419 | // Flush `queue` (normal text).
|
---|
| 420 | // Macro invoked before each entity and at the end of `value`.
|
---|
| 421 | // Does nothing when `queue` is empty.
|
---|
| 422 | function flush() {
|
---|
| 423 | if (queue) {
|
---|
| 424 | result.push(queue)
|
---|
| 425 |
|
---|
| 426 | if (handleText) {
|
---|
| 427 | handleText.call(textContext, queue, {start: prev, end: now()})
|
---|
| 428 | }
|
---|
| 429 |
|
---|
| 430 | queue = ''
|
---|
| 431 | }
|
---|
| 432 | }
|
---|
| 433 | }
|
---|
| 434 |
|
---|
| 435 | // Check if `character` is outside the permissible unicode range.
|
---|
| 436 | function prohibited(code) {
|
---|
| 437 | return (code >= 0xd800 && code <= 0xdfff) || code > 0x10ffff
|
---|
| 438 | }
|
---|
| 439 |
|
---|
| 440 | // Check if `character` is disallowed.
|
---|
| 441 | function disallowed(code) {
|
---|
| 442 | return (
|
---|
| 443 | (code >= 0x0001 && code <= 0x0008) ||
|
---|
| 444 | code === 0x000b ||
|
---|
| 445 | (code >= 0x000d && code <= 0x001f) ||
|
---|
| 446 | (code >= 0x007f && code <= 0x009f) ||
|
---|
| 447 | (code >= 0xfdd0 && code <= 0xfdef) ||
|
---|
| 448 | (code & 0xffff) === 0xffff ||
|
---|
| 449 | (code & 0xffff) === 0xfffe
|
---|
| 450 | )
|
---|
| 451 | }
|
---|