1 | 'use strict'
|
---|
2 |
|
---|
3 | var legacy = require('character-entities-legacy')
|
---|
4 | var invalid = require('character-reference-invalid')
|
---|
5 | var decimal = require('is-decimal')
|
---|
6 | var hexadecimal = require('is-hexadecimal')
|
---|
7 | var alphanumerical = require('is-alphanumerical')
|
---|
8 | var decodeEntity = require('./decode-entity')
|
---|
9 |
|
---|
10 | module.exports = parseEntities
|
---|
11 |
|
---|
12 | var own = {}.hasOwnProperty
|
---|
13 | var fromCharCode = String.fromCharCode
|
---|
14 | var noop = Function.prototype
|
---|
15 |
|
---|
16 | // Default settings.
|
---|
17 | var defaults = {
|
---|
18 | warning: null,
|
---|
19 | reference: null,
|
---|
20 | text: null,
|
---|
21 | warningContext: null,
|
---|
22 | referenceContext: null,
|
---|
23 | textContext: null,
|
---|
24 | position: {},
|
---|
25 | additional: null,
|
---|
26 | attribute: false,
|
---|
27 | nonTerminated: true
|
---|
28 | }
|
---|
29 |
|
---|
30 | // Characters.
|
---|
31 | var tab = 9 // '\t'
|
---|
32 | var lineFeed = 10 // '\n'
|
---|
33 | var formFeed = 12 // '\f'
|
---|
34 | var space = 32 // ' '
|
---|
35 | var ampersand = 38 // '&'
|
---|
36 | var semicolon = 59 // ';'
|
---|
37 | var lessThan = 60 // '<'
|
---|
38 | var equalsTo = 61 // '='
|
---|
39 | var numberSign = 35 // '#'
|
---|
40 | var uppercaseX = 88 // 'X'
|
---|
41 | var lowercaseX = 120 // 'x'
|
---|
42 | var replacementCharacter = 65533 // '�'
|
---|
43 |
|
---|
44 | // Reference types.
|
---|
45 | var name = 'named'
|
---|
46 | var hexa = 'hexadecimal'
|
---|
47 | var deci = 'decimal'
|
---|
48 |
|
---|
49 | // Map of bases.
|
---|
50 | var bases = {}
|
---|
51 |
|
---|
52 | bases[hexa] = 16
|
---|
53 | bases[deci] = 10
|
---|
54 |
|
---|
55 | // Map of types to tests.
|
---|
56 | // Each type of character reference accepts different characters.
|
---|
57 | // This test is used to detect whether a reference has ended (as the semicolon
|
---|
58 | // is not strictly needed).
|
---|
59 | var tests = {}
|
---|
60 |
|
---|
61 | tests[name] = alphanumerical
|
---|
62 | tests[deci] = decimal
|
---|
63 | tests[hexa] = hexadecimal
|
---|
64 |
|
---|
65 | // Warning types.
|
---|
66 | var namedNotTerminated = 1
|
---|
67 | var numericNotTerminated = 2
|
---|
68 | var namedEmpty = 3
|
---|
69 | var numericEmpty = 4
|
---|
70 | var namedUnknown = 5
|
---|
71 | var numericDisallowed = 6
|
---|
72 | var numericProhibited = 7
|
---|
73 |
|
---|
74 | // Warning messages.
|
---|
75 | var messages = {}
|
---|
76 |
|
---|
77 | messages[namedNotTerminated] =
|
---|
78 | 'Named character references must be terminated by a semicolon'
|
---|
79 | messages[numericNotTerminated] =
|
---|
80 | 'Numeric character references must be terminated by a semicolon'
|
---|
81 | messages[namedEmpty] = 'Named character references cannot be empty'
|
---|
82 | messages[numericEmpty] = 'Numeric character references cannot be empty'
|
---|
83 | messages[namedUnknown] = 'Named character references must be known'
|
---|
84 | messages[numericDisallowed] =
|
---|
85 | 'Numeric character references cannot be disallowed'
|
---|
86 | messages[numericProhibited] =
|
---|
87 | 'Numeric character references cannot be outside the permissible Unicode range'
|
---|
88 |
|
---|
89 | // Wrap to ensure clean parameters are given to `parse`.
|
---|
90 | function parseEntities(value, options) {
|
---|
91 | var settings = {}
|
---|
92 | var option
|
---|
93 | var key
|
---|
94 |
|
---|
95 | if (!options) {
|
---|
96 | options = {}
|
---|
97 | }
|
---|
98 |
|
---|
99 | for (key in defaults) {
|
---|
100 | option = options[key]
|
---|
101 | settings[key] =
|
---|
102 | option === null || option === undefined ? defaults[key] : option
|
---|
103 | }
|
---|
104 |
|
---|
105 | if (settings.position.indent || settings.position.start) {
|
---|
106 | settings.indent = settings.position.indent || []
|
---|
107 | settings.position = settings.position.start
|
---|
108 | }
|
---|
109 |
|
---|
110 | return parse(value, settings)
|
---|
111 | }
|
---|
112 |
|
---|
113 | // Parse entities.
|
---|
114 | // eslint-disable-next-line complexity
|
---|
115 | function parse(value, settings) {
|
---|
116 | var additional = settings.additional
|
---|
117 | var nonTerminated = settings.nonTerminated
|
---|
118 | var handleText = settings.text
|
---|
119 | var handleReference = settings.reference
|
---|
120 | var handleWarning = settings.warning
|
---|
121 | var textContext = settings.textContext
|
---|
122 | var referenceContext = settings.referenceContext
|
---|
123 | var warningContext = settings.warningContext
|
---|
124 | var pos = settings.position
|
---|
125 | var indent = settings.indent || []
|
---|
126 | var length = value.length
|
---|
127 | var index = 0
|
---|
128 | var lines = -1
|
---|
129 | var column = pos.column || 1
|
---|
130 | var line = pos.line || 1
|
---|
131 | var queue = ''
|
---|
132 | var result = []
|
---|
133 | var entityCharacters
|
---|
134 | var namedEntity
|
---|
135 | var terminated
|
---|
136 | var characters
|
---|
137 | var character
|
---|
138 | var reference
|
---|
139 | var following
|
---|
140 | var warning
|
---|
141 | var reason
|
---|
142 | var output
|
---|
143 | var entity
|
---|
144 | var begin
|
---|
145 | var start
|
---|
146 | var type
|
---|
147 | var test
|
---|
148 | var prev
|
---|
149 | var next
|
---|
150 | var diff
|
---|
151 | var end
|
---|
152 |
|
---|
153 | if (typeof additional === 'string') {
|
---|
154 | additional = additional.charCodeAt(0)
|
---|
155 | }
|
---|
156 |
|
---|
157 | // Cache the current point.
|
---|
158 | prev = now()
|
---|
159 |
|
---|
160 | // Wrap `handleWarning`.
|
---|
161 | warning = handleWarning ? parseError : noop
|
---|
162 |
|
---|
163 | // Ensure the algorithm walks over the first character and the end
|
---|
164 | // (inclusive).
|
---|
165 | index--
|
---|
166 | length++
|
---|
167 |
|
---|
168 | while (++index < length) {
|
---|
169 | // If the previous character was a newline.
|
---|
170 | if (character === lineFeed) {
|
---|
171 | column = indent[lines] || 1
|
---|
172 | }
|
---|
173 |
|
---|
174 | character = value.charCodeAt(index)
|
---|
175 |
|
---|
176 | if (character === ampersand) {
|
---|
177 | following = value.charCodeAt(index + 1)
|
---|
178 |
|
---|
179 | // The behaviour depends on the identity of the next character.
|
---|
180 | if (
|
---|
181 | following === tab ||
|
---|
182 | following === lineFeed ||
|
---|
183 | following === formFeed ||
|
---|
184 | following === space ||
|
---|
185 | following === ampersand ||
|
---|
186 | following === lessThan ||
|
---|
187 | following !== following ||
|
---|
188 | (additional && following === additional)
|
---|
189 | ) {
|
---|
190 | // Not a character reference.
|
---|
191 | // No characters are consumed, and nothing is returned.
|
---|
192 | // This is not an error, either.
|
---|
193 | queue += fromCharCode(character)
|
---|
194 | column++
|
---|
195 |
|
---|
196 | continue
|
---|
197 | }
|
---|
198 |
|
---|
199 | start = index + 1
|
---|
200 | begin = start
|
---|
201 | end = start
|
---|
202 |
|
---|
203 | if (following === numberSign) {
|
---|
204 | // Numerical entity.
|
---|
205 | end = ++begin
|
---|
206 |
|
---|
207 | // The behaviour further depends on the next character.
|
---|
208 | following = value.charCodeAt(end)
|
---|
209 |
|
---|
210 | if (following === uppercaseX || following === lowercaseX) {
|
---|
211 | // ASCII hex digits.
|
---|
212 | type = hexa
|
---|
213 | end = ++begin
|
---|
214 | } else {
|
---|
215 | // ASCII digits.
|
---|
216 | type = deci
|
---|
217 | }
|
---|
218 | } else {
|
---|
219 | // Named entity.
|
---|
220 | type = name
|
---|
221 | }
|
---|
222 |
|
---|
223 | entityCharacters = ''
|
---|
224 | entity = ''
|
---|
225 | characters = ''
|
---|
226 | test = tests[type]
|
---|
227 | end--
|
---|
228 |
|
---|
229 | while (++end < length) {
|
---|
230 | following = value.charCodeAt(end)
|
---|
231 |
|
---|
232 | if (!test(following)) {
|
---|
233 | break
|
---|
234 | }
|
---|
235 |
|
---|
236 | characters += fromCharCode(following)
|
---|
237 |
|
---|
238 | // Check if we can match a legacy named reference.
|
---|
239 | // If so, we cache that as the last viable named reference.
|
---|
240 | // This ensures we do not need to walk backwards later.
|
---|
241 | if (type === name && own.call(legacy, characters)) {
|
---|
242 | entityCharacters = characters
|
---|
243 | entity = legacy[characters]
|
---|
244 | }
|
---|
245 | }
|
---|
246 |
|
---|
247 | terminated = value.charCodeAt(end) === semicolon
|
---|
248 |
|
---|
249 | if (terminated) {
|
---|
250 | end++
|
---|
251 |
|
---|
252 | namedEntity = type === name ? decodeEntity(characters) : false
|
---|
253 |
|
---|
254 | if (namedEntity) {
|
---|
255 | entityCharacters = characters
|
---|
256 | entity = namedEntity
|
---|
257 | }
|
---|
258 | }
|
---|
259 |
|
---|
260 | diff = 1 + end - start
|
---|
261 |
|
---|
262 | if (!terminated && !nonTerminated) {
|
---|
263 | // Empty.
|
---|
264 | } else if (!characters) {
|
---|
265 | // An empty (possible) entity is valid, unless it’s numeric (thus an
|
---|
266 | // ampersand followed by an octothorp).
|
---|
267 | if (type !== name) {
|
---|
268 | warning(numericEmpty, diff)
|
---|
269 | }
|
---|
270 | } else if (type === name) {
|
---|
271 | // An ampersand followed by anything unknown, and not terminated, is
|
---|
272 | // invalid.
|
---|
273 | if (terminated && !entity) {
|
---|
274 | warning(namedUnknown, 1)
|
---|
275 | } else {
|
---|
276 | // If theres something after an entity name which is not known, cap
|
---|
277 | // the reference.
|
---|
278 | if (entityCharacters !== characters) {
|
---|
279 | end = begin + entityCharacters.length
|
---|
280 | diff = 1 + end - begin
|
---|
281 | terminated = false
|
---|
282 | }
|
---|
283 |
|
---|
284 | // If the reference is not terminated, warn.
|
---|
285 | if (!terminated) {
|
---|
286 | reason = entityCharacters ? namedNotTerminated : namedEmpty
|
---|
287 |
|
---|
288 | if (settings.attribute) {
|
---|
289 | following = value.charCodeAt(end)
|
---|
290 |
|
---|
291 | if (following === equalsTo) {
|
---|
292 | warning(reason, diff)
|
---|
293 | entity = null
|
---|
294 | } else if (alphanumerical(following)) {
|
---|
295 | entity = null
|
---|
296 | } else {
|
---|
297 | warning(reason, diff)
|
---|
298 | }
|
---|
299 | } else {
|
---|
300 | warning(reason, diff)
|
---|
301 | }
|
---|
302 | }
|
---|
303 | }
|
---|
304 |
|
---|
305 | reference = entity
|
---|
306 | } else {
|
---|
307 | if (!terminated) {
|
---|
308 | // All non-terminated numeric entities are not rendered, and trigger a
|
---|
309 | // warning.
|
---|
310 | warning(numericNotTerminated, diff)
|
---|
311 | }
|
---|
312 |
|
---|
313 | // When terminated and number, parse as either hexadecimal or decimal.
|
---|
314 | reference = parseInt(characters, bases[type])
|
---|
315 |
|
---|
316 | // Trigger a warning when the parsed number is prohibited, and replace
|
---|
317 | // with replacement character.
|
---|
318 | if (prohibited(reference)) {
|
---|
319 | warning(numericProhibited, diff)
|
---|
320 | reference = fromCharCode(replacementCharacter)
|
---|
321 | } else if (reference in invalid) {
|
---|
322 | // Trigger a warning when the parsed number is disallowed, and replace
|
---|
323 | // by an alternative.
|
---|
324 | warning(numericDisallowed, diff)
|
---|
325 | reference = invalid[reference]
|
---|
326 | } else {
|
---|
327 | // Parse the number.
|
---|
328 | output = ''
|
---|
329 |
|
---|
330 | // Trigger a warning when the parsed number should not be used.
|
---|
331 | if (disallowed(reference)) {
|
---|
332 | warning(numericDisallowed, diff)
|
---|
333 | }
|
---|
334 |
|
---|
335 | // Stringify the number.
|
---|
336 | if (reference > 0xffff) {
|
---|
337 | reference -= 0x10000
|
---|
338 | output += fromCharCode((reference >>> (10 & 0x3ff)) | 0xd800)
|
---|
339 | reference = 0xdc00 | (reference & 0x3ff)
|
---|
340 | }
|
---|
341 |
|
---|
342 | reference = output + fromCharCode(reference)
|
---|
343 | }
|
---|
344 | }
|
---|
345 |
|
---|
346 | // Found it!
|
---|
347 | // First eat the queued characters as normal text, then eat an entity.
|
---|
348 | if (reference) {
|
---|
349 | flush()
|
---|
350 |
|
---|
351 | prev = now()
|
---|
352 | index = end - 1
|
---|
353 | column += end - start + 1
|
---|
354 | result.push(reference)
|
---|
355 | next = now()
|
---|
356 | next.offset++
|
---|
357 |
|
---|
358 | if (handleReference) {
|
---|
359 | handleReference.call(
|
---|
360 | referenceContext,
|
---|
361 | reference,
|
---|
362 | {start: prev, end: next},
|
---|
363 | value.slice(start - 1, end)
|
---|
364 | )
|
---|
365 | }
|
---|
366 |
|
---|
367 | prev = next
|
---|
368 | } else {
|
---|
369 | // If we could not find a reference, queue the checked characters (as
|
---|
370 | // normal characters), and move the pointer to their end.
|
---|
371 | // This is possible because we can be certain neither newlines nor
|
---|
372 | // ampersands are included.
|
---|
373 | characters = value.slice(start - 1, end)
|
---|
374 | queue += characters
|
---|
375 | column += characters.length
|
---|
376 | index = end - 1
|
---|
377 | }
|
---|
378 | } else {
|
---|
379 | // Handle anything other than an ampersand, including newlines and EOF.
|
---|
380 | if (
|
---|
381 | character === 10 // Line feed
|
---|
382 | ) {
|
---|
383 | line++
|
---|
384 | lines++
|
---|
385 | column = 0
|
---|
386 | }
|
---|
387 |
|
---|
388 | if (character === character) {
|
---|
389 | queue += fromCharCode(character)
|
---|
390 | column++
|
---|
391 | } else {
|
---|
392 | flush()
|
---|
393 | }
|
---|
394 | }
|
---|
395 | }
|
---|
396 |
|
---|
397 | // Return the reduced nodes.
|
---|
398 | return result.join('')
|
---|
399 |
|
---|
400 | // Get current position.
|
---|
401 | function now() {
|
---|
402 | return {
|
---|
403 | line: line,
|
---|
404 | column: column,
|
---|
405 | offset: index + (pos.offset || 0)
|
---|
406 | }
|
---|
407 | }
|
---|
408 |
|
---|
409 | // “Throw” a parse-error: a warning.
|
---|
410 | function parseError(code, offset) {
|
---|
411 | var position = now()
|
---|
412 |
|
---|
413 | position.column += offset
|
---|
414 | position.offset += offset
|
---|
415 |
|
---|
416 | handleWarning.call(warningContext, messages[code], position, code)
|
---|
417 | }
|
---|
418 |
|
---|
419 | // Flush `queue` (normal text).
|
---|
420 | // Macro invoked before each entity and at the end of `value`.
|
---|
421 | // Does nothing when `queue` is empty.
|
---|
422 | function flush() {
|
---|
423 | if (queue) {
|
---|
424 | result.push(queue)
|
---|
425 |
|
---|
426 | if (handleText) {
|
---|
427 | handleText.call(textContext, queue, {start: prev, end: now()})
|
---|
428 | }
|
---|
429 |
|
---|
430 | queue = ''
|
---|
431 | }
|
---|
432 | }
|
---|
433 | }
|
---|
434 |
|
---|
435 | // Check if `character` is outside the permissible unicode range.
|
---|
436 | function prohibited(code) {
|
---|
437 | return (code >= 0xd800 && code <= 0xdfff) || code > 0x10ffff
|
---|
438 | }
|
---|
439 |
|
---|
440 | // Check if `character` is disallowed.
|
---|
441 | function disallowed(code) {
|
---|
442 | return (
|
---|
443 | (code >= 0x0001 && code <= 0x0008) ||
|
---|
444 | code === 0x000b ||
|
---|
445 | (code >= 0x000d && code <= 0x001f) ||
|
---|
446 | (code >= 0x007f && code <= 0x009f) ||
|
---|
447 | (code >= 0xfdd0 && code <= 0xfdef) ||
|
---|
448 | (code & 0xffff) === 0xffff ||
|
---|
449 | (code & 0xffff) === 0xfffe
|
---|
450 | )
|
---|
451 | }
|
---|