source: node_modules/parse-entities/index.js@ e48199a

main
Last change on this file since e48199a was d24f17c, checked in by Aleksandar Panovski <apano77@…>, 15 months ago

Initial commit

  • Property mode set to 100644
File size: 11.6 KB
Line 
1'use strict'
2
3var legacy = require('character-entities-legacy')
4var invalid = require('character-reference-invalid')
5var decimal = require('is-decimal')
6var hexadecimal = require('is-hexadecimal')
7var alphanumerical = require('is-alphanumerical')
8var decodeEntity = require('./decode-entity')
9
10module.exports = parseEntities
11
12var own = {}.hasOwnProperty
13var fromCharCode = String.fromCharCode
14var noop = Function.prototype
15
16// Default settings.
17var defaults = {
18 warning: null,
19 reference: null,
20 text: null,
21 warningContext: null,
22 referenceContext: null,
23 textContext: null,
24 position: {},
25 additional: null,
26 attribute: false,
27 nonTerminated: true
28}
29
30// Characters.
31var tab = 9 // '\t'
32var lineFeed = 10 // '\n'
33var formFeed = 12 // '\f'
34var space = 32 // ' '
35var ampersand = 38 // '&'
36var semicolon = 59 // ';'
37var lessThan = 60 // '<'
38var equalsTo = 61 // '='
39var numberSign = 35 // '#'
40var uppercaseX = 88 // 'X'
41var lowercaseX = 120 // 'x'
42var replacementCharacter = 65533 // '�'
43
44// Reference types.
45var name = 'named'
46var hexa = 'hexadecimal'
47var deci = 'decimal'
48
49// Map of bases.
50var bases = {}
51
52bases[hexa] = 16
53bases[deci] = 10
54
55// Map of types to tests.
56// Each type of character reference accepts different characters.
57// This test is used to detect whether a reference has ended (as the semicolon
58// is not strictly needed).
59var tests = {}
60
61tests[name] = alphanumerical
62tests[deci] = decimal
63tests[hexa] = hexadecimal
64
65// Warning types.
66var namedNotTerminated = 1
67var numericNotTerminated = 2
68var namedEmpty = 3
69var numericEmpty = 4
70var namedUnknown = 5
71var numericDisallowed = 6
72var numericProhibited = 7
73
74// Warning messages.
75var messages = {}
76
77messages[namedNotTerminated] =
78 'Named character references must be terminated by a semicolon'
79messages[numericNotTerminated] =
80 'Numeric character references must be terminated by a semicolon'
81messages[namedEmpty] = 'Named character references cannot be empty'
82messages[numericEmpty] = 'Numeric character references cannot be empty'
83messages[namedUnknown] = 'Named character references must be known'
84messages[numericDisallowed] =
85 'Numeric character references cannot be disallowed'
86messages[numericProhibited] =
87 'Numeric character references cannot be outside the permissible Unicode range'
88
89// Wrap to ensure clean parameters are given to `parse`.
90function parseEntities(value, options) {
91 var settings = {}
92 var option
93 var key
94
95 if (!options) {
96 options = {}
97 }
98
99 for (key in defaults) {
100 option = options[key]
101 settings[key] =
102 option === null || option === undefined ? defaults[key] : option
103 }
104
105 if (settings.position.indent || settings.position.start) {
106 settings.indent = settings.position.indent || []
107 settings.position = settings.position.start
108 }
109
110 return parse(value, settings)
111}
112
113// Parse entities.
114// eslint-disable-next-line complexity
115function parse(value, settings) {
116 var additional = settings.additional
117 var nonTerminated = settings.nonTerminated
118 var handleText = settings.text
119 var handleReference = settings.reference
120 var handleWarning = settings.warning
121 var textContext = settings.textContext
122 var referenceContext = settings.referenceContext
123 var warningContext = settings.warningContext
124 var pos = settings.position
125 var indent = settings.indent || []
126 var length = value.length
127 var index = 0
128 var lines = -1
129 var column = pos.column || 1
130 var line = pos.line || 1
131 var queue = ''
132 var result = []
133 var entityCharacters
134 var namedEntity
135 var terminated
136 var characters
137 var character
138 var reference
139 var following
140 var warning
141 var reason
142 var output
143 var entity
144 var begin
145 var start
146 var type
147 var test
148 var prev
149 var next
150 var diff
151 var end
152
153 if (typeof additional === 'string') {
154 additional = additional.charCodeAt(0)
155 }
156
157 // Cache the current point.
158 prev = now()
159
160 // Wrap `handleWarning`.
161 warning = handleWarning ? parseError : noop
162
163 // Ensure the algorithm walks over the first character and the end
164 // (inclusive).
165 index--
166 length++
167
168 while (++index < length) {
169 // If the previous character was a newline.
170 if (character === lineFeed) {
171 column = indent[lines] || 1
172 }
173
174 character = value.charCodeAt(index)
175
176 if (character === ampersand) {
177 following = value.charCodeAt(index + 1)
178
179 // The behaviour depends on the identity of the next character.
180 if (
181 following === tab ||
182 following === lineFeed ||
183 following === formFeed ||
184 following === space ||
185 following === ampersand ||
186 following === lessThan ||
187 following !== following ||
188 (additional && following === additional)
189 ) {
190 // Not a character reference.
191 // No characters are consumed, and nothing is returned.
192 // This is not an error, either.
193 queue += fromCharCode(character)
194 column++
195
196 continue
197 }
198
199 start = index + 1
200 begin = start
201 end = start
202
203 if (following === numberSign) {
204 // Numerical entity.
205 end = ++begin
206
207 // The behaviour further depends on the next character.
208 following = value.charCodeAt(end)
209
210 if (following === uppercaseX || following === lowercaseX) {
211 // ASCII hex digits.
212 type = hexa
213 end = ++begin
214 } else {
215 // ASCII digits.
216 type = deci
217 }
218 } else {
219 // Named entity.
220 type = name
221 }
222
223 entityCharacters = ''
224 entity = ''
225 characters = ''
226 test = tests[type]
227 end--
228
229 while (++end < length) {
230 following = value.charCodeAt(end)
231
232 if (!test(following)) {
233 break
234 }
235
236 characters += fromCharCode(following)
237
238 // Check if we can match a legacy named reference.
239 // If so, we cache that as the last viable named reference.
240 // This ensures we do not need to walk backwards later.
241 if (type === name && own.call(legacy, characters)) {
242 entityCharacters = characters
243 entity = legacy[characters]
244 }
245 }
246
247 terminated = value.charCodeAt(end) === semicolon
248
249 if (terminated) {
250 end++
251
252 namedEntity = type === name ? decodeEntity(characters) : false
253
254 if (namedEntity) {
255 entityCharacters = characters
256 entity = namedEntity
257 }
258 }
259
260 diff = 1 + end - start
261
262 if (!terminated && !nonTerminated) {
263 // Empty.
264 } else if (!characters) {
265 // An empty (possible) entity is valid, unless it’s numeric (thus an
266 // ampersand followed by an octothorp).
267 if (type !== name) {
268 warning(numericEmpty, diff)
269 }
270 } else if (type === name) {
271 // An ampersand followed by anything unknown, and not terminated, is
272 // invalid.
273 if (terminated && !entity) {
274 warning(namedUnknown, 1)
275 } else {
276 // If theres something after an entity name which is not known, cap
277 // the reference.
278 if (entityCharacters !== characters) {
279 end = begin + entityCharacters.length
280 diff = 1 + end - begin
281 terminated = false
282 }
283
284 // If the reference is not terminated, warn.
285 if (!terminated) {
286 reason = entityCharacters ? namedNotTerminated : namedEmpty
287
288 if (settings.attribute) {
289 following = value.charCodeAt(end)
290
291 if (following === equalsTo) {
292 warning(reason, diff)
293 entity = null
294 } else if (alphanumerical(following)) {
295 entity = null
296 } else {
297 warning(reason, diff)
298 }
299 } else {
300 warning(reason, diff)
301 }
302 }
303 }
304
305 reference = entity
306 } else {
307 if (!terminated) {
308 // All non-terminated numeric entities are not rendered, and trigger a
309 // warning.
310 warning(numericNotTerminated, diff)
311 }
312
313 // When terminated and number, parse as either hexadecimal or decimal.
314 reference = parseInt(characters, bases[type])
315
316 // Trigger a warning when the parsed number is prohibited, and replace
317 // with replacement character.
318 if (prohibited(reference)) {
319 warning(numericProhibited, diff)
320 reference = fromCharCode(replacementCharacter)
321 } else if (reference in invalid) {
322 // Trigger a warning when the parsed number is disallowed, and replace
323 // by an alternative.
324 warning(numericDisallowed, diff)
325 reference = invalid[reference]
326 } else {
327 // Parse the number.
328 output = ''
329
330 // Trigger a warning when the parsed number should not be used.
331 if (disallowed(reference)) {
332 warning(numericDisallowed, diff)
333 }
334
335 // Stringify the number.
336 if (reference > 0xffff) {
337 reference -= 0x10000
338 output += fromCharCode((reference >>> (10 & 0x3ff)) | 0xd800)
339 reference = 0xdc00 | (reference & 0x3ff)
340 }
341
342 reference = output + fromCharCode(reference)
343 }
344 }
345
346 // Found it!
347 // First eat the queued characters as normal text, then eat an entity.
348 if (reference) {
349 flush()
350
351 prev = now()
352 index = end - 1
353 column += end - start + 1
354 result.push(reference)
355 next = now()
356 next.offset++
357
358 if (handleReference) {
359 handleReference.call(
360 referenceContext,
361 reference,
362 {start: prev, end: next},
363 value.slice(start - 1, end)
364 )
365 }
366
367 prev = next
368 } else {
369 // If we could not find a reference, queue the checked characters (as
370 // normal characters), and move the pointer to their end.
371 // This is possible because we can be certain neither newlines nor
372 // ampersands are included.
373 characters = value.slice(start - 1, end)
374 queue += characters
375 column += characters.length
376 index = end - 1
377 }
378 } else {
379 // Handle anything other than an ampersand, including newlines and EOF.
380 if (
381 character === 10 // Line feed
382 ) {
383 line++
384 lines++
385 column = 0
386 }
387
388 if (character === character) {
389 queue += fromCharCode(character)
390 column++
391 } else {
392 flush()
393 }
394 }
395 }
396
397 // Return the reduced nodes.
398 return result.join('')
399
400 // Get current position.
401 function now() {
402 return {
403 line: line,
404 column: column,
405 offset: index + (pos.offset || 0)
406 }
407 }
408
409 // “Throw” a parse-error: a warning.
410 function parseError(code, offset) {
411 var position = now()
412
413 position.column += offset
414 position.offset += offset
415
416 handleWarning.call(warningContext, messages[code], position, code)
417 }
418
419 // Flush `queue` (normal text).
420 // Macro invoked before each entity and at the end of `value`.
421 // Does nothing when `queue` is empty.
422 function flush() {
423 if (queue) {
424 result.push(queue)
425
426 if (handleText) {
427 handleText.call(textContext, queue, {start: prev, end: now()})
428 }
429
430 queue = ''
431 }
432 }
433}
434
435// Check if `character` is outside the permissible unicode range.
436function prohibited(code) {
437 return (code >= 0xd800 && code <= 0xdfff) || code > 0x10ffff
438}
439
440// Check if `character` is disallowed.
441function disallowed(code) {
442 return (
443 (code >= 0x0001 && code <= 0x0008) ||
444 code === 0x000b ||
445 (code >= 0x000d && code <= 0x001f) ||
446 (code >= 0x007f && code <= 0x009f) ||
447 (code >= 0xfdd0 && code <= 0xfdef) ||
448 (code & 0xffff) === 0xffff ||
449 (code & 0xffff) === 0xfffe
450 )
451}
Note: See TracBrowser for help on using the repository browser.