1 | 'use strict';
|
---|
2 |
|
---|
3 | const generate = require('regjsgen').generate;
|
---|
4 | const parse = require('regjsparser').parse;
|
---|
5 | const regenerate = require('regenerate');
|
---|
6 | const unicodeMatchProperty = require('unicode-match-property-ecmascript');
|
---|
7 | const unicodeMatchPropertyValue = require('unicode-match-property-value-ecmascript');
|
---|
8 | const iuMappings = require('./data/iu-mappings.js');
|
---|
9 | const ESCAPE_SETS = require('./data/character-class-escape-sets.js');
|
---|
10 |
|
---|
11 | // Prepare a Regenerate set containing all code points, used for negative
|
---|
12 | // character classes (if any).
|
---|
13 | const UNICODE_SET = regenerate().addRange(0x0, 0x10FFFF);
|
---|
14 | // Without the `u` flag, the range stops at 0xFFFF.
|
---|
15 | // https://mths.be/es6#sec-pattern-semantics
|
---|
16 | const BMP_SET = regenerate().addRange(0x0, 0xFFFF);
|
---|
17 |
|
---|
18 | // Prepare a Regenerate set containing all code points that are supposed to be
|
---|
19 | // matched by `/./u`. https://mths.be/es6#sec-atom
|
---|
20 | const DOT_SET_UNICODE = UNICODE_SET.clone() // all Unicode code points
|
---|
21 | .remove(
|
---|
22 | // minus `LineTerminator`s (https://mths.be/es6#sec-line-terminators):
|
---|
23 | 0x000A, // Line Feed <LF>
|
---|
24 | 0x000D, // Carriage Return <CR>
|
---|
25 | 0x2028, // Line Separator <LS>
|
---|
26 | 0x2029 // Paragraph Separator <PS>
|
---|
27 | );
|
---|
28 |
|
---|
29 | const getCharacterClassEscapeSet = (character, unicode, ignoreCase) => {
|
---|
30 | if (unicode) {
|
---|
31 | if (ignoreCase) {
|
---|
32 | return ESCAPE_SETS.UNICODE_IGNORE_CASE.get(character);
|
---|
33 | }
|
---|
34 | return ESCAPE_SETS.UNICODE.get(character);
|
---|
35 | }
|
---|
36 | return ESCAPE_SETS.REGULAR.get(character);
|
---|
37 | };
|
---|
38 |
|
---|
39 | const getUnicodeDotSet = (dotAll) => {
|
---|
40 | return dotAll ? UNICODE_SET : DOT_SET_UNICODE;
|
---|
41 | };
|
---|
42 |
|
---|
43 | const getUnicodePropertyValueSet = (property, value) => {
|
---|
44 | const path = value ?
|
---|
45 | `${ property }/${ value }` :
|
---|
46 | `Binary_Property/${ property }`;
|
---|
47 | try {
|
---|
48 | return require(`regenerate-unicode-properties/${ path }.js`);
|
---|
49 | } catch (exception) {
|
---|
50 | throw new Error(
|
---|
51 | `Failed to recognize value \`${ value }\` for property ` +
|
---|
52 | `\`${ property }\`.`
|
---|
53 | );
|
---|
54 | }
|
---|
55 | };
|
---|
56 |
|
---|
57 | const handleLoneUnicodePropertyNameOrValue = (value) => {
|
---|
58 | // It could be a `General_Category` value or a binary property.
|
---|
59 | // Note: `unicodeMatchPropertyValue` throws on invalid values.
|
---|
60 | try {
|
---|
61 | const property = 'General_Category';
|
---|
62 | const category = unicodeMatchPropertyValue(property, value);
|
---|
63 | return getUnicodePropertyValueSet(property, category);
|
---|
64 | } catch (exception) {}
|
---|
65 | // It’s not a `General_Category` value, so check if it’s a binary
|
---|
66 | // property. Note: `unicodeMatchProperty` throws on invalid properties.
|
---|
67 | const property = unicodeMatchProperty(value);
|
---|
68 | return getUnicodePropertyValueSet(property);
|
---|
69 | };
|
---|
70 |
|
---|
71 | const getUnicodePropertyEscapeSet = (value, isNegative) => {
|
---|
72 | const parts = value.split('=');
|
---|
73 | const firstPart = parts[0];
|
---|
74 | let set;
|
---|
75 | if (parts.length == 1) {
|
---|
76 | set = handleLoneUnicodePropertyNameOrValue(firstPart);
|
---|
77 | } else {
|
---|
78 | // The pattern consists of two parts, i.e. `Property=Value`.
|
---|
79 | const property = unicodeMatchProperty(firstPart);
|
---|
80 | const value = unicodeMatchPropertyValue(property, parts[1]);
|
---|
81 | set = getUnicodePropertyValueSet(property, value);
|
---|
82 | }
|
---|
83 | if (isNegative) {
|
---|
84 | return UNICODE_SET.clone().remove(set);
|
---|
85 | }
|
---|
86 | return set.clone();
|
---|
87 | };
|
---|
88 |
|
---|
89 | // Given a range of code points, add any case-folded code points in that range
|
---|
90 | // to a set.
|
---|
91 | regenerate.prototype.iuAddRange = function(min, max) {
|
---|
92 | const $this = this;
|
---|
93 | do {
|
---|
94 | const folded = caseFold(min);
|
---|
95 | if (folded) {
|
---|
96 | $this.add(folded);
|
---|
97 | }
|
---|
98 | } while (++min <= max);
|
---|
99 | return $this;
|
---|
100 | };
|
---|
101 |
|
---|
102 | const update = (item, pattern) => {
|
---|
103 | let tree = parse(pattern, config.useUnicodeFlag ? 'u' : '');
|
---|
104 | switch (tree.type) {
|
---|
105 | case 'characterClass':
|
---|
106 | case 'group':
|
---|
107 | case 'value':
|
---|
108 | // No wrapping needed.
|
---|
109 | break;
|
---|
110 | default:
|
---|
111 | // Wrap the pattern in a non-capturing group.
|
---|
112 | tree = wrap(tree, pattern);
|
---|
113 | }
|
---|
114 | Object.assign(item, tree);
|
---|
115 | };
|
---|
116 |
|
---|
117 | const wrap = (tree, pattern) => {
|
---|
118 | // Wrap the pattern in a non-capturing group.
|
---|
119 | return {
|
---|
120 | 'type': 'group',
|
---|
121 | 'behavior': 'ignore',
|
---|
122 | 'body': [tree],
|
---|
123 | 'raw': `(?:${ pattern })`
|
---|
124 | };
|
---|
125 | };
|
---|
126 |
|
---|
127 | const caseFold = (codePoint) => {
|
---|
128 | return iuMappings.get(codePoint) || false;
|
---|
129 | };
|
---|
130 |
|
---|
131 | const processCharacterClass = (characterClassItem, regenerateOptions) => {
|
---|
132 | const set = regenerate();
|
---|
133 | for (const item of characterClassItem.body) {
|
---|
134 | switch (item.type) {
|
---|
135 | case 'value':
|
---|
136 | set.add(item.codePoint);
|
---|
137 | if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) {
|
---|
138 | const folded = caseFold(item.codePoint);
|
---|
139 | if (folded) {
|
---|
140 | set.add(folded);
|
---|
141 | }
|
---|
142 | }
|
---|
143 | break;
|
---|
144 | case 'characterClassRange':
|
---|
145 | const min = item.min.codePoint;
|
---|
146 | const max = item.max.codePoint;
|
---|
147 | set.addRange(min, max);
|
---|
148 | if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) {
|
---|
149 | set.iuAddRange(min, max);
|
---|
150 | }
|
---|
151 | break;
|
---|
152 | case 'characterClassEscape':
|
---|
153 | set.add(getCharacterClassEscapeSet(
|
---|
154 | item.value,
|
---|
155 | config.unicode,
|
---|
156 | config.ignoreCase
|
---|
157 | ));
|
---|
158 | break;
|
---|
159 | case 'unicodePropertyEscape':
|
---|
160 | set.add(getUnicodePropertyEscapeSet(item.value, item.negative));
|
---|
161 | break;
|
---|
162 | // The `default` clause is only here as a safeguard; it should never be
|
---|
163 | // reached. Code coverage tools should ignore it.
|
---|
164 | /* istanbul ignore next */
|
---|
165 | default:
|
---|
166 | throw new Error(`Unknown term type: ${ item.type }`);
|
---|
167 | }
|
---|
168 | }
|
---|
169 | if (characterClassItem.negative) {
|
---|
170 | update(characterClassItem, `(?!${set.toString(regenerateOptions)})[\\s\\S]`)
|
---|
171 | } else {
|
---|
172 | update(characterClassItem, set.toString(regenerateOptions));
|
---|
173 | }
|
---|
174 | return characterClassItem;
|
---|
175 | };
|
---|
176 |
|
---|
177 | const updateNamedReference = (item, index) => {
|
---|
178 | delete item.name;
|
---|
179 | item.matchIndex = index;
|
---|
180 | };
|
---|
181 |
|
---|
182 | const assertNoUnmatchedReferences = (groups) => {
|
---|
183 | const unmatchedReferencesNames = Object.keys(groups.unmatchedReferences);
|
---|
184 | if (unmatchedReferencesNames.length > 0) {
|
---|
185 | throw new Error(`Unknown group names: ${unmatchedReferencesNames}`);
|
---|
186 | }
|
---|
187 | };
|
---|
188 |
|
---|
189 | const processTerm = (item, regenerateOptions, groups) => {
|
---|
190 | switch (item.type) {
|
---|
191 | case 'dot':
|
---|
192 | if (config.useDotAllFlag) {
|
---|
193 | break;
|
---|
194 | } else if (config.unicode) {
|
---|
195 | update(
|
---|
196 | item,
|
---|
197 | getUnicodeDotSet(config.dotAll).toString(regenerateOptions)
|
---|
198 | );
|
---|
199 | } else if (config.dotAll) {
|
---|
200 | // TODO: consider changing this at the regenerate level.
|
---|
201 | update(item, '[\\s\\S]');
|
---|
202 | }
|
---|
203 | break;
|
---|
204 | case 'characterClass':
|
---|
205 | item = processCharacterClass(item, regenerateOptions);
|
---|
206 | break;
|
---|
207 | case 'unicodePropertyEscape':
|
---|
208 | if (config.unicodePropertyEscape) {
|
---|
209 | update(
|
---|
210 | item,
|
---|
211 | getUnicodePropertyEscapeSet(item.value, item.negative)
|
---|
212 | .toString(regenerateOptions)
|
---|
213 | );
|
---|
214 | }
|
---|
215 | break;
|
---|
216 | case 'characterClassEscape':
|
---|
217 | update(
|
---|
218 | item,
|
---|
219 | getCharacterClassEscapeSet(
|
---|
220 | item.value,
|
---|
221 | config.unicode,
|
---|
222 | config.ignoreCase
|
---|
223 | ).toString(regenerateOptions)
|
---|
224 | );
|
---|
225 | break;
|
---|
226 | case 'group':
|
---|
227 | if (item.behavior == 'normal') {
|
---|
228 | groups.lastIndex++;
|
---|
229 | }
|
---|
230 | if (item.name && config.namedGroup) {
|
---|
231 | const name = item.name.value;
|
---|
232 |
|
---|
233 | if (groups.names[name]) {
|
---|
234 | throw new Error(
|
---|
235 | `Multiple groups with the same name (${ name }) are not allowed.`
|
---|
236 | );
|
---|
237 | }
|
---|
238 |
|
---|
239 | const index = groups.lastIndex;
|
---|
240 | delete item.name;
|
---|
241 |
|
---|
242 | groups.names[name] = index;
|
---|
243 | if (groups.onNamedGroup) {
|
---|
244 | groups.onNamedGroup.call(null, name, index);
|
---|
245 | }
|
---|
246 |
|
---|
247 | if (groups.unmatchedReferences[name]) {
|
---|
248 | groups.unmatchedReferences[name].forEach(reference => {
|
---|
249 | updateNamedReference(reference, index);
|
---|
250 | });
|
---|
251 | delete groups.unmatchedReferences[name];
|
---|
252 | }
|
---|
253 | }
|
---|
254 | /* falls through */
|
---|
255 | case 'alternative':
|
---|
256 | case 'disjunction':
|
---|
257 | case 'quantifier':
|
---|
258 | item.body = item.body.map(term => {
|
---|
259 | return processTerm(term, regenerateOptions, groups);
|
---|
260 | });
|
---|
261 | break;
|
---|
262 | case 'value':
|
---|
263 | const codePoint = item.codePoint;
|
---|
264 | const set = regenerate(codePoint);
|
---|
265 | if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) {
|
---|
266 | const folded = caseFold(codePoint);
|
---|
267 | if (folded) {
|
---|
268 | set.add(folded);
|
---|
269 | }
|
---|
270 | }
|
---|
271 | update(item, set.toString(regenerateOptions));
|
---|
272 | break;
|
---|
273 | case 'reference':
|
---|
274 | if (item.name) {
|
---|
275 | const name = item.name.value;
|
---|
276 | const index = groups.names[name];
|
---|
277 | if (index) {
|
---|
278 | updateNamedReference(item, index);
|
---|
279 | break;
|
---|
280 | }
|
---|
281 |
|
---|
282 | if (!groups.unmatchedReferences[name]) {
|
---|
283 | groups.unmatchedReferences[name] = [];
|
---|
284 | }
|
---|
285 | // Keep track of references used before the corresponding group.
|
---|
286 | groups.unmatchedReferences[name].push(item);
|
---|
287 | }
|
---|
288 | break;
|
---|
289 | case 'anchor':
|
---|
290 | case 'empty':
|
---|
291 | case 'group':
|
---|
292 | // Nothing to do here.
|
---|
293 | break;
|
---|
294 | // The `default` clause is only here as a safeguard; it should never be
|
---|
295 | // reached. Code coverage tools should ignore it.
|
---|
296 | /* istanbul ignore next */
|
---|
297 | default:
|
---|
298 | throw new Error(`Unknown term type: ${ item.type }`);
|
---|
299 | }
|
---|
300 | return item;
|
---|
301 | };
|
---|
302 |
|
---|
303 | const config = {
|
---|
304 | 'ignoreCase': false,
|
---|
305 | 'unicode': false,
|
---|
306 | 'dotAll': false,
|
---|
307 | 'useDotAllFlag': false,
|
---|
308 | 'useUnicodeFlag': false,
|
---|
309 | 'unicodePropertyEscape': false,
|
---|
310 | 'namedGroup': false
|
---|
311 | };
|
---|
312 | const rewritePattern = (pattern, flags, options) => {
|
---|
313 | config.unicode = flags && flags.includes('u');
|
---|
314 | const regjsparserFeatures = {
|
---|
315 | 'unicodePropertyEscape': config.unicode,
|
---|
316 | 'namedGroups': true,
|
---|
317 | 'lookbehind': options && options.lookbehind
|
---|
318 | };
|
---|
319 | config.ignoreCase = flags && flags.includes('i');
|
---|
320 | const supportDotAllFlag = options && options.dotAllFlag;
|
---|
321 | config.dotAll = supportDotAllFlag && flags && flags.includes('s');
|
---|
322 | config.namedGroup = options && options.namedGroup;
|
---|
323 | config.useDotAllFlag = options && options.useDotAllFlag;
|
---|
324 | config.useUnicodeFlag = options && options.useUnicodeFlag;
|
---|
325 | config.unicodePropertyEscape = options && options.unicodePropertyEscape;
|
---|
326 | if (supportDotAllFlag && config.useDotAllFlag) {
|
---|
327 | throw new Error('`useDotAllFlag` and `dotAllFlag` cannot both be true!');
|
---|
328 | }
|
---|
329 | const regenerateOptions = {
|
---|
330 | 'hasUnicodeFlag': config.useUnicodeFlag,
|
---|
331 | 'bmpOnly': !config.unicode
|
---|
332 | };
|
---|
333 | const groups = {
|
---|
334 | 'onNamedGroup': options && options.onNamedGroup,
|
---|
335 | 'lastIndex': 0,
|
---|
336 | 'names': Object.create(null), // { [name]: index }
|
---|
337 | 'unmatchedReferences': Object.create(null) // { [name]: Array<reference> }
|
---|
338 | };
|
---|
339 | const tree = parse(pattern, flags, regjsparserFeatures);
|
---|
340 | // Note: `processTerm` mutates `tree` and `groups`.
|
---|
341 | processTerm(tree, regenerateOptions, groups);
|
---|
342 | assertNoUnmatchedReferences(groups);
|
---|
343 | return generate(tree);
|
---|
344 | };
|
---|
345 |
|
---|
346 | module.exports = rewritePattern;
|
---|