[6a3a178] | 1 | 'use strict';
|
---|
| 2 |
|
---|
| 3 | const generate = require('regjsgen').generate;
|
---|
| 4 | const parse = require('regjsparser').parse;
|
---|
| 5 | const regenerate = require('regenerate');
|
---|
| 6 | const unicodeMatchProperty = require('unicode-match-property-ecmascript');
|
---|
| 7 | const unicodeMatchPropertyValue = require('unicode-match-property-value-ecmascript');
|
---|
| 8 | const iuMappings = require('./data/iu-mappings.js');
|
---|
| 9 | const ESCAPE_SETS = require('./data/character-class-escape-sets.js');
|
---|
| 10 |
|
---|
| 11 | // Prepare a Regenerate set containing all code points, used for negative
|
---|
| 12 | // character classes (if any).
|
---|
| 13 | const UNICODE_SET = regenerate().addRange(0x0, 0x10FFFF);
|
---|
| 14 | // Without the `u` flag, the range stops at 0xFFFF.
|
---|
| 15 | // https://mths.be/es6#sec-pattern-semantics
|
---|
| 16 | const BMP_SET = regenerate().addRange(0x0, 0xFFFF);
|
---|
| 17 |
|
---|
| 18 | // Prepare a Regenerate set containing all code points that are supposed to be
|
---|
| 19 | // matched by `/./u`. https://mths.be/es6#sec-atom
|
---|
| 20 | const DOT_SET_UNICODE = UNICODE_SET.clone() // all Unicode code points
|
---|
| 21 | .remove(
|
---|
| 22 | // minus `LineTerminator`s (https://mths.be/es6#sec-line-terminators):
|
---|
| 23 | 0x000A, // Line Feed <LF>
|
---|
| 24 | 0x000D, // Carriage Return <CR>
|
---|
| 25 | 0x2028, // Line Separator <LS>
|
---|
| 26 | 0x2029 // Paragraph Separator <PS>
|
---|
| 27 | );
|
---|
| 28 |
|
---|
| 29 | const getCharacterClassEscapeSet = (character, unicode, ignoreCase) => {
|
---|
| 30 | if (unicode) {
|
---|
| 31 | if (ignoreCase) {
|
---|
| 32 | return ESCAPE_SETS.UNICODE_IGNORE_CASE.get(character);
|
---|
| 33 | }
|
---|
| 34 | return ESCAPE_SETS.UNICODE.get(character);
|
---|
| 35 | }
|
---|
| 36 | return ESCAPE_SETS.REGULAR.get(character);
|
---|
| 37 | };
|
---|
| 38 |
|
---|
| 39 | const getUnicodeDotSet = (dotAll) => {
|
---|
| 40 | return dotAll ? UNICODE_SET : DOT_SET_UNICODE;
|
---|
| 41 | };
|
---|
| 42 |
|
---|
| 43 | const getUnicodePropertyValueSet = (property, value) => {
|
---|
| 44 | const path = value ?
|
---|
| 45 | `${ property }/${ value }` :
|
---|
| 46 | `Binary_Property/${ property }`;
|
---|
| 47 | try {
|
---|
| 48 | return require(`regenerate-unicode-properties/${ path }.js`);
|
---|
| 49 | } catch (exception) {
|
---|
| 50 | throw new Error(
|
---|
| 51 | `Failed to recognize value \`${ value }\` for property ` +
|
---|
| 52 | `\`${ property }\`.`
|
---|
| 53 | );
|
---|
| 54 | }
|
---|
| 55 | };
|
---|
| 56 |
|
---|
| 57 | const handleLoneUnicodePropertyNameOrValue = (value) => {
|
---|
| 58 | // It could be a `General_Category` value or a binary property.
|
---|
| 59 | // Note: `unicodeMatchPropertyValue` throws on invalid values.
|
---|
| 60 | try {
|
---|
| 61 | const property = 'General_Category';
|
---|
| 62 | const category = unicodeMatchPropertyValue(property, value);
|
---|
| 63 | return getUnicodePropertyValueSet(property, category);
|
---|
| 64 | } catch (exception) {}
|
---|
| 65 | // It’s not a `General_Category` value, so check if it’s a binary
|
---|
| 66 | // property. Note: `unicodeMatchProperty` throws on invalid properties.
|
---|
| 67 | const property = unicodeMatchProperty(value);
|
---|
| 68 | return getUnicodePropertyValueSet(property);
|
---|
| 69 | };
|
---|
| 70 |
|
---|
| 71 | const getUnicodePropertyEscapeSet = (value, isNegative) => {
|
---|
| 72 | const parts = value.split('=');
|
---|
| 73 | const firstPart = parts[0];
|
---|
| 74 | let set;
|
---|
| 75 | if (parts.length == 1) {
|
---|
| 76 | set = handleLoneUnicodePropertyNameOrValue(firstPart);
|
---|
| 77 | } else {
|
---|
| 78 | // The pattern consists of two parts, i.e. `Property=Value`.
|
---|
| 79 | const property = unicodeMatchProperty(firstPart);
|
---|
| 80 | const value = unicodeMatchPropertyValue(property, parts[1]);
|
---|
| 81 | set = getUnicodePropertyValueSet(property, value);
|
---|
| 82 | }
|
---|
| 83 | if (isNegative) {
|
---|
| 84 | return UNICODE_SET.clone().remove(set);
|
---|
| 85 | }
|
---|
| 86 | return set.clone();
|
---|
| 87 | };
|
---|
| 88 |
|
---|
| 89 | // Given a range of code points, add any case-folded code points in that range
|
---|
| 90 | // to a set.
|
---|
| 91 | regenerate.prototype.iuAddRange = function(min, max) {
|
---|
| 92 | const $this = this;
|
---|
| 93 | do {
|
---|
| 94 | const folded = caseFold(min);
|
---|
| 95 | if (folded) {
|
---|
| 96 | $this.add(folded);
|
---|
| 97 | }
|
---|
| 98 | } while (++min <= max);
|
---|
| 99 | return $this;
|
---|
| 100 | };
|
---|
| 101 |
|
---|
| 102 | const update = (item, pattern) => {
|
---|
| 103 | let tree = parse(pattern, config.useUnicodeFlag ? 'u' : '');
|
---|
| 104 | switch (tree.type) {
|
---|
| 105 | case 'characterClass':
|
---|
| 106 | case 'group':
|
---|
| 107 | case 'value':
|
---|
| 108 | // No wrapping needed.
|
---|
| 109 | break;
|
---|
| 110 | default:
|
---|
| 111 | // Wrap the pattern in a non-capturing group.
|
---|
| 112 | tree = wrap(tree, pattern);
|
---|
| 113 | }
|
---|
| 114 | Object.assign(item, tree);
|
---|
| 115 | };
|
---|
| 116 |
|
---|
| 117 | const wrap = (tree, pattern) => {
|
---|
| 118 | // Wrap the pattern in a non-capturing group.
|
---|
| 119 | return {
|
---|
| 120 | 'type': 'group',
|
---|
| 121 | 'behavior': 'ignore',
|
---|
| 122 | 'body': [tree],
|
---|
| 123 | 'raw': `(?:${ pattern })`
|
---|
| 124 | };
|
---|
| 125 | };
|
---|
| 126 |
|
---|
| 127 | const caseFold = (codePoint) => {
|
---|
| 128 | return iuMappings.get(codePoint) || false;
|
---|
| 129 | };
|
---|
| 130 |
|
---|
| 131 | const processCharacterClass = (characterClassItem, regenerateOptions) => {
|
---|
| 132 | const set = regenerate();
|
---|
| 133 | for (const item of characterClassItem.body) {
|
---|
| 134 | switch (item.type) {
|
---|
| 135 | case 'value':
|
---|
| 136 | set.add(item.codePoint);
|
---|
| 137 | if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) {
|
---|
| 138 | const folded = caseFold(item.codePoint);
|
---|
| 139 | if (folded) {
|
---|
| 140 | set.add(folded);
|
---|
| 141 | }
|
---|
| 142 | }
|
---|
| 143 | break;
|
---|
| 144 | case 'characterClassRange':
|
---|
| 145 | const min = item.min.codePoint;
|
---|
| 146 | const max = item.max.codePoint;
|
---|
| 147 | set.addRange(min, max);
|
---|
| 148 | if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) {
|
---|
| 149 | set.iuAddRange(min, max);
|
---|
| 150 | }
|
---|
| 151 | break;
|
---|
| 152 | case 'characterClassEscape':
|
---|
| 153 | set.add(getCharacterClassEscapeSet(
|
---|
| 154 | item.value,
|
---|
| 155 | config.unicode,
|
---|
| 156 | config.ignoreCase
|
---|
| 157 | ));
|
---|
| 158 | break;
|
---|
| 159 | case 'unicodePropertyEscape':
|
---|
| 160 | set.add(getUnicodePropertyEscapeSet(item.value, item.negative));
|
---|
| 161 | break;
|
---|
| 162 | // The `default` clause is only here as a safeguard; it should never be
|
---|
| 163 | // reached. Code coverage tools should ignore it.
|
---|
| 164 | /* istanbul ignore next */
|
---|
| 165 | default:
|
---|
| 166 | throw new Error(`Unknown term type: ${ item.type }`);
|
---|
| 167 | }
|
---|
| 168 | }
|
---|
| 169 | if (characterClassItem.negative) {
|
---|
| 170 | update(characterClassItem, `(?!${set.toString(regenerateOptions)})[\\s\\S]`)
|
---|
| 171 | } else {
|
---|
| 172 | update(characterClassItem, set.toString(regenerateOptions));
|
---|
| 173 | }
|
---|
| 174 | return characterClassItem;
|
---|
| 175 | };
|
---|
| 176 |
|
---|
| 177 | const updateNamedReference = (item, index) => {
|
---|
| 178 | delete item.name;
|
---|
| 179 | item.matchIndex = index;
|
---|
| 180 | };
|
---|
| 181 |
|
---|
| 182 | const assertNoUnmatchedReferences = (groups) => {
|
---|
| 183 | const unmatchedReferencesNames = Object.keys(groups.unmatchedReferences);
|
---|
| 184 | if (unmatchedReferencesNames.length > 0) {
|
---|
| 185 | throw new Error(`Unknown group names: ${unmatchedReferencesNames}`);
|
---|
| 186 | }
|
---|
| 187 | };
|
---|
| 188 |
|
---|
| 189 | const processTerm = (item, regenerateOptions, groups) => {
|
---|
| 190 | switch (item.type) {
|
---|
| 191 | case 'dot':
|
---|
| 192 | if (config.useDotAllFlag) {
|
---|
| 193 | break;
|
---|
| 194 | } else if (config.unicode) {
|
---|
| 195 | update(
|
---|
| 196 | item,
|
---|
| 197 | getUnicodeDotSet(config.dotAll).toString(regenerateOptions)
|
---|
| 198 | );
|
---|
| 199 | } else if (config.dotAll) {
|
---|
| 200 | // TODO: consider changing this at the regenerate level.
|
---|
| 201 | update(item, '[\\s\\S]');
|
---|
| 202 | }
|
---|
| 203 | break;
|
---|
| 204 | case 'characterClass':
|
---|
| 205 | item = processCharacterClass(item, regenerateOptions);
|
---|
| 206 | break;
|
---|
| 207 | case 'unicodePropertyEscape':
|
---|
| 208 | if (config.unicodePropertyEscape) {
|
---|
| 209 | update(
|
---|
| 210 | item,
|
---|
| 211 | getUnicodePropertyEscapeSet(item.value, item.negative)
|
---|
| 212 | .toString(regenerateOptions)
|
---|
| 213 | );
|
---|
| 214 | }
|
---|
| 215 | break;
|
---|
| 216 | case 'characterClassEscape':
|
---|
| 217 | update(
|
---|
| 218 | item,
|
---|
| 219 | getCharacterClassEscapeSet(
|
---|
| 220 | item.value,
|
---|
| 221 | config.unicode,
|
---|
| 222 | config.ignoreCase
|
---|
| 223 | ).toString(regenerateOptions)
|
---|
| 224 | );
|
---|
| 225 | break;
|
---|
| 226 | case 'group':
|
---|
| 227 | if (item.behavior == 'normal') {
|
---|
| 228 | groups.lastIndex++;
|
---|
| 229 | }
|
---|
| 230 | if (item.name && config.namedGroup) {
|
---|
| 231 | const name = item.name.value;
|
---|
| 232 |
|
---|
| 233 | if (groups.names[name]) {
|
---|
| 234 | throw new Error(
|
---|
| 235 | `Multiple groups with the same name (${ name }) are not allowed.`
|
---|
| 236 | );
|
---|
| 237 | }
|
---|
| 238 |
|
---|
| 239 | const index = groups.lastIndex;
|
---|
| 240 | delete item.name;
|
---|
| 241 |
|
---|
| 242 | groups.names[name] = index;
|
---|
| 243 | if (groups.onNamedGroup) {
|
---|
| 244 | groups.onNamedGroup.call(null, name, index);
|
---|
| 245 | }
|
---|
| 246 |
|
---|
| 247 | if (groups.unmatchedReferences[name]) {
|
---|
| 248 | groups.unmatchedReferences[name].forEach(reference => {
|
---|
| 249 | updateNamedReference(reference, index);
|
---|
| 250 | });
|
---|
| 251 | delete groups.unmatchedReferences[name];
|
---|
| 252 | }
|
---|
| 253 | }
|
---|
| 254 | /* falls through */
|
---|
| 255 | case 'alternative':
|
---|
| 256 | case 'disjunction':
|
---|
| 257 | case 'quantifier':
|
---|
| 258 | item.body = item.body.map(term => {
|
---|
| 259 | return processTerm(term, regenerateOptions, groups);
|
---|
| 260 | });
|
---|
| 261 | break;
|
---|
| 262 | case 'value':
|
---|
| 263 | const codePoint = item.codePoint;
|
---|
| 264 | const set = regenerate(codePoint);
|
---|
| 265 | if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) {
|
---|
| 266 | const folded = caseFold(codePoint);
|
---|
| 267 | if (folded) {
|
---|
| 268 | set.add(folded);
|
---|
| 269 | }
|
---|
| 270 | }
|
---|
| 271 | update(item, set.toString(regenerateOptions));
|
---|
| 272 | break;
|
---|
| 273 | case 'reference':
|
---|
| 274 | if (item.name) {
|
---|
| 275 | const name = item.name.value;
|
---|
| 276 | const index = groups.names[name];
|
---|
| 277 | if (index) {
|
---|
| 278 | updateNamedReference(item, index);
|
---|
| 279 | break;
|
---|
| 280 | }
|
---|
| 281 |
|
---|
| 282 | if (!groups.unmatchedReferences[name]) {
|
---|
| 283 | groups.unmatchedReferences[name] = [];
|
---|
| 284 | }
|
---|
| 285 | // Keep track of references used before the corresponding group.
|
---|
| 286 | groups.unmatchedReferences[name].push(item);
|
---|
| 287 | }
|
---|
| 288 | break;
|
---|
| 289 | case 'anchor':
|
---|
| 290 | case 'empty':
|
---|
| 291 | case 'group':
|
---|
| 292 | // Nothing to do here.
|
---|
| 293 | break;
|
---|
| 294 | // The `default` clause is only here as a safeguard; it should never be
|
---|
| 295 | // reached. Code coverage tools should ignore it.
|
---|
| 296 | /* istanbul ignore next */
|
---|
| 297 | default:
|
---|
| 298 | throw new Error(`Unknown term type: ${ item.type }`);
|
---|
| 299 | }
|
---|
| 300 | return item;
|
---|
| 301 | };
|
---|
| 302 |
|
---|
| 303 | const config = {
|
---|
| 304 | 'ignoreCase': false,
|
---|
| 305 | 'unicode': false,
|
---|
| 306 | 'dotAll': false,
|
---|
| 307 | 'useDotAllFlag': false,
|
---|
| 308 | 'useUnicodeFlag': false,
|
---|
| 309 | 'unicodePropertyEscape': false,
|
---|
| 310 | 'namedGroup': false
|
---|
| 311 | };
|
---|
| 312 | const rewritePattern = (pattern, flags, options) => {
|
---|
| 313 | config.unicode = flags && flags.includes('u');
|
---|
| 314 | const regjsparserFeatures = {
|
---|
| 315 | 'unicodePropertyEscape': config.unicode,
|
---|
| 316 | 'namedGroups': true,
|
---|
| 317 | 'lookbehind': options && options.lookbehind
|
---|
| 318 | };
|
---|
| 319 | config.ignoreCase = flags && flags.includes('i');
|
---|
| 320 | const supportDotAllFlag = options && options.dotAllFlag;
|
---|
| 321 | config.dotAll = supportDotAllFlag && flags && flags.includes('s');
|
---|
| 322 | config.namedGroup = options && options.namedGroup;
|
---|
| 323 | config.useDotAllFlag = options && options.useDotAllFlag;
|
---|
| 324 | config.useUnicodeFlag = options && options.useUnicodeFlag;
|
---|
| 325 | config.unicodePropertyEscape = options && options.unicodePropertyEscape;
|
---|
| 326 | if (supportDotAllFlag && config.useDotAllFlag) {
|
---|
| 327 | throw new Error('`useDotAllFlag` and `dotAllFlag` cannot both be true!');
|
---|
| 328 | }
|
---|
| 329 | const regenerateOptions = {
|
---|
| 330 | 'hasUnicodeFlag': config.useUnicodeFlag,
|
---|
| 331 | 'bmpOnly': !config.unicode
|
---|
| 332 | };
|
---|
| 333 | const groups = {
|
---|
| 334 | 'onNamedGroup': options && options.onNamedGroup,
|
---|
| 335 | 'lastIndex': 0,
|
---|
| 336 | 'names': Object.create(null), // { [name]: index }
|
---|
| 337 | 'unmatchedReferences': Object.create(null) // { [name]: Array<reference> }
|
---|
| 338 | };
|
---|
| 339 | const tree = parse(pattern, flags, regjsparserFeatures);
|
---|
| 340 | // Note: `processTerm` mutates `tree` and `groups`.
|
---|
| 341 | processTerm(tree, regenerateOptions, groups);
|
---|
| 342 | assertNoUnmatchedReferences(groups);
|
---|
| 343 | return generate(tree);
|
---|
| 344 | };
|
---|
| 345 |
|
---|
| 346 | module.exports = rewritePattern;
|
---|