'use strict';
const generate = require('regjsgen').generate; const parse = require('regjsparser').parse; const regenerate = require('regenerate'); const unicodeMatchProperty = require('unicode-match-property-ecmascript'); const unicodeMatchPropertyValue = require('unicode-match-property-value-ecmascript'); const iuMappings = require('./data/iu-mappings.js'); const ESCAPE_SETS = require('./data/character-class-escape-sets.js');
// Prepare a Regenerate set containing all code points, used for negative // character classes (if any). const UNICODE_SET = regenerate().addRange(0x0, 0x10FFFF); // Without the `u` flag, the range stops at 0xFFFF. // mths.be/es6#sec-pattern-semantics const BMP_SET = regenerate().addRange(0x0, 0xFFFF);
// Prepare a Regenerate set containing all code points that are supposed to be // matched by `/./u`. mths.be/es6#sec-atom const DOT_SET_UNICODE = UNICODE_SET.clone() // all Unicode code points
.remove( // minus `LineTerminator`s (https://mths.be/es6#sec-line-terminators): 0x000A, // Line Feed <LF> 0x000D, // Carriage Return <CR> 0x2028, // Line Separator <LS> 0x2029 // Paragraph Separator <PS> );
const getCharacterClassEscapeSet = (character, unicode, ignoreCase) => {
if (unicode) { if (ignoreCase) { return ESCAPE_SETS.UNICODE_IGNORE_CASE.get(character); } return ESCAPE_SETS.UNICODE.get(character); } return ESCAPE_SETS.REGULAR.get(character);
};
const getUnicodeDotSet = (dotAll) => {
return dotAll ? UNICODE_SET : DOT_SET_UNICODE;
};
const getUnicodePropertyValueSet = (property, value) => {
const path = value ? `${ property }/${ value }` : `Binary_Property/${ property }`; try { return require(`regenerate-unicode-properties/${ path }.js`); } catch (exception) { throw new Error( `Failed to recognize value \`${ value }\` for property ` + `\`${ property }\`.` ); }
};
const handleLoneUnicodePropertyNameOrValue = (value) => {
// It could be a `General_Category` value or a binary property. // Note: `unicodeMatchPropertyValue` throws on invalid values. try { const property = 'General_Category'; const category = unicodeMatchPropertyValue(property, value); return getUnicodePropertyValueSet(property, category); } catch (exception) {} // It’s not a `General_Category` value, so check if it’s a binary // property. Note: `unicodeMatchProperty` throws on invalid properties. const property = unicodeMatchProperty(value); return getUnicodePropertyValueSet(property);
};
const getUnicodePropertyEscapeSet = (value, isNegative) => {
const parts = value.split('='); const firstPart = parts[0]; let set; if (parts.length == 1) { set = handleLoneUnicodePropertyNameOrValue(firstPart); } else { // The pattern consists of two parts, i.e. `Property=Value`. const property = unicodeMatchProperty(firstPart); const value = unicodeMatchPropertyValue(property, parts[1]); set = getUnicodePropertyValueSet(property, value); } if (isNegative) { return UNICODE_SET.clone().remove(set); } return set.clone();
};
// Given a range of code points, add any case-folded code points in that range // to a set. regenerate.prototype.iuAddRange = function(min, max) {
const $this = this; do { const folded = caseFold(min); if (folded) { $this.add(folded); } } while (++min <= max); return $this;
};
const update = (item, pattern) => {
let tree = parse(pattern, config.useUnicodeFlag ? 'u' : ''); switch (tree.type) { case 'characterClass': case 'group': case 'value': // No wrapping needed. break; default: // Wrap the pattern in a non-capturing group. tree = wrap(tree, pattern); } Object.assign(item, tree);
};
const wrap = (tree, pattern) => {
// Wrap the pattern in a non-capturing group. return { 'type': 'group', 'behavior': 'ignore', 'body': [tree], 'raw': `(?:${ pattern })` };
};
const caseFold = (codePoint) => {
return iuMappings.get(codePoint) || false;
};
const processCharacterClass = (characterClassItem, regenerateOptions) => {
let set = regenerate(); for (const item of characterClassItem.body) { switch (item.type) { case 'value': set.add(item.codePoint); if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) { const folded = caseFold(item.codePoint); if (folded) { set.add(folded); } } break; case 'characterClassRange': const min = item.min.codePoint; const max = item.max.codePoint; set.addRange(min, max); if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) { set.iuAddRange(min, max); } break; case 'characterClassEscape': set.add(getCharacterClassEscapeSet( item.value, config.unicode, config.ignoreCase )); break; case 'unicodePropertyEscape': set.add(getUnicodePropertyEscapeSet(item.value, item.negative)); break; // The `default` clause is only here as a safeguard; it should never be // reached. Code coverage tools should ignore it. /* istanbul ignore next */ default: throw new Error(`Unknown term type: ${ item.type }`); } } if (characterClassItem.negative) { set = (config.unicode ? UNICODE_SET : BMP_SET).clone().remove(set); } update(characterClassItem, set.toString(regenerateOptions)); return characterClassItem;
};
const updateNamedReference = (item, index) => {
delete item.name; item.matchIndex = index;
};
const assertNoUnmatchedReferences = (groups) => {
const unmatchedReferencesNames = Object.keys(groups.unmatchedReferences); if (unmatchedReferencesNames.length > 0) { throw new Error(`Unknown group names: ${unmatchedReferencesNames}`); }
};
const processTerm = (item, regenerateOptions, groups) => {
switch (item.type) { case 'dot': if (config.unicode) { update( item, getUnicodeDotSet(config.dotAll).toString(regenerateOptions) ); } else if (config.dotAll) { // TODO: consider changing this at the regenerate level. update(item, '[\\s\\S]'); } break; case 'characterClass': item = processCharacterClass(item, regenerateOptions); break; case 'unicodePropertyEscape': update( item, getUnicodePropertyEscapeSet(item.value, item.negative) .toString(regenerateOptions) ); break; case 'characterClassEscape': update( item, getCharacterClassEscapeSet( item.value, config.unicode, config.ignoreCase ).toString(regenerateOptions) ); break; case 'group': if (item.behavior == 'normal') { groups.lastIndex++; } if (item.name) { const name = item.name.value; if (groups.names[name]) { throw new Error( `Multiple groups with the same name (${ name }) are not allowed.` ); } const index = groups.lastIndex; delete item.name; groups.names[name] = index; if (groups.onNamedGroup) { groups.onNamedGroup.call(null, name, index); } if (groups.unmatchedReferences[name]) { groups.unmatchedReferences[name].forEach(reference => { updateNamedReference(reference, index); }); delete groups.unmatchedReferences[name]; } } /* falls through */ case 'alternative': case 'disjunction': case 'quantifier': item.body = item.body.map(term => { return processTerm(term, regenerateOptions, groups); }); break; case 'value': const codePoint = item.codePoint; const set = regenerate(codePoint); if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) { const folded = caseFold(codePoint); if (folded) { set.add(folded); } } update(item, set.toString(regenerateOptions)); break; case 'reference': if (item.name) { const name = item.name.value; const index = groups.names[name]; if (index) { updateNamedReference(item, index); break; } if (!groups.unmatchedReferences[name]) { groups.unmatchedReferences[name] = []; } // Keep track of references used before the corresponding group. groups.unmatchedReferences[name].push(item); } break; case 'anchor': case 'empty': case 'group': // Nothing to do here. break; // The `default` clause is only here as a safeguard; it should never be // reached. Code coverage tools should ignore it. /* istanbul ignore next */ default: throw new Error(`Unknown term type: ${ item.type }`); } return item;
};
const config = {
'ignoreCase': false, 'unicode': false, 'dotAll': false, 'useUnicodeFlag': false
}; const rewritePattern = (pattern, flags, options) => {
const regjsparserFeatures = { 'unicodePropertyEscape': options && options.unicodePropertyEscape, 'namedGroups': options && options.namedGroup, 'lookbehind': options && options.lookbehind }; config.ignoreCase = flags && flags.includes('i'); config.unicode = flags && flags.includes('u'); const supportDotAllFlag = options && options.dotAllFlag; config.dotAll = supportDotAllFlag && flags && flags.includes('s'); config.useUnicodeFlag = options && options.useUnicodeFlag; const regenerateOptions = { 'hasUnicodeFlag': config.useUnicodeFlag, 'bmpOnly': !config.unicode }; const groups = { 'onNamedGroup': options && options.onNamedGroup, 'lastIndex': 0, 'names': Object.create(null), // { [name]: index } 'unmatchedReferences': Object.create(null) // { [name]: Array<reference> } }; const tree = parse(pattern, flags, regjsparserFeatures); // Note: `processTerm` mutates `tree` and `groups`. processTerm(tree, regenerateOptions, groups); assertNoUnmatchedReferences(groups); return generate(tree);
};
module.exports = rewritePattern;