mirror of
https://github.com/Combodo/iTop.git
synced 2026-03-04 08:34:11 +01:00
673 lines
16 KiB
JavaScript
673 lines
16 KiB
JavaScript
/*! @orchidjs/unicode-variants | https://github.com/orchidjs/unicode-variants | Apache License (v2) */
|
||
'use strict';
|
||
|
||
Object.defineProperty(exports, '__esModule', { value: true });
|
||
|
||
/**
|
||
* Convert array of strings to a regular expression
|
||
* ex ['ab','a'] => (?:ab|a)
|
||
* ex ['a','b'] => [ab]
|
||
* @param {string[]} chars
|
||
* @return {string}
|
||
*/
|
||
const arrayToPattern = chars => {
|
||
chars = chars.filter(Boolean);
|
||
|
||
if (chars.length < 2) {
|
||
return chars[0] || '';
|
||
}
|
||
|
||
return maxValueLength(chars) == 1 ? '[' + chars.join('') + ']' : '(?:' + chars.join('|') + ')';
|
||
};
|
||
/**
|
||
* @param {string[]} array
|
||
* @return {string}
|
||
*/
|
||
|
||
const sequencePattern = array => {
|
||
if (!hasDuplicates(array)) {
|
||
return array.join('');
|
||
}
|
||
|
||
let pattern = '';
|
||
let prev_char_count = 0;
|
||
|
||
const prev_pattern = () => {
|
||
if (prev_char_count > 1) {
|
||
pattern += '{' + prev_char_count + '}';
|
||
}
|
||
};
|
||
|
||
array.forEach((char, i) => {
|
||
if (char === array[i - 1]) {
|
||
prev_char_count++;
|
||
return;
|
||
}
|
||
|
||
prev_pattern();
|
||
pattern += char;
|
||
prev_char_count = 1;
|
||
});
|
||
prev_pattern();
|
||
return pattern;
|
||
};
|
||
/**
|
||
* Convert array of strings to a regular expression
|
||
* ex ['ab','a'] => (?:ab|a)
|
||
* ex ['a','b'] => [ab]
|
||
* @param {Set<string>} chars
|
||
* @return {string}
|
||
*/
|
||
|
||
const setToPattern = chars => {
|
||
let array = toArray(chars);
|
||
return arrayToPattern(array);
|
||
};
|
||
/**
|
||
*
|
||
* https://stackoverflow.com/questions/7376598/in-javascript-how-do-i-check-if-an-array-has-duplicate-values
|
||
* @param {any[]} array
|
||
*/
|
||
|
||
const hasDuplicates = array => {
|
||
return new Set(array).size !== array.length;
|
||
};
|
||
/**
|
||
* https://stackoverflow.com/questions/63006601/why-does-u-throw-an-invalid-escape-error
|
||
* @param {string} str
|
||
* @return {string}
|
||
*/
|
||
|
||
const escape_regex = str => {
|
||
return (str + '').replace(/([\$\(\)\*\+\.\?\[\]\^\{\|\}\\])/gu, '\\$1');
|
||
};
|
||
/**
|
||
* Return the max length of array values
|
||
* @param {string[]} array
|
||
*
|
||
*/
|
||
|
||
const maxValueLength = array => {
|
||
return array.reduce((longest, value) => Math.max(longest, unicodeLength(value)), 0);
|
||
};
|
||
/**
|
||
* @param {string} str
|
||
*/
|
||
|
||
const unicodeLength = str => {
|
||
return toArray(str).length;
|
||
};
|
||
/**
|
||
* @param {any} p
|
||
* @return {any[]}
|
||
*/
|
||
|
||
const toArray = p => Array.from(p);
|
||
|
||
/**
|
||
* Get all possible combinations of substrings that add up to the given string
|
||
* https://stackoverflow.com/questions/30169587/find-all-the-combination-of-substrings-that-add-up-to-the-given-string
|
||
* @param {string} input
|
||
* @return {string[][]}
|
||
*/
|
||
const allSubstrings = input => {
|
||
if (input.length === 1) return [[input]];
|
||
/** @type {string[][]} */
|
||
|
||
let result = [];
|
||
const start = input.substring(1);
|
||
const suba = allSubstrings(start);
|
||
suba.forEach(function (subresult) {
|
||
let tmp = subresult.slice(0);
|
||
tmp[0] = input.charAt(0) + tmp[0];
|
||
result.push(tmp);
|
||
tmp = subresult.slice(0);
|
||
tmp.unshift(input.charAt(0));
|
||
result.push(tmp);
|
||
});
|
||
return result;
|
||
};
|
||
|
||
/**
|
||
* @typedef {{[key:string]:string}} TUnicodeMap
|
||
* @typedef {{[key:string]:Set<string>}} TUnicodeSets
|
||
* @typedef {[[number,number]]} TCodePoints
|
||
* @typedef {{folded:string,composed:string,code_point:number}} TCodePointObj
|
||
* @typedef {{start:number,end:number,length:number,substr:string}} TSequencePart
|
||
*/
|
||
/** @type {TCodePoints} */
|
||
|
||
const code_points = [[0, 65535]];
|
||
const accent_pat = '[\u0300-\u036F\u{b7}\u{2be}\u{2bc}]';
|
||
/** @type {TUnicodeMap} */
|
||
|
||
exports.unicode_map = void 0;
|
||
/** @type {RegExp} */
|
||
|
||
let multi_char_reg;
|
||
const max_char_length = 3;
|
||
/** @type {TUnicodeMap} */
|
||
|
||
const latin_convert = {};
|
||
/** @type {TUnicodeMap} */
|
||
|
||
const latin_condensed = {
|
||
'/': '⁄∕',
|
||
'0': '߀',
|
||
"a": "ⱥɐɑ",
|
||
"aa": "ꜳ",
|
||
"ae": "æǽǣ",
|
||
"ao": "ꜵ",
|
||
"au": "ꜷ",
|
||
"av": "ꜹꜻ",
|
||
"ay": "ꜽ",
|
||
"b": "ƀɓƃ",
|
||
"c": "ꜿƈȼↄ",
|
||
"d": "đɗɖᴅƌꮷԁɦ",
|
||
"e": "ɛǝᴇɇ",
|
||
"f": "ꝼƒ",
|
||
"g": "ǥɠꞡᵹꝿɢ",
|
||
"h": "ħⱨⱶɥ",
|
||
"i": "ɨı",
|
||
"j": "ɉȷ",
|
||
"k": "ƙⱪꝁꝃꝅꞣ",
|
||
"l": "łƚɫⱡꝉꝇꞁɭ",
|
||
"m": "ɱɯϻ",
|
||
"n": "ꞥƞɲꞑᴎлԉ",
|
||
"o": "øǿɔɵꝋꝍᴑ",
|
||
"oe": "œ",
|
||
"oi": "ƣ",
|
||
"oo": "ꝏ",
|
||
"ou": "ȣ",
|
||
"p": "ƥᵽꝑꝓꝕρ",
|
||
"q": "ꝗꝙɋ",
|
||
"r": "ɍɽꝛꞧꞃ",
|
||
"s": "ßȿꞩꞅʂ",
|
||
"t": "ŧƭʈⱦꞇ",
|
||
"th": "þ",
|
||
"tz": "ꜩ",
|
||
"u": "ʉ",
|
||
"v": "ʋꝟʌ",
|
||
"vy": "ꝡ",
|
||
"w": "ⱳ",
|
||
"y": "ƴɏỿ",
|
||
"z": "ƶȥɀⱬꝣ",
|
||
"hv": "ƕ"
|
||
};
|
||
|
||
for (let latin in latin_condensed) {
|
||
let unicode = latin_condensed[latin] || '';
|
||
|
||
for (let i = 0; i < unicode.length; i++) {
|
||
let char = unicode.substring(i, i + 1);
|
||
latin_convert[char] = latin;
|
||
}
|
||
}
|
||
|
||
const convert_pat = new RegExp(Object.keys(latin_convert).join('|') + '|' + accent_pat, 'gu');
|
||
/**
|
||
* Initialize the unicode_map from the give code point ranges
|
||
*
|
||
* @param {TCodePoints=} _code_points
|
||
*/
|
||
|
||
const initialize = _code_points => {
|
||
if (exports.unicode_map !== undefined) return;
|
||
exports.unicode_map = generateMap(_code_points || code_points);
|
||
};
|
||
/**
|
||
* Helper method for normalize a string
|
||
* https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/normalize
|
||
* @param {string} str
|
||
* @param {string} form
|
||
*/
|
||
|
||
const normalize = (str, form = 'NFKD') => str.normalize(form);
|
||
/**
|
||
* Remove accents without reordering string
|
||
* calling str.normalize('NFKD') on \u{594}\u{595}\u{596} becomes \u{596}\u{594}\u{595}
|
||
* via https://github.com/krisk/Fuse/issues/133#issuecomment-318692703
|
||
* @param {string} str
|
||
* @return {string}
|
||
*/
|
||
|
||
const asciifold = str => {
|
||
return toArray(str).reduce(
|
||
/**
|
||
* @param {string} result
|
||
* @param {string} char
|
||
*/
|
||
(result, char) => {
|
||
return result + _asciifold(char);
|
||
}, '');
|
||
};
|
||
/**
|
||
* @param {string} str
|
||
* @return {string}
|
||
*/
|
||
|
||
const _asciifold = str => {
|
||
str = normalize(str).toLowerCase().replace(convert_pat, (
|
||
/** @type {string} */
|
||
char) => {
|
||
return latin_convert[char] || '';
|
||
}); //return str;
|
||
|
||
return normalize(str, 'NFC');
|
||
};
|
||
/**
|
||
* Generate a list of unicode variants from the list of code points
|
||
* @param {TCodePoints} code_points
|
||
* @yield {TCodePointObj}
|
||
*/
|
||
|
||
function* generator(code_points) {
|
||
for (const [code_point_min, code_point_max] of code_points) {
|
||
for (let i = code_point_min; i <= code_point_max; i++) {
|
||
let composed = String.fromCharCode(i);
|
||
let folded = asciifold(composed);
|
||
|
||
if (folded == composed.toLowerCase()) {
|
||
continue;
|
||
} // skip when folded is a string longer than 3 characters long
|
||
// bc the resulting regex patterns will be long
|
||
// eg:
|
||
// folded صلى الله عليه وسلم length 18 code point 65018
|
||
// folded جل جلاله length 8 code point 65019
|
||
|
||
|
||
if (folded.length > max_char_length) {
|
||
continue;
|
||
}
|
||
|
||
if (folded.length == 0) {
|
||
continue;
|
||
}
|
||
|
||
yield {
|
||
folded: folded,
|
||
composed: composed,
|
||
code_point: i
|
||
};
|
||
}
|
||
}
|
||
}
|
||
/**
|
||
* Generate a unicode map from the list of code points
|
||
* @param {TCodePoints} code_points
|
||
* @return {TUnicodeSets}
|
||
*/
|
||
|
||
const generateSets = code_points => {
|
||
/** @type {{[key:string]:Set<string>}} */
|
||
const unicode_sets = {};
|
||
/**
|
||
* @param {string} folded
|
||
* @param {string} to_add
|
||
*/
|
||
|
||
const addMatching = (folded, to_add) => {
|
||
/** @type {Set<string>} */
|
||
const folded_set = unicode_sets[folded] || new Set();
|
||
const patt = new RegExp('^' + setToPattern(folded_set) + '$', 'iu');
|
||
|
||
if (to_add.match(patt)) {
|
||
return;
|
||
}
|
||
|
||
folded_set.add(escape_regex(to_add));
|
||
unicode_sets[folded] = folded_set;
|
||
};
|
||
|
||
for (let value of generator(code_points)) {
|
||
addMatching(value.folded, value.folded);
|
||
addMatching(value.folded, value.composed);
|
||
}
|
||
|
||
return unicode_sets;
|
||
};
|
||
/**
|
||
* Generate a unicode map from the list of code points
|
||
* ae => (?:(?:ae|Æ|Ǽ|Ǣ)|(?:A|Ⓐ|A...)(?:E|ɛ|Ⓔ...))
|
||
*
|
||
* @param {TCodePoints} code_points
|
||
* @return {TUnicodeMap}
|
||
*/
|
||
|
||
const generateMap = code_points => {
|
||
/** @type {TUnicodeSets} */
|
||
const unicode_sets = generateSets(code_points);
|
||
/** @type {TUnicodeMap} */
|
||
|
||
const unicode_map = {};
|
||
/** @type {string[]} */
|
||
|
||
let multi_char = [];
|
||
|
||
for (let folded in unicode_sets) {
|
||
let set = unicode_sets[folded];
|
||
|
||
if (set) {
|
||
unicode_map[folded] = setToPattern(set);
|
||
}
|
||
|
||
if (folded.length > 1) {
|
||
multi_char.push(escape_regex(folded));
|
||
}
|
||
}
|
||
|
||
multi_char.sort((a, b) => b.length - a.length);
|
||
const multi_char_patt = arrayToPattern(multi_char);
|
||
multi_char_reg = new RegExp('^' + multi_char_patt, 'u');
|
||
return unicode_map;
|
||
};
|
||
/**
|
||
* Map each element of an array from it's folded value to all possible unicode matches
|
||
* @param {string[]} strings
|
||
* @param {number} min_replacement
|
||
* @return {string}
|
||
*/
|
||
|
||
const mapSequence = (strings, min_replacement = 1) => {
|
||
let chars_replaced = 0;
|
||
strings = strings.map(str => {
|
||
if (exports.unicode_map[str]) {
|
||
chars_replaced += str.length;
|
||
}
|
||
|
||
return exports.unicode_map[str] || str;
|
||
});
|
||
|
||
if (chars_replaced >= min_replacement) {
|
||
return sequencePattern(strings);
|
||
}
|
||
|
||
return '';
|
||
};
|
||
/**
|
||
* Convert a short string and split it into all possible patterns
|
||
* Keep a pattern only if min_replacement is met
|
||
*
|
||
* 'abc'
|
||
* => [['abc'],['ab','c'],['a','bc'],['a','b','c']]
|
||
* => ['abc-pattern','ab-c-pattern'...]
|
||
*
|
||
*
|
||
* @param {string} str
|
||
* @param {number} min_replacement
|
||
* @return {string}
|
||
*/
|
||
|
||
const substringsToPattern = (str, min_replacement = 1) => {
|
||
min_replacement = Math.max(min_replacement, str.length - 1);
|
||
return arrayToPattern(allSubstrings(str).map(sub_pat => {
|
||
return mapSequence(sub_pat, min_replacement);
|
||
}));
|
||
};
|
||
/**
|
||
* Convert an array of sequences into a pattern
|
||
* [{start:0,end:3,length:3,substr:'iii'}...] => (?:iii...)
|
||
*
|
||
* @param {Sequence[]} sequences
|
||
* @param {boolean} all
|
||
*/
|
||
|
||
const sequencesToPattern = (sequences, all = true) => {
|
||
let min_replacement = sequences.length > 1 ? 1 : 0;
|
||
return arrayToPattern(sequences.map(sequence => {
|
||
let seq = [];
|
||
const len = all ? sequence.length() : sequence.length() - 1;
|
||
|
||
for (let j = 0; j < len; j++) {
|
||
seq.push(substringsToPattern(sequence.substrs[j] || '', min_replacement));
|
||
}
|
||
|
||
return sequencePattern(seq);
|
||
}));
|
||
};
|
||
/**
|
||
* Return true if the sequence is already in the sequences
|
||
* @param {Sequence} needle_seq
|
||
* @param {Sequence[]} sequences
|
||
*/
|
||
|
||
|
||
const inSequences = (needle_seq, sequences) => {
|
||
for (const seq of sequences) {
|
||
if (seq.start != needle_seq.start || seq.end != needle_seq.end) {
|
||
continue;
|
||
}
|
||
|
||
if (seq.substrs.join('') !== needle_seq.substrs.join('')) {
|
||
continue;
|
||
}
|
||
|
||
let needle_parts = needle_seq.parts;
|
||
/**
|
||
* @param {TSequencePart} part
|
||
*/
|
||
|
||
const filter = part => {
|
||
for (const needle_part of needle_parts) {
|
||
if (needle_part.start === part.start && needle_part.substr === part.substr) {
|
||
return false;
|
||
}
|
||
|
||
if (part.length == 1 || needle_part.length == 1) {
|
||
continue;
|
||
} // check for overlapping parts
|
||
// a = ['::=','==']
|
||
// b = ['::','===']
|
||
// a = ['r','sm']
|
||
// b = ['rs','m']
|
||
|
||
|
||
if (part.start < needle_part.start && part.end > needle_part.start) {
|
||
return true;
|
||
}
|
||
|
||
if (needle_part.start < part.start && needle_part.end > part.start) {
|
||
return true;
|
||
}
|
||
}
|
||
|
||
return false;
|
||
};
|
||
|
||
let filtered = seq.parts.filter(filter);
|
||
|
||
if (filtered.length > 0) {
|
||
continue;
|
||
}
|
||
|
||
return true;
|
||
}
|
||
|
||
return false;
|
||
};
|
||
|
||
class Sequence {
|
||
constructor() {
|
||
/** @type {TSequencePart[]} */
|
||
this.parts = [];
|
||
/** @type {string[]} */
|
||
|
||
this.substrs = [];
|
||
this.start = 0;
|
||
this.end = 0;
|
||
}
|
||
/**
|
||
* @param {TSequencePart|undefined} part
|
||
*/
|
||
|
||
|
||
add(part) {
|
||
if (part) {
|
||
this.parts.push(part);
|
||
this.substrs.push(part.substr);
|
||
this.start = Math.min(part.start, this.start);
|
||
this.end = Math.max(part.end, this.end);
|
||
}
|
||
}
|
||
|
||
last() {
|
||
return this.parts[this.parts.length - 1];
|
||
}
|
||
|
||
length() {
|
||
return this.parts.length;
|
||
}
|
||
/**
|
||
* @param {number} position
|
||
* @param {TSequencePart} last_piece
|
||
*/
|
||
|
||
|
||
clone(position, last_piece) {
|
||
let clone = new Sequence();
|
||
let parts = JSON.parse(JSON.stringify(this.parts));
|
||
let last_part = parts.pop();
|
||
|
||
for (const part of parts) {
|
||
clone.add(part);
|
||
}
|
||
|
||
let last_substr = last_piece.substr.substring(0, position - last_part.start);
|
||
let clone_last_len = last_substr.length;
|
||
clone.add({
|
||
start: last_part.start,
|
||
end: last_part.start + clone_last_len,
|
||
length: clone_last_len,
|
||
substr: last_substr
|
||
});
|
||
return clone;
|
||
}
|
||
|
||
}
|
||
/**
|
||
* Expand a regular expression pattern to include unicode variants
|
||
* eg /a/ becomes /aⓐaẚàáâầấẫẩãāăằắẵẳȧǡäǟảåǻǎȁȃạậặḁąⱥɐɑAⒶAÀÁÂẦẤẪẨÃĀĂẰẮẴẲȦǠÄǞẢÅǺǍȀȂẠẬẶḀĄȺⱯ/
|
||
*
|
||
* Issue:
|
||
* ﺊﺋ [ 'ﺊ = \\u{fe8a}', 'ﺋ = \\u{fe8b}' ]
|
||
* becomes: ئئ [ 'ي = \\u{64a}', 'ٔ = \\u{654}', 'ي = \\u{64a}', 'ٔ = \\u{654}' ]
|
||
*
|
||
* İIJ = IIJ = ⅡJ
|
||
*
|
||
* 1/2/4
|
||
*
|
||
* @param {string} str
|
||
* @return {string|undefined}
|
||
*/
|
||
|
||
|
||
const getPattern = str => {
|
||
initialize();
|
||
str = asciifold(str);
|
||
let pattern = '';
|
||
let sequences = [new Sequence()];
|
||
|
||
for (let i = 0; i < str.length; i++) {
|
||
let substr = str.substring(i);
|
||
let match = substr.match(multi_char_reg);
|
||
const char = str.substring(i, i + 1);
|
||
const match_str = match ? match[0] : null; // loop through sequences
|
||
// add either the char or multi_match
|
||
|
||
let overlapping = [];
|
||
let added_types = new Set();
|
||
|
||
for (const sequence of sequences) {
|
||
const last_piece = sequence.last();
|
||
|
||
if (!last_piece || last_piece.length == 1 || last_piece.end <= i) {
|
||
// if we have a multi match
|
||
if (match_str) {
|
||
const len = match_str.length;
|
||
sequence.add({
|
||
start: i,
|
||
end: i + len,
|
||
length: len,
|
||
substr: match_str
|
||
});
|
||
added_types.add('1');
|
||
} else {
|
||
sequence.add({
|
||
start: i,
|
||
end: i + 1,
|
||
length: 1,
|
||
substr: char
|
||
});
|
||
added_types.add('2');
|
||
}
|
||
} else if (match_str) {
|
||
let clone = sequence.clone(i, last_piece);
|
||
const len = match_str.length;
|
||
clone.add({
|
||
start: i,
|
||
end: i + len,
|
||
length: len,
|
||
substr: match_str
|
||
});
|
||
overlapping.push(clone);
|
||
} else {
|
||
// don't add char
|
||
// adding would create invalid patterns: 234 => [2,34,4]
|
||
added_types.add('3');
|
||
}
|
||
} // if we have overlapping
|
||
|
||
|
||
if (overlapping.length > 0) {
|
||
// ['ii','iii'] before ['i','i','iii']
|
||
overlapping = overlapping.sort((a, b) => {
|
||
return a.length() - b.length();
|
||
});
|
||
|
||
for (let clone of overlapping) {
|
||
// don't add if we already have an equivalent sequence
|
||
if (inSequences(clone, sequences)) {
|
||
continue;
|
||
}
|
||
|
||
sequences.push(clone);
|
||
}
|
||
|
||
continue;
|
||
} // if we haven't done anything unique
|
||
// clean up the patterns
|
||
// helps keep patterns smaller
|
||
// if str = 'r₨㎧aarss', pattern will be 446 instead of 655
|
||
|
||
|
||
if (i > 0 && added_types.size == 1 && !added_types.has('3')) {
|
||
pattern += sequencesToPattern(sequences, false);
|
||
let new_seq = new Sequence();
|
||
const old_seq = sequences[0];
|
||
|
||
if (old_seq) {
|
||
new_seq.add(old_seq.last());
|
||
}
|
||
|
||
sequences = [new_seq];
|
||
}
|
||
}
|
||
|
||
pattern += sequencesToPattern(sequences, true);
|
||
return pattern;
|
||
};
|
||
|
||
exports._asciifold = _asciifold;
|
||
exports.asciifold = asciifold;
|
||
exports.code_points = code_points;
|
||
exports.escape_regex = escape_regex;
|
||
exports.generateMap = generateMap;
|
||
exports.generateSets = generateSets;
|
||
exports.generator = generator;
|
||
exports.getPattern = getPattern;
|
||
exports.initialize = initialize;
|
||
exports.mapSequence = mapSequence;
|
||
exports.normalize = normalize;
|
||
exports.substringsToPattern = substringsToPattern;
|
||
//# sourceMappingURL=index.js.map
|