/** * @typedef {{[key:string]:string}} TUnicodeMap * @typedef {{[key:string]:Set}} TUnicodeSets * @typedef {[[number,number]]} TCodePoints * @typedef {{folded:string,composed:string,code_point:number}} TCodePointObj * @typedef {{start:number,end:number,length:number,substr:string}} TSequencePart */ import { setToPattern, arrayToPattern, escape_regex, sequencePattern, toArray } from './regex.mjs'; import { allSubstrings } from './strings.mjs'; /** @type {TCodePoints} */ export const code_points = [[ 0, 65535 ]]; const accent_pat = '[\u0300-\u036F\u{b7}\u{2be}\u{2bc}]'; /** @type {TUnicodeMap} */ export let unicode_map; /** @type {RegExp} */ let multi_char_reg; const max_char_length = 3; /** @type {TUnicodeMap} */ const latin_convert = {} /** @type {TUnicodeMap} */ const latin_condensed = { '/': '⁄∕', '0': '߀', "a": "ⱥɐɑ", "aa": "ꜳ", "ae": "æǽǣ", "ao": "ꜵ", "au": "ꜷ", "av": "ꜹꜻ", "ay": "ꜽ", "b": "ƀɓƃ", "c": "ꜿƈȼↄ", "d": "đɗɖᴅƌꮷԁɦ", "e": "ɛǝᴇɇ", "f": "ꝼƒ", "g": "ǥɠꞡᵹꝿɢ", "h": "ħⱨⱶɥ", "i": "ɨı", "j": "ɉȷ", "k": "ƙⱪꝁꝃꝅꞣ", "l": "łƚɫⱡꝉꝇꞁɭ", "m": "ɱɯϻ", "n": "ꞥƞɲꞑᴎлԉ", "o": "øǿɔɵꝋꝍᴑ", "oe": "œ", "oi": "ƣ", "oo": "ꝏ", "ou": "ȣ", "p": "ƥᵽꝑꝓꝕρ", "q": "ꝗꝙɋ", "r": "ɍɽꝛꞧꞃ", "s": "ßȿꞩꞅʂ", "t": "ŧƭʈⱦꞇ", "th": "þ", "tz": "ꜩ", "u": "ʉ", "v": "ʋꝟʌ", "vy": "ꝡ", "w": "ⱳ", "y": "ƴɏỿ", "z": "ƶȥɀⱬꝣ", "hv": "ƕ" } for( let latin in latin_condensed ){ let unicode = latin_condensed[latin] || ''; for( let i = 0; i < unicode.length; i++){ let char = unicode.substring(i,i+1); latin_convert[char] = latin; } } const convert_pat = new RegExp(Object.keys(latin_convert).join('|')+'|'+accent_pat,'gu'); /** * Initialize the unicode_map from the give code point ranges * * @param {TCodePoints=} _code_points */ export const initialize = (_code_points) => { if( unicode_map !== undefined ) return; unicode_map = generateMap(_code_points || code_points ); } /** * Helper method for normalize a string * https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/normalize * @param {string} str * @param {string} form */ export const normalize = (str,form='NFKD') => str.normalize(form); /** * Remove accents without reordering string * calling str.normalize('NFKD') on \u{594}\u{595}\u{596} becomes \u{596}\u{594}\u{595} * via https://github.com/krisk/Fuse/issues/133#issuecomment-318692703 * @param {string} str * @return {string} */ export const asciifold = (str) => { return toArray(str).reduce( /** * @param {string} result * @param {string} char */ (result, char) =>{ return result + _asciifold(char) }, '' ); }; /** * @param {string} str * @return {string} */ export const _asciifold = (str) => { str = normalize(str) .toLowerCase() .replace(convert_pat,(/** @type {string} */ char) => { return latin_convert[char] || ''; }) //return str; return normalize(str,'NFC') }; /** * Generate a list of unicode variants from the list of code points * @param {TCodePoints} code_points * @yield {TCodePointObj} */ export function* generator(code_points){ for(const [code_point_min, code_point_max] of code_points){ for(let i = code_point_min; i <= code_point_max; i++){ let composed = String.fromCharCode(i); let folded = asciifold(composed); if( folded == composed.toLowerCase() ){ continue; } // skip when folded is a string longer than 3 characters long // bc the resulting regex patterns will be long // eg: // folded صلى الله عليه وسلم length 18 code point 65018 // folded جل جلاله length 8 code point 65019 if( folded.length > max_char_length ){ continue; } if( folded.length == 0 ){ continue } yield {folded:folded,composed:composed,code_point:i}; } } } /** * Generate a unicode map from the list of code points * @param {TCodePoints} code_points * @return {TUnicodeSets} */ export const generateSets = (code_points) => { /** @type {{[key:string]:Set}} */ const unicode_sets = {}; /** * @param {string} folded * @param {string} to_add */ const addMatching = (folded,to_add) => { /** @type {Set} */ const folded_set = unicode_sets[folded] || new Set(); const patt = new RegExp( '^'+setToPattern(folded_set)+'$','iu'); if( to_add.match(patt) ){ return; } folded_set.add(escape_regex(to_add)); unicode_sets[folded] = folded_set; } for( let value of generator(code_points) ){ addMatching(value.folded,value.folded); addMatching(value.folded,value.composed); } return unicode_sets; } /** * Generate a unicode map from the list of code points * ae => (?:(?:ae|Æ|Ǽ|Ǣ)|(?:A|Ⓐ|Ａ...)(?:E|ɛ|Ⓔ...)) * * @param {TCodePoints} code_points * @return {TUnicodeMap} */ export const generateMap = (code_points) => { /** @type {TUnicodeSets} */ const unicode_sets = generateSets(code_points); /** @type {TUnicodeMap} */ const unicode_map = {}; /** @type {string[]} */ let multi_char = []; for( let folded in unicode_sets ){ let set = unicode_sets[folded]; if( set ){ unicode_map[folded] = setToPattern(set); } if( folded.length > 1 ){ multi_char.push(escape_regex(folded)); } } multi_char.sort((a, b) => b.length - a.length ); const multi_char_patt = arrayToPattern(multi_char); multi_char_reg = new RegExp('^'+multi_char_patt,'u'); return unicode_map; } /** * Map each element of an array from it's folded value to all possible unicode matches * @param {string[]} strings * @param {number} min_replacement * @return {string} */ export const mapSequence = (strings,min_replacement=1) =>{ let chars_replaced = 0; strings = strings.map((str)=>{ if( unicode_map[str] ){ chars_replaced += str.length; } return unicode_map[str] || str; }); if( chars_replaced >= min_replacement ){ return sequencePattern(strings); } return ''; } /** * Convert a short string and split it into all possible patterns * Keep a pattern only if min_replacement is met * * 'abc' * => [['abc'],['ab','c'],['a','bc'],['a','b','c']] * => ['abc-pattern','ab-c-pattern'...] * * * @param {string} str * @param {number} min_replacement * @return {string} */ export const substringsToPattern = (str,min_replacement=1) => { min_replacement = Math.max(min_replacement,str.length-1); return arrayToPattern( allSubstrings(str).map( (sub_pat) =>{ return mapSequence(sub_pat,min_replacement) }) ); } /** * Convert an array of sequences into a pattern * [{start:0,end:3,length:3,substr:'iii'}...] => (?:iii...) * * @param {Sequence[]} sequences * @param {boolean} all */ const sequencesToPattern = (sequences,all=true) => { let min_replacement = sequences.length > 1 ? 1 : 0; return arrayToPattern( sequences.map( (sequence) =>{ let seq = []; const len = all ? sequence.length() : sequence.length() - 1; for( let j = 0; j < len; j++){ seq.push(substringsToPattern(sequence.substrs[j]||'',min_replacement)); } return sequencePattern(seq); }) ); } /** * Return true if the sequence is already in the sequences * @param {Sequence} needle_seq * @param {Sequence[]} sequences */ const inSequences = (needle_seq, sequences) => { for(const seq of sequences){ if( seq.start != needle_seq.start || seq.end != needle_seq.end ){ continue; } if( seq.substrs.join('') !== needle_seq.substrs.join('') ){ continue; } let needle_parts = needle_seq.parts; /** * @param {TSequencePart} part */ const filter = (part) =>{ for(const needle_part of needle_parts){ if( needle_part.start === part.start && needle_part.substr === part.substr ){ return false; } if( part.length == 1 || needle_part.length == 1 ){ continue; } // check for overlapping parts // a = ['::=','=='] // b = ['::','==='] // a = ['r','sm'] // b = ['rs','m'] if( part.start < needle_part.start && part.end > needle_part.start ){ return true; } if( needle_part.start < part.start && needle_part.end > part.start ){ return true; } } return false; }; let filtered = seq.parts.filter(filter); if( filtered.length > 0 ){ continue; } return true; } return false; } class Sequence{ constructor(){ /** @type {TSequencePart[]} */ this.parts = []; /** @type {string[]} */ this.substrs = []; this.start = 0; this.end = 0; } /** * @param {TSequencePart|undefined} part */ add(part){ if( part ){ this.parts.push(part); this.substrs.push(part.substr); this.start = Math.min(part.start,this.start); this.end = Math.max(part.end,this.end); } } last(){ return this.parts[this.parts.length-1]; } length(){ return this.parts.length; } /** * @param {number} position * @param {TSequencePart} last_piece */ clone(position, last_piece){ let clone = new Sequence(); let parts = JSON.parse(JSON.stringify(this.parts)); let last_part = parts.pop(); for( const part of parts ){ clone.add(part); } let last_substr = last_piece.substr.substring(0,position-last_part.start); let clone_last_len = last_substr.length; clone.add({start:last_part.start,end:last_part.start+clone_last_len,length:clone_last_len,substr:last_substr}); return clone; } } /** * Expand a regular expression pattern to include unicode variants * eg /a/ becomes /aⓐａẚàáâầấẫẩãāăằắẵẳȧǡäǟảåǻǎȁȃạậặḁąⱥɐɑAⒶＡÀÁÂẦẤẪẨÃĀĂẰẮẴẲȦǠÄǞẢÅǺǍȀȂẠẬẶḀĄȺⱯ/ * * Issue: * ﺊﺋ [ 'ﺊ = \\u{fe8a}', 'ﺋ = \\u{fe8b}' ] * becomes: ئئ [ 'ي = \\u{64a}', 'ٔ = \\u{654}', 'ي = \\u{64a}', 'ٔ = \\u{654}' ] * * İĲ = IIJ = ⅡJ * * 1/2/4 * * @param {string} str * @return {string|undefined} */ export const getPattern = (str) => { initialize(); str = asciifold(str); let pattern = ''; let sequences = [new Sequence()]; for( let i = 0; i < str.length; i++ ){ let substr = str.substring(i); let match = substr.match(multi_char_reg); const char = str.substring(i,i+1); const match_str = match ? match[0] : null; // loop through sequences // add either the char or multi_match let overlapping = []; let added_types = new Set(); for(const sequence of sequences){ const last_piece = sequence.last(); if( !last_piece || last_piece.length == 1 || last_piece.end <= i ){ // if we have a multi match if( match_str ){ const len = match_str.length; sequence.add({start:i,end:i+len,length:len,substr:match_str}); added_types.add('1'); }else{ sequence.add({start:i,end:i+1,length:1,substr:char}); added_types.add('2'); } }else if( match_str ){ let clone = sequence.clone(i,last_piece); const len = match_str.length; clone.add({start:i,end:i+len,length:len,substr:match_str}); overlapping.push(clone); }else{ // don't add char // adding would create invalid patterns: 234 => [2,34,4] added_types.add('3'); } } // if we have overlapping if( overlapping.length > 0 ){ // ['ii','iii'] before ['i','i','iii'] overlapping = overlapping.sort((a,b)=>{ return a.length() - b.length(); }); for( let clone of overlapping){ // don't add if we already have an equivalent sequence if( inSequences(clone, sequences) ){ continue; } sequences.push(clone); } continue; } // if we haven't done anything unique // clean up the patterns // helps keep patterns smaller // if str = 'r₨㎧aarss', pattern will be 446 instead of 655 if( i > 0 && added_types.size == 1 && !added_types.has('3') ){ pattern += sequencesToPattern(sequences,false); let new_seq = new Sequence(); const old_seq = sequences[0]; if( old_seq ){ new_seq.add(old_seq.last()); } sequences = [new_seq]; } } pattern += sequencesToPattern(sequences,true); return pattern; } export { escape_regex };