1 /*
  2  * NormString.js - ilib normalized string subclass definition
  3  *
  4  * Copyright © 2013-2015, 2018, JEDLSoft
  5  *
  6  * Licensed under the Apache License, Version 2.0 (the "License");
  7  * you may not use this file except in compliance with the License.
  8  * You may obtain a copy of the License at
  9  *
 10  *     http://www.apache.org/licenses/LICENSE-2.0
 11  *
 12  * Unless required by applicable law or agreed to in writing, software
 13  * distributed under the License is distributed on an "AS IS" BASIS,
 14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15  *
 16  * See the License for the specific language governing permissions and
 17  * limitations under the License.
 18  */
 19 
 20 // !data ccc nfd nfc nfkd nfkc
 21 
 22 var ilib = require("./ilib.js");
 23 var Utils = require("./Utils.js");
 24 var JSUtils = require("./JSUtils.js");
 25 
 26 var IString = require("./IString.js");
 27 var GlyphString = require("./GlyphString.js");
 28 
 29 /**
 30  * @class
 31  * Create a new normalized string instance. This string inherits from
 32  * the GlyphString class, and adds the normalize method. It can be
 33  * used anywhere that a normal Javascript string is used. <p>
 34  *
 35  * The options parameter is optional, and may contain any combination
 36  * of the following properties:<p>
 37  *
 38  * <ul>
 39  * <li><i>onLoad</i> - a callback function to call when the locale data are
 40  * fully loaded. When the onLoad option is given, this object will attempt to
 41  * load any missing locale data using the ilib loader callback.
 42  * When the constructor is done (even if the data is already preassembled), the
 43  * onLoad function is called with the current instance as a parameter, so this
 44  * callback can be used with preassembled or dynamic loading or a mix of the two.
 45  *
 46  * <li><i>sync</i> - tell whether to load any missing locale data synchronously or
 47  * asynchronously. If this option is given as "false", then the "onLoad"
 48  * callback must be given, as the instance returned from this constructor will
 49  * not be usable for a while.
 50  *
 51  * <li><i>loadParams</i> - an object containing parameters to pass to the
 52  * loader callback function when locale data is missing. The parameters are not
 53  * interpretted or modified in any way. They are simply passed along. The object
 54  * may contain any property/value pairs as long as the calling code is in
 55  * agreement with the loader callback function as to what those parameters mean.
 56  * </ul>
 57  *
 58  * @constructor
 59  * @extends GlyphString
 60  * @param {string|IString=} str initialize this instance with this string
 61  * @param {Object=} options options governing the way this instance works
 62  */
 63 var NormString = function (str, options) {
 64     GlyphString.call(this, str, options);
 65 };
 66 
 67 NormString.prototype = new GlyphString("", {noinstance:true});
 68 NormString.prototype.parent = GlyphString;
 69 NormString.prototype.constructor = NormString;
 70 
 71 /**
 72  * Initialize the normalized string routines statically. This
 73  * is intended to be called in a dynamic-load version of ilib
 74  * to load the data needed to normalize strings before any instances
 75  * of NormString are created.<p>
 76  *
 77  * The options parameter may contain any of the following properties:
 78  *
 79  * <ul>
 80  * <li><i>form</i> - {string} the normalization form to load
 81  * <li><i>script</i> - {string} load the normalization for this script. If the
 82  * script is given as "all" then the normalization data for all scripts
 83  * is loaded at the same time
 84  * <li><i>sync</i> - {boolean} whether to load the files synchronously or not
 85  * <li><i>loadParams</i> - {Object} parameters to the loader function
 86  * <li><i>onLoad</i> - {function()} a function to call when the
 87  * files are done being loaded
 88  * </ul>
 89  *
 90  * @param {Object} options an object containing properties that govern
 91  * how to initialize the data
 92  */
 93 
 94 NormString.init = function(options) {
 95     var form = "nfkc";
 96     var script = "all";
 97     var sync = true;
 98     var loadParams = undefined;
 99 
100     if (options) {
101         if (options.form) {
102             form = options.form;
103         }
104         if (options.script) {
105             script = options.script;
106         }
107         if (options.loadParams) {
108             loadParams = options.loadParams;
109         }
110         if (typeof(options.sync) === 'boolean') {
111             sync = options.sync;
112         }
113     }
114 
115     var formDependencies = {
116         "nfd": ["nfd"],
117         "nfc": ["nfc", "nfd"],
118         "nfkd": ["nfkd", "nfd"],
119         "nfkc": ["nfc", "nfkd", "nfd"]
120     };
121     var files = [];
122     var forms = formDependencies[form];
123     var toLoad = [];
124     forms.forEach(function(f) {
125         if (!ilib.data.norm[f] || JSUtils.isEmpty(ilib.data.norm[f])) {
126             files.push(f + "/" + script + ".json");
127             toLoad.push(f);
128         }
129     });
130 
131     if (files.length || !ilib.data.ccc || JSUtils.isEmpty(ilib.data.ccc)) {
132         Utils.loadData({
133             object: "NormString",
134             name: "ccc.json",
135             locale: "-",
136             nonlocale: true,
137             sync: sync,
138             loadParams: loadParams,
139             callback: ilib.bind(this, function(normdata){
140                 ilib.data.ccc = normdata;
141 
142                 if (files.length) {
143                     //console.log("loading files " + JSON.stringify(files));
144                     Utils._callLoadData(files, sync, loadParams, function(arr) {
145                         for (var i = 0; i < arr.length; i++) {
146                             if (typeof(arr[i]) !== 'undefined') {
147                                 ilib.extend(ilib.data.norm[toLoad[i]], arr[i]);
148                             }
149                         }
150                         if (options && typeof(options.onLoad) === 'function') {
151                            options.onLoad(this);
152                         }
153                     });
154                 } else {
155                     if (options && typeof(options.onLoad) === 'function') {
156                         options.onLoad(this);
157                     }
158                 }
159             })
160         })
161     } else {
162         if (options && typeof(options.onLoad) === 'function') {
163             options.onLoad(this);
164         }
165     }
166 }
167 
168 /**
169  * Algorithmically decompose a precomposed Korean syllabic Hangul
170  * character into its individual combining Jamo characters. The given
171  * character must be in the range of Hangul characters U+AC00 to U+D7A3.
172  *
173  * @private
174  * @static
175  * @param {number} cp code point of a Korean Hangul character to decompose
176  * @return {string} the decomposed string of Jamo characters
177  */
178 NormString._decomposeHangul = function (cp) {
179     var sindex = cp - 0xAC00;
180     var result = String.fromCharCode(0x1100 + sindex / 588) +
181             String.fromCharCode(0x1161 + (sindex % 588) / 28);
182     var t = sindex % 28;
183     if (t !== 0) {
184         result += String.fromCharCode(0x11A7 + t);
185     }
186     return result;
187 };
188 
189 /**
190  * Expand one character according to the given canonical and
191  * compatibility mappings.
192  *
193  * @private
194  * @static
195  * @param {string} ch character to map
196  * @param {Object} canon the canonical mappings to apply
197  * @param {Object=} compat the compatibility mappings to apply, or undefined
198  * if only the canonical mappings are needed
199  * @return {string} the mapped character
200  */
201 NormString._expand = function (ch, canon, compat) {
202     var i,
203         expansion = "",
204         n = ch.charCodeAt(0);
205     if (GlyphString._isHangul(n)) {
206         expansion = NormString._decomposeHangul(n);
207     } else {
208         var result = canon[ch];
209         if (!result && compat) {
210             result = compat[ch];
211         }
212         if (result && result !== ch) {
213             for (i = 0; i < result.length; i++) {
214                 expansion += NormString._expand(result[i], canon, compat);
215             }
216         } else {
217             expansion = ch;
218         }
219     }
220     return expansion;
221 };
222 
223 /**
224  * Perform the Unicode Normalization Algorithm upon the string and return
225  * the resulting new string. The current string is not modified.
226  *
227  * <h2>Forms</h2>
228  *
229  * The forms of possible normalizations are defined by the <a
230  * href="http://www.unicode.org/reports/tr15/">Unicode Standard
231  * Annex (UAX) 15</a>. The form parameter is a string that may have one
232  * of the following values:
233  *
234  * <ul>
235  * <li>nfd - Canonical decomposition. This decomposes characters into
236  * their exactly equivalent forms. For example, "ü" would decompose
237  * into a "u" followed by the combining diaeresis character.
238  * <li>nfc - Canonical decomposition followed by canonical composition.
239  * This decomposes and then recomposes character into their shortest
240  * exactly equivalent forms by recomposing as many combining characters
241  * as possible. For example, "ü" followed by a combining
242  * macron character would decompose into a "u" followed by the combining
243  * macron characters the combining diaeresis character, and then be recomposed into
244  * the u with macron and diaeresis "ṻ" character. The reason that
245  * the "nfc" form decomposes and then recomposes is that combining characters
246  * have a specific order under the Unicode Normalization Algorithm, and
247  * partly composed characters such as the "ü" followed by combining
248  * marks may change the order of the combining marks when decomposed and
249  * recomposed.
250  * <li>nfkd - Compatibility decomposition. This decomposes characters
251  * into compatible forms that may not be exactly equivalent semantically,
252  * as well as performing canonical decomposition as well.
253  * For example, the "œ" ligature character decomposes to the two
254  * characters "oe" because they are compatible even though they are not
255  * exactly the same semantically.
256  * <li>nfkc - Compatibility decomposition followed by canonical composition.
257  * This decomposes characters into compatible forms, then recomposes
258  * characters using the canonical composition. That is, it breaks down
259  * characters into the compatible forms, and then recombines all combining
260  * marks it can with their base characters. For example, the character
261  * "ǽ" would be normalized to "aé" by first decomposing
262  * the character into "a" followed by "e" followed by the combining acute accent
263  * combining mark, and then recomposed to an "a" followed by the "e"
264  * with acute accent.
265  * </ul>
266  *
267  * <h2>Operation</h2>
268  *
269  * Two strings a and b can be said to be canonically equivalent if
270  * normalize(a) = normalize(b)
271  * under the nfc normalization form. Two strings can be said to be compatible if
272  * normalize(a) = normalize(b) under the nfkc normalization form.<p>
273  *
274  * The canonical normalization is often used to see if strings are
275  * equivalent to each other, and thus is useful when implementing parsing
276  * algorithms or exact matching algorithms. It can also be used to ensure
277  * that any string output produces a predictable sequence of characters.<p>
278  *
279  * Compatibility normalization
280  * does not always preserve the semantic meaning of all the characters,
281  * although this is sometimes the behaviour that you are after. It is useful,
282  * for example, when doing searches of user-input against text in documents
283  * where the matches are supposed to "fuzzy". In this case, both the query
284  * string and the document string would be mapped to their compatibility
285  * normalized forms, and then compared.<p>
286  *
287  * Compatibility normalization also does not guarantee round-trip conversion
288  * to and from legacy character sets as the normalization is "lossy". It is
289  * akin to doing a lower- or upper-case conversion on text -- after casing,
290  * you cannot tell what case each character is in the original string. It is
291  * good for matching and searching, but it rarely good for output because some
292  * distinctions or meanings in the original text have been lost.<p>
293  *
294  * Note that W3C normalization for HTML also escapes and unescapes
295  * HTML character entities such as "&uuml;" for u with diaeresis. This
296  * method does not do such escaping or unescaping. If normalization is required
297  * for HTML strings with entities, unescaping should be performed on the string
298  * prior to calling this method.<p>
299  *
300  * <h2>Data</h2>
301  *
302  * Normalization requires a fair amount of mapping data, much of which you may
303  * not need for the characters expected in your texts. It is possible to assemble
304  * a copy of ilib that saves space by only including normalization data for
305  * those scripts that you expect to encounter in your data.<p>
306  *
307  * The normalization data is organized by normalization form and within there
308  * by script. To include the normalization data for a particular script with
309  * a particular normalization form, use the following require:
310  *
311  * <pre><code>
312  * NormString.init({
313  *   form: "<form>",
314  *   script: "<script>"
315  * });
316  * </code></pre>
317  *
318  * Where <form> is the normalization form ("nfd", "nfc", "nfkd", or "nfkc"), and
319  * <script> is the ISO 15924 code for the script you would like to
320  * support. Example: to load in the NFC data for Cyrillic, you would use:
321  *
322  * <pre><code>
323  * NormString.init({
324  *   form: "nfc",
325  *   script: "Cyrl"
326  * });
327  * </code></pre>
328  *
329  * Note that because certain normalization forms include others in their algorithm,
330  * their data also depends on the data for the other forms. For example, if you
331  * include the "nfc" data for a script, you will automatically get the "nfd" data
332  * for that same script as well because the NFC algorithm does NFD normalization
333  * first. Here are the dependencies:<p>
334  *
335  * <ul>
336  * <li>NFD -> no dependencies
337  * <li>NFC -> NFD
338  * <li>NFKD -> NFD
339  * <li>NFKC -> NFKD, NFD, NFC
340  * </ul>
341  *
342  * A special value for the script dependency is "all" which will cause the data for
343  * all scripts
344  * to be loaded for that normalization form. This would be useful if you know that
345  * you are going to normalize a lot of multilingual text or cannot predict which scripts
346  * will appear in the input. Because the NFKC form depends on all others, you can
347  * get all of the data for all forms automatically by depending on "nfkc/all.js".
348  * Note that the normalization data for practically all script automatically depend
349  * on data for the Common script (code "Zyyy") which contains all of the characters
350  * that are commonly used in many different scripts. Examples of characters in the
351  * Common script are the ASCII punctuation characters, or the ASCII Arabic
352  * numerals "0" through "9".<p>
353  *
354  * By default, none of the data for normalization is automatically
355  * included in the preassembled ilib files. (For size "full".)
356  * If you would like to normalize strings, you must assemble
357  * your own copy of ilib and explicitly include the normalization data
358  * for those scripts. This normalization method will
359  * produce output, even without the normalization data. However, the output will be
360  * simply the same thing as its input for all scripts
361  * except Korean Hangul and Jamo, which are decomposed and recomposed
362  * algorithmically and therefore do not rely on data.<p>
363  *
364  * If characters are encountered for which there are no normalization data, they
365  * will be passed through to the output string unmodified.
366  *
367  * @param {string} form The normalization form requested
368  * @return {IString} a new instance of an IString that has been normalized
369  * according to the requested form. The current instance is not modified.
370  */
371 NormString.prototype.normalize = function (form) {
372     var i;
373 
374     if (typeof(form) !== 'string' || this.str.length === 0) {
375         return new IString(this.str);
376     }
377 
378     var nfc = false,
379         nfkd = false;
380 
381     switch (form) {
382     default:
383         break;
384 
385     case "nfc":
386         nfc = true;
387         break;
388 
389     case "nfkd":
390         nfkd = true;
391         break;
392 
393     case "nfkc":
394         nfkd = true;
395         nfc = true;
396         break;
397     }
398 
399     // decompose
400     var ch, it, decomp = "";
401 
402     if (nfkd) {
403         it = IString.prototype.charIterator.call(this);
404         while (it.hasNext()) {
405             ch = it.next();
406             decomp += NormString._expand(ch, ilib.data.norm.nfd, ilib.data.norm.nfkd);
407         }
408     } else {
409         it = IString.prototype.charIterator.call(this);
410         while (it.hasNext()) {
411             ch = it.next();
412             decomp += NormString._expand(ch, ilib.data.norm.nfd);
413         }
414     }
415 
416     // now put the combining marks in a fixed order by
417     // sorting on the combining class
418     function compareByCCC(left, right) {
419         return ilib.data.ccc[left] - ilib.data.ccc[right];
420     }
421 
422     function ccc(c) {
423         return ilib.data.ccc[c] || 0;
424     }
425 
426     function sortChars(arr, comp) {
427         // qt/qml's Javascript engine re-arranges entries that are equal to
428         // each other. Technically, that is a correct behaviour, but it is
429         // not desirable. All the other engines leave equivalent entries
430         // where they are. This bubblesort emulates what the other engines
431         // do. Fortunately, the arrays we are sorting are a max of 5 or 6
432         // entries, so performance is not a big deal here.
433         if (ilib._getPlatform() === "qt") {
434             var tmp;
435             for (var i = arr.length-1; i > 0; i--) {
436                 for (var j = 0; j < i; j++) {
437                     if (comp(arr[j], arr[j+1]) > 0) {
438                         tmp = arr[j];
439                         arr[j] = arr[j+1];
440                         arr[j+1] = tmp;
441                     }
442                 }
443             }
444             return arr;
445         } else {
446             return arr.sort(comp);
447         }
448     }
449 
450     var dstr = new IString(decomp);
451     it = dstr.charIterator();
452     var end, testChar, cpArray = [];
453 
454     // easier to deal with as an array of chars
455     while (it.hasNext()) {
456         cpArray.push(it.next());
457     }
458 
459     i = 0;
460     while (i < cpArray.length) {
461         if (typeof(ilib.data.ccc[cpArray[i]]) !== 'undefined' && ccc(cpArray[i]) !== 0) {
462             // found a non-starter... rearrange all the non-starters until the next starter
463             end = i+1;
464             while (end < cpArray.length &&
465                     typeof(ilib.data.ccc[cpArray[end]]) !== 'undefined' &&
466                     ccc(cpArray[end]) !== 0) {
467                 end++;
468             }
469 
470             // simple sort of the non-starter chars
471             if (end - i > 1) {
472                 cpArray = cpArray.slice(0,i).concat(sortChars(cpArray.slice(i, end), compareByCCC), cpArray.slice(end));
473             }
474         }
475         i++;
476     }
477 
478     if (nfc) {
479         i = 0;
480         while (i < cpArray.length) {
481             if (typeof(ilib.data.ccc[cpArray[i]]) === 'undefined' || ilib.data.ccc[cpArray[i]] === 0) {
482                 // found a starter... find all the non-starters until the next starter. Must include
483                 // the next starter because under some odd circumstances, two starters sometimes recompose
484                 // together to form another character
485                 end = i+1;
486                 var notdone = true;
487                 while (end < cpArray.length && notdone) {
488                     if (typeof(ilib.data.ccc[cpArray[end]]) !== 'undefined' &&
489                         ilib.data.ccc[cpArray[end]] !== 0) {
490                         if (ccc(cpArray[end-1]) < ccc(cpArray[end])) {
491                             // not blocked
492                             testChar = GlyphString._compose(cpArray[i], cpArray[end]);
493                             if (typeof(testChar) !== 'undefined') {
494                                 cpArray[i] = testChar;
495 
496                                 // delete the combining char
497                                 cpArray.splice(end,1);
498 
499                                 // restart the iteration, just in case there is more to recompose with the new char
500                                 end = i;
501                             }
502                         }
503                         end++;
504                     } else {
505                         // found the next starter. See if this can be composed with the previous starter
506                         testChar = GlyphString._compose(cpArray[i], cpArray[end]);
507                         if (ccc(cpArray[end-1]) === 0 && typeof(testChar) !== 'undefined') {
508                             // not blocked and there is a mapping
509                             cpArray[i] = testChar;
510 
511                             // delete the combining char
512                             cpArray.splice(end,1);
513 
514                             // restart the iteration, just in case there is more to recompose with the new char
515                             end = i+1;
516                         } else {
517                             // finished iterating
518                             notdone = false;
519                         }
520                     }
521                 }
522             }
523             i++;
524         }
525     }
526 
527     return new IString(cpArray.length > 0 ? cpArray.join("") : "");
528 };
529 
530 /**
531  * @override
532  * Return an iterator that will step through all of the characters
533  * in the string one at a time, taking care to step through decomposed
534  * characters and through surrogate pairs in UTF-16 encoding
535  * properly. <p>
536  *
537  * The NormString class will return decomposed Unicode characters
538  * as a single unit that a user might see on the screen. If the
539  * next character in the iteration is a base character and it is
540  * followed by combining characters, the base and all its following
541  * combining characters are returned as a single unit.<p>
542  *
543  * The standard Javascript String's charAt() method only
544  * returns information about a particular 16-bit character in the
545  * UTF-16 encoding scheme.
546  * If the index is pointing to a low- or high-surrogate character,
547  * it will return that surrogate character rather
548  * than the surrogate pair which represents a character
549  * in the supplementary planes.<p>
550  *
551  * The iterator instance returned has two methods, hasNext() which
552  * returns true if the iterator has more characters to iterate through,
553  * and next() which returns the next character.<p>
554  *
555  * @return {Object} an iterator
556  * that iterates through all the characters in the string
557  */
558 NormString.prototype.charIterator = function() {
559     var it = IString.prototype.charIterator.call(this);
560 
561     /**
562      * @constructor
563      */
564     function _chiterator (istring) {
565         /**
566          * @private
567          */
568         var ccc = function(c) {
569             return ilib.data.ccc[c] || 0;
570         };
571 
572         this.index = 0;
573         this.hasNext = function () {
574             return !!this.nextChar || it.hasNext();
575         };
576         this.next = function () {
577             var ch = this.nextChar || it.next(),
578                 prevCcc = ccc(ch),
579                 nextCcc,
580                 composed = ch;
581 
582             this.nextChar = undefined;
583 
584             if (ilib.data.ccc &&
585                     (typeof(ilib.data.ccc[ch]) === 'undefined' || ccc(ch) === 0)) {
586                 // found a starter... find all the non-starters until the next starter. Must include
587                 // the next starter because under some odd circumstances, two starters sometimes recompose
588                 // together to form another character
589                 var notdone = true;
590                 while (it.hasNext() && notdone) {
591                     this.nextChar = it.next();
592                     nextCcc = ccc(this.nextChar);
593                     if (typeof(ilib.data.ccc[this.nextChar]) !== 'undefined' && nextCcc !== 0) {
594                         ch += this.nextChar;
595                         this.nextChar = undefined;
596                     } else {
597                         // found the next starter. See if this can be composed with the previous starter
598                         var testChar = GlyphString._compose(composed, this.nextChar);
599                         if (prevCcc === 0 && typeof(testChar) !== 'undefined') {
600                             // not blocked and there is a mapping
601                             composed = testChar;
602                             ch += this.nextChar;
603                             this.nextChar = undefined;
604                         } else {
605                             // finished iterating, leave this.nextChar for the next next() call
606                             notdone = false;
607                         }
608                     }
609                     prevCcc = nextCcc;
610                 }
611             }
612             return ch;
613         };
614     };
615     return new _chiterator(this);
616 };
617 
618 module.exports = NormString;
619