1 /* 2 * NormString.js - ilib normalized string subclass definition 3 * 4 * Copyright © 2013-2015, 2018, JEDLSoft 5 * 6 * Licensed under the Apache License, Version 2.0 (the "License"); 7 * you may not use this file except in compliance with the License. 8 * You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * 16 * See the License for the specific language governing permissions and 17 * limitations under the License. 18 */ 19 20 // !data ccc nfd nfc nfkd nfkc 21 22 var ilib = require("./ilib.js"); 23 var Utils = require("./Utils.js"); 24 var JSUtils = require("./JSUtils.js"); 25 26 var IString = require("./IString.js"); 27 var GlyphString = require("./GlyphString.js"); 28 29 /** 30 * @class 31 * Create a new normalized string instance. This string inherits from 32 * the GlyphString class, and adds the normalize method. It can be 33 * used anywhere that a normal Javascript string is used. <p> 34 * 35 * The options parameter is optional, and may contain any combination 36 * of the following properties:<p> 37 * 38 * <ul> 39 * <li><i>onLoad</i> - a callback function to call when the locale data are 40 * fully loaded. When the onLoad option is given, this object will attempt to 41 * load any missing locale data using the ilib loader callback. 42 * When the constructor is done (even if the data is already preassembled), the 43 * onLoad function is called with the current instance as a parameter, so this 44 * callback can be used with preassembled or dynamic loading or a mix of the two. 45 * 46 * <li><i>sync</i> - tell whether to load any missing locale data synchronously or 47 * asynchronously. If this option is given as "false", then the "onLoad" 48 * callback must be given, as the instance returned from this constructor will 49 * not be usable for a while. 50 * 51 * <li><i>loadParams</i> - an object containing parameters to pass to the 52 * loader callback function when locale data is missing. The parameters are not 53 * interpretted or modified in any way. They are simply passed along. The object 54 * may contain any property/value pairs as long as the calling code is in 55 * agreement with the loader callback function as to what those parameters mean. 56 * </ul> 57 * 58 * @constructor 59 * @extends GlyphString 60 * @param {string|IString=} str initialize this instance with this string 61 * @param {Object=} options options governing the way this instance works 62 */ 63 var NormString = function (str, options) { 64 GlyphString.call(this, str, options); 65 }; 66 67 NormString.prototype = new GlyphString("", {noinstance:true}); 68 NormString.prototype.parent = GlyphString; 69 NormString.prototype.constructor = NormString; 70 71 /** 72 * Initialize the normalized string routines statically. This 73 * is intended to be called in a dynamic-load version of ilib 74 * to load the data needed to normalize strings before any instances 75 * of NormString are created.<p> 76 * 77 * The options parameter may contain any of the following properties: 78 * 79 * <ul> 80 * <li><i>form</i> - {string} the normalization form to load 81 * <li><i>script</i> - {string} load the normalization for this script. If the 82 * script is given as "all" then the normalization data for all scripts 83 * is loaded at the same time 84 * <li><i>sync</i> - {boolean} whether to load the files synchronously or not 85 * <li><i>loadParams</i> - {Object} parameters to the loader function 86 * <li><i>onLoad</i> - {function()} a function to call when the 87 * files are done being loaded 88 * </ul> 89 * 90 * @param {Object} options an object containing properties that govern 91 * how to initialize the data 92 */ 93 94 NormString.init = function(options) { 95 var form = "nfkc"; 96 var script = "all"; 97 var sync = true; 98 var loadParams = undefined; 99 100 if (options) { 101 if (options.form) { 102 form = options.form; 103 } 104 if (options.script) { 105 script = options.script; 106 } 107 if (options.loadParams) { 108 loadParams = options.loadParams; 109 } 110 if (typeof(options.sync) === 'boolean') { 111 sync = options.sync; 112 } 113 } 114 115 var formDependencies = { 116 "nfd": ["nfd"], 117 "nfc": ["nfc", "nfd"], 118 "nfkd": ["nfkd", "nfd"], 119 "nfkc": ["nfc", "nfkd", "nfd"] 120 }; 121 var files = []; 122 var forms = formDependencies[form]; 123 var toLoad = []; 124 forms.forEach(function(f) { 125 if (!ilib.data.norm[f] || JSUtils.isEmpty(ilib.data.norm[f])) { 126 files.push(f + "/" + script + ".json"); 127 toLoad.push(f); 128 } 129 }); 130 131 if (files.length || !ilib.data.ccc || JSUtils.isEmpty(ilib.data.ccc)) { 132 Utils.loadData({ 133 object: "NormString", 134 name: "ccc.json", 135 locale: "-", 136 nonlocale: true, 137 sync: sync, 138 loadParams: loadParams, 139 callback: ilib.bind(this, function(normdata){ 140 ilib.data.ccc = normdata; 141 142 if (files.length) { 143 //console.log("loading files " + JSON.stringify(files)); 144 Utils._callLoadData(files, sync, loadParams, function(arr) { 145 for (var i = 0; i < arr.length; i++) { 146 if (typeof(arr[i]) !== 'undefined') { 147 ilib.extend(ilib.data.norm[toLoad[i]], arr[i]); 148 } 149 } 150 if (options && typeof(options.onLoad) === 'function') { 151 options.onLoad(this); 152 } 153 }); 154 } else { 155 if (options && typeof(options.onLoad) === 'function') { 156 options.onLoad(this); 157 } 158 } 159 }) 160 }) 161 } else { 162 if (options && typeof(options.onLoad) === 'function') { 163 options.onLoad(this); 164 } 165 } 166 } 167 168 /** 169 * Algorithmically decompose a precomposed Korean syllabic Hangul 170 * character into its individual combining Jamo characters. The given 171 * character must be in the range of Hangul characters U+AC00 to U+D7A3. 172 * 173 * @private 174 * @static 175 * @param {number} cp code point of a Korean Hangul character to decompose 176 * @return {string} the decomposed string of Jamo characters 177 */ 178 NormString._decomposeHangul = function (cp) { 179 var sindex = cp - 0xAC00; 180 var result = String.fromCharCode(0x1100 + sindex / 588) + 181 String.fromCharCode(0x1161 + (sindex % 588) / 28); 182 var t = sindex % 28; 183 if (t !== 0) { 184 result += String.fromCharCode(0x11A7 + t); 185 } 186 return result; 187 }; 188 189 /** 190 * Expand one character according to the given canonical and 191 * compatibility mappings. 192 * 193 * @private 194 * @static 195 * @param {string} ch character to map 196 * @param {Object} canon the canonical mappings to apply 197 * @param {Object=} compat the compatibility mappings to apply, or undefined 198 * if only the canonical mappings are needed 199 * @return {string} the mapped character 200 */ 201 NormString._expand = function (ch, canon, compat) { 202 var i, 203 expansion = "", 204 n = ch.charCodeAt(0); 205 if (GlyphString._isHangul(n)) { 206 expansion = NormString._decomposeHangul(n); 207 } else { 208 var result = canon[ch]; 209 if (!result && compat) { 210 result = compat[ch]; 211 } 212 if (result && result !== ch) { 213 for (i = 0; i < result.length; i++) { 214 expansion += NormString._expand(result[i], canon, compat); 215 } 216 } else { 217 expansion = ch; 218 } 219 } 220 return expansion; 221 }; 222 223 /** 224 * Perform the Unicode Normalization Algorithm upon the string and return 225 * the resulting new string. The current string is not modified. 226 * 227 * <h2>Forms</h2> 228 * 229 * The forms of possible normalizations are defined by the <a 230 * href="http://www.unicode.org/reports/tr15/">Unicode Standard 231 * Annex (UAX) 15</a>. The form parameter is a string that may have one 232 * of the following values: 233 * 234 * <ul> 235 * <li>nfd - Canonical decomposition. This decomposes characters into 236 * their exactly equivalent forms. For example, "ü" would decompose 237 * into a "u" followed by the combining diaeresis character. 238 * <li>nfc - Canonical decomposition followed by canonical composition. 239 * This decomposes and then recomposes character into their shortest 240 * exactly equivalent forms by recomposing as many combining characters 241 * as possible. For example, "ü" followed by a combining 242 * macron character would decompose into a "u" followed by the combining 243 * macron characters the combining diaeresis character, and then be recomposed into 244 * the u with macron and diaeresis "ṻ" character. The reason that 245 * the "nfc" form decomposes and then recomposes is that combining characters 246 * have a specific order under the Unicode Normalization Algorithm, and 247 * partly composed characters such as the "ü" followed by combining 248 * marks may change the order of the combining marks when decomposed and 249 * recomposed. 250 * <li>nfkd - Compatibility decomposition. This decomposes characters 251 * into compatible forms that may not be exactly equivalent semantically, 252 * as well as performing canonical decomposition as well. 253 * For example, the "œ" ligature character decomposes to the two 254 * characters "oe" because they are compatible even though they are not 255 * exactly the same semantically. 256 * <li>nfkc - Compatibility decomposition followed by canonical composition. 257 * This decomposes characters into compatible forms, then recomposes 258 * characters using the canonical composition. That is, it breaks down 259 * characters into the compatible forms, and then recombines all combining 260 * marks it can with their base characters. For example, the character 261 * "ǽ" would be normalized to "aé" by first decomposing 262 * the character into "a" followed by "e" followed by the combining acute accent 263 * combining mark, and then recomposed to an "a" followed by the "e" 264 * with acute accent. 265 * </ul> 266 * 267 * <h2>Operation</h2> 268 * 269 * Two strings a and b can be said to be canonically equivalent if 270 * normalize(a) = normalize(b) 271 * under the nfc normalization form. Two strings can be said to be compatible if 272 * normalize(a) = normalize(b) under the nfkc normalization form.<p> 273 * 274 * The canonical normalization is often used to see if strings are 275 * equivalent to each other, and thus is useful when implementing parsing 276 * algorithms or exact matching algorithms. It can also be used to ensure 277 * that any string output produces a predictable sequence of characters.<p> 278 * 279 * Compatibility normalization 280 * does not always preserve the semantic meaning of all the characters, 281 * although this is sometimes the behaviour that you are after. It is useful, 282 * for example, when doing searches of user-input against text in documents 283 * where the matches are supposed to "fuzzy". In this case, both the query 284 * string and the document string would be mapped to their compatibility 285 * normalized forms, and then compared.<p> 286 * 287 * Compatibility normalization also does not guarantee round-trip conversion 288 * to and from legacy character sets as the normalization is "lossy". It is 289 * akin to doing a lower- or upper-case conversion on text -- after casing, 290 * you cannot tell what case each character is in the original string. It is 291 * good for matching and searching, but it rarely good for output because some 292 * distinctions or meanings in the original text have been lost.<p> 293 * 294 * Note that W3C normalization for HTML also escapes and unescapes 295 * HTML character entities such as "ü" for u with diaeresis. This 296 * method does not do such escaping or unescaping. If normalization is required 297 * for HTML strings with entities, unescaping should be performed on the string 298 * prior to calling this method.<p> 299 * 300 * <h2>Data</h2> 301 * 302 * Normalization requires a fair amount of mapping data, much of which you may 303 * not need for the characters expected in your texts. It is possible to assemble 304 * a copy of ilib that saves space by only including normalization data for 305 * those scripts that you expect to encounter in your data.<p> 306 * 307 * The normalization data is organized by normalization form and within there 308 * by script. To include the normalization data for a particular script with 309 * a particular normalization form, use the following require: 310 * 311 * <pre><code> 312 * NormString.init({ 313 * form: "<form>", 314 * script: "<script>" 315 * }); 316 * </code></pre> 317 * 318 * Where <form> is the normalization form ("nfd", "nfc", "nfkd", or "nfkc"), and 319 * <script> is the ISO 15924 code for the script you would like to 320 * support. Example: to load in the NFC data for Cyrillic, you would use: 321 * 322 * <pre><code> 323 * NormString.init({ 324 * form: "nfc", 325 * script: "Cyrl" 326 * }); 327 * </code></pre> 328 * 329 * Note that because certain normalization forms include others in their algorithm, 330 * their data also depends on the data for the other forms. For example, if you 331 * include the "nfc" data for a script, you will automatically get the "nfd" data 332 * for that same script as well because the NFC algorithm does NFD normalization 333 * first. Here are the dependencies:<p> 334 * 335 * <ul> 336 * <li>NFD -> no dependencies 337 * <li>NFC -> NFD 338 * <li>NFKD -> NFD 339 * <li>NFKC -> NFKD, NFD, NFC 340 * </ul> 341 * 342 * A special value for the script dependency is "all" which will cause the data for 343 * all scripts 344 * to be loaded for that normalization form. This would be useful if you know that 345 * you are going to normalize a lot of multilingual text or cannot predict which scripts 346 * will appear in the input. Because the NFKC form depends on all others, you can 347 * get all of the data for all forms automatically by depending on "nfkc/all.js". 348 * Note that the normalization data for practically all script automatically depend 349 * on data for the Common script (code "Zyyy") which contains all of the characters 350 * that are commonly used in many different scripts. Examples of characters in the 351 * Common script are the ASCII punctuation characters, or the ASCII Arabic 352 * numerals "0" through "9".<p> 353 * 354 * By default, none of the data for normalization is automatically 355 * included in the preassembled ilib files. (For size "full".) 356 * If you would like to normalize strings, you must assemble 357 * your own copy of ilib and explicitly include the normalization data 358 * for those scripts. This normalization method will 359 * produce output, even without the normalization data. However, the output will be 360 * simply the same thing as its input for all scripts 361 * except Korean Hangul and Jamo, which are decomposed and recomposed 362 * algorithmically and therefore do not rely on data.<p> 363 * 364 * If characters are encountered for which there are no normalization data, they 365 * will be passed through to the output string unmodified. 366 * 367 * @param {string} form The normalization form requested 368 * @return {IString} a new instance of an IString that has been normalized 369 * according to the requested form. The current instance is not modified. 370 */ 371 NormString.prototype.normalize = function (form) { 372 var i; 373 374 if (typeof(form) !== 'string' || this.str.length === 0) { 375 return new IString(this.str); 376 } 377 378 var nfc = false, 379 nfkd = false; 380 381 switch (form) { 382 default: 383 break; 384 385 case "nfc": 386 nfc = true; 387 break; 388 389 case "nfkd": 390 nfkd = true; 391 break; 392 393 case "nfkc": 394 nfkd = true; 395 nfc = true; 396 break; 397 } 398 399 // decompose 400 var ch, it, decomp = ""; 401 402 if (nfkd) { 403 it = IString.prototype.charIterator.call(this); 404 while (it.hasNext()) { 405 ch = it.next(); 406 decomp += NormString._expand(ch, ilib.data.norm.nfd, ilib.data.norm.nfkd); 407 } 408 } else { 409 it = IString.prototype.charIterator.call(this); 410 while (it.hasNext()) { 411 ch = it.next(); 412 decomp += NormString._expand(ch, ilib.data.norm.nfd); 413 } 414 } 415 416 // now put the combining marks in a fixed order by 417 // sorting on the combining class 418 function compareByCCC(left, right) { 419 return ilib.data.ccc[left] - ilib.data.ccc[right]; 420 } 421 422 function ccc(c) { 423 return ilib.data.ccc[c] || 0; 424 } 425 426 function sortChars(arr, comp) { 427 // qt/qml's Javascript engine re-arranges entries that are equal to 428 // each other. Technically, that is a correct behaviour, but it is 429 // not desirable. All the other engines leave equivalent entries 430 // where they are. This bubblesort emulates what the other engines 431 // do. Fortunately, the arrays we are sorting are a max of 5 or 6 432 // entries, so performance is not a big deal here. 433 if (ilib._getPlatform() === "qt") { 434 var tmp; 435 for (var i = arr.length-1; i > 0; i--) { 436 for (var j = 0; j < i; j++) { 437 if (comp(arr[j], arr[j+1]) > 0) { 438 tmp = arr[j]; 439 arr[j] = arr[j+1]; 440 arr[j+1] = tmp; 441 } 442 } 443 } 444 return arr; 445 } else { 446 return arr.sort(comp); 447 } 448 } 449 450 var dstr = new IString(decomp); 451 it = dstr.charIterator(); 452 var end, testChar, cpArray = []; 453 454 // easier to deal with as an array of chars 455 while (it.hasNext()) { 456 cpArray.push(it.next()); 457 } 458 459 i = 0; 460 while (i < cpArray.length) { 461 if (typeof(ilib.data.ccc[cpArray[i]]) !== 'undefined' && ccc(cpArray[i]) !== 0) { 462 // found a non-starter... rearrange all the non-starters until the next starter 463 end = i+1; 464 while (end < cpArray.length && 465 typeof(ilib.data.ccc[cpArray[end]]) !== 'undefined' && 466 ccc(cpArray[end]) !== 0) { 467 end++; 468 } 469 470 // simple sort of the non-starter chars 471 if (end - i > 1) { 472 cpArray = cpArray.slice(0,i).concat(sortChars(cpArray.slice(i, end), compareByCCC), cpArray.slice(end)); 473 } 474 } 475 i++; 476 } 477 478 if (nfc) { 479 i = 0; 480 while (i < cpArray.length) { 481 if (typeof(ilib.data.ccc[cpArray[i]]) === 'undefined' || ilib.data.ccc[cpArray[i]] === 0) { 482 // found a starter... find all the non-starters until the next starter. Must include 483 // the next starter because under some odd circumstances, two starters sometimes recompose 484 // together to form another character 485 end = i+1; 486 var notdone = true; 487 while (end < cpArray.length && notdone) { 488 if (typeof(ilib.data.ccc[cpArray[end]]) !== 'undefined' && 489 ilib.data.ccc[cpArray[end]] !== 0) { 490 if (ccc(cpArray[end-1]) < ccc(cpArray[end])) { 491 // not blocked 492 testChar = GlyphString._compose(cpArray[i], cpArray[end]); 493 if (typeof(testChar) !== 'undefined') { 494 cpArray[i] = testChar; 495 496 // delete the combining char 497 cpArray.splice(end,1); 498 499 // restart the iteration, just in case there is more to recompose with the new char 500 end = i; 501 } 502 } 503 end++; 504 } else { 505 // found the next starter. See if this can be composed with the previous starter 506 testChar = GlyphString._compose(cpArray[i], cpArray[end]); 507 if (ccc(cpArray[end-1]) === 0 && typeof(testChar) !== 'undefined') { 508 // not blocked and there is a mapping 509 cpArray[i] = testChar; 510 511 // delete the combining char 512 cpArray.splice(end,1); 513 514 // restart the iteration, just in case there is more to recompose with the new char 515 end = i+1; 516 } else { 517 // finished iterating 518 notdone = false; 519 } 520 } 521 } 522 } 523 i++; 524 } 525 } 526 527 return new IString(cpArray.length > 0 ? cpArray.join("") : ""); 528 }; 529 530 /** 531 * @override 532 * Return an iterator that will step through all of the characters 533 * in the string one at a time, taking care to step through decomposed 534 * characters and through surrogate pairs in UTF-16 encoding 535 * properly. <p> 536 * 537 * The NormString class will return decomposed Unicode characters 538 * as a single unit that a user might see on the screen. If the 539 * next character in the iteration is a base character and it is 540 * followed by combining characters, the base and all its following 541 * combining characters are returned as a single unit.<p> 542 * 543 * The standard Javascript String's charAt() method only 544 * returns information about a particular 16-bit character in the 545 * UTF-16 encoding scheme. 546 * If the index is pointing to a low- or high-surrogate character, 547 * it will return that surrogate character rather 548 * than the surrogate pair which represents a character 549 * in the supplementary planes.<p> 550 * 551 * The iterator instance returned has two methods, hasNext() which 552 * returns true if the iterator has more characters to iterate through, 553 * and next() which returns the next character.<p> 554 * 555 * @return {Object} an iterator 556 * that iterates through all the characters in the string 557 */ 558 NormString.prototype.charIterator = function() { 559 var it = IString.prototype.charIterator.call(this); 560 561 /** 562 * @constructor 563 */ 564 function _chiterator (istring) { 565 /** 566 * @private 567 */ 568 var ccc = function(c) { 569 return ilib.data.ccc[c] || 0; 570 }; 571 572 this.index = 0; 573 this.hasNext = function () { 574 return !!this.nextChar || it.hasNext(); 575 }; 576 this.next = function () { 577 var ch = this.nextChar || it.next(), 578 prevCcc = ccc(ch), 579 nextCcc, 580 composed = ch; 581 582 this.nextChar = undefined; 583 584 if (ilib.data.ccc && 585 (typeof(ilib.data.ccc[ch]) === 'undefined' || ccc(ch) === 0)) { 586 // found a starter... find all the non-starters until the next starter. Must include 587 // the next starter because under some odd circumstances, two starters sometimes recompose 588 // together to form another character 589 var notdone = true; 590 while (it.hasNext() && notdone) { 591 this.nextChar = it.next(); 592 nextCcc = ccc(this.nextChar); 593 if (typeof(ilib.data.ccc[this.nextChar]) !== 'undefined' && nextCcc !== 0) { 594 ch += this.nextChar; 595 this.nextChar = undefined; 596 } else { 597 // found the next starter. See if this can be composed with the previous starter 598 var testChar = GlyphString._compose(composed, this.nextChar); 599 if (prevCcc === 0 && typeof(testChar) !== 'undefined') { 600 // not blocked and there is a mapping 601 composed = testChar; 602 ch += this.nextChar; 603 this.nextChar = undefined; 604 } else { 605 // finished iterating, leave this.nextChar for the next next() call 606 notdone = false; 607 } 608 } 609 prevCcc = nextCcc; 610 } 611 } 612 return ch; 613 }; 614 }; 615 return new _chiterator(this); 616 }; 617 618 module.exports = NormString; 619