1 /* 2 * Collator.js - Collation routines 3 * 4 * Copyright © 2013-2015, 2018, JEDLSoft 5 * 6 * Licensed under the Apache License, Version 2.0 (the "License"); 7 * you may not use this file except in compliance with the License. 8 * You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * 16 * See the License for the specific language governing permissions and 17 * limitations under the License. 18 */ 19 20 // !data collation 21 22 var ilib = require("./ilib.js"); 23 var Utils = require("./Utils.js"); 24 var JSUtils = require("./JSUtils.js"); 25 var Locale = require("./Locale.js"); 26 var INumber = require("./INumber.js"); 27 var isPunct = require("./isPunct.js"); 28 var NormString = require("./NormString.js"); 29 var CodePointSource = require("./CodePointSource.js"); 30 var ElementIterator = require("./ElementIterator.js"); 31 var GlyphString = require("./GlyphString.js"); 32 33 /** 34 * @class 35 * A class that implements a locale-sensitive comparator function 36 * for use with sorting function. The comparator function 37 * assumes that the strings it is comparing contain Unicode characters 38 * encoded in UTF-16.<p> 39 * 40 * Collations usually depend only on the language, because most collation orders 41 * are shared between locales that speak the same language. There are, however, a 42 * number of instances where a locale collates differently than other locales 43 * that share the same language. There are also a number of instances where a 44 * locale collates differently based on the script used. This object can handle 45 * these cases automatically if a full locale is specified in the options rather 46 * than just a language code.<p> 47 * 48 * <h2>Options</h2> 49 * 50 * The options parameter can contain any of the following properties: 51 * 52 * <ul> 53 * <li><i>locale</i> - String|Locale. The locale which the comparator function 54 * will collate with. Default: the current iLib locale. 55 * 56 * <li><i>sensitivity</i> - String. Sensitivity or strength of collator. This is one of 57 * "primary", "base", "secondary", "accent", "tertiary", "case", "quaternary", or 58 * "variant". Default: "primary" 59 * <ol> 60 * <li>base or primary - Only the primary distinctions between characters are significant. 61 * Another way of saying that is that the collator will be case-, accent-, and 62 * variation-insensitive, and only distinguish between the base characters 63 * <li>case or secondary - Both the primary and secondary distinctions between characters 64 * are significant. That is, the collator will be accent- and variation-insensitive 65 * and will distinguish between base characters and character case. 66 * <li>accent or tertiary - The primary, secondary, and tertiary distinctions between 67 * characters are all significant. That is, the collator will be 68 * variation-insensitive, but accent-, case-, and base-character-sensitive. 69 * <li>variant or quaternary - All distinctions between characters are significant. That is, 70 * the algorithm is base character-, case-, accent-, and variation-sensitive. 71 * </ol> 72 * 73 * <li><i>upperFirst</i> - boolean. When collating case-sensitively in a script that 74 * has the concept of case, put upper-case 75 * characters first, otherwise lower-case will come first. Warning: some browsers do 76 * not implement this feature or at least do not implement it properly, so if you are 77 * using the native collator with this option, you may get different results in different 78 * browsers. To guarantee the same results, set useNative to false to use the ilib 79 * collator implementation. This of course will be somewhat slower, but more 80 * predictable. Default: true 81 * 82 * <li><i>reverse</i> - boolean. Return the list sorted in reverse order. When the 83 * upperFirst option is also set to true, upper-case characters would then come at 84 * the end of the list. Default: false. 85 * 86 * <li><i>scriptOrder</i> - string. When collating strings in multiple scripts, 87 * this property specifies what order those scripts should be sorted. The default 88 * Unicode Collation Algorithm (UCA) already has a default order for scripts, but 89 * this can be tailored via this property. The value of this option is a 90 * space-separated list of ISO 15924 scripts codes. If a code is specified in this 91 * property, its default data must be included using the JS assembly tool. If the 92 * data is not included, the ordering for the script will be ignored. Default: 93 * the default order defined by the UCA. 94 * 95 * <li><i>style</i> - The value of the style parameter is dependent on the locale. 96 * For some locales, there are different styles of collating strings depending 97 * on what kind of strings are being collated or what the preference of the user 98 * is. For example, in German, there is a phonebook order and a dictionary ordering 99 * that sort the same array of strings slightly differently. 100 * The static method {@link Collator#getAvailableStyles} will return a list of styles that ilib 101 * currently knows about for any given locale. If the value of the style option is 102 * not recognized for a locale, it will be ignored. Default style is "standard".<p> 103 * 104 * <li><i>usage</i> - Whether this collator will be used for searching or sorting. 105 * Valid values are simply the strings "sort" or "search". When used for sorting, 106 * it is good idea if a collator produces a stable sort. That is, the order of the 107 * sorted array of strings should not depend on the order of the strings in the 108 * input array. As such, when a collator is supposed to act case insensitively, 109 * it nonetheless still distinguishes between case after all other criteria 110 * are satisfied so that strings that are distinguished only by case do not sort 111 * randomly. For searching, we would like to match two strings that different only 112 * by case, so the collator must return equals in that situation instead of 113 * further distinguishing by case. Default is "sort". 114 * 115 * <li><i>numeric</i> - Treat the left and right strings as if they started with 116 * numbers and sort them numerically rather than lexically. 117 * 118 * <li><i>ignorePunctuation</i> - Skip punctuation characters when comparing the 119 * strings. 120 * 121 * <li>onLoad - a callback function to call when the collator object is fully 122 * loaded. When the onLoad option is given, the collator object will attempt to 123 * load any missing locale data using the ilib loader callback. 124 * When the constructor is done (even if the data is already preassembled), the 125 * onLoad function is called with the current instance as a parameter, so this 126 * callback can be used with preassembled or dynamic loading or a mix of the two. 127 * 128 * <li>sync - tell whether to load any missing locale data synchronously or 129 * asynchronously. If this option is given as "false", then the "onLoad" 130 * callback must be given, as the instance returned from this constructor will 131 * not be usable for a while. 132 * 133 * <li><i>loadParams</i> - an object containing parameters to pass to the 134 * loader callback function when locale data is missing. The parameters are not 135 * interpretted or modified in any way. They are simply passed along. The object 136 * may contain any property/value pairs as long as the calling code is in 137 * agreement with the loader callback function as to what those parameters mean. 138 * 139 * <li><i>useNative</i> - when this option is true, use the native Intl object 140 * provided by the Javascript engine, if it exists, to implement this class. If 141 * it doesn't exist, or if this parameter is false, then this class uses a pure 142 * Javascript implementation, which is slower and uses a lot more memory, but 143 * works everywhere that ilib works. Default is "true". 144 * </ul> 145 * 146 * <h2>Operation</h2> 147 * 148 * The Collator constructor returns a collator object tailored with the above 149 * options. The object contains an internal compare() method which compares two 150 * strings according to those options. This can be used directly to compare 151 * two strings, but is not useful for passing to the javascript sort function 152 * because then it will not have its collation data available. Instead, use the 153 * getComparator() method to retrieve a function that is bound to the collator 154 * object. (You could also bind it yourself using ilib.bind()). The bound function 155 * can be used with the standard Javascript array sorting algorithm, or as a 156 * comparator with your own sorting algorithm.<p> 157 * 158 * Example using the standard Javascript array sorting call with the bound 159 * function:<p> 160 * 161 * <code> 162 * <pre> 163 * var arr = ["ö", "oe", "ü", "o", "a", "ae", "u", "ß", "ä"]; 164 * var collator = new Collator({locale: 'de-DE', style: "dictionary"}); 165 * arr.sort(collator.getComparator()); 166 * console.log(JSON.stringify(arr)); 167 * </pre> 168 * </code> 169 * <p> 170 * 171 * Would give the output:<p> 172 * 173 * <code> 174 * <pre> 175 * ["a", "ae", "ä", "o", "oe", "ö", "ß", "u", "ü"] 176 * </pre> 177 * </code> 178 * 179 * When sorting an array of Javascript objects according to one of the 180 * string properties of the objects, wrap the collator's compare function 181 * in your own comparator function that knows the structure of the objects 182 * being sorted:<p> 183 * 184 * <code> 185 * <pre> 186 * var collator = new Collator({locale: 'de-DE'}); 187 * var myComparator = function (collator) { 188 * var comparator = collator.getComparator(); 189 * // left and right are your own objects 190 * return function (left, right) { 191 * return comparator(left.x.y.textProperty, right.x.y.textProperty); 192 * }; 193 * }; 194 * arr.sort(myComparator(collator)); 195 * </pre> 196 * </code> 197 * <p> 198 * 199 * <h2>Sort Keys</h2> 200 * 201 * The collator class also has a method to retrieve the sort key for a 202 * string. The sort key is an array of values that represent how each 203 * character in the string should be collated according to the characteristics 204 * of the collation algorithm and the given options. Thus, sort keys can be 205 * compared directly value-for-value with other sort keys that were generated 206 * by the same collator, and the resulting ordering is guaranteed to be the 207 * same as if the original strings were compared by the collator. 208 * Sort keys generated by different collators are not guaranteed to give 209 * any reasonable results when compared together unless the two collators 210 * were constructed with 211 * exactly the same options and therefore end up representing the exact same 212 * collation sequence.<p> 213 * 214 * A good rule of thumb is that you would use a sort key if you had 10 or more 215 * items to sort or if your array might be resorted arbitrarily. For example, if your 216 * user interface was displaying a table with 100 rows in it, and each row had 217 * 4 sortable text columns which could be sorted in acending or descending order, 218 * the recommended practice would be to generate a sort key for each of the 4 219 * sortable fields in each row and store that in the Javascript representation of the 220 * table data. Then, when the user clicks on a column header to resort the 221 * table according to that column, the resorting would be relatively quick 222 * because it would only be comparing arrays of values, and not recalculating 223 * the collation values for each character in each string for every comparison.<p> 224 * 225 * For tables that are large, it is usually a better idea to do the sorting 226 * on the server side, especially if the table is the result of a database 227 * query. In this case, the table is usually a view of the cursor of a large 228 * results set, and only a few entries are sent to the front end at a time. 229 * In order to sort the set efficiently, it should be done on the database 230 * level instead. 231 * 232 * <h2>Data</h2> 233 * 234 * Doing correct collation entails a huge amount of mapping data, much of which is 235 * not necessary when collating in one language with one script, which is the most 236 * common case. Thus, ilib implements a number of ways to include the data you 237 * need or leave out the data you don't need using the JS assembly tool: 238 * 239 * <ol> 240 * <li>Full multilingual data - if you are sorting multilingual data and need to collate 241 * text written in multiple scripts, you can use the directive "!data collation/ducet" to 242 * load in the full collation data. This allows the collator to perform the entire 243 * Unicode Collation Algorithm (UCA) based on the Default Unicode Collation Element 244 * Table (DUCET). The data is very large, on the order of multiple megabytes, but 245 * sometimes it is necessary. 246 * <li>A few scripts - if you are sorting text written in only a few scripts, you may 247 * want to include only the data for those scripts. Each ISO 15924 script code has its 248 * own data available in a separate file, so you can use the data directive to include 249 * only the data for the scripts you need. For example, use 250 * "!data collation/Latn" to retrieve the collation information for the Latin script. 251 * Because the "ducet" table mentioned in the previous point is a superset of the 252 * tables for all other scripts, you do not need to include explicitly the data for 253 * any particular script when using "ducet". That is, you either include "ducet" or 254 * you include a specific list of scripts. 255 * <li>Only one script - if you are sorting text written only in one script, you can 256 * either include the data directly as in the previous point, or you can rely on the 257 * locale to include the correct data for you. In this case, you can use the directive 258 * "!data collate" to load in the locale's collation data for its most common script. 259 * </ol> 260 * 261 * With any of the above ways of including the data, the collator will only perform the 262 * correct language-sensitive sorting for the given locale. All other scripts will be 263 * sorted in the default manner according to the UCA. For example, if you include the 264 * "ducet" data and pass in "de-DE" (German for Germany) as the locale spec, then 265 * only the Latin script (the default script for German) will be sorted according to 266 * German rules. All other scripts in the DUCET, such as Japanese or Arabic, will use 267 * the default UCA collation rules.<p> 268 * 269 * If this collator encounters a character for which it has no collation data, it will 270 * sort those characters by pure Unicode value after all characters for which it does have 271 * collation data. For example, if you only loaded in the German collation data (ie. the 272 * data for the Latin script tailored to German) to sort a list of person names, but that 273 * list happens to include the names of a few Japanese people written in Japanese 274 * characters, the Japanese names will sort at the end of the list after all German names, 275 * and will sort according to the Unicode values of the characters. 276 * 277 * @constructor 278 * @param {Object} options options governing how the resulting comparator 279 * function will operate 280 */ 281 var Collator = function(options) { 282 var sync = true, 283 loadParams = undefined, 284 useNative = true; 285 286 // defaults 287 /** 288 * @private 289 * @type {Locale} 290 */ 291 this.locale = new Locale(ilib.getLocale()); 292 293 /** @private */ 294 this.caseFirst = "upper"; 295 /** @private */ 296 this.sensitivity = "variant"; 297 /** @private */ 298 this.level = 4; 299 /** @private */ 300 this.usage = "sort"; 301 /** @private */ 302 this.reverse = false; 303 /** @private */ 304 this.numeric = false; 305 /** @private */ 306 this.style = "default"; 307 /** @private */ 308 this.ignorePunctuation = false; 309 310 if (options) { 311 if (options.locale) { 312 this.locale = (typeof(options.locale) === 'string') ? new Locale(options.locale) : options.locale; 313 } 314 if (options.sensitivity) { 315 switch (options.sensitivity) { 316 case 'primary': 317 case 'base': 318 this.sensitivity = "base"; 319 this.level = 1; 320 break; 321 case 'secondary': 322 case 'accent': 323 this.sensitivity = "accent"; 324 this.level = 2; 325 break; 326 case 'tertiary': 327 case 'case': 328 this.sensitivity = "case"; 329 this.level = 3; 330 break; 331 case 'quaternary': 332 case 'variant': 333 this.sensitivity = "variant"; 334 this.level = 4; 335 break; 336 } 337 } 338 if (typeof(options.upperFirst) !== 'undefined') { 339 this.caseFirst = options.upperFirst ? "upper" : "lower"; 340 } 341 342 if (typeof(options.ignorePunctuation) !== 'undefined') { 343 this.ignorePunctuation = options.ignorePunctuation; 344 } 345 if (typeof(options.sync) !== 'undefined') { 346 sync = !!options.sync; 347 } 348 349 loadParams = options.loadParams; 350 if (typeof(options.useNative) !== 'undefined') { 351 useNative = options.useNative; 352 } 353 354 if (options.usage === "sort" || options.usage === "search") { 355 this.usage = options.usage; 356 } 357 358 if (typeof(options.reverse) === 'boolean') { 359 this.reverse = options.reverse; 360 } 361 362 if (typeof(options.numeric) === 'boolean') { 363 this.numeric = options.numeric; 364 } 365 366 if (typeof(options.style) === 'string') { 367 this.style = options.style; 368 } 369 } else { 370 options = {sync: true}; 371 } 372 373 if (this.usage === "sort") { 374 // produces a stable sort 375 this.level = 4; 376 } 377 378 if (useNative && typeof(Intl) !== 'undefined' && Intl) { 379 // this engine is modern and supports the new Intl object! 380 //console.log("implemented natively"); 381 /** 382 * @private 383 * @type {{compare:function(string,string)}} 384 */ 385 this.collator = new Intl.Collator(this.locale.getSpec(), { 386 sensitivity: this.sensitivity, 387 caseFirst: this.caseFirst, 388 ignorePunctuation: this.ignorePunctuation, 389 numeric: this.numeric, 390 usage: this.usage 391 }); 392 393 if (options && typeof(options.onLoad) === 'function') { 394 options.onLoad(this); 395 } 396 } else { 397 //console.log("implemented in pure JS"); 398 399 // else implement in pure Javascript 400 Utils.loadData({ 401 object: "Collator", 402 locale: this.locale, 403 name: "collation.json", 404 replace: true, 405 sync: sync, 406 loadParams: loadParams, 407 callback: ilib.bind(this, function (collation) { 408 if (!collation) { 409 collation = ilib.data.collation; 410 } 411 this._initCollation(collation); 412 if (this.ignorePunctuation) { 413 isPunct._init(sync, loadParams, ilib.bind(this, function() { 414 this._init(options); 415 })); 416 } else { 417 this._init(options); 418 } 419 }) 420 }); 421 } 422 }; 423 424 Collator.prototype = { 425 /** 426 * @private 427 */ 428 _init: function(options) { 429 if (this.numeric) { 430 // Create a fake INumber instance now to guarantee that the locale data 431 // is loaded so we can create sync INumber instances later, even in async mode 432 new INumber("1", { 433 sync: options.sync, 434 loadParams: options.loadParams, 435 onLoad: function(n) { 436 if (typeof(options.onLoad) === 'function') { 437 options.onLoad(this); 438 } 439 } 440 }) 441 } else { 442 if (typeof(options.onLoad) === 'function') { 443 options.onLoad(this); 444 } 445 } 446 }, 447 448 /** 449 * @private 450 * Bit pack an array of values into a single number 451 * @param {number|null|Array.<number>} arr array of values to bit pack 452 * @param {number} offset offset for the start of this map 453 */ 454 _pack: function (arr, offset) { 455 var value = 0; 456 if (arr) { 457 if (typeof(arr) === 'number') { 458 arr = [ arr ]; 459 } 460 for (var i = 0; i < this.level; i++) { 461 var thisLevel = (typeof(arr[i]) !== "undefined" ? arr[i] : 0); 462 if (i === 0) { 463 thisLevel += offset; 464 } 465 if (i > 0) { 466 value <<= this.collation.bits[i]; 467 } 468 if (i === 2 && this.caseFirst === "lower") { 469 // sort the lower case first instead of upper 470 value = value | (1 - thisLevel); 471 } else { 472 value = value | thisLevel; 473 } 474 } 475 } 476 return value; 477 }, 478 479 /** 480 * @private 481 * Return the rule packed into an array of collation elements. 482 * @param {Array.<number|null|Array.<number>>} rule 483 * @param {number} offset 484 * @return {Array.<number>} a bit-packed array of numbers 485 */ 486 _packRule: function(rule, offset) { 487 if (ilib.isArray(rule[0])) { 488 var ret = []; 489 for (var i = 0; i < rule.length; i++) { 490 ret.push(this._pack(rule[i], offset)); 491 } 492 return ret; 493 } else { 494 return [ this._pack(rule, offset) ]; 495 } 496 }, 497 498 /** 499 * @private 500 */ 501 _addChars: function (str, offset) { 502 var gs = new GlyphString(str); 503 var it = gs.charIterator(); 504 var c; 505 506 while (it.hasNext()) { 507 c = it.next(); 508 if (c === "'") { 509 // escape a sequence of chars as one collation element 510 c = ""; 511 var x = ""; 512 while (it.hasNext() && x !== "'") { 513 c += x; 514 x = it.next(); 515 } 516 } 517 this.lastMap++; 518 this.map[c] = this._packRule([this.lastMap], offset); 519 } 520 }, 521 522 /** 523 * @private 524 */ 525 _addRules: function(rules, start) { 526 var p; 527 for (var r in rules.map) { 528 if (r) { 529 this.map[r] = this._packRule(rules.map[r], start); 530 p = typeof(rules.map[r][0]) === 'number' ? rules.map[r][0] : rules.map[r][0][0]; 531 this.lastMap = Math.max(p + start, this.lastMap); 532 } 533 } 534 535 if (typeof(rules.ranges) !== 'undefined') { 536 // for each range, everything in the range goes in primary sequence from the start 537 for (var i = 0; i < rules.ranges.length; i++) { 538 var range = rules.ranges[i]; 539 540 this.lastMap = range.start; 541 if (typeof(range.chars) === "string") { 542 this._addChars(range.chars, start); 543 } else { 544 for (var k = 0; k < range.chars.length; k++) { 545 this._addChars(range.chars[k], start); 546 } 547 } 548 } 549 } 550 }, 551 552 /** 553 * @private 554 */ 555 _initCollation: function(rules) { 556 var rule = this.style; 557 while (typeof(rule) === 'string') { 558 rule = rules[rule]; 559 } 560 561 if (!rule) { 562 rule = "default"; 563 564 while (typeof(rule) === 'string') { 565 rule = rules[rule]; 566 } 567 } 568 if (!rule) { 569 this.map = {}; 570 return; 571 } 572 573 /** 574 * @private 575 * @type {{scripts:Array.<string>,bits:Array.<number>,maxes:Array.<number>,bases:Array.<number>,map:Object.<string,Array.<number|null|Array.<number>>>}} 576 */ 577 this.collation = rule; 578 this.map = {}; 579 this.lastMap = -1; 580 this.keysize = this.collation.keysize[this.level-1]; 581 this.defaultRule = rules["default"]; 582 583 if (typeof(this.collation.inherit) !== 'undefined') { 584 for (var i = 0; i < this.collation.inherit.length; i++) { 585 if (this.collation.inherit === 'this') { 586 continue; 587 } 588 var col = this.collation.inherit[i]; 589 rule = typeof(col) === 'object' ? col.name : col; 590 if (rules[rule]) { 591 this._addRules(rules[rule], col.start || this.lastMap+1); 592 } 593 } 594 } 595 this._addRules(this.collation, this.lastMap+1); 596 }, 597 598 /** 599 * @private 600 */ 601 _basicCompare: function(left, right) { 602 var l = (left instanceof NormString) ? left : new NormString(left), 603 r = (right instanceof NormString) ? right : new NormString(right), 604 lelements, 605 relements, 606 diff; 607 608 if (this.numeric) { 609 var lvalue = new INumber(left, {locale: this.locale}); 610 var rvalue = new INumber(right, {locale: this.locale}); 611 if (!isNaN(lvalue.valueOf()) && !isNaN(rvalue.valueOf())) { 612 diff = lvalue.valueOf() - rvalue.valueOf(); 613 if (diff) { 614 return diff; 615 } else { 616 // skip the numeric part and compare the rest lexically 617 l = new NormString(left.substring(lvalue.parsed.length)); 618 r = new NormString(right.substring(rvalue.parsed.length)); 619 } 620 } 621 // else if they aren't both numbers, then let the code below take care of the lexical comparison instead 622 } 623 624 lelements = new ElementIterator(new CodePointSource(l, this.ignorePunctuation), this.map, this.keysize); 625 relements = new ElementIterator(new CodePointSource(r, this.ignorePunctuation), this.map, this.keysize); 626 627 while (lelements.hasNext() && relements.hasNext()) { 628 diff = lelements.next() - relements.next(); 629 if (diff) { 630 return diff; 631 } 632 } 633 if (!lelements.hasNext() && !relements.hasNext()) { 634 return 0; 635 } else if (lelements.hasNext()) { 636 return 1; 637 } else { 638 return -1; 639 } 640 }, 641 642 /** 643 * Compare two strings together according to the rules of this 644 * collator instance. Do not use this function directly with 645 * Array.sort, as it will not have its collation data available 646 * and therefore will not function properly. Use the function 647 * returned by getComparator() instead. 648 * 649 * @param {string} left the left string to compare 650 * @param {string} right the right string to compare 651 * @return {number} a negative number if left comes before right, a 652 * positive number if right comes before left, and zero if left and 653 * right are equivalent according to this collator 654 */ 655 compare: function (left, right) { 656 // last resort: use the "C" locale 657 if (this.collator) { 658 // implemented by the core engine 659 return this.collator.compare(left, right); 660 } 661 662 var ret = this._basicCompare(left, right); 663 return this.reverse ? -ret : ret; 664 }, 665 666 /** 667 * Return a comparator function that can compare two strings together 668 * according to the rules of this collator instance. The function 669 * returns a negative number if the left 670 * string comes before right, a positive number if the right string comes 671 * before the left, and zero if left and right are equivalent. If the 672 * reverse property was given as true to the collator constructor, this 673 * function will 674 * switch the sign of those values to cause sorting to happen in the 675 * reverse order. 676 * 677 * @return {function(...)|undefined} a comparator function that 678 * can compare two strings together according to the rules of this 679 * collator instance 680 */ 681 getComparator: function() { 682 // bind the function to this instance so that we have the collation 683 // rules available to do the work 684 if (this.collator) { 685 // implemented by the core engine 686 return this.collator.compare; 687 } 688 689 return ilib.bind(this, this.compare); 690 }, 691 692 /** 693 * Return a sort key string for the given string. The sort key 694 * string is a list of values that represent each character 695 * in the original string. The sort key 696 * values for any particular character consists of 3 numbers that 697 * encode the primary, secondary, and tertiary characteristics 698 * of that character. The values of each characteristic are 699 * modified according to the strength of this collator instance 700 * to give the correct collation order. The idea is that this 701 * sort key string is directly comparable byte-for-byte to 702 * other sort key strings generated by this collator without 703 * any further knowledge of the collation rules for the locale. 704 * More formally, if a < b according to the rules of this collation, 705 * then it is guaranteed that sortkey(a) < sortkey(b) when compared 706 * byte-for-byte. The sort key string can therefore be used 707 * without the collator to sort an array of strings efficiently 708 * because the work of determining the applicability of various 709 * collation rules is done once up-front when generating 710 * the sort key.<p> 711 * 712 * The sort key string can be treated as a regular, albeit somewhat 713 * odd-looking, string. That is, it can be pass to regular 714 * Javascript functions without problems. 715 * 716 * @param {string} str the original string to generate the sort key for 717 * @return {string} a sort key string for the given string 718 */ 719 sortKey: function (str) { 720 if (!str) { 721 return ""; 722 } 723 724 if (this.collator) { 725 // native, no sort keys available 726 return str; 727 } 728 729 if (this.numeric) { 730 var v = new INumber(str, {locale: this.locale}); 731 var s = isNaN(v.valueOf()) ? "" : v.valueOf().toString(16); 732 return JSUtils.pad(s, 16); 733 } else { 734 var n = (typeof(str) === "string") ? new NormString(str) : str, 735 ret = "", 736 lelements = new ElementIterator(new CodePointSource(n, this.ignorePunctuation), this.map, this.keysize), 737 element; 738 739 while (lelements.hasNext()) { 740 element = lelements.next(); 741 if (this.reverse) { 742 // for reverse, take the bitwise inverse 743 element = (1 << this.keysize) - element; 744 } 745 ret += JSUtils.pad(element.toString(16), this.keysize/4); 746 } 747 } 748 return ret; 749 } 750 }; 751 752 /** 753 * Retrieve the list of collation style names that are available for the 754 * given locale. This list varies depending on the locale, and depending 755 * on whether or not the data for that locale was assembled into this copy 756 * of ilib. 757 * 758 * @param {Locale|string=} locale The locale for which the available 759 * styles are being sought 760 * @return Array.<string> an array of style names that are available for 761 * the given locale 762 */ 763 Collator.getAvailableStyles = function (locale) { 764 return [ "standard" ]; 765 }; 766 767 /** 768 * Retrieve the list of ISO 15924 script codes that are available in this 769 * copy of ilib. This list varies depending on whether or not the data for 770 * various scripts was assembled into this copy of ilib. If the "ducet" 771 * data is assembled into this copy of ilib, this method will report the 772 * entire list of scripts as being available. If a collator instance is 773 * instantiated with a script code that is not on the list returned by this 774 * function, it will be ignored and text in that script will be sorted by 775 * numeric Unicode values of the characters. 776 * 777 * @return Array.<string> an array of ISO 15924 script codes that are 778 * available 779 */ 780 Collator.getAvailableScripts = function () { 781 return [ "Latn" ]; 782 }; 783 784 785 /** 786 * Return a default collation style 787 * 788 * @returns {string} default collation style such as 'latin', 'korean' etc */ 789 Collator.prototype.getDefaultCollatorStyle = function () { 790 return this.defaultRule; 791 }; 792 793 module.exports = Collator; 794