1 /*
  2  * Collator.js - Collation routines
  3  *
  4  * Copyright © 2013-2015, 2018, JEDLSoft
  5  *
  6  * Licensed under the Apache License, Version 2.0 (the "License");
  7  * you may not use this file except in compliance with the License.
  8  * You may obtain a copy of the License at
  9  *
 10  *     http://www.apache.org/licenses/LICENSE-2.0
 11  *
 12  * Unless required by applicable law or agreed to in writing, software
 13  * distributed under the License is distributed on an "AS IS" BASIS,
 14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15  *
 16  * See the License for the specific language governing permissions and
 17  * limitations under the License.
 18  */
 19 
 20 // !data collation
 21 
 22 var ilib = require("./ilib.js");
 23 var Utils = require("./Utils.js");
 24 var JSUtils = require("./JSUtils.js");
 25 var Locale = require("./Locale.js");
 26 var INumber = require("./INumber.js");
 27 var isPunct = require("./isPunct.js");
 28 var NormString = require("./NormString.js");
 29 var CodePointSource = require("./CodePointSource.js");
 30 var ElementIterator = require("./ElementIterator.js");
 31 var GlyphString = require("./GlyphString.js");
 32 
 33 /**
 34  * @class
 35  * A class that implements a locale-sensitive comparator function
 36  * for use with sorting function. The comparator function
 37  * assumes that the strings it is comparing contain Unicode characters
 38  * encoded in UTF-16.<p>
 39  *
 40  * Collations usually depend only on the language, because most collation orders
 41  * are shared between locales that speak the same language. There are, however, a
 42  * number of instances where a locale collates differently than other locales
 43  * that share the same language. There are also a number of instances where a
 44  * locale collates differently based on the script used. This object can handle
 45  * these cases automatically if a full locale is specified in the options rather
 46  * than just a language code.<p>
 47  *
 48  * <h2>Options</h2>
 49  *
 50  * The options parameter can contain any of the following properties:
 51  *
 52  * <ul>
 53  * <li><i>locale</i> - String|Locale. The locale which the comparator function
 54  * will collate with. Default: the current iLib locale.
 55  *
 56  * <li><i>sensitivity</i> - String. Sensitivity or strength of collator. This is one of
 57  * "primary", "base", "secondary", "accent", "tertiary", "case", "quaternary", or
 58  * "variant". Default: "primary"
 59  *   <ol>
 60  *   <li>base or primary - Only the primary distinctions between characters are significant.
 61  *   Another way of saying that is that the collator will be case-, accent-, and
 62  *   variation-insensitive, and only distinguish between the base characters
 63  *   <li>case or secondary - Both the primary and secondary distinctions between characters
 64  *   are significant. That is, the collator will be accent- and variation-insensitive
 65  *   and will distinguish between base characters and character case.
 66  *   <li>accent or tertiary - The primary, secondary, and tertiary distinctions between
 67  *   characters are all significant. That is, the collator will be
 68  *   variation-insensitive, but accent-, case-, and base-character-sensitive.
 69  *   <li>variant or quaternary - All distinctions between characters are significant. That is,
 70  *   the algorithm is base character-, case-, accent-, and variation-sensitive.
 71  *   </ol>
 72  *
 73  * <li><i>upperFirst</i> - boolean. When collating case-sensitively in a script that
 74  * has the concept of case, put upper-case
 75  * characters first, otherwise lower-case will come first. Warning: some browsers do
 76  * not implement this feature or at least do not implement it properly, so if you are
 77  * using the native collator with this option, you may get different results in different
 78  * browsers. To guarantee the same results, set useNative to false to use the ilib
 79  * collator implementation. This of course will be somewhat slower, but more
 80  * predictable. Default: true
 81  *
 82  * <li><i>reverse</i> - boolean. Return the list sorted in reverse order. When the
 83  * upperFirst option is also set to true, upper-case characters would then come at
 84  * the end of the list. Default: false.
 85  *
 86  * <li><i>scriptOrder</i> - string. When collating strings in multiple scripts,
 87  * this property specifies what order those scripts should be sorted. The default
 88  * Unicode Collation Algorithm (UCA) already has a default order for scripts, but
 89  * this can be tailored via this property. The value of this option is a
 90  * space-separated list of ISO 15924 scripts codes. If a code is specified in this
 91  * property, its default data must be included using the JS assembly tool. If the
 92  * data is not included, the ordering for the script will be ignored. Default:
 93  * the default order defined by the UCA.
 94  *
 95  * <li><i>style</i> - The value of the style parameter is dependent on the locale.
 96  * For some locales, there are different styles of collating strings depending
 97  * on what kind of strings are being collated or what the preference of the user
 98  * is. For example, in German, there is a phonebook order and a dictionary ordering
 99  * that sort the same array of strings slightly differently.
100  * The static method {@link Collator#getAvailableStyles} will return a list of styles that ilib
101  * currently knows about for any given locale. If the value of the style option is
102  * not recognized for a locale, it will be ignored. Default style is "standard".<p>
103  *
104  * <li><i>usage</i> - Whether this collator will be used for searching or sorting.
105  * Valid values are simply the strings "sort" or "search". When used for sorting,
106  * it is good idea if a collator produces a stable sort. That is, the order of the
107  * sorted array of strings should not depend on the order of the strings in the
108  * input array. As such, when a collator is supposed to act case insensitively,
109  * it nonetheless still distinguishes between case after all other criteria
110  * are satisfied so that strings that are distinguished only by case do not sort
111  * randomly. For searching, we would like to match two strings that different only
112  * by case, so the collator must return equals in that situation instead of
113  * further distinguishing by case. Default is "sort".
114  *
115  * <li><i>numeric</i> - Treat the left and right strings as if they started with
116  * numbers and sort them numerically rather than lexically.
117  *
118  * <li><i>ignorePunctuation</i> - Skip punctuation characters when comparing the
119  * strings.
120  *
121  * <li>onLoad - a callback function to call when the collator object is fully
122  * loaded. When the onLoad option is given, the collator object will attempt to
123  * load any missing locale data using the ilib loader callback.
124  * When the constructor is done (even if the data is already preassembled), the
125  * onLoad function is called with the current instance as a parameter, so this
126  * callback can be used with preassembled or dynamic loading or a mix of the two.
127  *
128  * <li>sync - tell whether to load any missing locale data synchronously or
129  * asynchronously. If this option is given as "false", then the "onLoad"
130  * callback must be given, as the instance returned from this constructor will
131  * not be usable for a while.
132  *
133  * <li><i>loadParams</i> - an object containing parameters to pass to the
134  * loader callback function when locale data is missing. The parameters are not
135  * interpretted or modified in any way. They are simply passed along. The object
136  * may contain any property/value pairs as long as the calling code is in
137  * agreement with the loader callback function as to what those parameters mean.
138  *
139  * <li><i>useNative</i> - when this option is true, use the native Intl object
140  * provided by the Javascript engine, if it exists, to implement this class. If
141  * it doesn't exist, or if this parameter is false, then this class uses a pure
142  * Javascript implementation, which is slower and uses a lot more memory, but
143  * works everywhere that ilib works. Default is "true".
144  * </ul>
145  *
146  * <h2>Operation</h2>
147  *
148  * The Collator constructor returns a collator object tailored with the above
149  * options. The object contains an internal compare() method which compares two
150  * strings according to those options. This can be used directly to compare
151  * two strings, but is not useful for passing to the javascript sort function
152  * because then it will not have its collation data available. Instead, use the
153  * getComparator() method to retrieve a function that is bound to the collator
154  * object. (You could also bind it yourself using ilib.bind()). The bound function
155  * can be used with the standard Javascript array sorting algorithm, or as a
156  * comparator with your own sorting algorithm.<p>
157  *
158  * Example using the standard Javascript array sorting call with the bound
159  * function:<p>
160  *
161  * <code>
162  * <pre>
163  * var arr = ["ö", "oe", "ü", "o", "a", "ae", "u", "ß", "ä"];
164  * var collator = new Collator({locale: 'de-DE', style: "dictionary"});
165  * arr.sort(collator.getComparator());
166  * console.log(JSON.stringify(arr));
167  * </pre>
168  * </code>
169  * <p>
170  *
171  * Would give the output:<p>
172  *
173  * <code>
174  * <pre>
175  * ["a", "ae", "ä", "o", "oe", "ö", "ß", "u", "ü"]
176  * </pre>
177  * </code>
178  *
179  * When sorting an array of Javascript objects according to one of the
180  * string properties of the objects, wrap the collator's compare function
181  * in your own comparator function that knows the structure of the objects
182  * being sorted:<p>
183  *
184  * <code>
185  * <pre>
186  * var collator = new Collator({locale: 'de-DE'});
187  * var myComparator = function (collator) {
188  *   var comparator = collator.getComparator();
189  *   // left and right are your own objects
190  *   return function (left, right) {
191  *       return comparator(left.x.y.textProperty, right.x.y.textProperty);
192  *   };
193  * };
194  * arr.sort(myComparator(collator));
195  * </pre>
196  * </code>
197  * <p>
198  *
199  * <h2>Sort Keys</h2>
200  *
201  * The collator class also has a method to retrieve the sort key for a
202  * string. The sort key is an array of values that represent how each
203  * character in the string should be collated according to the characteristics
204  * of the collation algorithm and the given options. Thus, sort keys can be
205  * compared directly value-for-value with other sort keys that were generated
206  * by the same collator, and the resulting ordering is guaranteed to be the
207  * same as if the original strings were compared by the collator.
208  * Sort keys generated by different collators are not guaranteed to give
209  * any reasonable results when compared together unless the two collators
210  * were constructed with
211  * exactly the same options and therefore end up representing the exact same
212  * collation sequence.<p>
213  *
214  * A good rule of thumb is that you would use a sort key if you had 10 or more
215  * items to sort or if your array might be resorted arbitrarily. For example, if your
216  * user interface was displaying a table with 100 rows in it, and each row had
217  * 4 sortable text columns which could be sorted in acending or descending order,
218  * the recommended practice would be to generate a sort key for each of the 4
219  * sortable fields in each row and store that in the Javascript representation of the
220  * table data. Then, when the user clicks on a column header to resort the
221  * table according to that column, the resorting would be relatively quick
222  * because it would only be comparing arrays of values, and not recalculating
223  * the collation values for each character in each string for every comparison.<p>
224  *
225  * For tables that are large, it is usually a better idea to do the sorting
226  * on the server side, especially if the table is the result of a database
227  * query. In this case, the table is usually a view of the cursor of a large
228  * results set, and only a few entries are sent to the front end at a time.
229  * In order to sort the set efficiently, it should be done on the database
230  * level instead.
231  *
232  * <h2>Data</h2>
233  *
234  * Doing correct collation entails a huge amount of mapping data, much of which is
235  * not necessary when collating in one language with one script, which is the most
236  * common case. Thus, ilib implements a number of ways to include the data you
237  * need or leave out the data you don't need using the JS assembly tool:
238  *
239  * <ol>
240  * <li>Full multilingual data - if you are sorting multilingual data and need to collate
241  * text written in multiple scripts, you can use the directive "!data collation/ducet" to
242  * load in the full collation data.  This allows the collator to perform the entire
243  * Unicode Collation Algorithm (UCA) based on the Default Unicode Collation Element
244  * Table (DUCET). The data is very large, on the order of multiple megabytes, but
245  * sometimes it is necessary.
246  * <li>A few scripts - if you are sorting text written in only a few scripts, you may
247  * want to include only the data for those scripts. Each ISO 15924 script code has its
248  * own data available in a separate file, so you can use the data directive to include
249  * only the data for the scripts you need. For example, use
250  * "!data collation/Latn" to retrieve the collation information for the Latin script.
251  * Because the "ducet" table mentioned in the previous point is a superset of the
252  * tables for all other scripts, you do not need to include explicitly the data for
253  * any particular script when using "ducet". That is, you either include "ducet" or
254  * you include a specific list of scripts.
255  * <li>Only one script - if you are sorting text written only in one script, you can
256  * either include the data directly as in the previous point, or you can rely on the
257  * locale to include the correct data for you. In this case, you can use the directive
258  * "!data collate" to load in the locale's collation data for its most common script.
259  * </ol>
260  *
261  * With any of the above ways of including the data, the collator will only perform the
262  * correct language-sensitive sorting for the given locale. All other scripts will be
263  * sorted in the default manner according to the UCA. For example, if you include the
264  * "ducet" data and pass in "de-DE" (German for Germany) as the locale spec, then
265  * only the Latin script (the default script for German) will be sorted according to
266  * German rules. All other scripts in the DUCET, such as Japanese or Arabic, will use
267  * the default UCA collation rules.<p>
268  *
269  * If this collator encounters a character for which it has no collation data, it will
270  * sort those characters by pure Unicode value after all characters for which it does have
271  * collation data. For example, if you only loaded in the German collation data (ie. the
272  * data for the Latin script tailored to German) to sort a list of person names, but that
273  * list happens to include the names of a few Japanese people written in Japanese
274  * characters, the Japanese names will sort at the end of the list after all German names,
275  * and will sort according to the Unicode values of the characters.
276  *
277  * @constructor
278  * @param {Object} options options governing how the resulting comparator
279  * function will operate
280  */
281 var Collator = function(options) {
282     var sync = true,
283         loadParams = undefined,
284         useNative = true;
285 
286     // defaults
287     /**
288      * @private
289      * @type {Locale}
290      */
291     this.locale = new Locale(ilib.getLocale());
292 
293     /** @private */
294     this.caseFirst = "upper";
295     /** @private */
296     this.sensitivity = "variant";
297     /** @private */
298     this.level = 4;
299     /** @private */
300     this.usage = "sort";
301     /** @private */
302     this.reverse = false;
303     /** @private */
304     this.numeric = false;
305     /** @private */
306     this.style = "default";
307     /** @private */
308     this.ignorePunctuation = false;
309 
310     if (options) {
311         if (options.locale) {
312             this.locale = (typeof(options.locale) === 'string') ? new Locale(options.locale) : options.locale;
313         }
314         if (options.sensitivity) {
315             switch (options.sensitivity) {
316                 case 'primary':
317                 case 'base':
318                     this.sensitivity = "base";
319                     this.level = 1;
320                     break;
321                 case 'secondary':
322                 case 'accent':
323                     this.sensitivity = "accent";
324                     this.level = 2;
325                     break;
326                 case 'tertiary':
327                 case 'case':
328                     this.sensitivity = "case";
329                     this.level = 3;
330                     break;
331                 case 'quaternary':
332                 case 'variant':
333                     this.sensitivity = "variant";
334                     this.level = 4;
335                     break;
336             }
337         }
338         if (typeof(options.upperFirst) !== 'undefined') {
339             this.caseFirst = options.upperFirst ? "upper" : "lower";
340         }
341 
342         if (typeof(options.ignorePunctuation) !== 'undefined') {
343             this.ignorePunctuation = options.ignorePunctuation;
344         }
345         if (typeof(options.sync) !== 'undefined') {
346             sync = !!options.sync;
347         }
348 
349         loadParams = options.loadParams;
350         if (typeof(options.useNative) !== 'undefined') {
351             useNative = options.useNative;
352         }
353 
354         if (options.usage === "sort" || options.usage === "search") {
355             this.usage = options.usage;
356         }
357 
358         if (typeof(options.reverse) === 'boolean') {
359             this.reverse = options.reverse;
360         }
361 
362         if (typeof(options.numeric) === 'boolean') {
363             this.numeric = options.numeric;
364         }
365 
366         if (typeof(options.style) === 'string') {
367             this.style = options.style;
368         }
369     } else {
370         options = {sync: true};
371     }
372 
373     if (this.usage === "sort") {
374         // produces a stable sort
375         this.level = 4;
376     }
377 
378     if (useNative && typeof(Intl) !== 'undefined' && Intl) {
379         // this engine is modern and supports the new Intl object!
380         //console.log("implemented natively");
381         /**
382          * @private
383          * @type {{compare:function(string,string)}}
384          */
385         this.collator = new Intl.Collator(this.locale.getSpec(), {
386             sensitivity: this.sensitivity,
387             caseFirst: this.caseFirst,
388             ignorePunctuation: this.ignorePunctuation,
389             numeric: this.numeric,
390             usage: this.usage
391         });
392 
393         if (options && typeof(options.onLoad) === 'function') {
394             options.onLoad(this);
395         }
396     } else {
397         //console.log("implemented in pure JS");
398 
399         // else implement in pure Javascript
400         Utils.loadData({
401             object: "Collator",
402             locale: this.locale,
403             name: "collation.json",
404             replace: true,
405             sync: sync,
406             loadParams: loadParams,
407             callback: ilib.bind(this, function (collation) {
408                 if (!collation) {
409                     collation = ilib.data.collation;
410                 }
411                 this._initCollation(collation);
412                 if (this.ignorePunctuation) {
413                     isPunct._init(sync, loadParams, ilib.bind(this, function() {
414                         this._init(options);
415                     }));
416                 } else {
417                     this._init(options);
418                 }
419             })
420         });
421     }
422 };
423 
424 Collator.prototype = {
425     /**
426      * @private
427      */
428     _init: function(options) {
429         if (this.numeric) {
430             // Create a fake INumber instance now to guarantee that the locale data
431             // is loaded so we can create sync INumber instances later, even in async mode
432             new INumber("1", {
433                 sync: options.sync,
434                 loadParams: options.loadParams,
435                 onLoad: function(n) {
436                     if (typeof(options.onLoad) === 'function') {
437                         options.onLoad(this);
438                     }
439                 }
440             })
441         } else {
442             if (typeof(options.onLoad) === 'function') {
443                 options.onLoad(this);
444             }
445         }
446     },
447 
448     /**
449      * @private
450      * Bit pack an array of values into a single number
451      * @param {number|null|Array.<number>} arr array of values to bit pack
452      * @param {number} offset offset for the start of this map
453      */
454     _pack: function (arr, offset) {
455         var value = 0;
456         if (arr) {
457             if (typeof(arr) === 'number') {
458                 arr = [ arr ];
459             }
460             for (var i = 0; i < this.level; i++) {
461                 var thisLevel = (typeof(arr[i]) !== "undefined" ? arr[i] : 0);
462                 if (i === 0) {
463                     thisLevel += offset;
464                 }
465                 if (i > 0) {
466                     value <<= this.collation.bits[i];
467                 }
468                 if (i === 2 && this.caseFirst === "lower") {
469                     // sort the lower case first instead of upper
470                     value = value | (1 - thisLevel);
471                 } else {
472                     value = value | thisLevel;
473                 }
474             }
475         }
476         return value;
477     },
478 
479     /**
480      * @private
481      * Return the rule packed into an array of collation elements.
482      * @param {Array.<number|null|Array.<number>>} rule
483      * @param {number} offset
484      * @return {Array.<number>} a bit-packed array of numbers
485      */
486     _packRule: function(rule, offset) {
487         if (ilib.isArray(rule[0])) {
488             var ret = [];
489             for (var i = 0; i < rule.length; i++) {
490                 ret.push(this._pack(rule[i], offset));
491             }
492             return ret;
493         } else {
494             return [ this._pack(rule, offset) ];
495         }
496     },
497 
498     /**
499      * @private
500      */
501     _addChars: function (str, offset) {
502         var gs = new GlyphString(str);
503         var it = gs.charIterator();
504         var c;
505 
506         while (it.hasNext()) {
507             c = it.next();
508             if (c === "'") {
509                 // escape a sequence of chars as one collation element
510                 c = "";
511                 var x = "";
512                 while (it.hasNext() && x !== "'") {
513                     c += x;
514                     x = it.next();
515                 }
516             }
517             this.lastMap++;
518             this.map[c] = this._packRule([this.lastMap], offset);
519         }
520     },
521 
522     /**
523      * @private
524      */
525     _addRules: function(rules, start) {
526         var p;
527         for (var r in rules.map) {
528             if (r) {
529                 this.map[r] = this._packRule(rules.map[r], start);
530                 p = typeof(rules.map[r][0]) === 'number' ? rules.map[r][0] : rules.map[r][0][0];
531                 this.lastMap = Math.max(p + start, this.lastMap);
532             }
533         }
534 
535         if (typeof(rules.ranges) !== 'undefined') {
536             // for each range, everything in the range goes in primary sequence from the start
537             for (var i = 0; i < rules.ranges.length; i++) {
538                 var range = rules.ranges[i];
539 
540                 this.lastMap = range.start;
541                 if (typeof(range.chars) === "string") {
542                     this._addChars(range.chars, start);
543                 } else {
544                     for (var k = 0; k < range.chars.length; k++) {
545                         this._addChars(range.chars[k], start);
546                     }
547                 }
548             }
549         }
550     },
551 
552     /**
553      * @private
554      */
555     _initCollation: function(rules) {
556         var rule = this.style;
557         while (typeof(rule) === 'string') {
558             rule = rules[rule];
559         }
560 
561         if (!rule) {
562             rule = "default";
563 
564             while (typeof(rule) === 'string') {
565                 rule = rules[rule];
566             }
567         }
568         if (!rule) {
569             this.map = {};
570             return;
571         }
572 
573         /**
574          * @private
575          * @type {{scripts:Array.<string>,bits:Array.<number>,maxes:Array.<number>,bases:Array.<number>,map:Object.<string,Array.<number|null|Array.<number>>>}}
576          */
577         this.collation = rule;
578         this.map = {};
579         this.lastMap = -1;
580         this.keysize = this.collation.keysize[this.level-1];
581         this.defaultRule = rules["default"];
582 
583         if (typeof(this.collation.inherit) !== 'undefined') {
584             for (var i = 0; i < this.collation.inherit.length; i++) {
585                 if (this.collation.inherit === 'this') {
586                     continue;
587                 }
588                 var col = this.collation.inherit[i];
589                 rule = typeof(col) === 'object' ? col.name : col;
590                 if (rules[rule]) {
591                     this._addRules(rules[rule], col.start || this.lastMap+1);
592                 }
593             }
594         }
595         this._addRules(this.collation, this.lastMap+1);
596     },
597 
598     /**
599      * @private
600      */
601     _basicCompare: function(left, right) {
602         var l = (left instanceof NormString) ? left : new NormString(left),
603             r = (right instanceof NormString) ? right : new NormString(right),
604             lelements,
605             relements,
606             diff;
607 
608         if (this.numeric) {
609             var lvalue = new INumber(left, {locale: this.locale});
610             var rvalue = new INumber(right, {locale: this.locale});
611             if (!isNaN(lvalue.valueOf()) && !isNaN(rvalue.valueOf())) {
612                 diff = lvalue.valueOf() - rvalue.valueOf();
613                 if (diff) {
614                     return diff;
615                 } else {
616                     // skip the numeric part and compare the rest lexically
617                     l = new NormString(left.substring(lvalue.parsed.length));
618                     r = new NormString(right.substring(rvalue.parsed.length));
619                 }
620             }
621             // else if they aren't both numbers, then let the code below take care of the lexical comparison instead
622         }
623 
624         lelements = new ElementIterator(new CodePointSource(l, this.ignorePunctuation), this.map, this.keysize);
625         relements = new ElementIterator(new CodePointSource(r, this.ignorePunctuation), this.map, this.keysize);
626 
627         while (lelements.hasNext() && relements.hasNext()) {
628             diff = lelements.next() - relements.next();
629             if (diff) {
630                 return diff;
631             }
632         }
633         if (!lelements.hasNext() && !relements.hasNext()) {
634             return 0;
635         } else if (lelements.hasNext()) {
636             return 1;
637         } else {
638             return -1;
639         }
640     },
641 
642     /**
643      * Compare two strings together according to the rules of this
644      * collator instance. Do not use this function directly with
645      * Array.sort, as it will not have its collation data available
646      * and therefore will not function properly. Use the function
647      * returned by getComparator() instead.
648      *
649      * @param {string} left the left string to compare
650      * @param {string} right the right string to compare
651      * @return {number} a negative number if left comes before right, a
652      * positive number if right comes before left, and zero if left and
653      * right are equivalent according to this collator
654      */
655     compare: function (left, right) {
656         // last resort: use the "C" locale
657         if (this.collator) {
658             // implemented by the core engine
659             return this.collator.compare(left, right);
660         }
661 
662         var ret = this._basicCompare(left, right);
663         return this.reverse ? -ret : ret;
664     },
665 
666     /**
667      * Return a comparator function that can compare two strings together
668      * according to the rules of this collator instance. The function
669      * returns a negative number if the left
670      * string comes before right, a positive number if the right string comes
671      * before the left, and zero if left and right are equivalent. If the
672      * reverse property was given as true to the collator constructor, this
673      * function will
674      * switch the sign of those values to cause sorting to happen in the
675      * reverse order.
676      *
677      * @return {function(...)|undefined} a comparator function that
678      * can compare two strings together according to the rules of this
679      * collator instance
680      */
681     getComparator: function() {
682         // bind the function to this instance so that we have the collation
683         // rules available to do the work
684         if (this.collator) {
685             // implemented by the core engine
686             return this.collator.compare;
687         }
688 
689         return ilib.bind(this, this.compare);
690     },
691 
692     /**
693      * Return a sort key string for the given string. The sort key
694      * string is a list of values that represent each character
695      * in the original string. The sort key
696      * values for any particular character consists of 3 numbers that
697      * encode the primary, secondary, and tertiary characteristics
698      * of that character. The values of each characteristic are
699      * modified according to the strength of this collator instance
700      * to give the correct collation order. The idea is that this
701      * sort key string is directly comparable byte-for-byte to
702      * other sort key strings generated by this collator without
703      * any further knowledge of the collation rules for the locale.
704      * More formally, if a < b according to the rules of this collation,
705      * then it is guaranteed that sortkey(a) < sortkey(b) when compared
706      * byte-for-byte. The sort key string can therefore be used
707      * without the collator to sort an array of strings efficiently
708      * because the work of determining the applicability of various
709      * collation rules is done once up-front when generating
710      * the sort key.<p>
711      *
712      * The sort key string can be treated as a regular, albeit somewhat
713      * odd-looking, string. That is, it can be pass to regular
714      * Javascript functions without problems.
715      *
716      * @param {string} str the original string to generate the sort key for
717      * @return {string} a sort key string for the given string
718      */
719     sortKey: function (str) {
720         if (!str) {
721             return "";
722         }
723 
724         if (this.collator) {
725             // native, no sort keys available
726             return str;
727         }
728 
729         if (this.numeric) {
730             var v = new INumber(str, {locale: this.locale});
731             var s = isNaN(v.valueOf()) ? "" : v.valueOf().toString(16);
732             return JSUtils.pad(s, 16);
733         } else {
734             var n = (typeof(str) === "string") ? new NormString(str) : str,
735                 ret = "",
736                 lelements = new ElementIterator(new CodePointSource(n, this.ignorePunctuation), this.map, this.keysize),
737                 element;
738 
739             while (lelements.hasNext()) {
740                 element = lelements.next();
741                 if (this.reverse) {
742                     // for reverse, take the bitwise inverse
743                     element = (1 << this.keysize) - element;
744                 }
745                 ret += JSUtils.pad(element.toString(16), this.keysize/4);
746             }
747         }
748         return ret;
749     }
750 };
751 
752 /**
753  * Retrieve the list of collation style names that are available for the
754  * given locale. This list varies depending on the locale, and depending
755  * on whether or not the data for that locale was assembled into this copy
756  * of ilib.
757  *
758  * @param {Locale|string=} locale The locale for which the available
759  * styles are being sought
760  * @return Array.<string> an array of style names that are available for
761  * the given locale
762  */
763 Collator.getAvailableStyles = function (locale) {
764     return [ "standard" ];
765 };
766 
767 /**
768  * Retrieve the list of ISO 15924 script codes that are available in this
769  * copy of ilib. This list varies depending on whether or not the data for
770  * various scripts was assembled into this copy of ilib. If the "ducet"
771  * data is assembled into this copy of ilib, this method will report the
772  * entire list of scripts as being available. If a collator instance is
773  * instantiated with a script code that is not on the list returned by this
774  * function, it will be ignored and text in that script will be sorted by
775  * numeric Unicode values of the characters.
776  *
777  * @return Array.<string> an array of ISO 15924 script codes that are
778  * available
779  */
780 Collator.getAvailableScripts = function () {
781     return [ "Latn" ];
782 };
783 
784 
785 /**
786  * Return a default collation style
787  *
788  * @returns {string} default collation style such as 'latin', 'korean' etc */
789 Collator.prototype.getDefaultCollatorStyle = function () {
790     return this.defaultRule;
791 };
792 
793 module.exports = Collator;
794