1 /*
  2  * LocaleMatcher.js - Locale matcher definition
  3  *
  4  * Copyright © 2013-2015, 2018-2019, JEDLSoft
  5  *
  6  * Licensed under the Apache License, Version 2.0 (the "License");
  7  * you may not use this file except in compliance with the License.
  8  * You may obtain a copy of the License at
  9  *
 10  *     http://www.apache.org/licenses/LICENSE-2.0
 11  *
 12  * Unless required by applicable law or agreed to in writing, software
 13  * distributed under the License is distributed on an "AS IS" BASIS,
 14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15  *
 16  * See the License for the specific language governing permissions and
 17  * limitations under the License.
 18  */
 19 
 20 // !data localematch
 21 
 22 var ilib = require("../index.js");
 23 var Utils = require("./Utils.js");
 24 var Locale = require("./Locale.js");
 25 
 26 var componentWeights = [
 27     0.5,   // language
 28     0.2,   // script
 29     0.25,  // region
 30     0.05   // variant
 31 ];
 32 
 33 // these are languages where you have to put the script all the time,
 34 // as none of the scripts are default for the language
 35 var multiScriptLanguages = {
 36     "az": true,   // Azerbaijani
 37     "kk": true,   // Kazakh
 38     "ku": true,   // Kurdish
 39     "ky": true,   // Kyrgyz
 40     "pa": true,   // Panjabi
 41     "sr": true,   // Serbian
 42     "tg": true,   // Tajik
 43     "uz": true,   // Uzbek
 44     "zh": true    // Chinese
 45 };
 46 
 47 /**
 48  * @class
 49  * Create a new locale matcher instance. This is used
 50  * to see which locales can be matched with each other in
 51  * various ways.<p>
 52  *
 53  * The options object may contain any of the following properties:
 54  *
 55  * <ul>
 56  * <li><i>locale</i> - the locale instance or locale spec to match
 57  *
 58  * <li><i>onLoad</i> - a callback function to call when the locale matcher object is fully
 59  * loaded. When the onLoad option is given, the locale matcher object will attempt to
 60  * load any missing locale data using the ilib loader callback.
 61  * When the constructor is done (even if the data is already preassembled), the
 62  * onLoad function is called with the current instance as a parameter, so this
 63  * callback can be used with preassembled or dynamic loading or a mix of the two.
 64  *
 65  * <li><i>sync</i> - tell whether to load any missing locale data synchronously or
 66  * asynchronously. If this option is given as "false", then the "onLoad"
 67  * callback must be given, as the instance returned from this constructor will
 68  * not be usable for a while.
 69  *
 70  * <li><i>loadParams</i> - an object containing parameters to pass to the
 71  * loader callback function when locale data is missing. The parameters are not
 72  * interpretted or modified in any way. They are simply passed along. The object
 73  * may contain any property/value pairs as long as the calling code is in
 74  * agreement with the loader callback function as to what those parameters mean.
 75  * </ul>
 76  *
 77  *
 78  * @constructor
 79  * @param {Object} options parameters to initialize this matcher
 80  */
 81 var LocaleMatcher = function(options) {
 82     var sync = true,
 83         loadParams = undefined;
 84 
 85     this.locale = new Locale();
 86 
 87     if (options) {
 88         if (typeof(options.locale) !== 'undefined') {
 89             this.locale = (typeof(options.locale) === 'string') ? new Locale(options.locale) : options.locale;
 90         }
 91 
 92         if (typeof(options.sync) !== 'undefined') {
 93             sync = !!options.sync;
 94         }
 95 
 96         if (typeof(options.loadParams) !== 'undefined') {
 97             loadParams = options.loadParams;
 98         }
 99     }
100 
101     if (typeof(ilib.data.localematch) === 'undefined') {
102         Utils.loadData({
103             object: "LocaleMatcher",
104             locale: "-",
105             name: "localematch.json",
106             sync: sync,
107             loadParams: loadParams,
108             callback: ilib.bind(this, function (info) {
109                 if (!info) {
110                     info = {};
111                 }
112                 /** @type {Object.<string,string>} */
113                 this.info = info;
114                 if (options && typeof(options.onLoad) === 'function') {
115                     options.onLoad(this);
116                 }
117             })
118         });
119     } else {
120         this.info = ilib.data.localematch;
121         if (options && typeof(options.onLoad) === 'function') {
122             options.onLoad(this);
123         }
124     }
125 };
126 
127 
128 LocaleMatcher.prototype = {
129     /**
130      * Return the locale used to construct this instance.
131      * @return {Locale|undefined} the locale for this matcher
132      */
133     getLocale: function() {
134         return this.locale;
135     },
136 
137     /**
138      * @private
139      * Do the work
140      */
141     _getLikelyLocale: function(locale) {
142         // already full specified
143         if (locale.language && locale.script && locale.region) return locale;
144 
145         if (typeof(this.info.likelyLocales[locale.getSpec()]) === 'undefined') {
146             // try various partials before giving up
147             var partial = this.info.likelyLocales[new Locale(locale.language, undefined, locale.region).getSpec()];
148             if (typeof(partial) !== 'undefined') return new Locale(partial);
149 
150             partial = this.info.likelyLocales[new Locale(locale.language, locale.script, undefined).getSpec()];
151             if (typeof(partial) !== 'undefined') return new Locale(partial);
152 
153             partial = this.info.likelyLocales[new Locale(locale.language, undefined, undefined).getSpec()];
154             if (typeof(partial) !== 'undefined') return new Locale(partial);
155 
156             partial = this.info.likelyLocales[new Locale(undefined, locale.script, locale.region).getSpec()];
157             if (typeof(partial) !== 'undefined') return new Locale(partial);
158 
159             partial = this.info.likelyLocales[new Locale(undefined, undefined, locale.region).getSpec()];
160             if (typeof(partial) !== 'undefined') return new Locale(partial);
161 
162             partial = this.info.likelyLocales[new Locale(undefined, locale.script, undefined).getSpec()];
163             if (typeof(partial) !== 'undefined') return new Locale(partial);
164 
165             return locale;
166         }
167 
168         return new Locale(this.info.likelyLocales[locale.getSpec()]);
169     },
170 
171     /**
172      * Return an Locale instance that is fully specified based on partial information
173      * given to the constructor of this locale matcher instance. For example, if the locale
174      * spec given to this locale matcher instance is simply "ru" (for the Russian language),
175      * then it will fill in the missing region and script tags and return a locale with
176      * the specifier "ru-Cyrl-RU". (ie. Russian language, Cyrillic, Russian Federation).
177      * Any one or two of the language, script, or region parts may be left unspecified,
178      * and the other one or two parts will be filled in automatically. If this
179      * class has no information about the given locale, then the locale of this
180      * locale matcher instance is returned unchanged.
181      *
182      * @returns {Locale} the most likely completion of the partial locale given
183      * to the constructor of this locale matcher instance
184      */
185     getLikelyLocale: function () {
186         return this._getLikelyLocale(this.locale);
187     },
188 
189     /**
190      * Return an Locale instance that is specified based on partial information
191      * given to the constructor of this locale matcher instance but which leaves out any
192      * part of the locale specifier that is so common that it is understood. For example,
193      * if the locale
194      * spec given to this locale matcher instance is simply "ru" (for the Russian language),
195      * then it will fill in the missing region and/or script tags and return a locale with
196      * the specifier "ru-RU". (ie. Russian language, Russian Federation). Note that the
197      * default script "Cyrl" is left out because the vast majority of text written in
198      * Russian is written with the Cyrllic script, so that part of the locale is understood
199      * and is commonly left out.<p>
200      *
201      * Any one or two of the language, script, or region parts may be left unspecified,
202      * and the other one or two parts will be filled in automatically. If this
203      * class has no information about the given locale, then the locale of this
204      * locale matcher instance is returned unchanged.<p>
205      *
206      * This method returns the same information as getLikelyLocale but with the very common
207      * parts left out.
208      *
209      * @returns {Locale} the most likely "minimal" completion of the partial locale given
210      * to the constructor of this locale matcher instance where the commonly understood
211      * parts are left out.
212      */
213     getLikelyLocaleMinimal: function() {
214         var fullLocale = this._getLikelyLocale(this.locale);
215         var langLocale = this._getLikelyLocale(new Locale(fullLocale.language));
216         return fullLocale.script === langLocale.script && !multiScriptLanguages[fullLocale.language] ?
217             new Locale(fullLocale.language, undefined, fullLocale.region) :
218             fullLocale;
219     },
220 
221     /**
222      * Return the degree that the given locale matches the current locale of this
223      * matcher. This method returns an integer from 0 to 100. A value of 100 is
224      * a 100% match, meaning that the two locales are exactly equivalent to each
225      * other. (eg. "ja-JP" and "ja-JP") A value of 0 means that there 0% match or
226      * that the two locales have nothing in common. (eg. "en-US" and "ja-JP") <p>
227      *
228      * Locale matching is not the same as equivalence, as the degree of matching
229      * is returned. (See Locale.equals for equivalence.)<p>
230      *
231      * The match score is calculated based on matching the 4 locale components,
232      * weighted by importance:
233      *
234      * <ul>
235      * <li> language - this accounts for 50% of the match score
236      * <li> region - accounts for 25% of the match score
237      * <li> script - accounts for 20% of the match score
238      * <li> variant - accounts for 5% of the match score
239      * </ul>
240      *
241      * The score is affected by the following things:
242      *
243      * <ul>
244      * <li> A large language score is given when the language components of the locales
245      * match exactly.
246      * <li> Higher language scores are given when the languages are linguistically
247      * close to each other, such as dialects.
248      * <li> A small score is given when two languages are in the same
249      * linguistic family, but one is not a dialect of the other, such as German
250      * and Dutch.
251      * <li> A large region score is given when two locales share the same region.
252      * <li> A smaller region score is given when one region is contained within
253      * another. For example, Hong Kong is part of China, so a moderate score is
254      * given instead of a full score.
255      * <li> A small score is given if two regions are geographically close to
256      * each other or are tied by history. For example, Ireland and Great Britain
257      * are both adjacent and tied by history, so they receive a moderate score.
258      * <li> A high script score is given if the two locales share the same script.
259      * The legibility of a common script means that there is some small kinship of the
260      * different languages.
261      * <li> A high variant score is given if the two locales share the same
262      * variant. Full score is given when both locales have no variant at all.
263      * <li> Locale components that are unspecified in both locales are given high
264      * scores.
265      * <li> Locales where a particular locale component is missing in only one
266      * locale can still match when the default for that locale component matches
267      * the component in the other locale. The
268      * default value for the missing component is determined using the likely locales
269      * data. (See getLikelyLocale()) For example, "en-US" and "en-Latn-US" receive
270      * a high script score because the default script for "en" is "Latn".
271      * </ul>
272      *
273      * The intention of this method is that it can be used to determine
274      * compatibility of locales. For example, when a user signs up for an
275      * account on a web site, the locales that the web site supports and
276      * the locale of the user's browser may differ, and the site needs to
277      * pick the best locale to show the user. Let's say the
278      * web site supports a selection of European languages such as "it-IT",
279      * "fr-FR", "de-DE", and "en-GB". The user's
280      * browser may be set to "it-CH". The web site code can then match "it-CH"
281      * against each of the supported locales to find the one with the
282      * highest score. In
283      * this case, the best match would be "it-IT" because it shares a
284      * language and script in common with "it-CH" and differs only in the region
285      * component. It is not a 100% match, but it is pretty good. The web site
286      * may decide if the match scores all fall
287      * below a chosen threshold (perhaps 50%?), it should show the user the
288      * default language "en-GB", because that is probably a better choice
289      * than any other supported locale.<p>
290      *
291      * @param {Locale} locale the other locale to match against the current one
292      * @return {number} an integer from 0 to 100 that indicates the degree to
293      * which these locales match each other
294      */
295     match: function(locale) {
296         var other = new Locale(locale);
297         var scores = [0, 0, 0, 0];
298         var thisfull, otherfull, i;
299 
300         if (this.locale.language === other.language) {
301             scores[0] = 100;
302         } else {
303             if (!this.locale.language || !other.language) {
304                 // check for default language
305                 thisfull = this.getLikelyLocale();
306                 otherfull = new Locale(this.info.likelyLocales[other.getSpec()] || other.getSpec());
307                 if (thisfull.language === otherfull.language) {
308                     scores[0] = 100;
309                 }
310             } else {
311                 // check for macro languages
312                 var mlthis = this.info.macroLanguagesReverse[this.locale.language] || this.locale.language;
313                 var mlother = this.info.macroLanguagesReverse[other.language] || other.language;
314                 if (mlthis === mlother) {
315                     scores[0] = 90;
316                 } else {
317                     // check for mutual intelligibility
318                     var pair = this.locale.language + "-" + other.language;
319                     scores[0] = this.info.mutualIntelligibility[pair] || 0;
320                 }
321             }
322         }
323 
324         if (this.locale.script === other.script) {
325             scores[1] = 100;
326         } else {
327             if (!this.locale.script || !other.script) {
328                 // check for default script
329                 thisfull = this.locale.script ? this.locale : new Locale(this.info.likelyLocales[this.locale.language]);
330                 otherfull = other.script ? other : new Locale(this.info.likelyLocales[other.language]);
331                 if (thisfull.script === otherfull.script) {
332                     scores[1] = 100;
333                 }
334             }
335         }
336 
337         if (this.locale.region === other.region) {
338             scores[2] = 100;
339         } else {
340             if (!this.locale.region || !other.region) {
341                 // check for default region
342                 thisfull = this.getLikelyLocale();
343                 otherfull = new Locale(this.info.likelyLocales[other.getSpec()] || other.getSpec());
344                 if (thisfull.region === otherfull.region) {
345                     scores[2] = 100;
346                 }
347             } else {
348                 // check for containment
349                 var containers = this.info.territoryContainmentReverse[this.locale.region] || [];
350                 // end at 1 because 0 is "001" which is "the whole world" -- which is not useful
351                 for (i = containers.length-1; i > 0; i--) {
352                     var container = this.info.territoryContainment[containers[i]];
353                     if (container && container.indexOf(other.region) > -1) {
354                         // same area only accounts for 20% of the region score
355                         scores[2] = ((i+1) * 100 / containers.length) * 0.2;
356                         break;
357                     }
358                 }
359             }
360         }
361 
362         if (this.locale.variant === other.variant) {
363             scores[3] = 100;
364         }
365 
366         var total = 0;
367 
368         for (i = 0; i < 4; i++) {
369             total += scores[i] * componentWeights[i];
370         }
371 
372         return Math.round(total);
373     },
374 
375     /**
376      * Return the macrolanguage associated with this locale. If the
377      * locale's language is not part of a macro-language, then the
378      * locale's language is returned as-is.
379      *
380      * @returns {string} the ISO code for the macrolanguage associated
381      * with this locale, or language of the locale
382      */
383     getMacroLanguage: function() {
384         return this.info.macroLanguagesReverse[this.locale.language] || this.locale.language;
385     },
386 
387     /**
388      * @private
389      * Return the containment array for the given region code.
390      */
391     _getRegionContainment: function(region) {
392         return this.info.territoryContainmentReverse[region] || []
393     },
394 
395     /**
396      * Return the list of regions that this locale is contained within. Regions are
397      * nested, so locales can be in multiple regions. (eg. US is in Northern North
398      * America, North America, the Americas, the World.) Most regions are specified
399      * using UN.49 region numbers, though some, like "EU", are letters. If the
400      * locale is underspecified, this method will use the most likely locale method
401      * to get the region first. For example, the locale "ja" (Japanese) is most
402      * likely "ja-JP" (Japanese for Japan), and the region containment info for Japan
403      * is returned.
404      *
405      * @returns {Array.<string>} an array of region specifiers that this locale is within
406      */
407     getRegionContainment: function() {
408         var region = this.locale.region || this.getLikelyLocale().region;
409         return this._getRegionContainment(region);
410     },
411 
412     /**
413      * Find the smallest region that contains both the current locale and the other locale.
414      * If the current or other locales are underspecified, this method will use the most
415      * likely locale method
416      * to get their regions first. For example, the locale "ja" (Japanese) is most
417      * likely "ja-JP" (Japanese for Japan), and the region containment info for Japan
418      * is checked against the other locale's region containment info.
419      *
420      * @param {string|Locale} otherLocale a locale specifier or a Locale instance to
421      * compare against
422      * @returns {string} the region specifier of the smallest region containing both the
423      * current locale and other locale
424      */
425     smallestCommonRegion: function(otherLocale) {
426         if (typeof(otherLocale) === "undefined") return "001";
427 
428         var thisRegion = this.locale.region || this.getLikelyLocale().region;
429         var otherLoc = typeof(otherLocale) === "string" ? new Locale(otherLocale) : otherLocale;
430         var otherRegion = this._getLikelyLocale(otherLoc).region;
431 
432         var thisRegions = this._getRegionContainment(thisRegion);
433         var otherRegions = this._getRegionContainment(otherRegion);
434 
435         // region containment arrays are arranged from largest to smallest, so start
436         // at the end of the array
437         for (var i = thisRegions.length-1; i > 0; i--) {
438             if (otherRegions.indexOf(thisRegions[i]) > -1) {
439                 return thisRegions[i];
440             }
441         }
442 
443         // this default should never be reached because the world should be common to all regions
444         return "001";
445     }
446 };
447 
448 module.exports = LocaleMatcher;
449