1 /* 2 * LocaleMatcher.js - Locale matcher definition 3 * 4 * Copyright © 2013-2015, 2018-2019, JEDLSoft 5 * 6 * Licensed under the Apache License, Version 2.0 (the "License"); 7 * you may not use this file except in compliance with the License. 8 * You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * 16 * See the License for the specific language governing permissions and 17 * limitations under the License. 18 */ 19 20 // !data localematch 21 22 var ilib = require("../index.js"); 23 var Utils = require("./Utils.js"); 24 var Locale = require("./Locale.js"); 25 26 var componentWeights = [ 27 0.5, // language 28 0.2, // script 29 0.25, // region 30 0.05 // variant 31 ]; 32 33 // these are languages where you have to put the script all the time, 34 // as none of the scripts are default for the language 35 var multiScriptLanguages = { 36 "az": true, // Azerbaijani 37 "kk": true, // Kazakh 38 "ku": true, // Kurdish 39 "ky": true, // Kyrgyz 40 "pa": true, // Panjabi 41 "sr": true, // Serbian 42 "tg": true, // Tajik 43 "uz": true, // Uzbek 44 "zh": true // Chinese 45 }; 46 47 /** 48 * @class 49 * Create a new locale matcher instance. This is used 50 * to see which locales can be matched with each other in 51 * various ways.<p> 52 * 53 * The options object may contain any of the following properties: 54 * 55 * <ul> 56 * <li><i>locale</i> - the locale instance or locale spec to match 57 * 58 * <li><i>onLoad</i> - a callback function to call when the locale matcher object is fully 59 * loaded. When the onLoad option is given, the locale matcher object will attempt to 60 * load any missing locale data using the ilib loader callback. 61 * When the constructor is done (even if the data is already preassembled), the 62 * onLoad function is called with the current instance as a parameter, so this 63 * callback can be used with preassembled or dynamic loading or a mix of the two. 64 * 65 * <li><i>sync</i> - tell whether to load any missing locale data synchronously or 66 * asynchronously. If this option is given as "false", then the "onLoad" 67 * callback must be given, as the instance returned from this constructor will 68 * not be usable for a while. 69 * 70 * <li><i>loadParams</i> - an object containing parameters to pass to the 71 * loader callback function when locale data is missing. The parameters are not 72 * interpretted or modified in any way. They are simply passed along. The object 73 * may contain any property/value pairs as long as the calling code is in 74 * agreement with the loader callback function as to what those parameters mean. 75 * </ul> 76 * 77 * 78 * @constructor 79 * @param {Object} options parameters to initialize this matcher 80 */ 81 var LocaleMatcher = function(options) { 82 var sync = true, 83 loadParams = undefined; 84 85 this.locale = new Locale(); 86 87 if (options) { 88 if (typeof(options.locale) !== 'undefined') { 89 this.locale = (typeof(options.locale) === 'string') ? new Locale(options.locale) : options.locale; 90 } 91 92 if (typeof(options.sync) !== 'undefined') { 93 sync = !!options.sync; 94 } 95 96 if (typeof(options.loadParams) !== 'undefined') { 97 loadParams = options.loadParams; 98 } 99 } 100 101 if (typeof(ilib.data.localematch) === 'undefined') { 102 Utils.loadData({ 103 object: "LocaleMatcher", 104 locale: "-", 105 name: "localematch.json", 106 sync: sync, 107 loadParams: loadParams, 108 callback: ilib.bind(this, function (info) { 109 if (!info) { 110 info = {}; 111 } 112 /** @type {Object.<string,string>} */ 113 this.info = info; 114 if (options && typeof(options.onLoad) === 'function') { 115 options.onLoad(this); 116 } 117 }) 118 }); 119 } else { 120 this.info = ilib.data.localematch; 121 if (options && typeof(options.onLoad) === 'function') { 122 options.onLoad(this); 123 } 124 } 125 }; 126 127 128 LocaleMatcher.prototype = { 129 /** 130 * Return the locale used to construct this instance. 131 * @return {Locale|undefined} the locale for this matcher 132 */ 133 getLocale: function() { 134 return this.locale; 135 }, 136 137 /** 138 * @private 139 * Do the work 140 */ 141 _getLikelyLocale: function(locale) { 142 // already full specified 143 if (locale.language && locale.script && locale.region) return locale; 144 145 if (typeof(this.info.likelyLocales[locale.getSpec()]) === 'undefined') { 146 // try various partials before giving up 147 var partial = this.info.likelyLocales[new Locale(locale.language, undefined, locale.region).getSpec()]; 148 if (typeof(partial) !== 'undefined') return new Locale(partial); 149 150 partial = this.info.likelyLocales[new Locale(locale.language, locale.script, undefined).getSpec()]; 151 if (typeof(partial) !== 'undefined') return new Locale(partial); 152 153 partial = this.info.likelyLocales[new Locale(locale.language, undefined, undefined).getSpec()]; 154 if (typeof(partial) !== 'undefined') return new Locale(partial); 155 156 partial = this.info.likelyLocales[new Locale(undefined, locale.script, locale.region).getSpec()]; 157 if (typeof(partial) !== 'undefined') return new Locale(partial); 158 159 partial = this.info.likelyLocales[new Locale(undefined, undefined, locale.region).getSpec()]; 160 if (typeof(partial) !== 'undefined') return new Locale(partial); 161 162 partial = this.info.likelyLocales[new Locale(undefined, locale.script, undefined).getSpec()]; 163 if (typeof(partial) !== 'undefined') return new Locale(partial); 164 165 return locale; 166 } 167 168 return new Locale(this.info.likelyLocales[locale.getSpec()]); 169 }, 170 171 /** 172 * Return an Locale instance that is fully specified based on partial information 173 * given to the constructor of this locale matcher instance. For example, if the locale 174 * spec given to this locale matcher instance is simply "ru" (for the Russian language), 175 * then it will fill in the missing region and script tags and return a locale with 176 * the specifier "ru-Cyrl-RU". (ie. Russian language, Cyrillic, Russian Federation). 177 * Any one or two of the language, script, or region parts may be left unspecified, 178 * and the other one or two parts will be filled in automatically. If this 179 * class has no information about the given locale, then the locale of this 180 * locale matcher instance is returned unchanged. 181 * 182 * @returns {Locale} the most likely completion of the partial locale given 183 * to the constructor of this locale matcher instance 184 */ 185 getLikelyLocale: function () { 186 return this._getLikelyLocale(this.locale); 187 }, 188 189 /** 190 * Return an Locale instance that is specified based on partial information 191 * given to the constructor of this locale matcher instance but which leaves out any 192 * part of the locale specifier that is so common that it is understood. For example, 193 * if the locale 194 * spec given to this locale matcher instance is simply "ru" (for the Russian language), 195 * then it will fill in the missing region and/or script tags and return a locale with 196 * the specifier "ru-RU". (ie. Russian language, Russian Federation). Note that the 197 * default script "Cyrl" is left out because the vast majority of text written in 198 * Russian is written with the Cyrllic script, so that part of the locale is understood 199 * and is commonly left out.<p> 200 * 201 * Any one or two of the language, script, or region parts may be left unspecified, 202 * and the other one or two parts will be filled in automatically. If this 203 * class has no information about the given locale, then the locale of this 204 * locale matcher instance is returned unchanged.<p> 205 * 206 * This method returns the same information as getLikelyLocale but with the very common 207 * parts left out. 208 * 209 * @returns {Locale} the most likely "minimal" completion of the partial locale given 210 * to the constructor of this locale matcher instance where the commonly understood 211 * parts are left out. 212 */ 213 getLikelyLocaleMinimal: function() { 214 var fullLocale = this._getLikelyLocale(this.locale); 215 var langLocale = this._getLikelyLocale(new Locale(fullLocale.language)); 216 return fullLocale.script === langLocale.script && !multiScriptLanguages[fullLocale.language] ? 217 new Locale(fullLocale.language, undefined, fullLocale.region) : 218 fullLocale; 219 }, 220 221 /** 222 * Return the degree that the given locale matches the current locale of this 223 * matcher. This method returns an integer from 0 to 100. A value of 100 is 224 * a 100% match, meaning that the two locales are exactly equivalent to each 225 * other. (eg. "ja-JP" and "ja-JP") A value of 0 means that there 0% match or 226 * that the two locales have nothing in common. (eg. "en-US" and "ja-JP") <p> 227 * 228 * Locale matching is not the same as equivalence, as the degree of matching 229 * is returned. (See Locale.equals for equivalence.)<p> 230 * 231 * The match score is calculated based on matching the 4 locale components, 232 * weighted by importance: 233 * 234 * <ul> 235 * <li> language - this accounts for 50% of the match score 236 * <li> region - accounts for 25% of the match score 237 * <li> script - accounts for 20% of the match score 238 * <li> variant - accounts for 5% of the match score 239 * </ul> 240 * 241 * The score is affected by the following things: 242 * 243 * <ul> 244 * <li> A large language score is given when the language components of the locales 245 * match exactly. 246 * <li> Higher language scores are given when the languages are linguistically 247 * close to each other, such as dialects. 248 * <li> A small score is given when two languages are in the same 249 * linguistic family, but one is not a dialect of the other, such as German 250 * and Dutch. 251 * <li> A large region score is given when two locales share the same region. 252 * <li> A smaller region score is given when one region is contained within 253 * another. For example, Hong Kong is part of China, so a moderate score is 254 * given instead of a full score. 255 * <li> A small score is given if two regions are geographically close to 256 * each other or are tied by history. For example, Ireland and Great Britain 257 * are both adjacent and tied by history, so they receive a moderate score. 258 * <li> A high script score is given if the two locales share the same script. 259 * The legibility of a common script means that there is some small kinship of the 260 * different languages. 261 * <li> A high variant score is given if the two locales share the same 262 * variant. Full score is given when both locales have no variant at all. 263 * <li> Locale components that are unspecified in both locales are given high 264 * scores. 265 * <li> Locales where a particular locale component is missing in only one 266 * locale can still match when the default for that locale component matches 267 * the component in the other locale. The 268 * default value for the missing component is determined using the likely locales 269 * data. (See getLikelyLocale()) For example, "en-US" and "en-Latn-US" receive 270 * a high script score because the default script for "en" is "Latn". 271 * </ul> 272 * 273 * The intention of this method is that it can be used to determine 274 * compatibility of locales. For example, when a user signs up for an 275 * account on a web site, the locales that the web site supports and 276 * the locale of the user's browser may differ, and the site needs to 277 * pick the best locale to show the user. Let's say the 278 * web site supports a selection of European languages such as "it-IT", 279 * "fr-FR", "de-DE", and "en-GB". The user's 280 * browser may be set to "it-CH". The web site code can then match "it-CH" 281 * against each of the supported locales to find the one with the 282 * highest score. In 283 * this case, the best match would be "it-IT" because it shares a 284 * language and script in common with "it-CH" and differs only in the region 285 * component. It is not a 100% match, but it is pretty good. The web site 286 * may decide if the match scores all fall 287 * below a chosen threshold (perhaps 50%?), it should show the user the 288 * default language "en-GB", because that is probably a better choice 289 * than any other supported locale.<p> 290 * 291 * @param {Locale} locale the other locale to match against the current one 292 * @return {number} an integer from 0 to 100 that indicates the degree to 293 * which these locales match each other 294 */ 295 match: function(locale) { 296 var other = new Locale(locale); 297 var scores = [0, 0, 0, 0]; 298 var thisfull, otherfull, i; 299 300 if (this.locale.language === other.language) { 301 scores[0] = 100; 302 } else { 303 if (!this.locale.language || !other.language) { 304 // check for default language 305 thisfull = this.getLikelyLocale(); 306 otherfull = new Locale(this.info.likelyLocales[other.getSpec()] || other.getSpec()); 307 if (thisfull.language === otherfull.language) { 308 scores[0] = 100; 309 } 310 } else { 311 // check for macro languages 312 var mlthis = this.info.macroLanguagesReverse[this.locale.language] || this.locale.language; 313 var mlother = this.info.macroLanguagesReverse[other.language] || other.language; 314 if (mlthis === mlother) { 315 scores[0] = 90; 316 } else { 317 // check for mutual intelligibility 318 var pair = this.locale.language + "-" + other.language; 319 scores[0] = this.info.mutualIntelligibility[pair] || 0; 320 } 321 } 322 } 323 324 if (this.locale.script === other.script) { 325 scores[1] = 100; 326 } else { 327 if (!this.locale.script || !other.script) { 328 // check for default script 329 thisfull = this.locale.script ? this.locale : new Locale(this.info.likelyLocales[this.locale.language]); 330 otherfull = other.script ? other : new Locale(this.info.likelyLocales[other.language]); 331 if (thisfull.script === otherfull.script) { 332 scores[1] = 100; 333 } 334 } 335 } 336 337 if (this.locale.region === other.region) { 338 scores[2] = 100; 339 } else { 340 if (!this.locale.region || !other.region) { 341 // check for default region 342 thisfull = this.getLikelyLocale(); 343 otherfull = new Locale(this.info.likelyLocales[other.getSpec()] || other.getSpec()); 344 if (thisfull.region === otherfull.region) { 345 scores[2] = 100; 346 } 347 } else { 348 // check for containment 349 var containers = this.info.territoryContainmentReverse[this.locale.region] || []; 350 // end at 1 because 0 is "001" which is "the whole world" -- which is not useful 351 for (i = containers.length-1; i > 0; i--) { 352 var container = this.info.territoryContainment[containers[i]]; 353 if (container && container.indexOf(other.region) > -1) { 354 // same area only accounts for 20% of the region score 355 scores[2] = ((i+1) * 100 / containers.length) * 0.2; 356 break; 357 } 358 } 359 } 360 } 361 362 if (this.locale.variant === other.variant) { 363 scores[3] = 100; 364 } 365 366 var total = 0; 367 368 for (i = 0; i < 4; i++) { 369 total += scores[i] * componentWeights[i]; 370 } 371 372 return Math.round(total); 373 }, 374 375 /** 376 * Return the macrolanguage associated with this locale. If the 377 * locale's language is not part of a macro-language, then the 378 * locale's language is returned as-is. 379 * 380 * @returns {string} the ISO code for the macrolanguage associated 381 * with this locale, or language of the locale 382 */ 383 getMacroLanguage: function() { 384 return this.info.macroLanguagesReverse[this.locale.language] || this.locale.language; 385 }, 386 387 /** 388 * @private 389 * Return the containment array for the given region code. 390 */ 391 _getRegionContainment: function(region) { 392 return this.info.territoryContainmentReverse[region] || [] 393 }, 394 395 /** 396 * Return the list of regions that this locale is contained within. Regions are 397 * nested, so locales can be in multiple regions. (eg. US is in Northern North 398 * America, North America, the Americas, the World.) Most regions are specified 399 * using UN.49 region numbers, though some, like "EU", are letters. If the 400 * locale is underspecified, this method will use the most likely locale method 401 * to get the region first. For example, the locale "ja" (Japanese) is most 402 * likely "ja-JP" (Japanese for Japan), and the region containment info for Japan 403 * is returned. 404 * 405 * @returns {Array.<string>} an array of region specifiers that this locale is within 406 */ 407 getRegionContainment: function() { 408 var region = this.locale.region || this.getLikelyLocale().region; 409 return this._getRegionContainment(region); 410 }, 411 412 /** 413 * Find the smallest region that contains both the current locale and the other locale. 414 * If the current or other locales are underspecified, this method will use the most 415 * likely locale method 416 * to get their regions first. For example, the locale "ja" (Japanese) is most 417 * likely "ja-JP" (Japanese for Japan), and the region containment info for Japan 418 * is checked against the other locale's region containment info. 419 * 420 * @param {string|Locale} otherLocale a locale specifier or a Locale instance to 421 * compare against 422 * @returns {string} the region specifier of the smallest region containing both the 423 * current locale and other locale 424 */ 425 smallestCommonRegion: function(otherLocale) { 426 if (typeof(otherLocale) === "undefined") return "001"; 427 428 var thisRegion = this.locale.region || this.getLikelyLocale().region; 429 var otherLoc = typeof(otherLocale) === "string" ? new Locale(otherLocale) : otherLocale; 430 var otherRegion = this._getLikelyLocale(otherLoc).region; 431 432 var thisRegions = this._getRegionContainment(thisRegion); 433 var otherRegions = this._getRegionContainment(otherRegion); 434 435 // region containment arrays are arranged from largest to smallest, so start 436 // at the end of the array 437 for (var i = thisRegions.length-1; i > 0; i--) { 438 if (otherRegions.indexOf(thisRegions[i]) > -1) { 439 return thisRegions[i]; 440 } 441 } 442 443 // this default should never be reached because the world should be common to all regions 444 return "001"; 445 } 446 }; 447 448 module.exports = LocaleMatcher; 449