Source

LocaleMatcher.js

/*
 * LocaleMatcher.js - Locale matcher definition
 *
 * Copyright © 2013-2015, 2018-2019, JEDLSoft
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

// !data localematch

var ilib = require("../index.js");
var Utils = require("./Utils.js");
var Locale = require("./Locale.js");

var componentWeights = [
    0.5,   // language
    0.2,   // script
    0.25,  // region
    0.05   // variant
];

// these are languages where you have to put the script all the time,
// as none of the scripts are default for the language
var multiScriptLanguages = {
    "az": true,   // Azerbaijani
    "kk": true,   // Kazakh
    "ku": true,   // Kurdish
    "ky": true,   // Kyrgyz
    "pa": true,   // Panjabi
    "sr": true,   // Serbian
    "tg": true,   // Tajik
    "uz": true,   // Uzbek
    "zh": true    // Chinese
};

/**
 * @class
 * Create a new locale matcher instance. This is used
 * to see which locales can be matched with each other in
 * various ways.<p>
 *
 * The options object may contain any of the following properties:
 *
 * <ul>
 * <li><i>locale</i> - the locale instance or locale spec to match
 *
 * <li><i>onLoad</i> - a callback function to call when the locale matcher object is fully
 * loaded. When the onLoad option is given, the locale matcher object will attempt to
 * load any missing locale data using the ilib loader callback.
 * When the constructor is done (even if the data is already preassembled), the
 * onLoad function is called with the current instance as a parameter, so this
 * callback can be used with preassembled or dynamic loading or a mix of the two.
 *
 * <li><i>sync</i> - tell whether to load any missing locale data synchronously or
 * asynchronously. If this option is given as "false", then the "onLoad"
 * callback must be given, as the instance returned from this constructor will
 * not be usable for a while.
 *
 * <li><i>loadParams</i> - an object containing parameters to pass to the
 * loader callback function when locale data is missing. The parameters are not
 * interpretted or modified in any way. They are simply passed along. The object
 * may contain any property/value pairs as long as the calling code is in
 * agreement with the loader callback function as to what those parameters mean.
 * </ul>
 *
 *
 * @constructor
 * @param {Object} options parameters to initialize this matcher
 */
var LocaleMatcher = function(options) {
    var sync = true,
        loadParams = undefined;

    this.locale = new Locale();

    if (options) {
        if (typeof(options.locale) !== 'undefined') {
            this.locale = (typeof(options.locale) === 'string') ? new Locale(options.locale) : options.locale;
        }

        if (typeof(options.sync) !== 'undefined') {
            sync = !!options.sync;
        }

        if (typeof(options.loadParams) !== 'undefined') {
            loadParams = options.loadParams;
        }
    }

    if (typeof(ilib.data.localematch) === 'undefined') {
        Utils.loadData({
            object: "LocaleMatcher",
            locale: "-",
            name: "localematch.json",
            sync: sync,
            loadParams: loadParams,
            callback: ilib.bind(this, function (info) {
                if (!info) {
                    info = {};
                }
                /** @type {Object.<string,string>} */
                this.info = info;
                if (options && typeof(options.onLoad) === 'function') {
                    options.onLoad(this);
                }
            })
        });
    } else {
        this.info = ilib.data.localematch;
        if (options && typeof(options.onLoad) === 'function') {
            options.onLoad(this);
        }
    }
};


LocaleMatcher.prototype = {
    /**
     * Return the locale used to construct this instance.
     * @return {Locale|undefined} the locale for this matcher
     */
    getLocale: function() {
        return this.locale;
    },

    /**
     * Do the work
     * @private
     */
    _getLikelyLocale: function(locale) {
        // already full specified
        if (locale.language && locale.script && locale.region) return locale;

        if (typeof(this.info.likelyLocales[locale.getSpec()]) === 'undefined') {
            // try various partials before giving up
            var partial = this.info.likelyLocales[new Locale(locale.language, undefined, locale.region).getSpec()];
            if (typeof(partial) !== 'undefined') return new Locale(partial);

            partial = this.info.likelyLocales[new Locale(locale.language, locale.script, undefined).getSpec()];
            if (typeof(partial) !== 'undefined') return new Locale(partial);

            partial = this.info.likelyLocales[new Locale(locale.language, undefined, undefined).getSpec()];
            if (typeof(partial) !== 'undefined') return new Locale(partial);

            partial = this.info.likelyLocales[new Locale(undefined, locale.script, locale.region).getSpec()];
            if (typeof(partial) !== 'undefined') return new Locale(partial);

            partial = this.info.likelyLocales[new Locale(undefined, undefined, locale.region).getSpec()];
            if (typeof(partial) !== 'undefined') return new Locale(partial);

            partial = this.info.likelyLocales[new Locale(undefined, locale.script, undefined).getSpec()];
            if (typeof(partial) !== 'undefined') return new Locale(partial);

            return locale;
        }

        return new Locale(this.info.likelyLocales[locale.getSpec()]);
    },

    /**
     * Return an Locale instance that is fully specified based on partial information
     * given to the constructor of this locale matcher instance. For example, if the locale
     * spec given to this locale matcher instance is simply "ru" (for the Russian language),
     * then it will fill in the missing region and script tags and return a locale with
     * the specifier "ru-Cyrl-RU". (ie. Russian language, Cyrillic, Russian Federation).
     * Any one or two of the language, script, or region parts may be left unspecified,
     * and the other one or two parts will be filled in automatically. If this
     * class has no information about the given locale, then the locale of this
     * locale matcher instance is returned unchanged.
     *
     * @returns {Locale} the most likely completion of the partial locale given
     * to the constructor of this locale matcher instance
     */
    getLikelyLocale: function () {
        return this._getLikelyLocale(this.locale);
    },

    /**
     * Return an Locale instance that is specified based on partial information
     * given to the constructor of this locale matcher instance but which leaves out any
     * part of the locale specifier that is so common that it is understood. For example,
     * if the locale
     * spec given to this locale matcher instance is simply "ru" (for the Russian language),
     * then it will fill in the missing region and/or script tags and return a locale with
     * the specifier "ru-RU". (ie. Russian language, Russian Federation). Note that the
     * default script "Cyrl" is left out because the vast majority of text written in
     * Russian is written with the Cyrllic script, so that part of the locale is understood
     * and is commonly left out.<p>
     *
     * Any one or two of the language, script, or region parts may be left unspecified,
     * and the other one or two parts will be filled in automatically. If this
     * class has no information about the given locale, then the locale of this
     * locale matcher instance is returned unchanged.<p>
     *
     * This method returns the same information as getLikelyLocale but with the very common
     * parts left out.
     *
     * @returns {Locale} the most likely "minimal" completion of the partial locale given
     * to the constructor of this locale matcher instance where the commonly understood
     * parts are left out.
     */
    getLikelyLocaleMinimal: function() {
        var fullLocale = this._getLikelyLocale(this.locale);
        var langLocale = this._getLikelyLocale(new Locale(fullLocale.language));
        return fullLocale.script === langLocale.script && !multiScriptLanguages[fullLocale.language] ?
            new Locale(fullLocale.language, undefined, fullLocale.region) :
            fullLocale;
    },

    /**
     * Return the degree that the given locale matches the current locale of this
     * matcher. This method returns an integer from 0 to 100. A value of 100 is
     * a 100% match, meaning that the two locales are exactly equivalent to each
     * other. (eg. "ja-JP" and "ja-JP") A value of 0 means that there 0% match or
     * that the two locales have nothing in common. (eg. "en-US" and "ja-JP") <p>
     *
     * Locale matching is not the same as equivalence, as the degree of matching
     * is returned. (See Locale.equals for equivalence.)<p>
     *
     * The match score is calculated based on matching the 4 locale components,
     * weighted by importance:
     *
     * <ul>
     * <li> language - this accounts for 50% of the match score
     * <li> region - accounts for 25% of the match score
     * <li> script - accounts for 20% of the match score
     * <li> variant - accounts for 5% of the match score
     * </ul>
     *
     * The score is affected by the following things:
     *
     * <ul>
     * <li> A large language score is given when the language components of the locales
     * match exactly.
     * <li> Higher language scores are given when the languages are linguistically
     * close to each other, such as dialects.
     * <li> A small score is given when two languages are in the same
     * linguistic family, but one is not a dialect of the other, such as German
     * and Dutch.
     * <li> A large region score is given when two locales share the same region.
     * <li> A smaller region score is given when one region is contained within
     * another. For example, Hong Kong is part of China, so a moderate score is
     * given instead of a full score.
     * <li> A small score is given if two regions are geographically close to
     * each other or are tied by history. For example, Ireland and Great Britain
     * are both adjacent and tied by history, so they receive a moderate score.
     * <li> A high script score is given if the two locales share the same script.
     * The legibility of a common script means that there is some small kinship of the
     * different languages.
     * <li> A high variant score is given if the two locales share the same
     * variant. Full score is given when both locales have no variant at all.
     * <li> Locale components that are unspecified in both locales are given high
     * scores.
     * <li> Locales where a particular locale component is missing in only one
     * locale can still match when the default for that locale component matches
     * the component in the other locale. The
     * default value for the missing component is determined using the likely locales
     * data. (See getLikelyLocale()) For example, "en-US" and "en-Latn-US" receive
     * a high script score because the default script for "en" is "Latn".
     * </ul>
     *
     * The intention of this method is that it can be used to determine
     * compatibility of locales. For example, when a user signs up for an
     * account on a web site, the locales that the web site supports and
     * the locale of the user's browser may differ, and the site needs to
     * pick the best locale to show the user. Let's say the
     * web site supports a selection of European languages such as "it-IT",
     * "fr-FR", "de-DE", and "en-GB". The user's
     * browser may be set to "it-CH". The web site code can then match "it-CH"
     * against each of the supported locales to find the one with the
     * highest score. In
     * this case, the best match would be "it-IT" because it shares a
     * language and script in common with "it-CH" and differs only in the region
     * component. It is not a 100% match, but it is pretty good. The web site
     * may decide if the match scores all fall
     * below a chosen threshold (perhaps 50%?), it should show the user the
     * default language "en-GB", because that is probably a better choice
     * than any other supported locale.<p>
     *
     * @param {Locale} locale the other locale to match against the current one
     * @return {number} an integer from 0 to 100 that indicates the degree to
     * which these locales match each other
     */
    match: function(locale) {
        var other = new Locale(locale);
        var scores = [0, 0, 0, 0];
        var thisfull, otherfull, i;

        if (this.locale.language === other.language) {
            scores[0] = 100;
        } else {
            if (!this.locale.language || !other.language) {
                // check for default language
                thisfull = this.getLikelyLocale();
                otherfull = new Locale(this.info.likelyLocales[other.getSpec()] || other.getSpec());
                if (thisfull.language === otherfull.language) {
                    scores[0] = 100;
                }
            } else {
                // check for macro languages
                var mlthis = this.info.macroLanguagesReverse[this.locale.language] || this.locale.language;
                var mlother = this.info.macroLanguagesReverse[other.language] || other.language;
                if (mlthis === mlother) {
                    scores[0] = 90;
                } else {
                    // check for mutual intelligibility
                    var pair = this.locale.language + "-" + other.language;
                    scores[0] = this.info.mutualIntelligibility[pair] || 0;
                }
            }
        }

        if (this.locale.script === other.script) {
            scores[1] = 100;
        } else {
            if (!this.locale.script || !other.script) {
                // check for default script
                thisfull = this.locale.script ? this.locale : new Locale(this.info.likelyLocales[this.locale.language]);
                otherfull = other.script ? other : new Locale(this.info.likelyLocales[other.language]);
                if (thisfull.script === otherfull.script) {
                    scores[1] = 100;
                }
            }
        }

        if (this.locale.region === other.region) {
            scores[2] = 100;
        } else {
            if (!this.locale.region || !other.region) {
                // check for default region
                thisfull = this.getLikelyLocale();
                otherfull = new Locale(this.info.likelyLocales[other.getSpec()] || other.getSpec());
                if (thisfull.region === otherfull.region) {
                    scores[2] = 100;
                }
            } else {
                // check for containment
                var containers = this.info.territoryContainmentReverse[this.locale.region] || [];
                // end at 1 because 0 is "001" which is "the whole world" -- which is not useful
                for (i = containers.length-1; i > 0; i--) {
                    var container = this.info.territoryContainment[containers[i]];
                    if (container && container.indexOf(other.region) > -1) {
                        // same area only accounts for 20% of the region score
                        scores[2] = ((i+1) * 100 / containers.length) * 0.2;
                        break;
                    }
                }
            }
        }

        if (this.locale.variant === other.variant) {
            scores[3] = 100;
        }

        var total = 0;

        for (i = 0; i < 4; i++) {
            total += scores[i] * componentWeights[i];
        }

        return Math.round(total);
    },

    /**
     * Return the macrolanguage associated with this locale. If the
     * locale's language is not part of a macro-language, then the
     * locale's language is returned as-is.
     *
     * @returns {string} the ISO code for the macrolanguage associated
     * with this locale, or language of the locale
     */
    getMacroLanguage: function() {
        return this.info.macroLanguagesReverse[this.locale.language] || this.locale.language;
    },

    /**
     * Return the containment array for the given region code.
     * @private
     */
    _getRegionContainment: function(region) {
        return this.info.territoryContainmentReverse[region] || []
    },

    /**
     * Return the list of regions that this locale is contained within. Regions are
     * nested, so locales can be in multiple regions. (eg. US is in Northern North
     * America, North America, the Americas, the World.) Most regions are specified
     * using UN.49 region numbers, though some, like "EU", are letters. If the
     * locale is underspecified, this method will use the most likely locale method
     * to get the region first. For example, the locale "ja" (Japanese) is most
     * likely "ja-JP" (Japanese for Japan), and the region containment info for Japan
     * is returned.
     *
     * @returns {Array.<string>} an array of region specifiers that this locale is within
     */
    getRegionContainment: function() {
        var region = this.locale.region || this.getLikelyLocale().region;
        return this._getRegionContainment(region);
    },

    /**
     * Find the smallest region that contains both the current locale and the other locale.
     * If the current or other locales are underspecified, this method will use the most
     * likely locale method
     * to get their regions first. For example, the locale "ja" (Japanese) is most
     * likely "ja-JP" (Japanese for Japan), and the region containment info for Japan
     * is checked against the other locale's region containment info.
     *
     * @param {string|Locale} otherLocale a locale specifier or a Locale instance to
     * compare against
     * @returns {string} the region specifier of the smallest region containing both the
     * current locale and other locale
     */
    smallestCommonRegion: function(otherLocale) {
        if (typeof(otherLocale) === "undefined") return "001";

        var thisRegion = this.locale.region || this.getLikelyLocale().region;
        var otherLoc = typeof(otherLocale) === "string" ? new Locale(otherLocale) : otherLocale;
        var otherRegion = this._getLikelyLocale(otherLoc).region;

        var thisRegions = this._getRegionContainment(thisRegion);
        var otherRegions = this._getRegionContainment(otherRegion);

        // region containment arrays are arranged from largest to smallest, so start
        // at the end of the array
        for (var i = thisRegions.length-1; i > 0; i--) {
            if (otherRegions.indexOf(thisRegions[i]) > -1) {
                return thisRegions[i];
            }
        }

        // this default should never be reached because the world should be common to all regions
        return "001";
    }
};

module.exports = LocaleMatcher;