1 /*
  2  * Charset.js - Return information about a particular character set
  3  *
  4  * Copyright © 2014-2015, 2018, JEDLSoft
  5  *
  6  * Licensed under the Apache License, Version 2.0 (the "License");
  7  * you may not use this file except in compliance with the License.
  8  * You may obtain a copy of the License at
  9  *
 10  *     http://www.apache.org/licenses/LICENSE-2.0
 11  *
 12  * Unless required by applicable law or agreed to in writing, software
 13  * distributed under the License is distributed on an "AS IS" BASIS,
 14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15  *
 16  * See the License for the specific language governing permissions and
 17  * limitations under the License.
 18  */
 19 
 20 // !data charset charsetaliases charset/ISO-8859-1 charset/ISO-8859-15 charset/UTF-8
 21 
 22 var ilib = require("./ilib.js");
 23 var Utils = require("./Utils.js");
 24 
 25 /**
 26  * @class
 27  * Create a new character set info instance. Charset instances give information about
 28  * a particular character set, such as whether or not it is single byte or multibyte,
 29  * and which languages commonly use that charset.<p>
 30  *
 31  * The optional options object holds extra parameters if they are necessary. The
 32  * current list of supported options are:
 33  *
 34  * <ul>
 35  * <li><i>name</i> - the name of the charset. This can be given as any commonly
 36  * used name for the character set, which is normalized to a standard IANA name
 37  * before its info is loaded. If a name is not given,
 38  * this class will return information about the base character set of Javascript,
 39  * which is currently Unicode as encoded in UTF-16.
 40  *
 41  * <li><i>onLoad</i> - a callback function to call when this object is fully
 42  * loaded. When the onLoad option is given, this class will attempt to
 43  * load any missing data using the ilib loader callback.
 44  * When the constructor is done (even if the data is already preassembled), the
 45  * onLoad function is called with the current instance as a parameter, so this
 46  * callback can be used with preassembled or dynamic loading or a mix of the two.
 47  *
 48  * <li><i>sync</i> - tell whether to load any missing data synchronously or
 49  * asynchronously. If this option is given as "false", then the "onLoad"
 50  * callback must be given, because the instance returned from this constructor will
 51  * not be usable for a while.
 52  *
 53  * <li><i>loadParams</i> - an object containing parameters to pass to the
 54  * loader callback function when data is missing. The parameters are not
 55  * interpretted or modified in any way. They are simply passed along. The object
 56  * may contain any property/value pairs as long as the calling code is in
 57  * agreement with the loader callback function as to what those parameters mean.
 58  * </ul>
 59  *
 60  * If this copy of ilib is pre-assembled and all the data is already available,
 61  * or if the data was already previously loaded, then this constructor will call
 62  * the onLoad callback immediately when the initialization is done.
 63  * If the onLoad option is not given, this class will only attempt to load any
 64  * missing data synchronously.
 65  *
 66  * @constructor
 67  * @see {ilib.setLoaderCallback} for information about registering a loader callback instance
 68  * @param {Object=} options options which govern the construction of this instance
 69  */
 70 var Charset = function(options) {
 71     var sync = true,
 72         loadParams = undefined;
 73     this.originalName = "UTF-8";
 74 
 75     if (options) {
 76         if (typeof(options.name) !== 'undefined') {
 77             this.originalName = options.name;
 78         }
 79 
 80         if (typeof(options.sync) !== 'undefined') {
 81             sync = !!options.sync;
 82         }
 83 
 84         if (typeof(options.loadParams) !== 'undefined') {
 85             loadParams = options.loadParams;
 86         }
 87     }
 88 
 89     // default data. A majority of charsets use this info
 90     this.info = {
 91         description: "default",
 92         min: 1,
 93         max: 1,
 94         bigendian: true,
 95         scripts: ["Latn"],
 96         locales: ["*"]
 97     };
 98 
 99     Utils.loadData({
100         object: "Charset",
101         locale: "-",
102         nonlocale: true,
103         name: "charsetaliases.json",
104         sync: sync,
105         loadParams: loadParams,
106         callback: ilib.bind(this, function (info) {
107             // first map the given original name to one of the standardized IANA names
108             if (info) {
109                 // recognize better by getting rid of extraneous crap and upper-casing
110                 // it so that the match is case-insensitive
111                 var n = this.originalName.replace(/[-_,:\+\.\(\)]/g, '').toUpperCase();
112                 this.name = info[n];
113             }
114             if (!this.name) {
115                 this.name = this.originalName;
116             }
117             Utils.loadData({
118                 object: "Charset",
119                 locale: "-",
120                 nonlocale: true,
121                 name: "charset/" + this.name + ".json",
122                 sync: sync,
123                 loadParams: loadParams,
124                 callback: ilib.bind(this, function (info) {
125                     if (info) {
126                         ilib.extend(this.info, info);
127                     }
128                     if (options && typeof(options.onLoad) === 'function') {
129                         options.onLoad(this);
130                     }
131                 })
132             });
133         })
134     });
135 };
136 
137 Charset.prototype = {
138     /**
139      * Return the standard normalized name of this charset.  The list of standard names
140      * comes from the IANA registry of character set names at
141      * <a href="http://www.iana.org/assignments/character-sets/character-sets.xhtml">http://www.iana.org/assignments/character-sets/character-sets.xhtml</a>.
142      *
143      * @returns {string} the name of the charset
144      */
145     getName: function () {
146         return this.name;
147     },
148 
149     /**
150      * Return the original name that this instance was constructed with before it was
151      * normalized to the standard name returned by {@link #getName}.
152      *
153      * @returns {string} the original name that this instance was constructed with
154      */
155     getOriginalName: function() {
156         return this.originalName;
157     },
158 
159     /**
160      * Return a short description of the character set.
161      *
162      * @returns {string} a description of the character set
163      */
164     getDescription: function() {
165         return this.info.description || this.getName();
166     },
167 
168     /**
169      * Return the smallest number of bytes that a single character in this charset
170      * could use. For most charsets, this is 1, but for some charsets such as Unicode
171      * encoded in UTF-16, this may be 2 or more.
172      * @returns {number} the smallest number of bytes that a single character in
173      * this charset uses
174      */
175     getMinCharWidth: function () {
176         return this.info.min;
177     },
178 
179     /**
180      * Return the largest number of bytes that a single character in this charset
181      * could use.
182      * @returns {number} the largest number of bytes that a single character in
183      * this charset uses
184      */
185     getMaxCharWidth: function () {
186         return this.info.max;
187     },
188 
189     /**
190      * Return true if this is a multibyte character set, or false for a fixed
191      * width character set. A multibyte character set is one in which the characters
192      * have a variable width. That is, one character may use 1 byte and a different
193      * character might use 2 or 3 bytes.
194      *
195      * @returns {boolean} true if this is a multibyte charset, or false otherwise
196      */
197     isMultibyte: function() {
198         return this.getMaxCharWidth() > this.getMinCharWidth();
199     },
200 
201     /**
202      * Return whether or not characters larger than 1 byte use the big endian order
203      * or little endian.
204      *
205      * @returns {boolean} true if this character set uses big endian order, or false
206      * otherwise
207      */
208     isBigEndian: function() {
209         return this.info.bigendian;
210     },
211 
212     /**
213      * Return an array of ISO script codes whose characters can be encoded with this
214      * character set.
215      *
216      * @returns {Array.<string>} an array of ISO script codes supported by this charset
217      */
218     getScripts: function() {
219         return this.info.scripts;
220     }
221 };
222 
223 module.exports = Charset;