1 /* 2 * Charset.js - Return information about a particular character set 3 * 4 * Copyright © 2014-2015, 2018, JEDLSoft 5 * 6 * Licensed under the Apache License, Version 2.0 (the "License"); 7 * you may not use this file except in compliance with the License. 8 * You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * 16 * See the License for the specific language governing permissions and 17 * limitations under the License. 18 */ 19 20 // !data charset charsetaliases charset/ISO-8859-1 charset/ISO-8859-15 charset/UTF-8 21 22 var ilib = require("./ilib.js"); 23 var Utils = require("./Utils.js"); 24 25 /** 26 * @class 27 * Create a new character set info instance. Charset instances give information about 28 * a particular character set, such as whether or not it is single byte or multibyte, 29 * and which languages commonly use that charset.<p> 30 * 31 * The optional options object holds extra parameters if they are necessary. The 32 * current list of supported options are: 33 * 34 * <ul> 35 * <li><i>name</i> - the name of the charset. This can be given as any commonly 36 * used name for the character set, which is normalized to a standard IANA name 37 * before its info is loaded. If a name is not given, 38 * this class will return information about the base character set of Javascript, 39 * which is currently Unicode as encoded in UTF-16. 40 * 41 * <li><i>onLoad</i> - a callback function to call when this object is fully 42 * loaded. When the onLoad option is given, this class will attempt to 43 * load any missing data using the ilib loader callback. 44 * When the constructor is done (even if the data is already preassembled), the 45 * onLoad function is called with the current instance as a parameter, so this 46 * callback can be used with preassembled or dynamic loading or a mix of the two. 47 * 48 * <li><i>sync</i> - tell whether to load any missing data synchronously or 49 * asynchronously. If this option is given as "false", then the "onLoad" 50 * callback must be given, because the instance returned from this constructor will 51 * not be usable for a while. 52 * 53 * <li><i>loadParams</i> - an object containing parameters to pass to the 54 * loader callback function when data is missing. The parameters are not 55 * interpretted or modified in any way. They are simply passed along. The object 56 * may contain any property/value pairs as long as the calling code is in 57 * agreement with the loader callback function as to what those parameters mean. 58 * </ul> 59 * 60 * If this copy of ilib is pre-assembled and all the data is already available, 61 * or if the data was already previously loaded, then this constructor will call 62 * the onLoad callback immediately when the initialization is done. 63 * If the onLoad option is not given, this class will only attempt to load any 64 * missing data synchronously. 65 * 66 * @constructor 67 * @see {ilib.setLoaderCallback} for information about registering a loader callback instance 68 * @param {Object=} options options which govern the construction of this instance 69 */ 70 var Charset = function(options) { 71 var sync = true, 72 loadParams = undefined; 73 this.originalName = "UTF-8"; 74 75 if (options) { 76 if (typeof(options.name) !== 'undefined') { 77 this.originalName = options.name; 78 } 79 80 if (typeof(options.sync) !== 'undefined') { 81 sync = !!options.sync; 82 } 83 84 if (typeof(options.loadParams) !== 'undefined') { 85 loadParams = options.loadParams; 86 } 87 } 88 89 // default data. A majority of charsets use this info 90 this.info = { 91 description: "default", 92 min: 1, 93 max: 1, 94 bigendian: true, 95 scripts: ["Latn"], 96 locales: ["*"] 97 }; 98 99 Utils.loadData({ 100 object: "Charset", 101 locale: "-", 102 nonlocale: true, 103 name: "charsetaliases.json", 104 sync: sync, 105 loadParams: loadParams, 106 callback: ilib.bind(this, function (info) { 107 // first map the given original name to one of the standardized IANA names 108 if (info) { 109 // recognize better by getting rid of extraneous crap and upper-casing 110 // it so that the match is case-insensitive 111 var n = this.originalName.replace(/[-_,:\+\.\(\)]/g, '').toUpperCase(); 112 this.name = info[n]; 113 } 114 if (!this.name) { 115 this.name = this.originalName; 116 } 117 Utils.loadData({ 118 object: "Charset", 119 locale: "-", 120 nonlocale: true, 121 name: "charset/" + this.name + ".json", 122 sync: sync, 123 loadParams: loadParams, 124 callback: ilib.bind(this, function (info) { 125 if (info) { 126 ilib.extend(this.info, info); 127 } 128 if (options && typeof(options.onLoad) === 'function') { 129 options.onLoad(this); 130 } 131 }) 132 }); 133 }) 134 }); 135 }; 136 137 Charset.prototype = { 138 /** 139 * Return the standard normalized name of this charset. The list of standard names 140 * comes from the IANA registry of character set names at 141 * <a href="http://www.iana.org/assignments/character-sets/character-sets.xhtml">http://www.iana.org/assignments/character-sets/character-sets.xhtml</a>. 142 * 143 * @returns {string} the name of the charset 144 */ 145 getName: function () { 146 return this.name; 147 }, 148 149 /** 150 * Return the original name that this instance was constructed with before it was 151 * normalized to the standard name returned by {@link #getName}. 152 * 153 * @returns {string} the original name that this instance was constructed with 154 */ 155 getOriginalName: function() { 156 return this.originalName; 157 }, 158 159 /** 160 * Return a short description of the character set. 161 * 162 * @returns {string} a description of the character set 163 */ 164 getDescription: function() { 165 return this.info.description || this.getName(); 166 }, 167 168 /** 169 * Return the smallest number of bytes that a single character in this charset 170 * could use. For most charsets, this is 1, but for some charsets such as Unicode 171 * encoded in UTF-16, this may be 2 or more. 172 * @returns {number} the smallest number of bytes that a single character in 173 * this charset uses 174 */ 175 getMinCharWidth: function () { 176 return this.info.min; 177 }, 178 179 /** 180 * Return the largest number of bytes that a single character in this charset 181 * could use. 182 * @returns {number} the largest number of bytes that a single character in 183 * this charset uses 184 */ 185 getMaxCharWidth: function () { 186 return this.info.max; 187 }, 188 189 /** 190 * Return true if this is a multibyte character set, or false for a fixed 191 * width character set. A multibyte character set is one in which the characters 192 * have a variable width. That is, one character may use 1 byte and a different 193 * character might use 2 or 3 bytes. 194 * 195 * @returns {boolean} true if this is a multibyte charset, or false otherwise 196 */ 197 isMultibyte: function() { 198 return this.getMaxCharWidth() > this.getMinCharWidth(); 199 }, 200 201 /** 202 * Return whether or not characters larger than 1 byte use the big endian order 203 * or little endian. 204 * 205 * @returns {boolean} true if this character set uses big endian order, or false 206 * otherwise 207 */ 208 isBigEndian: function() { 209 return this.info.bigendian; 210 }, 211 212 /** 213 * Return an array of ISO script codes whose characters can be encoded with this 214 * character set. 215 * 216 * @returns {Array.<string>} an array of ISO script codes supported by this charset 217 */ 218 getScripts: function() { 219 return this.info.scripts; 220 } 221 }; 222 223 module.exports = Charset;