1 /* 2 * Charmap.js - A character set mapping class 3 * 4 * Copyright © 2014-2015, 2018, JEDLSoft 5 * 6 * Licensed under the Apache License, Version 2.0 (the "License"); 7 * you may not use this file except in compliance with the License. 8 * You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * 16 * See the License for the specific language governing permissions and 17 * limitations under the License. 18 */ 19 20 // !data charmaps charset/US-ASCII charset/ISO-10646-UCS-2 charset/ISO-8859-1 charset/ISO-8859-15 charmaps/ISO-8859-15 charmaps/ISO-8859-1 charset/ISO-8859-1 21 22 var ilib = require("./ilib.js"); 23 var JSUtils = require("./JSUtils.js"); 24 var IString = require("./IString.js"); 25 26 /** 27 * @class 28 * Create a new default character set mapping instance. This class is the parent 29 * class of all of the charmapping subclasses, and only implements basic US-ASCII 30 * mapping. The subclasses implement all other charsets, some algorithmically, and 31 * some in a table-based way. Use {@link CharmapFactory} to create the correct 32 * subclass instance for the desired charmap.<p> 33 * 34 * All mappings are done to or from Unicode in the UTF-16 encoding, which is the base 35 * character set and encoding used by Javascript itself. In order to convert 36 * between two non-Unicode character sets, you must chain two charmap instances together 37 * to first map to Unicode and then back to the second charset. <p> 38 * 39 * The options parameter controls which mapping is constructed and its behaviours. The 40 * current list of supported options are: 41 * 42 * <ul> 43 * <li><i>missing</i> - specify what to do if a mapping is missing for a particular 44 * character. For example, if you are mapping Unicode characters to a particular native 45 * character set that does not support particular Unicode characters, the mapper will 46 * follow the behaviour specified in this property. Valid values are: 47 * <ul> 48 * <li><i>skip</i> - skip any characters that do not exist in the target charset 49 * <li><i>placeholder</i> - put a static placeholder character in the output string 50 * wherever there is an unknown character in the input string. Use the <i>placeholder</i> 51 * parameter to specify which character to use in this case 52 * <li><i>escape</i> - use an escape sequence to represent the unknown character 53 * </ul> 54 * The default value for the missing property if not otherwise specified is "escape" 55 * so that information is not lost. 56 * 57 * <li><i>placeholder</i> - specify the placeholder character to use when the 58 * mapper cannot map a particular input character to the output string. If this 59 * option is not specified, then the '?' (question mark) character is used where 60 * possible. 61 * 62 * <li><i>escapeStyle</i> - what style of escape sequences should be used to 63 * escape unknown characters in the input when mapping to native, and what 64 * style of espcae sequences should be parsed when mapping to Unicode. Valid 65 * values are: 66 * <ul> 67 * <li><i>html</i> - Escape the characters as HTML entities. This would use 68 * the standard HTML 5.0 (or later) entity names where possible, and numeric 69 * entities in all other cases. Eg. an "e" with an acute accent would be 70 * "é" 71 * <li><i>js</i> - Use the Javascript escape style. Eg. an "e" with an acute 72 * accent would be "\u00E9". This can also be specified as "c#" as 73 * it uses a similar escape syntax. 74 * <li><i>c</i> - Use the C/C++ escape style, which is similar to the the 75 * Javascript style, but uses an "x" in place of the "u". Eg. an "e" with an 76 * acute accent would be "\x00E9". This can also be specified as "c++". 77 * <li><i>java</i> - Use the Java escape style. This is very similar to the 78 * the Javascript style, but the backslash has to be escaped twice. Eg. an 79 * "e" with an acute accent would be "\\u00E9". This can also be specified 80 * as "ruby", as Ruby uses a similar escape syntax with double backslashes. 81 * <li><i>perl</i> - Use the Perl escape style. Eg. an "e" with an acute 82 * accent would be "\N{U+00E9}" 83 * </ul> 84 * The default if this style is not specified is "js" for Javascript. 85 * </ul> 86 * 87 * If this copy of ilib is pre-assembled and all the data is already available, 88 * or if the data was already previously loaded, then this constructor will call 89 * the onLoad callback immediately when the initialization is done. 90 * If the onLoad option is not given, this class will only attempt to load any 91 * missing data synchronously. 92 * 93 * @constructor 94 * @param {Object=} options options which govern the construction of this instance 95 */ 96 var Charmap = function(options) { 97 if (options && options.noinstance) { 98 return; 99 } 100 101 this.missing = "placeholder"; 102 this.placeholder = "?"; 103 this.escapeStyle = "js"; 104 this.expansionFactor = 1; 105 106 if (options) { 107 if (typeof(options.placeholder) !== 'undefined') { 108 this.placeholder = options.placeholder; 109 } 110 111 var escapes = { 112 "html": "html", 113 "js": "js", 114 "c#": "js", 115 "c": "c", 116 "c++": "c", 117 "java": "java", 118 "ruby": "java", 119 "perl": "perl" 120 }; 121 122 if (typeof(options.escapeStyle) !== 'undefined') { 123 if (typeof(escapes[options.escapeStyle]) !== 'undefined') { 124 this.escapeStyle = escapes[options.escapeStyle]; 125 } 126 } 127 128 if (typeof(options.missing) !== 'undefined') { 129 if (options.missing === "skip" || options.missing === "placeholder" || options.missing === "escape") { 130 this.missing = options.missing; 131 } 132 } 133 } 134 }; 135 136 /** 137 * A place for the algorithmic conversions to register themselves as 138 * they are defined. 139 * 140 * @static 141 * @private 142 */ 143 Charmap._algorithms = {}; 144 145 Charmap.prototype = { 146 /** 147 * Return the standard name of this charmap. All charmaps map from 148 * Unicode to the native charset, so the name returned from this 149 * function corresponds to the native charset. 150 * 151 * @returns {string} the name of the locale's language in English 152 */ 153 getName: function () { 154 return this.charset.getName(); 155 }, 156 157 /** 158 * @private 159 */ 160 writeNative: function (array, start, value) { 161 // console.log("Charmap.writeNative: start " + start + " adding " + JSON.stringify(value)); 162 if (ilib.isArray(value)) { 163 for (var i = 0; i < value.length; i++) { 164 array[start+i] = value[i]; 165 } 166 167 return value.length; 168 } else { 169 array[start] = value; 170 return 1; 171 } 172 }, 173 174 /** 175 * @private 176 */ 177 writeNativeString: function (array, start, string) { 178 // console.log("Charmap.writeNativeString: start " + start + " adding " + JSON.stringify(string)); 179 for (var i = 0; i < string.length; i++) { 180 array[start+i] = string.charCodeAt(i); 181 } 182 return string.length; 183 }, 184 185 /** 186 * @private 187 */ 188 _calcExpansionFactor: function() { 189 var factor = 1; 190 factor = Math.max(factor, this.charset.getMaxCharWidth()); 191 switch (this.missing) { 192 case "placeholder": 193 if (this.placeholder) { 194 factor = Math.max(factor, this.placeholder.length); 195 } 196 break; 197 case "escape": 198 switch (this.escapeStyle) { 199 case "html": 200 factor = Math.max(factor, 8); // HHHH; 201 break; 202 case "c": 203 factor = Math.max(factor, 6); // \xHHHH 204 break; 205 case "perl": 206 factor = Math.max(factor, 10); // \N{U+HHHH} 207 break; 208 209 default: 210 factor = Math.max(factor, 6); // \uHHHH 211 break; 212 } 213 break; 214 default: 215 break; 216 } 217 218 this.expansionFactor = factor; 219 }, 220 221 /** 222 * @private 223 */ 224 dealWithMissingChar: function(c) { 225 var seq = ""; 226 227 switch (this.missing) { 228 case "skip": 229 // do nothing 230 break; 231 232 case "escape": 233 var num = (typeof(c) === 'string') ? c.charCodeAt(0) : c; 234 var bigc = JSUtils.pad(num.toString(16), 4).toUpperCase(); 235 switch (this.escapeStyle) { 236 case "html": 237 seq = "" + bigc + ";"; 238 break; 239 case "c": 240 seq = "\\x" + bigc; 241 break; 242 case "java": 243 seq = "\\\\u" + bigc; 244 break; 245 case "perl": 246 seq = "\\N{U+" + bigc + "}"; 247 break; 248 249 default: 250 case "js": 251 seq = "\\u" + bigc; 252 break; 253 } 254 break; 255 256 default: 257 case "placeholder": 258 seq = this.placeholder; 259 break; 260 } 261 262 return seq; 263 }, 264 265 /** 266 * Map a string to the native character set. This string may be 267 * given as an intrinsic Javascript string object or an IString 268 * object. 269 * 270 * @param {string|IString} string string to map to a different 271 * character set. 272 * @return {Uint8Array} An array of bytes representing the string 273 * in the native character set 274 */ 275 mapToNative: function(string) { 276 if (!string) { 277 return new Uint8Array(0); 278 } 279 280 if (this.algorithm) { 281 return this.algorithm.mapToNative(string); 282 } 283 284 // the default algorithm is plain old ASCII 285 var str = (string instanceof IString) ? string : new IString(string); 286 287 // use IString's iterator so that we take care of walking through 288 // the code points correctly, including the surrogate pairs 289 var c, i = 0, it = str.iterator(); 290 var ret = new Uint8Array(str.length * this.expansionFactor); 291 292 while (it.hasNext() && i < ret.length) { 293 c = it.next(); 294 if (c < 127) { 295 ret[i++] = c; 296 } else { 297 i += this.writeNativeString(ret, i, this.dealWithMissingChar(c)); 298 } 299 } 300 301 return ret; 302 }, 303 304 /** 305 * Map a native string to the standard Javascript charset of UTF-16. 306 * This string may be given as an array of numbers where each number 307 * represents a code point in the "from" charset, or as a Uint8Array 308 * array of bytes representing the bytes of the string in order. 309 * 310 * @param {Array.<number>|Uint8Array} bytes bytes to map to 311 * a Unicode string 312 * @return {string} A string in the standard Javascript charset UTF-16 313 */ 314 mapToUnicode: function(bytes) { 315 var ret = ""; 316 var c, i = 0; 317 318 while (i < bytes.length) { 319 c = bytes[i]; 320 321 // the default algorithm is plain old ASCII 322 if (c < 128) { 323 ret += String.fromCharCode(c); 324 } else { 325 // The byte at "i" wasn't ASCII 326 ret += this.dealWithMissingChar(bytes[i++]); 327 } 328 } 329 330 return ret; 331 } 332 }; 333 334 module.exports = Charmap; 335