1 /* 2 * CharmapTable.js - A character set mapping class that maps using trie table 3 * 4 * Copyright © 2014-2015, 2018, JEDLSoft 5 * 6 * Licensed under the Apache License, Version 2.0 (the "License"); 7 * you may not use this file except in compliance with the License. 8 * You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * 16 * See the License for the specific language governing permissions and 17 * limitations under the License. 18 */ 19 20 // !data charmaps/ISO-8859-15 charset/ISO-8859-15 21 22 var ilib = require("./ilib.js"); 23 var Utils = require("./Utils.js"); 24 var Charset = require("./Charset.js"); 25 var Charmap = require("./Charmap.js"); 26 var IString = require("./IString.js"); 27 28 /** 29 * @class 30 * Create a new character set mapping instance using based on a trie table. Charmap 31 * instances map strings to 32 * other character sets. The charsets can be of any type, single-byte, multi-byte, 33 * shifting, etc. <p> 34 * 35 * All mappings are done to or from Unicode in the UTF-16 encoding, which is the base 36 * character set and encoding used by Javascript itself. In order to convert 37 * between two non-Unicode character sets, you must chain two charmap instances together 38 * to first map to Unicode and then back to the second charset. <p> 39 * 40 * The options parameter controls which mapping is constructed and its behaviours. The 41 * current list of supported options are: 42 * 43 * <ul> 44 * <li><i>charset</i> - the name of the native charset to map to or from. This can be 45 * given as an {@link Charset} instance or as a string that contains any commonly used name 46 * for the character set, which is normalized to a standard IANA name. 47 * If a name is not given, this class will default to the Western European character 48 * set called ISO-8859-15. 49 * 50 * <li><i>missing</i> - specify what to do if a mapping is missing for a particular 51 * character. For example, if you are mapping Unicode characters to a particular native 52 * character set that does not support particular Unicode characters, the mapper will 53 * follow the behaviour specified in this property. Valid values are: 54 * <ul> 55 * <li><i>skip</i> - skip any characters that do not exist in the target charset 56 * <li><i>placeholder</i> - put a static placeholder character in the output string 57 * wherever there is an unknown character in the input string. Use the <i>placeholder</i> 58 * parameter to specify which character to use in this case 59 * <li><i>escape</i> - use an escape sequence to represent the unknown character 60 * </ul> 61 * The default value for the missing property if not otherwise specified is "escape" 62 * so that information is not lost. 63 * 64 * <li><i>placeholder</i> - specify the placeholder character to use when the 65 * mapper cannot map a particular input character to the output string. If this 66 * option is not specified, then the '?' (question mark) character is used where 67 * possible. 68 * 69 * <li><i>escapeStyle</i> - what style of escape sequences should be used to 70 * escape unknown characters in the input when mapping to native, and what 71 * style of espcae sequences should be parsed when mapping to Unicode. Valid 72 * values are: 73 * <ul> 74 * <li><i>html</i> - Escape the characters as HTML entities. This would use 75 * the standard HTML 5.0 (or later) entity names where possible, and numeric 76 * entities in all other cases. Eg. an "e" with an acute accent would be 77 * "é" 78 * <li><i>js</i> - Use the Javascript escape style. Eg. an "e" with an acute 79 * accent would be "\u00E9". This can also be specified as "c#" as 80 * it uses a similar escape syntax. 81 * <li><i>c</i> - Use the C/C++ escape style, which is similar to the the 82 * Javascript style, but uses an "x" in place of the "u". Eg. an "e" with an 83 * acute accent would be "\x00E9". This can also be specified as "c++". 84 * <li><i>java</i> - Use the Java escape style. This is very similar to the 85 * the Javascript style, but the backslash has to be escaped twice. Eg. an 86 * "e" with an acute accent would be "\\u00E9". This can also be specified 87 * as "ruby", as Ruby uses a similar escape syntax with double backslashes. 88 * <li><i>perl</i> - Use the Perl escape style. Eg. an "e" with an acute 89 * accent would be "\N{U+00E9}" 90 * </ul> 91 * The default if this style is not specified is "js" for Javascript. 92 * 93 * <li><i>onLoad</i> - a callback function to call when this object is fully 94 * loaded. When the onLoad option is given, this class will attempt to 95 * load any missing data using the ilib loader callback. 96 * When the constructor is done (even if the data is already preassembled), the 97 * onLoad function is called with the current instance as a parameter, so this 98 * callback can be used with preassembled or dynamic loading or a mix of the two. 99 * 100 * <li><i>sync</i> - tell whether to load any missing data synchronously or 101 * asynchronously. If this option is given as "false", then the "onLoad" 102 * callback must be given, because the instance returned from this constructor will 103 * not be usable for a while. 104 * 105 * <li><i>loadParams</i> - an object containing parameters to pass to the 106 * loader callback function when data is missing. The parameters are not 107 * interpretted or modified in any way. They are simply passed along. The object 108 * may contain any property/value pairs as long as the calling code is in 109 * agreement with the loader callback function as to what those parameters mean. 110 * </ul> 111 * 112 * If this copy of ilib is pre-assembled and all the data is already available, 113 * or if the data was already previously loaded, then this constructor will call 114 * the onLoad callback immediately when the initialization is done. 115 * If the onLoad option is not given, this class will only attempt to load any 116 * missing data synchronously. 117 * 118 * @constructor 119 * @see {ilib.setLoaderCallback} for information about registering a loader callback instance 120 * @extends Charmap 121 * @param {Object=} options options which govern the construction of this instance 122 */ 123 var CharmapTable = function(options) { 124 var sync = true; 125 126 // console.log("CharmapTable: constructor with options: " + JSON.stringify(options)); 127 128 this.parent.call(this, options); 129 this.charsetName = "ISO-8859-15"; 130 131 if (options) { 132 if (typeof(options.charset) === "object") { 133 this.charset = options.charset; 134 this.charsetName = this.charset.getName(); 135 } else if (typeof(options.name) !== 'undefined') { 136 this.charsetName = options.name; 137 } 138 } else { 139 options = {sync: true}; 140 } 141 142 if (!this.charset) { 143 new Charset({ 144 name: this.charsetName, 145 sync: sync, 146 loadParams: options.loadParams, 147 onLoad: ilib.bind(this, function(cs) { 148 this.charset = cs; 149 this._init(options); 150 }) 151 }); 152 } else { 153 this._init(options); 154 } 155 }; 156 157 CharmapTable.prototype = new Charmap({noinstance: true}); 158 CharmapTable.prototype.parent = Charmap; 159 CharmapTable.prototype.constructor = CharmapTable; 160 161 /** 162 * Initialize the table charmap object 163 * @private 164 */ 165 CharmapTable.prototype._init = function(options) { 166 this._calcExpansionFactor(); 167 168 Utils.loadData({ 169 object: "Charmap", 170 locale: "-", 171 nonlocale: true, 172 name: "charmaps/" + this.charset.getName() + ".json", 173 sync: options.sync, 174 loadParams: options.loadParams, 175 callback: ilib.bind(this, function (mapping) { 176 var ret = this; 177 if (!mapping) { 178 if (options.sync) { 179 throw "No mapping found for " + this.charset.getName(); 180 } else { 181 ret = undefined; 182 } 183 } 184 185 /** @type {{from:Object,to:Object}} */ 186 this.map = mapping; 187 if (typeof(options.onLoad) === 'function') { 188 options.onLoad(ret); 189 } 190 }) 191 }); 192 }; 193 194 /** 195 * Walk a trie to find the value for the current position in the given array. 196 * @private 197 */ 198 CharmapTable.prototype._trieWalk = function(trie, array, start) { 199 function isValue(node) { 200 return (typeof(node) === 'string' || typeof(node) === 'number' || 201 (typeof(node) === 'object' && ilib.isArray(node))); 202 } 203 204 var lastLeaf = undefined, 205 i = start, 206 trienode = trie; 207 208 while (i < array.length) { 209 if (typeof(trienode.__leaf) !== 'undefined') { 210 lastLeaf = { 211 consumed: i - start + 1, 212 value: trienode.__leaf 213 }; 214 } 215 if (array[i] === 0) { 216 // null-terminator, so end the mapping. 217 return { 218 consumed: 1, 219 value: 0 220 }; 221 } else if (typeof(trienode[array[i]]) !== 'undefined') { 222 // we have a mapping 223 if (isValue(trienode[array[i]])) { 224 // it is a leaf node 225 return { 226 consumed: i - start + 1, 227 value: trienode[array[i]] 228 }; 229 } else { 230 // it is an intermediate node 231 trienode = trienode[array[i++]]; 232 } 233 } else { 234 // no mapping for this array element, so return the last known 235 // leaf. If none, this will return undefined. 236 return lastLeaf; 237 } 238 } 239 240 return undefined; 241 }; 242 243 /** 244 * Map a string to the native character set. This string may be 245 * given as an intrinsic Javascript string object or an IString 246 * object. 247 * 248 * @param {string|IString} string string to map to a different 249 * character set. 250 * @return {Uint8Array} An array of bytes representing the string 251 * in the native character set 252 */ 253 CharmapTable.prototype.mapToNative = function(string) { 254 if (!string) { 255 return new Uint8Array(0); 256 } 257 258 var str = (string instanceof IString) ? string : new IString(string); 259 260 // use IString's iterator so that we take care of walking through 261 // the code points correctly, including the surrogate pairs 262 // var c, i = 0, it = str.charIterator(); 263 var ret = new Uint8Array(str.length * this.expansionFactor); 264 265 var i = 0, j = 0; 266 267 while (i < string.length) { 268 var result = this._trieWalk(this.map.from, string, i); 269 if (result) { 270 if (result.value) { 271 i += result.consumed; 272 j += this.writeNative(ret, j, result.value); 273 } else { 274 // null-termination 275 i = string.length; 276 this.writeNative(ret, j, [result.value]); 277 } 278 } else { 279 // The unicode char at "i" didn't have any mapping, so 280 // deal with the missing char 281 j += this.writeNativeString(ret, j, this.dealWithMissingChar(string[i++])); 282 } 283 } 284 285 return ret.subarray(0, j); 286 }; 287 288 /** 289 * Map a native string to the standard Javascript charset of UTF-16. 290 * This string may be given as an array of numbers where each number 291 * represents a code point in the "from" charset, or as a Uint8Array 292 * array of bytes representing the bytes of the string in order. 293 * 294 * @param {Array.<number>|Uint8Array} bytes bytes to map to 295 * a Unicode string 296 * @return {string} A string in the standard Javascript charset UTF-16 297 */ 298 CharmapTable.prototype.mapToUnicode = function(bytes) { 299 var ret = ""; 300 var i = 0; 301 302 while (i < bytes.length) { 303 var result = this._trieWalk(this.map.to, bytes, i); 304 if (result) { 305 if (result.value) { 306 i += result.consumed; 307 if (typeof(result.value) === 'string') { 308 ret += result.value; 309 } else if (ilib.isArray(result.value)) { 310 for (var j = 0; j < result.value.length; j++) { 311 ret += result.value[j]; 312 } 313 } // else error in charmap file?? 314 } else { 315 // null-termination 316 i = bytes.length; 317 } 318 } else { 319 // The byte at "i" wasn't a lead byte, so start again at the 320 // next byte instead. This may synchronize the rest 321 // of the string. 322 ret += this.dealWithMissingChar(bytes[i++]); 323 } 324 } 325 326 return ret; 327 }; 328 329 Charmap._algorithms["CharmapTable"] = CharmapTable; 330 331 module.exports = CharmapTable;