1 /* 2 * CharmapFactory.js - Factory class to create the right subclasses of a charmap for any 3 * given chararacter set. 4 * 5 * Copyright © 2015, 2018, JEDLSoft 6 * 7 * Licensed under the Apache License, Version 2.0 (the "License"); 8 * you may not use this file except in compliance with the License. 9 * You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, software 14 * distributed under the License is distributed on an "AS IS" BASIS, 15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 * 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 */ 20 21 var ilib = require("./ilib.js"); 22 var JSUtils = require("./JSUtils.js"); 23 24 var Charset = require("./Charset.js"); 25 var Charmap = require("./Charmap.js"); 26 27 function circumventWebpackCharmap(x) { 28 return "./" + x + ".js"; 29 } 30 31 /** 32 * Factory method to create a new instance of a character set mapping (charmap) 33 * subclass that is appropriate for the requested charset. Charmap instances map strings to 34 * other character sets. The charsets can be of any type, single-byte, multi-byte, 35 * shifting, etc. <p> 36 * 37 * All mappings are done to or from Unicode in the UTF-16 encoding, which is the base 38 * character set and encoding used by Javascript itself. In order to convert 39 * between two non-Unicode character sets, you must chain two charmap instances together 40 * to first map to Unicode and then back to the second charset. <p> 41 * 42 * The options parameter controls which mapping is constructed and its behaviours. The 43 * current list of supported options are: 44 * 45 * <ul> 46 * <li><i>name</i> - the name of the native charset to map to or from. This can be 47 * given as an {@link Charset} instance or as a string that contains any commonly used name 48 * for the character set, which is normalized to a standard IANA name. 49 * If a name is not given, this class will default to the Western European character 50 * set called ISO-8859-15. 51 * 52 * <li><i>missing</i> - specify what to do if a mapping is missing for a particular 53 * character. For example, if you are mapping Unicode characters to a particular native 54 * character set that does not support particular Unicode characters, the mapper will 55 * follow the behaviour specified in this property. Valid values are: 56 * <ul> 57 * <li><i>skip</i> - skip any characters that do not exist in the target charset 58 * <li><i>placeholder</i> - put a static placeholder character in the output string 59 * wherever there is an unknown character in the input string. Use the <i>placeholder</i> 60 * parameter to specify which character to use in this case 61 * <li><i>escape</i> - use an escape sequence to represent the unknown character 62 * </ul> 63 * The default value for the missing property if not otherwise specified is "escape" 64 * so that information is not lost. 65 * 66 * <li><i>placeholder</i> - specify the placeholder character to use when the 67 * mapper cannot map a particular input character to the output string. If this 68 * option is not specified, then the '?' (question mark) character is used where 69 * possible. 70 * 71 * <li><i>escapeStyle</i> - what style of escape sequences should be used to 72 * escape unknown characters in the input when mapping to native, and what 73 * style of espcae sequences should be parsed when mapping to Unicode. Valid 74 * values are: 75 * <ul> 76 * <li><i>html</i> - Escape the characters as HTML entities. This would use 77 * the standard HTML 5.0 (or later) entity names where possible, and numeric 78 * entities in all other cases. Eg. an "e" with an acute accent would be 79 * "é" 80 * <li><i>js</i> - Use the Javascript escape style. Eg. an "e" with an acute 81 * accent would be "\u00E9". This can also be specified as "c#" as 82 * it uses a similar escape syntax. 83 * <li><i>c</i> - Use the C/C++ escape style, which is similar to the the 84 * Javascript style, but uses an "x" in place of the "u". Eg. an "e" with an 85 * acute accent would be "\x00E9". This can also be specified as "c++". 86 * <li><i>java</i> - Use the Java escape style. This is very similar to the 87 * the Javascript style, but the backslash has to be escaped twice. Eg. an 88 * "e" with an acute accent would be "\\u00E9". This can also be specified 89 * as "ruby", as Ruby uses a similar escape syntax with double backslashes. 90 * <li><i>perl</i> - Use the Perl escape style. Eg. an "e" with an acute 91 * accent would be "\N{U+00E9}" 92 * </ul> 93 * The default if this style is not specified is "js" for Javascript. 94 * 95 * <li><i>onLoad</i> - a callback function to call when this object is fully 96 * loaded. When the onLoad option is given, this class will attempt to 97 * load any missing data using the ilib loader callback. 98 * When the constructor is done (even if the data is already preassembled), the 99 * onLoad function is called with the current instance as a parameter, so this 100 * callback can be used with preassembled or dynamic loading or a mix of the two. 101 * 102 * <li><i>sync</i> - tell whether to load any missing data synchronously or 103 * asynchronously. If this option is given as "false", then the "onLoad" 104 * callback must be given, because the instance returned from this constructor will 105 * not be usable for a while. 106 * 107 * <li><i>loadParams</i> - an object containing parameters to pass to the 108 * loader callback function when data is missing. The parameters are not 109 * interpretted or modified in any way. They are simply passed along. The object 110 * may contain any property/value pairs as long as the calling code is in 111 * agreement with the loader callback function as to what those parameters mean. 112 * </ul> 113 * 114 * If this copy of ilib is pre-assembled and all the data is already available, 115 * or if the data was already previously loaded, then this constructor will call 116 * the onLoad callback immediately when the initialization is done. 117 * If the onLoad option is not given, this class will only attempt to load any 118 * missing data synchronously. 119 * 120 * @static 121 * @param {Object=} options options controlling the construction of this instance, or 122 * undefined to use the default options 123 * @return {Charmap|undefined} an instance of a character set mapping class appropriate for 124 * the requested charset, or undefined if no mapper could be found that supports the 125 * requested charset 126 */ 127 var CharmapFactory = function(options) { 128 var charsetName = (options && options.name) || "ISO-8859-15"; 129 var sync = true; 130 131 // console.log("CharmapFactory: called with options: " + JSON.stringify(options)); 132 133 if (options) { 134 if (typeof(options.sync) === 'boolean') { 135 sync = options.sync; 136 } 137 } else { 138 options = {sync: true}; 139 } 140 141 var instance; 142 143 new Charset({ 144 name: charsetName, 145 sync: sync, 146 loadParams: options.loadParams, 147 onLoad: function (charset) { 148 // name will be normalized already 149 var cons, name = charset.getName(); 150 151 // console.log("CharmapFactory: normalized charset name: " + name); 152 153 if (!Charmap._algorithms[name] && ilib.isDynCode()) { 154 // console.log("CharmapFactory: isDynCode. Doing require"); 155 var entry = CharmapFactory._dynMap[name] || "CharmapTable"; 156 cons = Charmap._algorithms[name] = require(circumventWebpackCharmap(entry)); 157 } 158 159 if (!cons) { 160 cons = Charmap._algorithms[name] || Charmap._algorithms["CharmapTable"]; 161 } 162 163 // console.log("CharmapFactory: cons is "); console.dir(cons); 164 165 // Pass the same options through to the constructor so the subclass 166 // has the ability to do something with if it needs to. It should also call 167 // the onLoad callback when it is done. 168 instance = cons && new cons(JSUtils.merge(options || {}, {charset: charset})); 169 } 170 }); 171 172 return instance; 173 }; 174 175 176 /** 177 * Map standardized charset names to classes to initialize in the dynamic code model. 178 * These classes implement algorithmic mappings instead of table-based ones. 179 * TODO: Need to figure out some way that this doesn't have to be updated by hand. 180 * @private 181 */ 182 CharmapFactory._dynMap = { 183 "UTF-8": "UTF8", 184 "UTF-16": "UTF16LE", 185 "UTF-16LE": "UTF16LE", 186 "UTF-16BE": "UTF16BE", 187 "US-ASCII": "Charmap" 188 /* 189 not implemented yet 190 "ISO-2022-JP": "ISO2022", 191 "ISO-2022-JP-1": "ISO2022", 192 "ISO-2022-JP-2": "ISO2022", 193 "ISO-2022-JP-3": "ISO2022", 194 "ISO-2022-JP-2004": "ISO2022", 195 "ISO-2022-CN": "ISO2022", 196 "ISO-2022-CN-EXT": "ISO2022", 197 "ISO-2022-KR": "ISO2022" 198 */ 199 }; 200 201 module.exports = CharmapFactory;