1 /*
  2  * CharmapFactory.js - Factory class to create the right subclasses of a charmap for any
  3  * given chararacter set.
  4  *
  5  * Copyright © 2015, 2018, JEDLSoft
  6  *
  7  * Licensed under the Apache License, Version 2.0 (the "License");
  8  * you may not use this file except in compliance with the License.
  9  * You may obtain a copy of the License at
 10  *
 11  *     http://www.apache.org/licenses/LICENSE-2.0
 12  *
 13  * Unless required by applicable law or agreed to in writing, software
 14  * distributed under the License is distributed on an "AS IS" BASIS,
 15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16  *
 17  * See the License for the specific language governing permissions and
 18  * limitations under the License.
 19  */
 20 
 21 var ilib = require("./ilib.js");
 22 var JSUtils = require("./JSUtils.js");
 23 
 24 var Charset = require("./Charset.js");
 25 var Charmap = require("./Charmap.js");
 26 
 27 function circumventWebpackCharmap(x) {
 28     return "./" + x + ".js";
 29 }
 30 
 31 /**
 32  * Factory method to create a new instance of a character set mapping (charmap)
 33  * subclass that is appropriate for the requested charset. Charmap instances map strings to
 34  * other character sets. The charsets can be of any type, single-byte, multi-byte,
 35  * shifting, etc. <p>
 36  *
 37  * All mappings are done to or from Unicode in the UTF-16 encoding, which is the base
 38  * character set and encoding used by Javascript itself. In order to convert
 39  * between two non-Unicode character sets, you must chain two charmap instances together
 40  * to first map to Unicode and then back to the second charset. <p>
 41  *
 42  * The options parameter controls which mapping is constructed and its behaviours. The
 43  * current list of supported options are:
 44  *
 45  * <ul>
 46  * <li><i>name</i> - the name of the native charset to map to or from. This can be
 47  * given as an {@link Charset} instance or as a string that contains any commonly used name
 48  * for the character set, which is normalized to a standard IANA name.
 49  * If a name is not given, this class will default to the Western European character
 50  * set called ISO-8859-15.
 51  *
 52  * <li><i>missing</i> - specify what to do if a mapping is missing for a particular
 53  * character. For example, if you are mapping Unicode characters to a particular native
 54  * character set that does not support particular Unicode characters, the mapper will
 55  * follow the behaviour specified in this property. Valid values are:
 56  * <ul>
 57  * <li><i>skip</i> - skip any characters that do not exist in the target charset
 58  * <li><i>placeholder</i> - put a static placeholder character in the output string
 59  * wherever there is an unknown character in the input string. Use the <i>placeholder</i>
 60  * parameter to specify which character to use in this case
 61  * <li><i>escape</i> - use an escape sequence to represent the unknown character
 62  * </ul>
 63  * The default value for the missing property if not otherwise specified is "escape"
 64  * so that information is not lost.
 65  *
 66  * <li><i>placeholder</i> - specify the placeholder character to use when the
 67  * mapper cannot map a particular input character to the output string. If this
 68  * option is not specified, then the '?' (question mark) character is used where
 69  * possible.
 70  *
 71  * <li><i>escapeStyle</i> - what style of escape sequences should be used to
 72  * escape unknown characters in the input when mapping to native, and what
 73  * style of espcae sequences should be parsed when mapping to Unicode. Valid
 74  * values are:
 75  * <ul>
 76  * <li><i>html</i> - Escape the characters as HTML entities. This would use
 77  * the standard HTML 5.0 (or later) entity names where possible, and numeric
 78  * entities in all other cases. Eg. an "e" with an acute accent would be
 79  * "é"
 80  * <li><i>js</i> - Use the Javascript escape style. Eg. an "e" with an acute
 81  * accent would be "\u00E9". This can also be specified as "c#" as
 82  * it uses a similar escape syntax.
 83  * <li><i>c</i> - Use the C/C++ escape style, which is similar to the the
 84  * Javascript style, but uses an "x" in place of the "u". Eg. an "e" with an
 85  * acute accent would be "\x00E9". This can also be specified as "c++".
 86  * <li><i>java</i> - Use the Java escape style. This is very similar to the
 87  * the Javascript style, but the backslash has to be escaped twice. Eg. an
 88  * "e" with an acute accent would be "\\u00E9". This can also be specified
 89  * as "ruby", as Ruby uses a similar escape syntax with double backslashes.
 90  * <li><i>perl</i> - Use the Perl escape style. Eg. an "e" with an acute
 91  * accent would be "\N{U+00E9}"
 92  * </ul>
 93  * The default if this style is not specified is "js" for Javascript.
 94  *
 95  * <li><i>onLoad</i> - a callback function to call when this object is fully
 96  * loaded. When the onLoad option is given, this class will attempt to
 97  * load any missing data using the ilib loader callback.
 98  * When the constructor is done (even if the data is already preassembled), the
 99  * onLoad function is called with the current instance as a parameter, so this
100  * callback can be used with preassembled or dynamic loading or a mix of the two.
101  *
102  * <li><i>sync</i> - tell whether to load any missing data synchronously or
103  * asynchronously. If this option is given as "false", then the "onLoad"
104  * callback must be given, because the instance returned from this constructor will
105  * not be usable for a while.
106  *
107  * <li><i>loadParams</i> - an object containing parameters to pass to the
108  * loader callback function when data is missing. The parameters are not
109  * interpretted or modified in any way. They are simply passed along. The object
110  * may contain any property/value pairs as long as the calling code is in
111  * agreement with the loader callback function as to what those parameters mean.
112  * </ul>
113  *
114  * If this copy of ilib is pre-assembled and all the data is already available,
115  * or if the data was already previously loaded, then this constructor will call
116  * the onLoad callback immediately when the initialization is done.
117  * If the onLoad option is not given, this class will only attempt to load any
118  * missing data synchronously.
119  *
120  * @static
121  * @param {Object=} options options controlling the construction of this instance, or
122  * undefined to use the default options
123  * @return {Charmap|undefined} an instance of a character set mapping class appropriate for
124  * the requested charset, or undefined if no mapper could be found that supports the
125  * requested charset
126  */
127 var CharmapFactory = function(options) {
128     var charsetName = (options && options.name) || "ISO-8859-15";
129     var sync = true;
130 
131     // console.log("CharmapFactory: called with options: " + JSON.stringify(options));
132 
133     if (options) {
134         if (typeof(options.sync) === 'boolean') {
135             sync = options.sync;
136         }
137     } else {
138         options = {sync: true};
139     }
140 
141     var instance;
142 
143     new Charset({
144         name: charsetName,
145         sync: sync,
146         loadParams: options.loadParams,
147         onLoad: function (charset) {
148             // name will be normalized already
149             var cons, name = charset.getName();
150 
151             // console.log("CharmapFactory: normalized charset name: " + name);
152 
153             if (!Charmap._algorithms[name] && ilib.isDynCode()) {
154                 // console.log("CharmapFactory: isDynCode. Doing require");
155                 var entry = CharmapFactory._dynMap[name] || "CharmapTable";
156                 cons = Charmap._algorithms[name] = require(circumventWebpackCharmap(entry));
157             }
158 
159             if (!cons) {
160                 cons = Charmap._algorithms[name] || Charmap._algorithms["CharmapTable"];
161             }
162 
163             // console.log("CharmapFactory: cons is "); console.dir(cons);
164 
165             // Pass the same options through to the constructor so the subclass
166             // has the ability to do something with if it needs to. It should also call
167             // the onLoad callback when it is done.
168             instance = cons && new cons(JSUtils.merge(options || {}, {charset: charset}));
169         }
170     });
171 
172     return instance;
173 };
174 
175 
176 /**
177  * Map standardized charset names to classes to initialize in the dynamic code model.
178  * These classes implement algorithmic mappings instead of table-based ones.
179  * TODO: Need to figure out some way that this doesn't have to be updated by hand.
180  * @private
181  */
182 CharmapFactory._dynMap = {
183     "UTF-8":      "UTF8",
184     "UTF-16":     "UTF16LE",
185     "UTF-16LE":   "UTF16LE",
186     "UTF-16BE":   "UTF16BE",
187     "US-ASCII":   "Charmap"
188     /*
189     not implemented yet
190     "ISO-2022-JP": "ISO2022",
191     "ISO-2022-JP-1": "ISO2022",
192     "ISO-2022-JP-2": "ISO2022",
193     "ISO-2022-JP-3": "ISO2022",
194     "ISO-2022-JP-2004": "ISO2022",
195     "ISO-2022-CN": "ISO2022",
196     "ISO-2022-CN-EXT": "ISO2022",
197     "ISO-2022-KR": "ISO2022"
198     */
199 };
200 
201 module.exports = CharmapFactory;