1 /*
  2  * Charmap.js - A character set mapping class
  3  *
  4  * Copyright © 2014-2015, 2018, JEDLSoft
  5  *
  6  * Licensed under the Apache License, Version 2.0 (the "License");
  7  * you may not use this file except in compliance with the License.
  8  * You may obtain a copy of the License at
  9  *
 10  *     http://www.apache.org/licenses/LICENSE-2.0
 11  *
 12  * Unless required by applicable law or agreed to in writing, software
 13  * distributed under the License is distributed on an "AS IS" BASIS,
 14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15  *
 16  * See the License for the specific language governing permissions and
 17  * limitations under the License.
 18  */
 19 
 20 // !data charmaps charset/US-ASCII charset/ISO-10646-UCS-2 charset/ISO-8859-1 charset/ISO-8859-15 charmaps/ISO-8859-15 charmaps/ISO-8859-1 charset/ISO-8859-1
 21 
 22 var ilib = require("./ilib.js");
 23 var JSUtils = require("./JSUtils.js");
 24 var IString = require("./IString.js");
 25 
 26 /**
 27  * @class
 28  * Create a new default character set mapping instance. This class is the parent
 29  * class of all of the charmapping subclasses, and only implements basic US-ASCII
 30  * mapping. The subclasses implement all other charsets, some algorithmically, and
 31  * some in a table-based way. Use {@link CharmapFactory} to create the correct
 32  * subclass instance for the desired charmap.<p>
 33  *
 34  * All mappings are done to or from Unicode in the UTF-16 encoding, which is the base
 35  * character set and encoding used by Javascript itself. In order to convert
 36  * between two non-Unicode character sets, you must chain two charmap instances together
 37  * to first map to Unicode and then back to the second charset. <p>
 38  *
 39  * The options parameter controls which mapping is constructed and its behaviours. The
 40  * current list of supported options are:
 41  *
 42  * <ul>
 43  * <li><i>missing</i> - specify what to do if a mapping is missing for a particular
 44  * character. For example, if you are mapping Unicode characters to a particular native
 45  * character set that does not support particular Unicode characters, the mapper will
 46  * follow the behaviour specified in this property. Valid values are:
 47  * <ul>
 48  * <li><i>skip</i> - skip any characters that do not exist in the target charset
 49  * <li><i>placeholder</i> - put a static placeholder character in the output string
 50  * wherever there is an unknown character in the input string. Use the <i>placeholder</i>
 51  * parameter to specify which character to use in this case
 52  * <li><i>escape</i> - use an escape sequence to represent the unknown character
 53  * </ul>
 54  * The default value for the missing property if not otherwise specified is "escape"
 55  * so that information is not lost.
 56  *
 57  * <li><i>placeholder</i> - specify the placeholder character to use when the
 58  * mapper cannot map a particular input character to the output string. If this
 59  * option is not specified, then the '?' (question mark) character is used where
 60  * possible.
 61  *
 62  * <li><i>escapeStyle</i> - what style of escape sequences should be used to
 63  * escape unknown characters in the input when mapping to native, and what
 64  * style of espcae sequences should be parsed when mapping to Unicode. Valid
 65  * values are:
 66  * <ul>
 67  * <li><i>html</i> - Escape the characters as HTML entities. This would use
 68  * the standard HTML 5.0 (or later) entity names where possible, and numeric
 69  * entities in all other cases. Eg. an "e" with an acute accent would be
 70  * "é"
 71  * <li><i>js</i> - Use the Javascript escape style. Eg. an "e" with an acute
 72  * accent would be "\u00E9". This can also be specified as "c#" as
 73  * it uses a similar escape syntax.
 74  * <li><i>c</i> - Use the C/C++ escape style, which is similar to the the
 75  * Javascript style, but uses an "x" in place of the "u". Eg. an "e" with an
 76  * acute accent would be "\x00E9". This can also be specified as "c++".
 77  * <li><i>java</i> - Use the Java escape style. This is very similar to the
 78  * the Javascript style, but the backslash has to be escaped twice. Eg. an
 79  * "e" with an acute accent would be "\\u00E9". This can also be specified
 80  * as "ruby", as Ruby uses a similar escape syntax with double backslashes.
 81  * <li><i>perl</i> - Use the Perl escape style. Eg. an "e" with an acute
 82  * accent would be "\N{U+00E9}"
 83  * </ul>
 84  * The default if this style is not specified is "js" for Javascript.
 85  * </ul>
 86  *
 87  * If this copy of ilib is pre-assembled and all the data is already available,
 88  * or if the data was already previously loaded, then this constructor will call
 89  * the onLoad callback immediately when the initialization is done.
 90  * If the onLoad option is not given, this class will only attempt to load any
 91  * missing data synchronously.
 92  *
 93  * @constructor
 94  * @param {Object=} options options which govern the construction of this instance
 95  */
 96 var Charmap = function(options) {
 97     if (options && options.noinstance) {
 98         return;
 99     }
100 
101     this.missing = "placeholder";
102     this.placeholder = "?";
103     this.escapeStyle = "js";
104     this.expansionFactor = 1;
105 
106     if (options) {
107         if (typeof(options.placeholder) !== 'undefined') {
108             this.placeholder = options.placeholder;
109         }
110 
111         var escapes = {
112             "html": "html",
113             "js": "js",
114             "c#": "js",
115             "c": "c",
116             "c++": "c",
117             "java": "java",
118             "ruby": "java",
119             "perl": "perl"
120         };
121 
122         if (typeof(options.escapeStyle) !== 'undefined') {
123             if (typeof(escapes[options.escapeStyle]) !== 'undefined') {
124                 this.escapeStyle = escapes[options.escapeStyle];
125             }
126         }
127 
128         if (typeof(options.missing) !== 'undefined') {
129             if (options.missing === "skip" || options.missing === "placeholder" || options.missing === "escape") {
130                 this.missing = options.missing;
131             }
132         }
133     }
134 };
135 
136 /**
137  * A place for the algorithmic conversions to register themselves as
138  * they are defined.
139  *
140  * @static
141  * @private
142  */
143 Charmap._algorithms = {};
144 
145 Charmap.prototype = {
146     /**
147      * Return the standard name of this charmap. All charmaps map from
148      * Unicode to the native charset, so the name returned from this
149      * function corresponds to the native charset.
150      *
151      * @returns {string} the name of the locale's language in English
152      */
153     getName: function () {
154         return this.charset.getName();
155     },
156 
157     /**
158      * @private
159      */
160     writeNative: function (array, start, value) {
161         // console.log("Charmap.writeNative: start " + start + " adding " + JSON.stringify(value));
162         if (ilib.isArray(value)) {
163             for (var i = 0; i < value.length; i++) {
164                 array[start+i] = value[i];
165             }
166 
167             return value.length;
168         } else {
169             array[start] = value;
170             return 1;
171         }
172     },
173 
174     /**
175      * @private
176      */
177     writeNativeString: function (array, start, string) {
178         // console.log("Charmap.writeNativeString: start " + start + " adding " + JSON.stringify(string));
179         for (var i = 0; i < string.length; i++) {
180             array[start+i] = string.charCodeAt(i);
181         }
182         return string.length;
183     },
184 
185     /**
186      * @private
187      */
188     _calcExpansionFactor: function() {
189         var factor = 1;
190         factor = Math.max(factor, this.charset.getMaxCharWidth());
191         switch (this.missing) {
192         case "placeholder":
193             if (this.placeholder) {
194                 factor = Math.max(factor, this.placeholder.length);
195             }
196             break;
197         case "escape":
198             switch (this.escapeStyle) {
199             case "html":
200                 factor = Math.max(factor, 8); // &#xHHHH;
201                 break;
202             case "c":
203                 factor = Math.max(factor, 6); // \xHHHH
204                 break;
205             case "perl":
206                 factor = Math.max(factor, 10); // \N{U+HHHH}
207                 break;
208 
209             default:
210                 factor = Math.max(factor, 6); // \uHHHH
211                 break;
212             }
213             break;
214         default:
215             break;
216         }
217 
218         this.expansionFactor = factor;
219     },
220 
221     /**
222      * @private
223      */
224     dealWithMissingChar: function(c) {
225         var seq = "";
226 
227         switch (this.missing) {
228             case "skip":
229                 // do nothing
230                 break;
231 
232             case "escape":
233                 var num = (typeof(c) === 'string') ? c.charCodeAt(0) : c;
234                 var bigc = JSUtils.pad(num.toString(16), 4).toUpperCase();
235                 switch (this.escapeStyle) {
236                     case "html":
237                         seq = "&#x" + bigc + ";";
238                         break;
239                     case "c":
240                         seq = "\\x" + bigc;
241                         break;
242                     case "java":
243                         seq = "\\\\u" + bigc;
244                         break;
245                     case "perl":
246                         seq = "\\N{U+" + bigc + "}";
247                         break;
248 
249                     default:
250                     case "js":
251                         seq = "\\u" + bigc;
252                         break;
253                 }
254                 break;
255 
256             default:
257             case "placeholder":
258                 seq = this.placeholder;
259                 break;
260         }
261 
262         return seq;
263     },
264 
265     /**
266      * Map a string to the native character set. This string may be
267      * given as an intrinsic Javascript string object or an IString
268      * object.
269      *
270      * @param {string|IString} string string to map to a different
271      * character set.
272      * @return {Uint8Array} An array of bytes representing the string
273      * in the native character set
274      */
275     mapToNative: function(string) {
276         if (!string) {
277             return new Uint8Array(0);
278         }
279 
280         if (this.algorithm) {
281             return this.algorithm.mapToNative(string);
282         }
283 
284         // the default algorithm is plain old ASCII
285         var str = (string instanceof IString) ? string : new IString(string);
286 
287         // use IString's iterator so that we take care of walking through
288         // the code points correctly, including the surrogate pairs
289         var c, i = 0, it = str.iterator();
290         var ret = new Uint8Array(str.length * this.expansionFactor);
291 
292         while (it.hasNext() && i < ret.length) {
293             c = it.next();
294             if (c < 127) {
295                 ret[i++] = c;
296             } else {
297                 i += this.writeNativeString(ret, i, this.dealWithMissingChar(c));
298             }
299         }
300 
301         return ret;
302     },
303 
304     /**
305      * Map a native string to the standard Javascript charset of UTF-16.
306      * This string may be given as an array of numbers where each number
307      * represents a code point in the "from" charset, or as a Uint8Array
308      * array of bytes representing the bytes of the string in order.
309      *
310      * @param {Array.<number>|Uint8Array} bytes bytes to map to
311      * a Unicode string
312      * @return {string} A string in the standard Javascript charset UTF-16
313      */
314     mapToUnicode: function(bytes) {
315         var ret = "";
316         var c, i = 0;
317 
318         while (i < bytes.length) {
319             c = bytes[i];
320 
321             // the default algorithm is plain old ASCII
322             if (c < 128) {
323                 ret += String.fromCharCode(c);
324             } else {
325                 // The byte at "i" wasn't ASCII
326                 ret += this.dealWithMissingChar(bytes[i++]);
327             }
328         }
329 
330         return ret;
331     }
332 };
333 
334 module.exports = Charmap;
335