1 /*
  2  * CharmapTable.js - A character set mapping class that maps using trie table
  3  *
  4  * Copyright © 2014-2015, 2018, JEDLSoft
  5  *
  6  * Licensed under the Apache License, Version 2.0 (the "License");
  7  * you may not use this file except in compliance with the License.
  8  * You may obtain a copy of the License at
  9  *
 10  *     http://www.apache.org/licenses/LICENSE-2.0
 11  *
 12  * Unless required by applicable law or agreed to in writing, software
 13  * distributed under the License is distributed on an "AS IS" BASIS,
 14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15  *
 16  * See the License for the specific language governing permissions and
 17  * limitations under the License.
 18  */
 19 
 20 // !data charmaps/ISO-8859-15 charset/ISO-8859-15
 21 
 22 var ilib = require("../index.js");
 23 var Utils = require("./Utils.js");
 24 var Charset = require("./Charset.js");
 25 var Charmap = require("./Charmap.js");
 26 var IString = require("./IString.js");
 27 
 28 /**
 29  * @class
 30  * Create a new character set mapping instance using based on a trie table. Charmap
 31  * instances map strings to
 32  * other character sets. The charsets can be of any type, single-byte, multi-byte,
 33  * shifting, etc. <p>
 34  *
 35  * All mappings are done to or from Unicode in the UTF-16 encoding, which is the base
 36  * character set and encoding used by Javascript itself. In order to convert
 37  * between two non-Unicode character sets, you must chain two charmap instances together
 38  * to first map to Unicode and then back to the second charset. <p>
 39  *
 40  * The options parameter controls which mapping is constructed and its behaviours. The
 41  * current list of supported options are:
 42  *
 43  * <ul>
 44  * <li><i>charset</i> - the name of the native charset to map to or from. This can be
 45  * given as an {@link Charset} instance or as a string that contains any commonly used name
 46  * for the character set, which is normalized to a standard IANA name.
 47  * If a name is not given, this class will default to the Western European character
 48  * set called ISO-8859-15.
 49  *
 50  * <li><i>missing</i> - specify what to do if a mapping is missing for a particular
 51  * character. For example, if you are mapping Unicode characters to a particular native
 52  * character set that does not support particular Unicode characters, the mapper will
 53  * follow the behaviour specified in this property. Valid values are:
 54  * <ul>
 55  * <li><i>skip</i> - skip any characters that do not exist in the target charset
 56  * <li><i>placeholder</i> - put a static placeholder character in the output string
 57  * wherever there is an unknown character in the input string. Use the <i>placeholder</i>
 58  * parameter to specify which character to use in this case
 59  * <li><i>escape</i> - use an escape sequence to represent the unknown character
 60  * </ul>
 61  * The default value for the missing property if not otherwise specified is "escape"
 62  * so that information is not lost.
 63  *
 64  * <li><i>placeholder</i> - specify the placeholder character to use when the
 65  * mapper cannot map a particular input character to the output string. If this
 66  * option is not specified, then the '?' (question mark) character is used where
 67  * possible.
 68  *
 69  * <li><i>escapeStyle</i> - what style of escape sequences should be used to
 70  * escape unknown characters in the input when mapping to native, and what
 71  * style of espcae sequences should be parsed when mapping to Unicode. Valid
 72  * values are:
 73  * <ul>
 74  * <li><i>html</i> - Escape the characters as HTML entities. This would use
 75  * the standard HTML 5.0 (or later) entity names where possible, and numeric
 76  * entities in all other cases. Eg. an "e" with an acute accent would be
 77  * "é"
 78  * <li><i>js</i> - Use the Javascript escape style. Eg. an "e" with an acute
 79  * accent would be "\u00E9". This can also be specified as "c#" as
 80  * it uses a similar escape syntax.
 81  * <li><i>c</i> - Use the C/C++ escape style, which is similar to the the
 82  * Javascript style, but uses an "x" in place of the "u". Eg. an "e" with an
 83  * acute accent would be "\x00E9". This can also be specified as "c++".
 84  * <li><i>java</i> - Use the Java escape style. This is very similar to the
 85  * the Javascript style, but the backslash has to be escaped twice. Eg. an
 86  * "e" with an acute accent would be "\\u00E9". This can also be specified
 87  * as "ruby", as Ruby uses a similar escape syntax with double backslashes.
 88  * <li><i>perl</i> - Use the Perl escape style. Eg. an "e" with an acute
 89  * accent would be "\N{U+00E9}"
 90  * </ul>
 91  * The default if this style is not specified is "js" for Javascript.
 92  *
 93  * <li><i>onLoad</i> - a callback function to call when this object is fully
 94  * loaded. When the onLoad option is given, this class will attempt to
 95  * load any missing data using the ilib loader callback.
 96  * When the constructor is done (even if the data is already preassembled), the
 97  * onLoad function is called with the current instance as a parameter, so this
 98  * callback can be used with preassembled or dynamic loading or a mix of the two.
 99  *
100  * <li><i>sync</i> - tell whether to load any missing data synchronously or
101  * asynchronously. If this option is given as "false", then the "onLoad"
102  * callback must be given, because the instance returned from this constructor will
103  * not be usable for a while.
104  *
105  * <li><i>loadParams</i> - an object containing parameters to pass to the
106  * loader callback function when data is missing. The parameters are not
107  * interpretted or modified in any way. They are simply passed along. The object
108  * may contain any property/value pairs as long as the calling code is in
109  * agreement with the loader callback function as to what those parameters mean.
110  * </ul>
111  *
112  * If this copy of ilib is pre-assembled and all the data is already available,
113  * or if the data was already previously loaded, then this constructor will call
114  * the onLoad callback immediately when the initialization is done.
115  * If the onLoad option is not given, this class will only attempt to load any
116  * missing data synchronously.
117  *
118  * @constructor
119  * @see {ilib.setLoaderCallback} for information about registering a loader callback instance
120  * @extends Charmap
121  * @param {Object=} options options which govern the construction of this instance
122  */
123 var CharmapTable = function(options) {
124     var sync = true;
125 
126     // console.log("CharmapTable: constructor with options: " + JSON.stringify(options));
127 
128     this.parent.call(this, options);
129     this.charsetName = "ISO-8859-15";
130 
131     if (options) {
132         if (typeof(options.charset) === "object") {
133             this.charset = options.charset;
134             this.charsetName = this.charset.getName();
135         } else if (typeof(options.name) !== 'undefined') {
136             this.charsetName = options.name;
137         }
138     } else {
139         options = {sync: true};
140     }
141 
142     if (!this.charset) {
143         new Charset({
144             name: this.charsetName,
145             sync: sync,
146             loadParams: options.loadParams,
147             onLoad: ilib.bind(this, function(cs) {
148                 this.charset = cs;
149                 this._init(options);
150             })
151         });
152     } else {
153         this._init(options);
154     }
155 };
156 
157 CharmapTable.prototype = new Charmap({noinstance: true});
158 CharmapTable.prototype.parent = Charmap;
159 CharmapTable.prototype.constructor = CharmapTable;
160 
161 /**
162  * Initialize the table charmap object
163  * @private
164  */
165 CharmapTable.prototype._init = function(options) {
166     this._calcExpansionFactor();
167 
168     Utils.loadData({
169         object: "Charmap",
170         locale: "-",
171         nonlocale: true,
172         name: "charmaps/" + this.charset.getName() + ".json",
173         sync: options.sync,
174         loadParams: options.loadParams,
175         callback: ilib.bind(this, function (mapping) {
176             var ret = this;
177             if (!mapping) {
178                 if (options.sync) {
179                     throw "No mapping found for " + this.charset.getName();
180                 } else {
181                     ret = undefined;
182                 }
183             }
184 
185             /** @type {{from:Object,to:Object}} */
186             this.map = mapping;
187             if (typeof(options.onLoad) === 'function') {
188                 options.onLoad(ret);
189             }
190         })
191     });
192 };
193 
194 /**
195  * Walk a trie to find the value for the current position in the given array.
196  * @private
197  */
198 CharmapTable.prototype._trieWalk = function(trie, array, start) {
199     function isValue(node) {
200         return (typeof(node) === 'string' || typeof(node) === 'number' ||
201             (typeof(node) === 'object' && ilib.isArray(node)));
202     }
203 
204     var lastLeaf = undefined,
205         i = start,
206         trienode = trie;
207 
208     while (i < array.length) {
209         if (typeof(trienode.__leaf) !== 'undefined') {
210             lastLeaf = {
211                 consumed: i - start + 1,
212                 value: trienode.__leaf
213             };
214         }
215         if (array[i] === 0) {
216             // null-terminator, so end the mapping.
217             return {
218                 consumed: 1,
219                 value: 0
220             };
221         } else if (typeof(trienode[array[i]]) !== 'undefined') {
222             // we have a mapping
223             if (isValue(trienode[array[i]])) {
224                 // it is a leaf node
225                 return {
226                     consumed: i - start + 1,
227                     value: trienode[array[i]]
228                 };
229             } else {
230                 // it is an intermediate node
231                 trienode = trienode[array[i++]];
232             }
233         } else {
234             // no mapping for this array element, so return the last known
235             // leaf. If none, this will return undefined.
236             return lastLeaf;
237         }
238     }
239 
240     return undefined;
241 };
242 
243 /**
244  * Map a string to the native character set. This string may be
245  * given as an intrinsic Javascript string object or an IString
246  * object.
247  *
248  * @param {string|IString} string string to map to a different
249  * character set.
250  * @return {Uint8Array} An array of bytes representing the string
251  * in the native character set
252  */
253 CharmapTable.prototype.mapToNative = function(string) {
254     if (!string) {
255         return new Uint8Array(0);
256     }
257 
258     var str = (string instanceof IString) ? string : new IString(string);
259 
260     // use IString's iterator so that we take care of walking through
261     // the code points correctly, including the surrogate pairs
262     // var c, i = 0, it = str.charIterator();
263     var ret = new Uint8Array(str.length * this.expansionFactor);
264 
265     var i = 0, j = 0;
266 
267     while (i < string.length) {
268         var result = this._trieWalk(this.map.from, string, i);
269         if (result) {
270             if (result.value) {
271                 i += result.consumed;
272                 j += this.writeNative(ret, j, result.value);
273             } else {
274                 // null-termination
275                 i = string.length;
276                 this.writeNative(ret, j, [result.value]);
277             }
278         } else {
279             // The unicode char at "i" didn't have any mapping, so
280             // deal with the missing char
281             j += this.writeNativeString(ret, j, this.dealWithMissingChar(string[i++]));
282         }
283     }
284 
285     return ret.subarray(0, j);
286 };
287 
288 /**
289  * Map a native string to the standard Javascript charset of UTF-16.
290  * This string may be given as an array of numbers where each number
291  * represents a code point in the "from" charset, or as a Uint8Array
292  * array of bytes representing the bytes of the string in order.
293  *
294  * @param {Array.<number>|Uint8Array} bytes bytes to map to
295  * a Unicode string
296  * @return {string} A string in the standard Javascript charset UTF-16
297  */
298 CharmapTable.prototype.mapToUnicode = function(bytes) {
299     var ret = "";
300     var i = 0;
301 
302     while (i < bytes.length) {
303         var result = this._trieWalk(this.map.to, bytes, i);
304         if (result) {
305             if (result.value) {
306                 i += result.consumed;
307                 if (typeof(result.value) === 'string') {
308                     ret += result.value;
309                 } else if (ilib.isArray(result.value)) {
310                     for (var j = 0; j < result.value.length; j++) {
311                         ret += result.value[j];
312                     }
313                 } // else error in charmap file??
314             } else {
315                 // null-termination
316                 i = bytes.length;
317             }
318         } else {
319             // The byte at "i" wasn't a lead byte, so start again at the
320             // next byte instead. This may synchronize the rest
321             // of the string.
322             ret += this.dealWithMissingChar(bytes[i++]);
323         }
324     }
325 
326     return ret;
327 };
328 
329 Charmap._algorithms["CharmapTable"] = CharmapTable;
330 
331 module.exports = CharmapTable;