1 /*
  2  * UTF8.js - Implement Unicode Transformation Format 8-bit mappings
  3  *
  4  * Copyright © 2014-2015, 2018, JEDLSoft
  5  *
  6  * Licensed under the Apache License, Version 2.0 (the "License");
  7  * you may not use this file except in compliance with the License.
  8  * You may obtain a copy of the License at
  9  *
 10  *     http://www.apache.org/licenses/LICENSE-2.0
 11  *
 12  * Unless required by applicable law or agreed to in writing, software
 13  * distributed under the License is distributed on an "AS IS" BASIS,
 14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15  *
 16  * See the License for the specific language governing permissions and
 17  * limitations under the License.
 18  */
 19 
 20 // !data charset/UTF-8
 21 
 22 var Charset = require("./Charset.js");
 23 var Charmap = require("./Charmap.js");
 24 var IString = require("./IString.js");
 25 
 26 /**
 27  * @class
 28  * Create a new UTF-8 mapping instance
 29  * @constructor
 30  * @extends Charmap
 31  */
 32 var UTF8 = function (options) {
 33     options = options || {sync: true};
 34     if (typeof(options.charset) === "object" && options.charset instanceof Charset) {
 35         this.charset = options.charset;
 36         this._init(options);
 37     } else {
 38         new Charset({
 39             name: "UTF-8",
 40             sync: options.sync,
 41             loadParams: options.loadParams,
 42             onLoad: ilib.bind(this, function(cs) {
 43                 this.charset = cs;
 44                 this._init(options);
 45             })
 46         });
 47     }
 48 };
 49 
 50 UTF8.prototype = new Charmap({noinstance: true});
 51 UTF8.prototype.parent = Charmap;
 52 UTF8.prototype.constructor = UTF8;
 53 
 54 /**
 55  * @private
 56  * Initialize the charmap instance
 57  */
 58 UTF8.prototype._init = function(options) {
 59     this._calcExpansionFactor();
 60 
 61     if (typeof(options.onLoad) === "function") {
 62         options.onLoad(this);
 63     }
 64 };
 65 
 66 UTF8.prototype.validate = function(bytes) {
 67     var i = 0;
 68     while (i < bytes.length) {
 69         if ((bytes[i] & 0x80) === 0) {
 70             i++;
 71         } else {
 72             var len;
 73             if ((bytes[i] & 0xC0) === 0xC0) {
 74                 len = 2;
 75             } else if ((bytes[i] & 0xE0) === 0xE0) {
 76                 len = 3;
 77             } else if ((bytes[i] & 0xF0) === 0xF0) {
 78                 len = 4;
 79             } else {
 80                 // invalid lead byte
 81                 return false;
 82             }
 83             if (i + len > bytes.length) {
 84                 // not enough trailing bytes
 85                 return false;
 86             }
 87             for (var j = 1; j < len; j++) {
 88                 // check each trailing byte to see if it has the correct form
 89                 if ((bytes[i+j] & 0x80) !== 0x80) {
 90                     return false;
 91                 }
 92             }
 93             i += len;
 94         }
 95     }
 96 
 97     return true;
 98 };
 99 
100 UTF8.prototype.mapToUnicode = function (bytes) {
101     if (typeof(Buffer) !== "undefined") {
102         // nodejs can convert it quickly in native code
103         var b = Buffer.from(bytes);
104         return b.toString("utf8");
105     }
106     // otherwise we have to implement it in pure JS
107     var ret = "";
108     var i = 0;
109     while (i < bytes.length) {
110         if (bytes[i] === 0) {
111             // null-terminator
112             i = bytes.length;
113         } else if ((bytes[i] & 0x80) === 0) {
114             // 1 byte char
115             ret += String.fromCharCode(bytes[i++]);
116         } else if ((bytes[i] & 0xE0) === 0xC0) {
117             // 2 byte char
118             if (i + 1 >= bytes.length || (bytes[i+1] & 0x80) !== 0x80) {
119                 throw "invalid utf-8 bytes";
120             }
121             // xxx xxyyyyyy
122             ret += String.fromCharCode((bytes[i] & 0x1F) << 6 | (bytes[i+1] & 0x3F));
123             i += 2;
124         } else if ((bytes[i] & 0xF0) === 0xE0) {
125             // 3 byte char
126             if (i + 2 >= bytes.length || (bytes[i+1] & 0x80) !== 0x80 || (bytes[i+2] & 0x80) !== 0x80) {
127                 throw "invalid utf-8 bytes";
128             }
129             // xxxxyyyy yyzzzzzz
130             ret += String.fromCharCode((bytes[i] & 0xF) << 12 | (bytes[i+1] & 0x3F) << 6 | (bytes[i+2] & 0x3F));
131             i += 3;
132         } else if ((bytes[i] & 0xF8) === 0xF0) {
133             // 4 byte char
134             if (i + 3 >= bytes.length || (bytes[i+1] & 0x80) !== 0x80 || (bytes[i+2] & 0x80) !== 0x80 || (bytes[i+3] & 0x80) !== 0x80) {
135                 throw "invalid utf-8 bytes";
136             }
137             // wwwxx xxxxyyyy yyzzzzzz
138             ret += IString.fromCodePoint((bytes[i] & 0x7) << 18 | (bytes[i+1] & 0x3F) << 12 | (bytes[i+2] & 0x3F) << 6 | (bytes[i+3] & 0x3F));
139             i += 4;
140         } else {
141             throw "invalid utf-8 bytes";
142         }
143     }
144 
145     return ret;
146 };
147 
148 UTF8.prototype.mapToNative = function(str) {
149     if (typeof(Buffer) !== "undefined") {
150         // nodejs can convert it quickly in native code
151         var b = Buffer.from(str, "utf8");
152         return new Uint8Array(b);
153     }
154     // otherwise we have to implement it in pure JS
155     var istr = (str instanceof IString) ? str : new IString(str);
156 
157     // step through the surrogate pairs as single code points by using
158     // IString's iterator
159     var it = istr.iterator();
160 
161     // multiply by 4 because the max size of a UTF-8 char is 4 bytes, so
162     // this will at least get us enough room to encode everything. Add 1
163     // for the null terminator
164     var ret = new Uint8Array(istr.length * 4 + 1);
165     var i = 0;
166 
167     while (it.hasNext()) {
168         var c = it.next();
169         if (c > 0x7F) {
170             if (c > 0x7FF) {
171                 if (c > 0xFFFF) {
172                     // astral planes char
173                     ret[i]   = 0xF0 | ((c >> 18) & 0x3);
174                     ret[i+1] = 0x80 | ((c >> 12) & 0x3F);
175                     ret[i+2] = 0x80 | ((c >> 6) & 0x3F);
176                     ret[i+3] = 0x80 | (c & 0x3F);
177 
178                     i += 4;
179                 } else {
180                     ret[i]   = 0xE0 | ((c >> 12) & 0xF);
181                     ret[i+1] = 0x80 | ((c >> 6) & 0x3F);
182                     ret[i+2] = 0x80 | (c & 0x3F);
183 
184                     i += 3;
185                 }
186             } else {
187                 ret[i]   = 0xC0 | ((c >> 6) & 0x1F);
188                 ret[i+1] = 0x80 | (c & 0x3F);
189 
190                 i += 2;
191             }
192         } else {
193             ret[i++] = (c & 0x7F);
194         }
195     }
196     ret[i] = 0; // null-terminate it
197 
198     return ret;
199 };
200 
201 Charmap._algorithms["UTF-8"] = UTF8;
202 
203 module.exports = UTF8;