1 /* 2 * UTF8.js - Implement Unicode Transformation Format 8-bit mappings 3 * 4 * Copyright © 2014-2015, 2018, JEDLSoft 5 * 6 * Licensed under the Apache License, Version 2.0 (the "License"); 7 * you may not use this file except in compliance with the License. 8 * You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * 16 * See the License for the specific language governing permissions and 17 * limitations under the License. 18 */ 19 20 // !data charset/UTF-8 21 22 var Charset = require("./Charset.js"); 23 var Charmap = require("./Charmap.js"); 24 var IString = require("./IString.js"); 25 26 /** 27 * @class 28 * Create a new UTF-8 mapping instance 29 * @constructor 30 * @extends Charmap 31 */ 32 var UTF8 = function (options) { 33 options = options || {sync: true}; 34 if (typeof(options.charset) === "object" && options.charset instanceof Charset) { 35 this.charset = options.charset; 36 this._init(options); 37 } else { 38 new Charset({ 39 name: "UTF-8", 40 sync: options.sync, 41 loadParams: options.loadParams, 42 onLoad: ilib.bind(this, function(cs) { 43 this.charset = cs; 44 this._init(options); 45 }) 46 }); 47 } 48 }; 49 50 UTF8.prototype = new Charmap({noinstance: true}); 51 UTF8.prototype.parent = Charmap; 52 UTF8.prototype.constructor = UTF8; 53 54 /** 55 * @private 56 * Initialize the charmap instance 57 */ 58 UTF8.prototype._init = function(options) { 59 this._calcExpansionFactor(); 60 61 if (typeof(options.onLoad) === "function") { 62 options.onLoad(this); 63 } 64 }; 65 66 UTF8.prototype.validate = function(bytes) { 67 var i = 0; 68 while (i < bytes.length) { 69 if ((bytes[i] & 0x80) === 0) { 70 i++; 71 } else { 72 var len; 73 if ((bytes[i] & 0xC0) === 0xC0) { 74 len = 2; 75 } else if ((bytes[i] & 0xE0) === 0xE0) { 76 len = 3; 77 } else if ((bytes[i] & 0xF0) === 0xF0) { 78 len = 4; 79 } else { 80 // invalid lead byte 81 return false; 82 } 83 if (i + len > bytes.length) { 84 // not enough trailing bytes 85 return false; 86 } 87 for (var j = 1; j < len; j++) { 88 // check each trailing byte to see if it has the correct form 89 if ((bytes[i+j] & 0x80) !== 0x80) { 90 return false; 91 } 92 } 93 i += len; 94 } 95 } 96 97 return true; 98 }; 99 100 UTF8.prototype.mapToUnicode = function (bytes) { 101 if (typeof(Buffer) !== "undefined") { 102 // nodejs can convert it quickly in native code 103 var b = new Buffer(bytes); 104 return b.toString("utf8"); 105 } 106 // otherwise we have to implement it in pure JS 107 var ret = ""; 108 var i = 0; 109 while (i < bytes.length) { 110 if (bytes[i] === 0) { 111 // null-terminator 112 i = bytes.length; 113 } else if ((bytes[i] & 0x80) === 0) { 114 // 1 byte char 115 ret += String.fromCharCode(bytes[i++]); 116 } else if ((bytes[i] & 0xE0) === 0xC0) { 117 // 2 byte char 118 if (i + 1 >= bytes.length || (bytes[i+1] & 0x80) !== 0x80) { 119 throw "invalid utf-8 bytes"; 120 } 121 // xxx xxyyyyyy 122 ret += String.fromCharCode((bytes[i] & 0x1F) << 6 | (bytes[i+1] & 0x3F)); 123 i += 2; 124 } else if ((bytes[i] & 0xF0) === 0xE0) { 125 // 3 byte char 126 if (i + 2 >= bytes.length || (bytes[i+1] & 0x80) !== 0x80 || (bytes[i+2] & 0x80) !== 0x80) { 127 throw "invalid utf-8 bytes"; 128 } 129 // xxxxyyyy yyzzzzzz 130 ret += String.fromCharCode((bytes[i] & 0xF) << 12 | (bytes[i+1] & 0x3F) << 6 | (bytes[i+2] & 0x3F)); 131 i += 3; 132 } else if ((bytes[i] & 0xF8) === 0xF0) { 133 // 4 byte char 134 if (i + 3 >= bytes.length || (bytes[i+1] & 0x80) !== 0x80 || (bytes[i+2] & 0x80) !== 0x80 || (bytes[i+3] & 0x80) !== 0x80) { 135 throw "invalid utf-8 bytes"; 136 } 137 // wwwxx xxxxyyyy yyzzzzzz 138 ret += IString.fromCodePoint((bytes[i] & 0x7) << 18 | (bytes[i+1] & 0x3F) << 12 | (bytes[i+2] & 0x3F) << 6 | (bytes[i+3] & 0x3F)); 139 i += 4; 140 } else { 141 throw "invalid utf-8 bytes"; 142 } 143 } 144 145 return ret; 146 }; 147 148 UTF8.prototype.mapToNative = function(str) { 149 if (typeof(Buffer) !== "undefined") { 150 // nodejs can convert it quickly in native code 151 var b = new Buffer(str, "utf8"); 152 return new Uint8Array(b); 153 } 154 // otherwise we have to implement it in pure JS 155 var istr = (str instanceof IString) ? str : new IString(str); 156 157 // step through the surrogate pairs as single code points by using 158 // IString's iterator 159 var it = istr.iterator(); 160 161 // multiply by 4 because the max size of a UTF-8 char is 4 bytes, so 162 // this will at least get us enough room to encode everything. Add 1 163 // for the null terminator 164 var ret = new Uint8Array(istr.length * 4 + 1); 165 var i = 0; 166 167 while (it.hasNext()) { 168 var c = it.next(); 169 if (c > 0x7F) { 170 if (c > 0x7FF) { 171 if (c > 0xFFFF) { 172 // astral planes char 173 ret[i] = 0xF0 | ((c >> 18) & 0x3); 174 ret[i+1] = 0x80 | ((c >> 12) & 0x3F); 175 ret[i+2] = 0x80 | ((c >> 6) & 0x3F); 176 ret[i+3] = 0x80 | (c & 0x3F); 177 178 i += 4; 179 } else { 180 ret[i] = 0xE0 | ((c >> 12) & 0xF); 181 ret[i+1] = 0x80 | ((c >> 6) & 0x3F); 182 ret[i+2] = 0x80 | (c & 0x3F); 183 184 i += 3; 185 } 186 } else { 187 ret[i] = 0xC0 | ((c >> 6) & 0x1F); 188 ret[i+1] = 0x80 | (c & 0x3F); 189 190 i += 2; 191 } 192 } else { 193 ret[i++] = (c & 0x7F); 194 } 195 } 196 ret[i] = 0; // null-terminate it 197 198 return ret; 199 }; 200 201 Charmap._algorithms["UTF-8"] = UTF8; 202 203 module.exports = UTF8;