UTF8.js

/*
 * UTF8.js - Implement Unicode Transformation Format 8-bit mappings
 *
 * Copyright © 2014-2015, 2018, JEDLSoft
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

// !data charset/UTF-8

var Charset = require("./Charset.js");
var Charmap = require("./Charmap.js");
var IString = require("./IString.js");

/**
 * @class
 * Create a new UTF-8 mapping instance
 * @constructor
 * @extends Charmap
 */
var UTF8 = function (options) {
    options = options || {sync: true};
    if (typeof(options.charset) === "object" && options.charset instanceof Charset) {
        this.charset = options.charset;
        this._init(options);
    } else {
        new Charset({
            name: "UTF-8",
            sync: options.sync,
            loadParams: options.loadParams,
            onLoad: ilib.bind(this, function(cs) {
                this.charset = cs;
                this._init(options);
            })
        });
    }
};

UTF8.prototype = new Charmap({noinstance: true});
UTF8.prototype.parent = Charmap;
UTF8.prototype.constructor = UTF8;

/**
 * Initialize the charmap instance
 * @private
 */
UTF8.prototype._init = function(options) {
    this._calcExpansionFactor();

    if (typeof(options.onLoad) === "function") {
        options.onLoad(this);
    }
};

UTF8.prototype.validate = function(bytes) {
    var i = 0;
    while (i < bytes.length) {
        if ((bytes[i] & 0x80) === 0) {
            i++;
        } else {
            var len;
            if ((bytes[i] & 0xC0) === 0xC0) {
                len = 2;
            } else if ((bytes[i] & 0xE0) === 0xE0) {
                len = 3;
            } else if ((bytes[i] & 0xF0) === 0xF0) {
                len = 4;
            } else {
                // invalid lead byte
                return false;
            }
            if (i + len > bytes.length) {
                // not enough trailing bytes
                return false;
            }
            for (var j = 1; j < len; j++) {
                // check each trailing byte to see if it has the correct form
                if ((bytes[i+j] & 0x80) !== 0x80) {
                    return false;
                }
            }
            i += len;
        }
    }

    return true;
};

UTF8.prototype.mapToUnicode = function (bytes) {
    if (typeof(Buffer) !== "undefined") {
        // nodejs can convert it quickly in native code
        var b = Buffer.from(bytes);
        return b.toString("utf8");
    }
    // otherwise we have to implement it in pure JS
    var ret = "";
    var i = 0;
    while (i < bytes.length) {
        if (bytes[i] === 0) {
            // null-terminator
            i = bytes.length;
        } else if ((bytes[i] & 0x80) === 0) {
            // 1 byte char
            ret += String.fromCharCode(bytes[i++]);
        } else if ((bytes[i] & 0xE0) === 0xC0) {
            // 2 byte char
            if (i + 1 >= bytes.length || (bytes[i+1] & 0x80) !== 0x80) {
                throw "invalid utf-8 bytes";
            }
            // xxx xxyyyyyy
            ret += String.fromCharCode((bytes[i] & 0x1F) << 6 | (bytes[i+1] & 0x3F));
            i += 2;
        } else if ((bytes[i] & 0xF0) === 0xE0) {
            // 3 byte char
            if (i + 2 >= bytes.length || (bytes[i+1] & 0x80) !== 0x80 || (bytes[i+2] & 0x80) !== 0x80) {
                throw "invalid utf-8 bytes";
            }
            // xxxxyyyy yyzzzzzz
            ret += String.fromCharCode((bytes[i] & 0xF) << 12 | (bytes[i+1] & 0x3F) << 6 | (bytes[i+2] & 0x3F));
            i += 3;
        } else if ((bytes[i] & 0xF8) === 0xF0) {
            // 4 byte char
            if (i + 3 >= bytes.length || (bytes[i+1] & 0x80) !== 0x80 || (bytes[i+2] & 0x80) !== 0x80 || (bytes[i+3] & 0x80) !== 0x80) {
                throw "invalid utf-8 bytes";
            }
            // wwwxx xxxxyyyy yyzzzzzz
            ret += IString.fromCodePoint((bytes[i] & 0x7) << 18 | (bytes[i+1] & 0x3F) << 12 | (bytes[i+2] & 0x3F) << 6 | (bytes[i+3] & 0x3F));
            i += 4;
        } else {
            throw "invalid utf-8 bytes";
        }
    }

    return ret;
};

UTF8.prototype.mapToNative = function(str) {
    if (typeof(Buffer) !== "undefined") {
        // nodejs can convert it quickly in native code
        var b = Buffer.from(str, "utf8");
        return new Uint8Array(b);
    }
    // otherwise we have to implement it in pure JS
    var istr = (str instanceof IString) ? str : new IString(str);

    // step through the surrogate pairs as single code points by using
    // IString's iterator
    var it = istr.iterator();

    // multiply by 4 because the max size of a UTF-8 char is 4 bytes, so
    // this will at least get us enough room to encode everything. Add 1
    // for the null terminator
    var ret = new Uint8Array(istr.length * 4 + 1);
    var i = 0;

    while (it.hasNext()) {
        var c = it.next();
        if (c > 0x7F) {
            if (c > 0x7FF) {
                if (c > 0xFFFF) {
                    // astral planes char
                    ret[i]   = 0xF0 | ((c >> 18) & 0x3);
                    ret[i+1] = 0x80 | ((c >> 12) & 0x3F);
                    ret[i+2] = 0x80 | ((c >> 6) & 0x3F);
                    ret[i+3] = 0x80 | (c & 0x3F);

                    i += 4;
                } else {
                    ret[i]   = 0xE0 | ((c >> 12) & 0xF);
                    ret[i+1] = 0x80 | ((c >> 6) & 0x3F);
                    ret[i+2] = 0x80 | (c & 0x3F);

                    i += 3;
                }
            } else {
                ret[i]   = 0xC0 | ((c >> 6) & 0x1F);
                ret[i+1] = 0x80 | (c & 0x3F);

                i += 2;
            }
        } else {
            ret[i++] = (c & 0x7F);
        }
    }
    ret[i] = 0; // null-terminate it

    return ret;
};

Charmap._algorithms["UTF-8"] = UTF8;

module.exports = UTF8;