/*
* GlyphString.js - ilib string subclass that allows you to access
* whole glyphs at a time
*
* Copyright © 2015-2018, 2023 JEDLSoft
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// !data ccc nfc ctype_m
var ilib = require("../index.js");
var Utils = require("./Utils.js");
var JSUtils = require("./JSUtils.js");
var IString = require("./IString.js");
var CType = require("./CType.js");
/**
* @class
* Create a new glyph string instance. This string inherits from
* the IString class, and adds methods that allow you to access
* whole glyphs at a time. <p>
*
* In Unicode, various accented characters can be created by using
* a base character and one or more combining characters following
* it. These appear on the screen to the user as a single glyph.
* For example, the Latin character "a" (U+0061) followed by the
* combining diaresis character "¨" (U+0308) combine together to
* form the "a with diaresis" glyph "ä", which looks like a single
* character on the screen.<p>
*
* The big problem with combining characters for web developers is
* that many CSS engines do not ellipsize text between glyphs. They
* only deal with single Unicode characters. So if a particular space
* only allows for 4 characters, the CSS engine will truncate a
* string at 4 Unicode characters and then add the ellipsis (...)
* character. What if the fourth Unicode character is the "a" and
* the fifth one is the diaresis? Then a string like "xxxäxxx" that
* is ellipsized at 4 characters will appear as "xxxa..." on the
* screen instead of "xxxä...".<p>
*
* In the Latin script as it is commonly used, it is not so common
* to form accented characters using combining accents, so the above
* example is mostly for illustrative purposes. It is not unheard of
* however. The situation is much, much worse in scripts such as Thai and
* Devanagari that normally make very heavy use of combining characters.
* These scripts do so because Unicode does not include pre-composed
* versions of the accented characters like it does for Latin, so
* combining accents are the only way to create these accented and
* combined versions of the characters.<p>
*
* The solution to this problem is not to use the the CSS property
* "text-overflow: ellipsis" in your web site, ever. Instead, use
* a glyph string to truncate text between glyphs dynamically,
* rather than truncating between Unicode characters using CSS.<p>
*
* Glyph strings are also useful for truncation, hyphenation, and
* line wrapping, as all of these should be done between glyphs instead
* of between characters.<p>
*
* The options parameter is optional, and may contain any combination
* of the following properties:<p>
*
* <ul>
* <li><i>onLoad</i> - a callback function to call when the locale data are
* fully loaded. When the onLoad option is given, this object will attempt to
* load any missing locale data using the ilib loader callback.
* When the constructor is done (even if the data is already preassembled), the
* onLoad function is called with the current instance as a parameter, so this
* callback can be used with preassembled or dynamic loading or a mix of the two.
*
* <li><i>sync</i> - tell whether to load any missing locale data synchronously or
* asynchronously. If this option is given as "false", then the "onLoad"
* callback must be given, as the instance returned from this constructor will
* not be usable for a while.
*
* <li><i>loadParams</i> - an object containing parameters to pass to the
* loader callback function when locale data is missing. The parameters are not
* interpretted or modified in any way. They are simply passed along. The object
* may contain any property/value pairs as long as the calling code is in
* agreement with the loader callback function as to what those parameters mean.
* </ul>
*
* @constructor
* @extends IString
* @param {string|IString=} str initialize this instance with this string
* @param {Object=} options options governing the way this instance works
*/
var GlyphString = function (str, options) {
if (options && options.noinstance) {
return;
}
IString.call(this, str);
options = options || {sync: true};
CType._load("ctype_m", options.sync, options.loadParams, ilib.bind(this, function() {
if (!ilib.data.ccc || JSUtils.isEmpty(ilib.data.ccc)) {
Utils.loadData({
object: "GlyphString",
locale: "-",
name: "ccc.json",
nonlocale: true,
sync: options.sync,
loadParams: options.loadParams,
callback: ilib.bind(this, function (norm) {
ilib.data.ccc = norm;
if (!ilib.data.norm.nfc || JSUtils.isEmpty(ilib.data.norm.nfc)) {
Utils.loadData({
object: "GlyphString",
locale: "-",
name: "nfc/all.json",
nonlocale: true,
sync: options.sync,
loadParams: options.loadParams,
callback: ilib.bind(this, function (norm) {
ilib.data.norm.nfc = norm;
if (options && typeof(options.onLoad) === 'function') {
options.onLoad(this);
}
})
});
} else {
if (options && typeof(options.onLoad) === 'function') {
options.onLoad(this);
}
}
})
});
} else {
if (options && typeof(options.onLoad) === 'function') {
options.onLoad(this);
}
}
}));
};
GlyphString.prototype = new IString(undefined);
GlyphString.prototype.parent = IString;
GlyphString.prototype.constructor = GlyphString;
/**
* Return true if the given character is a leading Jamo (Choseong) character.
*
* @private
* @static
* @param {number} n code point to check
* @return {boolean} true if the character is a leading Jamo character,
* false otherwise
*/
GlyphString._isJamoL = function (n) {
return (n >= 0x1100 && n <= 0x1112);
};
/**
* Return true if the given character is a vowel Jamo (Jungseong) character.
*
* @private
* @static
* @param {number} n code point to check
* @return {boolean} true if the character is a vowel Jamo character,
* false otherwise
*/
GlyphString._isJamoV = function (n) {
return (n >= 0x1161 && n <= 0x1175);
};
/**
* Return true if the given character is a trailing Jamo (Jongseong) character.
*
* @private
* @static
* @param {number} n code point to check
* @return {boolean} true if the character is a trailing Jamo character,
* false otherwise
*/
GlyphString._isJamoT = function (n) {
return (n >= 0x11A8 && n <= 0x11C2);
};
/**
* Return true if the given character is a LV Jamo character.
* LV Jamo character is a precomposed Hangul character with LV sequence.
*
* @private
* @static
* @param {number} n code point to check
* @return {boolean} true if the character is a LV Jamo character,
* false otherwise
*/
GlyphString._isJamoLV = function (n) {
var syllableBase = 0xAC00;
var leadingJamoCount = 19;
var vowelJamoCount = 21;
var trailingJamoCount = 28;
var syllableCount = leadingJamoCount * vowelJamoCount * trailingJamoCount;
var syllableIndex = n - syllableBase;
// Check if n is a precomposed Hangul
if (0 <= syllableIndex && syllableIndex < syllableCount) {
// Check if n is a LV Jamo character
if((syllableIndex % trailingJamoCount) == 0) {
return true;
}
}
return false;
};
/**
* Return true if the given character is a precomposed Hangul character.
* The precomposed Hangul character may be a LV Jamo character or a LVT Jamo Character.
*
* @private
* @static
* @param {number} n code point to check
* @return {boolean} true if the character is a precomposed Hangul character,
* false otherwise
*/
GlyphString._isHangul = function (n) {
return (n >= 0xAC00 && n <= 0xD7A3);
};
/**
* Algorithmically compose an L and a V combining Jamo characters into
* a precomposed Korean syllabic Hangul character. Both should already
* be in the proper ranges for L and V characters.
*
* @private
* @static
* @param {number} lead the code point of the lead Jamo character to compose
* @param {number} trail the code point of the trailing Jamo character to compose
* @return {string} the composed Hangul character
*/
GlyphString._composeJamoLV = function (lead, trail) {
var lindex = lead - 0x1100;
var vindex = trail - 0x1161;
return IString.fromCodePoint(0xAC00 + (lindex * 21 + vindex) * 28);
};
/**
* Algorithmically compose a Hangul LV and a combining Jamo T character
* into a precomposed Korean syllabic Hangul character.
*
* @private
* @static
* @param {number} lead the code point of the lead Hangul character to compose
* @param {number} trail the code point of the trailing Jamo T character to compose
* @return {string} the composed Hangul character
*/
GlyphString._composeJamoLVT = function (lead, trail) {
return IString.fromCodePoint(lead + (trail - 0x11A7));
};
/**
* Compose one character out of a leading character and a
* trailing character. If the characters are Korean Jamo, they
* will be composed algorithmically. If they are any other
* characters, they will be looked up in the nfc tables.
*
* @private
* @static
* @param {string} lead leading character to compose
* @param {string} trail the trailing character to compose
* @return {string|null} the fully composed character, or undefined if
* there is no composition for those two characters
*/
GlyphString._compose = function (lead, trail) {
var first = lead.charCodeAt(0);
var last = trail.charCodeAt(0);
if (GlyphString._isJamoLV(first) && GlyphString._isJamoT(last)) {
return GlyphString._composeJamoLVT(first, last);
} else if (GlyphString._isJamoL(first) && GlyphString._isJamoV(last)) {
return GlyphString._composeJamoLV(first, last);
}
var c = lead + trail;
return (ilib.data.norm.nfc && ilib.data.norm.nfc[c]);
};
/**
* Return an iterator that will step through all of the characters
* in the string one at a time, taking care to step through decomposed
* characters and through surrogate pairs in the UTF-16 encoding
* as single characters. <p>
*
* The GlyphString class will return decomposed Unicode characters
* as a single unit that a user might see on the screen as a single
* glyph. If the
* next character in the iteration is a base character and it is
* followed by combining characters, the base and all its following
* combining characters are returned as a single unit.<p>
*
* The standard Javascript String's charAt() method only
* returns information about a particular 16-bit character in the
* UTF-16 encoding scheme.
* If the index is pointing to a low- or high-surrogate character,
* it will return that surrogate character rather
* than the surrogate pair which represents a character
* in the supplementary planes.<p>
*
* The iterator instance returned has two methods, hasNext() which
* returns true if the iterator has more characters to iterate through,
* and next() which returns the next character.<p>
*
* @override
* @return {Object} an iterator
* that iterates through all the characters in the string
*/
GlyphString.prototype.charIterator = function() {
var it = IString.prototype.charIterator.call(this);
/**
* @private
*/
function _chiterator (istring) {
this.index = 0;
this.spacingCombining = false;
this.hasNext = function () {
return !!this.nextChar || it.hasNext();
};
this.next = function () {
var ch = this.nextChar || it.next(),
prevCcc = ilib.data.ccc[ch],
nextCcc,
composed = ch;
this.nextChar = undefined;
this.spacingCombining = false;
if (ilib.data.ccc &&
(typeof(ilib.data.ccc[ch]) === 'undefined' || ilib.data.ccc[ch] === 0)) {
// found a starter... find all the non-starters until the next starter. Must include
// the next starter because under some odd circumstances, two starters sometimes recompose
// together to form another character
var notdone = true;
while (it.hasNext() && notdone) {
this.nextChar = it.next();
nextCcc = ilib.data.ccc[this.nextChar];
var codePoint = IString.toCodePoint(this.nextChar, 0);
// Mn characters are Marks that are non-spacing. These do not take more room than an accent, so they should be
// considered part of the on-screen glyph, even if they are non-combining. Mc are marks that are spacing
// and combining, which means they are part of the glyph, but they cause the glyph to use up more space than
// just the base character alone.
var isMn = CType._inRange(codePoint, "Mn", ilib.data.ctype_m);
var isMc = CType._inRange(codePoint, "Mc", ilib.data.ctype_m);
if (isMn || isMc || (typeof(nextCcc) !== 'undefined' && nextCcc !== 0)) {
if (isMc) {
this.spacingCombining = true;
}
ch += this.nextChar;
this.nextChar = undefined;
} else {
// found the next starter. See if this can be composed with the previous starter
var testChar = GlyphString._compose(composed, this.nextChar);
if (prevCcc === 0 && typeof(testChar) !== 'undefined') {
// not blocked and there is a mapping
composed = testChar;
ch += this.nextChar;
this.nextChar = undefined;
} else {
// finished iterating, leave this.nextChar for the next next() call
notdone = false;
}
}
prevCcc = nextCcc;
}
}
return ch;
};
// Returns true if the last character returned by the "next" method included
// spacing combining characters. If it does, then the character was wider than
// just the base character alone, and the truncation code will not add it.
this.wasSpacingCombining = function() {
return this.spacingCombining;
};
};
return new _chiterator(this);
};
/**
* Truncate the current string at the given number of whole glyphs and return
* the resulting string.
*
* @param {number} length the number of whole glyphs to keep in the string
* @return {string} a string truncated to the requested number of glyphs
*/
GlyphString.prototype.truncate = function(length) {
var it = this.charIterator();
var tr = "";
for (var i = 0; i < length-1 && it.hasNext(); i++) {
tr += it.next();
}
/*
* handle the last character separately. If it contains spacing combining
* accents, then we must assume that it uses up more horizontal space on
* the screen than just the base character by itself, and therefore this
* method will not truncate enough characters to fit in the given length.
* In this case, we have to chop off not only the combining characters,
* but also the base character as well because the base without the
* combining accents is considered a different character.
*/
if (i < length && it.hasNext()) {
var c = it.next();
if (!it.wasSpacingCombining()) {
tr += c;
}
}
return tr;
};
/**
* Truncate the current string at the given number of glyphs and add an ellipsis
* to indicate that is more to the string. The ellipsis forms the last character
* in the string, so the string is actually truncated at length-1 glyphs.
*
* @param {number} length the number of whole glyphs to keep in the string
* including the ellipsis
* @return {string} a string truncated to the requested number of glyphs
* with an ellipsis
*/
GlyphString.prototype.ellipsize = function(length) {
return this.truncate(length > 0 ? length-1 : 0) + "…";
};
module.exports = GlyphString;
Source