1 /* 2 * GlyphString.js - ilib string subclass that allows you to access 3 * whole glyphs at a time 4 * 5 * Copyright © 2015-2018, JEDLSoft 6 * 7 * Licensed under the Apache License, Version 2.0 (the "License"); 8 * you may not use this file except in compliance with the License. 9 * You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, software 14 * distributed under the License is distributed on an "AS IS" BASIS, 15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 * 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 */ 20 21 // !data ccc nfc ctype_m 22 23 var ilib = require("./ilib.js"); 24 var Utils = require("./Utils.js"); 25 var JSUtils = require("./JSUtils.js"); 26 27 var IString = require("./IString.js"); 28 var CType = require("./CType.js"); 29 30 /** 31 * @class 32 * Create a new glyph string instance. This string inherits from 33 * the IString class, and adds methods that allow you to access 34 * whole glyphs at a time. <p> 35 * 36 * In Unicode, various accented characters can be created by using 37 * a base character and one or more combining characters following 38 * it. These appear on the screen to the user as a single glyph. 39 * For example, the Latin character "a" (U+0061) followed by the 40 * combining diaresis character "¨" (U+0308) combine together to 41 * form the "a with diaresis" glyph "ä", which looks like a single 42 * character on the screen.<p> 43 * 44 * The big problem with combining characters for web developers is 45 * that many CSS engines do not ellipsize text between glyphs. They 46 * only deal with single Unicode characters. So if a particular space 47 * only allows for 4 characters, the CSS engine will truncate a 48 * string at 4 Unicode characters and then add the ellipsis (...) 49 * character. What if the fourth Unicode character is the "a" and 50 * the fifth one is the diaresis? Then a string like "xxxäxxx" that 51 * is ellipsized at 4 characters will appear as "xxxa..." on the 52 * screen instead of "xxxä...".<p> 53 * 54 * In the Latin script as it is commonly used, it is not so common 55 * to form accented characters using combining accents, so the above 56 * example is mostly for illustrative purposes. It is not unheard of 57 * however. The situation is much, much worse in scripts such as Thai and 58 * Devanagari that normally make very heavy use of combining characters. 59 * These scripts do so because Unicode does not include pre-composed 60 * versions of the accented characters like it does for Latin, so 61 * combining accents are the only way to create these accented and 62 * combined versions of the characters.<p> 63 * 64 * The solution to this problem is not to use the the CSS property 65 * "text-overflow: ellipsis" in your web site, ever. Instead, use 66 * a glyph string to truncate text between glyphs dynamically, 67 * rather than truncating between Unicode characters using CSS.<p> 68 * 69 * Glyph strings are also useful for truncation, hyphenation, and 70 * line wrapping, as all of these should be done between glyphs instead 71 * of between characters.<p> 72 * 73 * The options parameter is optional, and may contain any combination 74 * of the following properties:<p> 75 * 76 * <ul> 77 * <li><i>onLoad</i> - a callback function to call when the locale data are 78 * fully loaded. When the onLoad option is given, this object will attempt to 79 * load any missing locale data using the ilib loader callback. 80 * When the constructor is done (even if the data is already preassembled), the 81 * onLoad function is called with the current instance as a parameter, so this 82 * callback can be used with preassembled or dynamic loading or a mix of the two. 83 * 84 * <li><i>sync</i> - tell whether to load any missing locale data synchronously or 85 * asynchronously. If this option is given as "false", then the "onLoad" 86 * callback must be given, as the instance returned from this constructor will 87 * not be usable for a while. 88 * 89 * <li><i>loadParams</i> - an object containing parameters to pass to the 90 * loader callback function when locale data is missing. The parameters are not 91 * interpretted or modified in any way. They are simply passed along. The object 92 * may contain any property/value pairs as long as the calling code is in 93 * agreement with the loader callback function as to what those parameters mean. 94 * </ul> 95 * 96 * @constructor 97 * @extends IString 98 * @param {string|IString=} str initialize this instance with this string 99 * @param {Object=} options options governing the way this instance works 100 */ 101 var GlyphString = function (str, options) { 102 if (options && options.noinstance) { 103 return; 104 } 105 106 IString.call(this, str); 107 108 options = options || {sync: true}; 109 110 CType._load("ctype_m", options.sync, options.loadParams, ilib.bind(this, function() { 111 if (!ilib.data.ccc || JSUtils.isEmpty(ilib.data.ccc)) { 112 Utils.loadData({ 113 object: "GlyphString", 114 locale: "-", 115 name: "ccc.json", 116 nonlocale: true, 117 sync: options.sync, 118 loadParams: options.loadParams, 119 callback: ilib.bind(this, function (norm) { 120 ilib.data.ccc = norm; 121 if (!ilib.data.norm.nfc || JSUtils.isEmpty(ilib.data.norm.nfc)) { 122 Utils.loadData({ 123 object: "GlyphString", 124 locale: "-", 125 name: "nfc/all.json", 126 nonlocale: true, 127 sync: options.sync, 128 loadParams: options.loadParams, 129 callback: ilib.bind(this, function (norm) { 130 ilib.data.norm.nfc = norm; 131 if (options && typeof(options.onLoad) === 'function') { 132 options.onLoad(this); 133 } 134 }) 135 }); 136 } else { 137 if (options && typeof(options.onLoad) === 'function') { 138 options.onLoad(this); 139 } 140 } 141 }) 142 }); 143 } else { 144 if (options && typeof(options.onLoad) === 'function') { 145 options.onLoad(this); 146 } 147 } 148 })); 149 }; 150 151 GlyphString.prototype = new IString(undefined); 152 GlyphString.prototype.parent = IString; 153 GlyphString.prototype.constructor = GlyphString; 154 155 /** 156 * Return true if the given character is a leading Jamo (Choseong) character. 157 * 158 * @private 159 * @static 160 * @param {number} n code point to check 161 * @return {boolean} true if the character is a leading Jamo character, 162 * false otherwise 163 */ 164 GlyphString._isJamoL = function (n) { 165 return (n >= 0x1100 && n <= 0x1112); 166 }; 167 168 /** 169 * Return true if the given character is a vowel Jamo (Jungseong) character. 170 * 171 * @private 172 * @static 173 * @param {number} n code point to check 174 * @return {boolean} true if the character is a vowel Jamo character, 175 * false otherwise 176 */ 177 GlyphString._isJamoV = function (n) { 178 return (n >= 0x1161 && n <= 0x1175); 179 }; 180 181 /** 182 * Return true if the given character is a trailing Jamo (Jongseong) character. 183 * 184 * @private 185 * @static 186 * @param {number} n code point to check 187 * @return {boolean} true if the character is a trailing Jamo character, 188 * false otherwise 189 */ 190 GlyphString._isJamoT = function (n) { 191 return (n >= 0x11A8 && n <= 0x11C2); 192 }; 193 194 /** 195 * Return true if the given character is a LV Jamo character. 196 * LV Jamo character is a precomposed Hangul character with LV sequence. 197 * 198 * @private 199 * @static 200 * @param {number} n code point to check 201 * @return {boolean} true if the character is a LV Jamo character, 202 * false otherwise 203 */ 204 GlyphString._isJamoLV = function (n) { 205 var syllableBase = 0xAC00; 206 var leadingJamoCount = 19; 207 var vowelJamoCount = 21; 208 var trailingJamoCount = 28; 209 var syllableCount = leadingJamoCount * vowelJamoCount * trailingJamoCount; 210 var syllableIndex = n - syllableBase; 211 // Check if n is a precomposed Hangul 212 if (0 <= syllableIndex && syllableIndex < syllableCount) { 213 // Check if n is a LV Jamo character 214 if((syllableIndex % trailingJamoCount) == 0) { 215 return true; 216 } 217 } 218 return false; 219 }; 220 221 /** 222 * Return true if the given character is a precomposed Hangul character. 223 * The precomposed Hangul character may be a LV Jamo character or a LVT Jamo Character. 224 * 225 * @private 226 * @static 227 * @param {number} n code point to check 228 * @return {boolean} true if the character is a precomposed Hangul character, 229 * false otherwise 230 */ 231 GlyphString._isHangul = function (n) { 232 return (n >= 0xAC00 && n <= 0xD7A3); 233 }; 234 235 /** 236 * Algorithmically compose an L and a V combining Jamo characters into 237 * a precomposed Korean syllabic Hangul character. Both should already 238 * be in the proper ranges for L and V characters. 239 * 240 * @private 241 * @static 242 * @param {number} lead the code point of the lead Jamo character to compose 243 * @param {number} trail the code point of the trailing Jamo character to compose 244 * @return {string} the composed Hangul character 245 */ 246 GlyphString._composeJamoLV = function (lead, trail) { 247 var lindex = lead - 0x1100; 248 var vindex = trail - 0x1161; 249 return IString.fromCodePoint(0xAC00 + (lindex * 21 + vindex) * 28); 250 }; 251 252 /** 253 * Algorithmically compose a Hangul LV and a combining Jamo T character 254 * into a precomposed Korean syllabic Hangul character. 255 * 256 * @private 257 * @static 258 * @param {number} lead the code point of the lead Hangul character to compose 259 * @param {number} trail the code point of the trailing Jamo T character to compose 260 * @return {string} the composed Hangul character 261 */ 262 GlyphString._composeJamoLVT = function (lead, trail) { 263 return IString.fromCodePoint(lead + (trail - 0x11A7)); 264 }; 265 266 /** 267 * Compose one character out of a leading character and a 268 * trailing character. If the characters are Korean Jamo, they 269 * will be composed algorithmically. If they are any other 270 * characters, they will be looked up in the nfc tables. 271 * 272 * @private 273 * @static 274 * @param {string} lead leading character to compose 275 * @param {string} trail the trailing character to compose 276 * @return {string|null} the fully composed character, or undefined if 277 * there is no composition for those two characters 278 */ 279 GlyphString._compose = function (lead, trail) { 280 var first = lead.charCodeAt(0); 281 var last = trail.charCodeAt(0); 282 if (GlyphString._isJamoLV(first) && GlyphString._isJamoT(last)) { 283 return GlyphString._composeJamoLVT(first, last); 284 } else if (GlyphString._isJamoL(first) && GlyphString._isJamoV(last)) { 285 return GlyphString._composeJamoLV(first, last); 286 } 287 288 var c = lead + trail; 289 return (ilib.data.norm.nfc && ilib.data.norm.nfc[c]); 290 }; 291 292 /** 293 * Return an iterator that will step through all of the characters 294 * in the string one at a time, taking care to step through decomposed 295 * characters and through surrogate pairs in the UTF-16 encoding 296 * as single characters. <p> 297 * 298 * The GlyphString class will return decomposed Unicode characters 299 * as a single unit that a user might see on the screen as a single 300 * glyph. If the 301 * next character in the iteration is a base character and it is 302 * followed by combining characters, the base and all its following 303 * combining characters are returned as a single unit.<p> 304 * 305 * The standard Javascript String's charAt() method only 306 * returns information about a particular 16-bit character in the 307 * UTF-16 encoding scheme. 308 * If the index is pointing to a low- or high-surrogate character, 309 * it will return that surrogate character rather 310 * than the surrogate pair which represents a character 311 * in the supplementary planes.<p> 312 * 313 * The iterator instance returned has two methods, hasNext() which 314 * returns true if the iterator has more characters to iterate through, 315 * and next() which returns the next character.<p> 316 * 317 * @override 318 * @return {Object} an iterator 319 * that iterates through all the characters in the string 320 */ 321 GlyphString.prototype.charIterator = function() { 322 var it = IString.prototype.charIterator.call(this); 323 324 /** 325 * @constructor 326 */ 327 function _chiterator (istring) { 328 this.index = 0; 329 this.spacingCombining = false; 330 this.hasNext = function () { 331 return !!this.nextChar || it.hasNext(); 332 }; 333 this.next = function () { 334 var ch = this.nextChar || it.next(), 335 prevCcc = ilib.data.ccc[ch], 336 nextCcc, 337 composed = ch; 338 339 this.nextChar = undefined; 340 this.spacingCombining = false; 341 342 if (ilib.data.ccc && 343 (typeof(ilib.data.ccc[ch]) === 'undefined' || ilib.data.ccc[ch] === 0)) { 344 // found a starter... find all the non-starters until the next starter. Must include 345 // the next starter because under some odd circumstances, two starters sometimes recompose 346 // together to form another character 347 var notdone = true; 348 while (it.hasNext() && notdone) { 349 this.nextChar = it.next(); 350 nextCcc = ilib.data.ccc[this.nextChar]; 351 var codePoint = IString.toCodePoint(this.nextChar, 0); 352 // Mn characters are Marks that are non-spacing. These do not take more room than an accent, so they should be 353 // considered part of the on-screen glyph, even if they are non-combining. Mc are marks that are spacing 354 // and combining, which means they are part of the glyph, but they cause the glyph to use up more space than 355 // just the base character alone. 356 var isMn = CType._inRange(codePoint, "Mn", ilib.data.ctype_m); 357 var isMc = CType._inRange(codePoint, "Mc", ilib.data.ctype_m); 358 if (isMn || isMc || (typeof(nextCcc) !== 'undefined' && nextCcc !== 0)) { 359 if (isMc) { 360 this.spacingCombining = true; 361 } 362 ch += this.nextChar; 363 this.nextChar = undefined; 364 } else { 365 // found the next starter. See if this can be composed with the previous starter 366 var testChar = GlyphString._compose(composed, this.nextChar); 367 if (prevCcc === 0 && typeof(testChar) !== 'undefined') { 368 // not blocked and there is a mapping 369 composed = testChar; 370 ch += this.nextChar; 371 this.nextChar = undefined; 372 } else { 373 // finished iterating, leave this.nextChar for the next next() call 374 notdone = false; 375 } 376 } 377 prevCcc = nextCcc; 378 } 379 } 380 return ch; 381 }; 382 // Returns true if the last character returned by the "next" method included 383 // spacing combining characters. If it does, then the character was wider than 384 // just the base character alone, and the truncation code will not add it. 385 this.wasSpacingCombining = function() { 386 return this.spacingCombining; 387 }; 388 }; 389 return new _chiterator(this); 390 }; 391 392 /** 393 * Truncate the current string at the given number of whole glyphs and return 394 * the resulting string. 395 * 396 * @param {number} length the number of whole glyphs to keep in the string 397 * @return {string} a string truncated to the requested number of glyphs 398 */ 399 GlyphString.prototype.truncate = function(length) { 400 var it = this.charIterator(); 401 var tr = ""; 402 for (var i = 0; i < length-1 && it.hasNext(); i++) { 403 tr += it.next(); 404 } 405 406 /* 407 * handle the last character separately. If it contains spacing combining 408 * accents, then we must assume that it uses up more horizontal space on 409 * the screen than just the base character by itself, and therefore this 410 * method will not truncate enough characters to fit in the given length. 411 * In this case, we have to chop off not only the combining characters, 412 * but also the base character as well because the base without the 413 * combining accents is considered a different character. 414 */ 415 if (i < length && it.hasNext()) { 416 var c = it.next(); 417 if (!it.wasSpacingCombining()) { 418 tr += c; 419 } 420 } 421 return tr; 422 }; 423 424 /** 425 * Truncate the current string at the given number of glyphs and add an ellipsis 426 * to indicate that is more to the string. The ellipsis forms the last character 427 * in the string, so the string is actually truncated at length-1 glyphs. 428 * 429 * @param {number} length the number of whole glyphs to keep in the string 430 * including the ellipsis 431 * @return {string} a string truncated to the requested number of glyphs 432 * with an ellipsis 433 */ 434 GlyphString.prototype.ellipsize = function(length) { 435 return this.truncate(length > 0 ? length-1 : 0) + "…"; 436 }; 437 438 module.exports = GlyphString; 439