1 /*
  2  * GlyphString.js - ilib string subclass that allows you to access
  3  * whole glyphs at a time
  4  *
  5  * Copyright © 2015-2018, JEDLSoft
  6  *
  7  * Licensed under the Apache License, Version 2.0 (the "License");
  8  * you may not use this file except in compliance with the License.
  9  * You may obtain a copy of the License at
 10  *
 11  *     http://www.apache.org/licenses/LICENSE-2.0
 12  *
 13  * Unless required by applicable law or agreed to in writing, software
 14  * distributed under the License is distributed on an "AS IS" BASIS,
 15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16  *
 17  * See the License for the specific language governing permissions and
 18  * limitations under the License.
 19  */
 20 
 21 // !data ccc nfc ctype_m
 22 
 23 var ilib = require("./ilib.js");
 24 var Utils = require("./Utils.js");
 25 var JSUtils = require("./JSUtils.js");
 26 
 27 var IString = require("./IString.js");
 28 var CType = require("./CType.js");
 29 
 30 /**
 31  * @class
 32  * Create a new glyph string instance. This string inherits from
 33  * the IString class, and adds methods that allow you to access
 34  * whole glyphs at a time. <p>
 35  *
 36  * In Unicode, various accented characters can be created by using
 37  * a base character and one or more combining characters following
 38  * it. These appear on the screen to the user as a single glyph.
 39  * For example, the Latin character "a" (U+0061) followed by the
 40  * combining diaresis character "¨" (U+0308) combine together to
 41  * form the "a with diaresis" glyph "ä", which looks like a single
 42  * character on the screen.<p>
 43  *
 44  * The big problem with combining characters for web developers is
 45  * that many CSS engines do not ellipsize text between glyphs. They
 46  * only deal with single Unicode characters. So if a particular space
 47  * only allows for 4 characters, the CSS engine will truncate a
 48  * string at 4 Unicode characters and then add the ellipsis (...)
 49  * character. What if the fourth Unicode character is the "a" and
 50  * the fifth one is the diaresis? Then a string like "xxxäxxx" that
 51  * is ellipsized at 4 characters will appear as "xxxa..." on the
 52  * screen instead of "xxxä...".<p>
 53  *
 54  * In the Latin script as it is commonly used, it is not so common
 55  * to form accented characters using combining accents, so the above
 56  * example is mostly for illustrative purposes. It is not unheard of
 57  * however. The situation is much, much worse in scripts such as Thai and
 58  * Devanagari that normally make very heavy use of combining characters.
 59  * These scripts do so because Unicode does not include pre-composed
 60  * versions of the accented characters like it does for Latin, so
 61  * combining accents are the only way to create these accented and
 62  * combined versions of the characters.<p>
 63  *
 64  * The solution to this problem is not to use the the CSS property
 65  * "text-overflow: ellipsis" in your web site, ever. Instead, use
 66  * a glyph string to truncate text between glyphs dynamically,
 67  * rather than truncating between Unicode characters using CSS.<p>
 68  *
 69  * Glyph strings are also useful for truncation, hyphenation, and
 70  * line wrapping, as all of these should be done between glyphs instead
 71  * of between characters.<p>
 72  *
 73  * The options parameter is optional, and may contain any combination
 74  * of the following properties:<p>
 75  *
 76  * <ul>
 77  * <li><i>onLoad</i> - a callback function to call when the locale data are
 78  * fully loaded. When the onLoad option is given, this object will attempt to
 79  * load any missing locale data using the ilib loader callback.
 80  * When the constructor is done (even if the data is already preassembled), the
 81  * onLoad function is called with the current instance as a parameter, so this
 82  * callback can be used with preassembled or dynamic loading or a mix of the two.
 83  *
 84  * <li><i>sync</i> - tell whether to load any missing locale data synchronously or
 85  * asynchronously. If this option is given as "false", then the "onLoad"
 86  * callback must be given, as the instance returned from this constructor will
 87  * not be usable for a while.
 88  *
 89  * <li><i>loadParams</i> - an object containing parameters to pass to the
 90  * loader callback function when locale data is missing. The parameters are not
 91  * interpretted or modified in any way. They are simply passed along. The object
 92  * may contain any property/value pairs as long as the calling code is in
 93  * agreement with the loader callback function as to what those parameters mean.
 94  * </ul>
 95  *
 96  * @constructor
 97  * @extends IString
 98  * @param {string|IString=} str initialize this instance with this string
 99  * @param {Object=} options options governing the way this instance works
100  */
101 var GlyphString = function (str, options) {
102     if (options && options.noinstance) {
103         return;
104     }
105 
106     IString.call(this, str);
107 
108     options = options || {sync: true};
109 
110     CType._load("ctype_m", options.sync, options.loadParams, ilib.bind(this, function() {
111         if (!ilib.data.ccc || JSUtils.isEmpty(ilib.data.ccc)) {
112             Utils.loadData({
113                 object: "GlyphString",
114                 locale: "-",
115                 name: "ccc.json",
116                 nonlocale: true,
117                 sync: options.sync,
118                 loadParams: options.loadParams,
119                 callback: ilib.bind(this, function (norm) {
120                     ilib.data.ccc = norm;
121                     if (!ilib.data.norm.nfc || JSUtils.isEmpty(ilib.data.norm.nfc)) {
122                         Utils.loadData({
123                             object: "GlyphString",
124                             locale: "-",
125                             name: "nfc/all.json",
126                             nonlocale: true,
127                             sync: options.sync,
128                             loadParams: options.loadParams,
129                             callback: ilib.bind(this, function (norm) {
130                                 ilib.data.norm.nfc = norm;
131                                 if (options && typeof(options.onLoad) === 'function') {
132                                     options.onLoad(this);
133                                 }
134                             })
135                         });
136                     } else {
137                         if (options && typeof(options.onLoad) === 'function') {
138                             options.onLoad(this);
139                         }
140                     }
141                 })
142             });
143         } else {
144             if (options && typeof(options.onLoad) === 'function') {
145                 options.onLoad(this);
146             }
147         }
148     }));
149 };
150 
151 GlyphString.prototype = new IString(undefined);
152 GlyphString.prototype.parent = IString;
153 GlyphString.prototype.constructor = GlyphString;
154 
155 /**
156  * Return true if the given character is a leading Jamo (Choseong) character.
157  *
158  * @private
159  * @static
160  * @param {number} n code point to check
161  * @return {boolean} true if the character is a leading Jamo character,
162  * false otherwise
163  */
164 GlyphString._isJamoL = function (n) {
165     return (n >= 0x1100 && n <= 0x1112);
166 };
167 
168 /**
169  * Return true if the given character is a vowel Jamo (Jungseong) character.
170  *
171  * @private
172  * @static
173  * @param {number} n code point to check
174  * @return {boolean} true if the character is a vowel Jamo character,
175  * false otherwise
176  */
177 GlyphString._isJamoV = function (n) {
178     return (n >= 0x1161 && n <= 0x1175);
179 };
180 
181 /**
182  * Return true if the given character is a trailing Jamo (Jongseong) character.
183  *
184  * @private
185  * @static
186  * @param {number} n code point to check
187  * @return {boolean} true if the character is a trailing Jamo character,
188  * false otherwise
189  */
190 GlyphString._isJamoT = function (n) {
191     return (n >= 0x11A8 && n <= 0x11C2);
192 };
193 
194 /**
195  * Return true if the given character is a LV Jamo character.
196  * LV Jamo character is a precomposed Hangul character with LV sequence.
197  *
198  * @private
199  * @static
200  * @param {number} n code point to check
201  * @return {boolean} true if the character is a LV Jamo character,
202  * false otherwise
203  */
204 GlyphString._isJamoLV = function (n) {
205     var syllableBase = 0xAC00;
206     var leadingJamoCount = 19;
207     var vowelJamoCount = 21;
208     var trailingJamoCount = 28;
209     var syllableCount = leadingJamoCount * vowelJamoCount * trailingJamoCount;
210     var syllableIndex = n - syllableBase;
211     // Check if n is a precomposed Hangul
212     if (0 <= syllableIndex && syllableIndex < syllableCount) {
213     // Check if n is a LV Jamo character
214         if((syllableIndex % trailingJamoCount) == 0) {
215             return true;
216         }
217     }
218     return false;
219 };
220 
221 /**
222  * Return true if the given character is a precomposed Hangul character.
223  * The precomposed Hangul character may be a LV Jamo character or a LVT Jamo Character.
224  *
225  * @private
226  * @static
227  * @param {number} n code point to check
228  * @return {boolean} true if the character is a precomposed Hangul character,
229  * false otherwise
230  */
231 GlyphString._isHangul = function (n) {
232     return (n >= 0xAC00 && n <= 0xD7A3);
233 };
234 
235 /**
236  * Algorithmically compose an L and a V combining Jamo characters into
237  * a precomposed Korean syllabic Hangul character. Both should already
238  * be in the proper ranges for L and V characters.
239  *
240  * @private
241  * @static
242  * @param {number} lead the code point of the lead Jamo character to compose
243  * @param {number} trail the code point of the trailing Jamo character to compose
244  * @return {string} the composed Hangul character
245  */
246 GlyphString._composeJamoLV = function (lead, trail) {
247     var lindex = lead - 0x1100;
248     var vindex = trail - 0x1161;
249     return IString.fromCodePoint(0xAC00 + (lindex * 21 + vindex) * 28);
250 };
251 
252 /**
253  * Algorithmically compose a Hangul LV and a combining Jamo T character
254  * into a precomposed Korean syllabic Hangul character.
255  *
256  * @private
257  * @static
258  * @param {number} lead the code point of the lead Hangul character to compose
259  * @param {number} trail the code point of the trailing Jamo T character to compose
260  * @return {string} the composed Hangul character
261  */
262 GlyphString._composeJamoLVT = function (lead, trail) {
263     return IString.fromCodePoint(lead + (trail - 0x11A7));
264 };
265 
266 /**
267  * Compose one character out of a leading character and a
268  * trailing character. If the characters are Korean Jamo, they
269  * will be composed algorithmically. If they are any other
270  * characters, they will be looked up in the nfc tables.
271  *
272  * @private
273  * @static
274  * @param {string} lead leading character to compose
275  * @param {string} trail the trailing character to compose
276  * @return {string|null} the fully composed character, or undefined if
277  * there is no composition for those two characters
278  */
279 GlyphString._compose = function (lead, trail) {
280     var first = lead.charCodeAt(0);
281     var last = trail.charCodeAt(0);
282     if (GlyphString._isJamoLV(first) && GlyphString._isJamoT(last)) {
283         return GlyphString._composeJamoLVT(first, last);
284     } else if (GlyphString._isJamoL(first) && GlyphString._isJamoV(last)) {
285         return GlyphString._composeJamoLV(first, last);
286     }
287 
288     var c = lead + trail;
289     return (ilib.data.norm.nfc && ilib.data.norm.nfc[c]);
290 };
291 
292 /**
293  * Return an iterator that will step through all of the characters
294  * in the string one at a time, taking care to step through decomposed
295  * characters and through surrogate pairs in the UTF-16 encoding
296  * as single characters. <p>
297  *
298  * The GlyphString class will return decomposed Unicode characters
299  * as a single unit that a user might see on the screen as a single
300  * glyph. If the
301  * next character in the iteration is a base character and it is
302  * followed by combining characters, the base and all its following
303  * combining characters are returned as a single unit.<p>
304  *
305  * The standard Javascript String's charAt() method only
306  * returns information about a particular 16-bit character in the
307  * UTF-16 encoding scheme.
308  * If the index is pointing to a low- or high-surrogate character,
309  * it will return that surrogate character rather
310  * than the surrogate pair which represents a character
311  * in the supplementary planes.<p>
312  *
313  * The iterator instance returned has two methods, hasNext() which
314  * returns true if the iterator has more characters to iterate through,
315  * and next() which returns the next character.<p>
316  *
317  * @override
318  * @return {Object} an iterator
319  * that iterates through all the characters in the string
320  */
321 GlyphString.prototype.charIterator = function() {
322     var it = IString.prototype.charIterator.call(this);
323 
324     /**
325      * @constructor
326      */
327     function _chiterator (istring) {
328         this.index = 0;
329         this.spacingCombining = false;
330         this.hasNext = function () {
331             return !!this.nextChar || it.hasNext();
332         };
333         this.next = function () {
334             var ch = this.nextChar || it.next(),
335                 prevCcc = ilib.data.ccc[ch],
336                 nextCcc,
337                 composed = ch;
338 
339             this.nextChar = undefined;
340             this.spacingCombining = false;
341 
342             if (ilib.data.ccc &&
343                     (typeof(ilib.data.ccc[ch]) === 'undefined' || ilib.data.ccc[ch] === 0)) {
344                 // found a starter... find all the non-starters until the next starter. Must include
345                 // the next starter because under some odd circumstances, two starters sometimes recompose
346                 // together to form another character
347                 var notdone = true;
348                 while (it.hasNext() && notdone) {
349                     this.nextChar = it.next();
350                     nextCcc = ilib.data.ccc[this.nextChar];
351                     var codePoint = IString.toCodePoint(this.nextChar, 0);
352                     // Mn characters are Marks that are non-spacing. These do not take more room than an accent, so they should be
353                     // considered part of the on-screen glyph, even if they are non-combining. Mc are marks that are spacing
354                     // and combining, which means they are part of the glyph, but they cause the glyph to use up more space than
355                     // just the base character alone.
356                     var isMn = CType._inRange(codePoint, "Mn", ilib.data.ctype_m);
357                     var isMc = CType._inRange(codePoint, "Mc", ilib.data.ctype_m);
358                     if (isMn || isMc || (typeof(nextCcc) !== 'undefined' && nextCcc !== 0)) {
359                         if (isMc) {
360                             this.spacingCombining = true;
361                         }
362                         ch += this.nextChar;
363                         this.nextChar = undefined;
364                     } else {
365                         // found the next starter. See if this can be composed with the previous starter
366                         var testChar = GlyphString._compose(composed, this.nextChar);
367                         if (prevCcc === 0 && typeof(testChar) !== 'undefined') {
368                             // not blocked and there is a mapping
369                             composed = testChar;
370                             ch += this.nextChar;
371                             this.nextChar = undefined;
372                         } else {
373                             // finished iterating, leave this.nextChar for the next next() call
374                             notdone = false;
375                         }
376                     }
377                     prevCcc = nextCcc;
378                 }
379             }
380             return ch;
381         };
382         // Returns true if the last character returned by the "next" method included
383         // spacing combining characters. If it does, then the character was wider than
384         // just the base character alone, and the truncation code will not add it.
385         this.wasSpacingCombining = function() {
386             return this.spacingCombining;
387         };
388     };
389     return new _chiterator(this);
390 };
391 
392 /**
393  * Truncate the current string at the given number of whole glyphs and return
394  * the resulting string.
395  *
396  * @param {number} length the number of whole glyphs to keep in the string
397  * @return {string} a string truncated to the requested number of glyphs
398  */
399 GlyphString.prototype.truncate = function(length) {
400     var it = this.charIterator();
401     var tr = "";
402     for (var i = 0; i < length-1 && it.hasNext(); i++) {
403         tr += it.next();
404     }
405 
406     /*
407      * handle the last character separately. If it contains spacing combining
408      * accents, then we must assume that it uses up more horizontal space on
409      * the screen than just the base character by itself, and therefore this
410      * method will not truncate enough characters to fit in the given length.
411      * In this case, we have to chop off not only the combining characters,
412      * but also the base character as well because the base without the
413      * combining accents is considered a different character.
414      */
415     if (i < length && it.hasNext()) {
416         var c = it.next();
417         if (!it.wasSpacingCombining()) {
418             tr += c;
419         }
420     }
421     return tr;
422 };
423 
424 /**
425  * Truncate the current string at the given number of glyphs and add an ellipsis
426  * to indicate that is more to the string. The ellipsis forms the last character
427  * in the string, so the string is actually truncated at length-1 glyphs.
428  *
429  * @param {number} length the number of whole glyphs to keep in the string
430  * including the ellipsis
431  * @return {string} a string truncated to the requested number of glyphs
432  * with an ellipsis
433  */
434 GlyphString.prototype.ellipsize = function(length) {
435     return this.truncate(length > 0 ? length-1 : 0) + "…";
436 };
437 
438 module.exports = GlyphString;
439