1 /*
  2  * CType.js - Character type definitions
  3  *
  4  * Copyright © 2012-2015, 2018, JEDLSoft
  5  *
  6  * Licensed under the Apache License, Version 2.0 (the "License");
  7  * you may not use this file except in compliance with the License.
  8  * You may obtain a copy of the License at
  9  *
 10  *     http://www.apache.org/licenses/LICENSE-2.0
 11  *
 12  * Unless required by applicable law or agreed to in writing, software
 13  * distributed under the License is distributed on an "AS IS" BASIS,
 14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15  *
 16  * See the License for the specific language governing permissions and
 17  * limitations under the License.
 18  */
 19 
 20 // !data ctype
 21 
 22 var ilib = require("../index.js");
 23 var SearchUtils = require("./SearchUtils.js");
 24 var Utils = require("./Utils.js");
 25 var IString = require("./IString.js");
 26 
 27 /**
 28  * Provides a set of static routines that return information about characters.
 29  * These routines emulate the C-library ctype functions. The characters must be
 30  * encoded in utf-16, as no other charsets are currently supported. Only the first
 31  * character of the given string is tested.
 32  * @namespace
 33  */
 34 var CType = {};
 35 
 36 
 37 /**
 38  * Actual implementation for withinRange. Searches the given object for ranges.
 39  * The range names are taken from the Unicode range names in
 40  * http://www.unicode.org/Public/UNIDATA/extracted/DerivedGeneralCategory.txt
 41  *
 42  * <ul>
 43  * <li>Cn - Unassigned
 44  * <li>Lu - Uppercase_Letter
 45  * <li>Ll - Lowercase_Letter
 46  * <li>Lt - Titlecase_Letter
 47  * <li>Lm - Modifier_Letter
 48  * <li>Lo - Other_Letter
 49  * <li>Mn - Nonspacing_Mark
 50  * <li>Me - Enclosing_Mark
 51  * <li>Mc - Spacing_Mark
 52  * <li>Nd - Decimal_Number
 53  * <li>Nl - Letter_Number
 54  * <li>No - Other_Number
 55  * <li>Zs - Space_Separator
 56  * <li>Zl - Line_Separator
 57  * <li>Zp - Paragraph_Separator
 58  * <li>Cc - Control
 59  * <li>Cf - Format
 60  * <li>Co - Private_Use
 61  * <li>Cs - Surrogate
 62  * <li>Pd - Dash_Punctuation
 63  * <li>Ps - Open_Punctuation
 64  * <li>Pe - Close_Punctuation
 65  * <li>Pc - Connector_Punctuation
 66  * <li>Po - Other_Punctuation
 67  * <li>Sm - Math_Symbol
 68  * <li>Sc - Currency_Symbol
 69  * <li>Sk - Modifier_Symbol
 70  * <li>So - Other_Symbol
 71  * <li>Pi - Initial_Punctuation
 72  * <li>Pf - Final_Punctuation
 73  * </ul>
 74  *
 75  * @protected
 76  * @param {number} num code point of the character to examine
 77  * @param {string} rangeName the name of the range to check
 78  * @param {Object} obj object containing the character range data
 79  * @return {boolean} true if the first character is within the named
 80  * range
 81  */
 82 CType._inRange = function(num, rangeName, obj) {
 83     var range;
 84     if (num < 0 || !rangeName || !obj) {
 85         return false;
 86     }
 87 
 88     range = obj[rangeName];
 89     if (!range) {
 90         return false;
 91     }
 92 
 93     var compare = function(singlerange, target) {
 94         if (singlerange.length === 1) {
 95             return singlerange[0] - target;
 96         } else {
 97             return target < singlerange[0] ? singlerange[0] - target :
 98                 (target > singlerange[1] ? singlerange[1] - target : 0);
 99         }
100     };
101     var result = SearchUtils.bsearch(num, range, compare);
102     return result < range.length && compare(range[result], num) === 0;
103 };
104 
105 /**
106  * Return whether or not the first character is within the named range
107  * of Unicode characters. The valid list of range names are taken from
108  * the Unicode 6.0 spec. Characters in all ranges of Unicode are supported,
109  * including those supported in Javascript via UTF-16. Currently, this method
110  * supports the following range names:
111  *
112  * <ul>
113  * <li><i>ascii</i> - basic ASCII
114  * <li><i>latin</i> - Latin, Latin Extended Additional, Latin-1 supplement, Latin Extended-C, Latin Extended-D, Latin Extended-E
115  * <li><i>armenian</i>
116  * <li><i>greek</i> - Greek, Greek Extended
117  * <li><i>cyrillic</i> - Cyrillic, Cyrillic Extended-A, Cyrillic Extended-B, Cyrillic Extended-C, Cyrillic Supplement
118  * <li><i>georgian</i> - Georgian, Georgian Supplement
119  * <li><i>glagolitic</i> - Glagolitic, Glagolitic Supplement
120  * <li><i>gothic</i>
121  * <li><i>ogham</i>
122  * <li><i>oldpersian</i>
123  * <li><i>runic</i>
124  * <li><i>ipa</i> - IPA, Phonetic Extensions, Phonetic Extensions Supplement
125  * <li><i>phonetic</i>
126  * <li><i>modifiertone</i> - Modifier Tone Letters
127  * <li><i>spacing</i>
128  * <li><i>diacritics</i>
129  * <li><i>halfmarks</i> - Combining Half Marks
130  * <li><i>small</i> - Small Form Variants
131  * <li><i>bamum</i> - Bamum, Bamum Supplement
132  * <li><i>ethiopic</i> - Ethiopic, Ethiopic Extended, Ethiopic Extended-A
133  * <li><i>nko</i>
134  * <li><i>osmanya</i>
135  * <li><i>tifinagh</i>
136  * <li><i>val</i>
137  * <li><i>arabic</i> - Arabic, Arabic Supplement, Arabic Presentation Forms-A,
138  * Arabic Presentation Forms-B, Arabic Mathematical Alphabetic Symbols
139  * <li><i>carlan</i>
140  * <li><i>hebrew</i>
141  * <li><i>mandaic</i>
142  * <li><i>samaritan</i>
143  * <li><i>syriac</i>
144  * <li><i>mongolian</i>
145  * <li><i>phagspa</i>
146  * <li><i>tibetan</i>
147  * <li><i>bengali</i>
148  * <li><i>devanagari</i> - Devanagari, Devanagari Extended
149  * <li><i>gujarati</i>
150  * <li><i>gurmukhi</i>
151  * <li><i>kannada</i>
152  * <li><i>lepcha</i>
153  * <li><i>limbu</i>
154  * <li><i>malayalam</i>
155  * <li><i>meetaimayek</i>
156  * <li><i>olchiki</i>
157  * <li><i>oriya</i>
158  * <li><i>saurashtra</i>
159  * <li><i>sinhala</i>
160  * <li><i>sylotinagri</i> - Syloti Nagri
161  * <li><i>tangut</i>
162  * <li><i>tamil</i>
163  * <li><i>telugu</i>
164  * <li><i>thaana</i>
165  * <li><i>vedic</i>
166  * <li><i>batak</i>
167  * <li><i>balinese</i>
168  * <li><i>buginese</i>
169  * <li><i>cham</i>
170  * <li><i>javanese</i>
171  * <li><i>kayahli</i>
172  * <li><i>khmer</i>
173  * <li><i>lao</i>
174  * <li><i>myanmar</i> - Myanmar, Myanmar Extended-A, Myanmar Extended-B
175  * <li><i>newtailue</i>
176  * <li><i>rejang</i>
177  * <li><i>sundanese</i> - Sundanese, Sundanese Supplement
178  * <li><i>taile</i>
179  * <li><i>taitham</i>
180  * <li><i>taiviet</i>
181  * <li><i>thai</i>
182  * <li><i>buhld</i>
183  * <li><i>hanunoo</i>
184  * <li><i>tagalog</i>
185  * <li><i>tagbanwa</i>
186  * <li><i>bopomofo</i> - Bopomofo, Bopomofo Extended
187  * <li><i>cjk</i> - the CJK unified ideographs (Han), CJK Unified Ideographs
188  *  Extension A, CJK Unified Ideographs Extension B, CJK Unified Ideographs
189  *  Extension C, CJK Unified Ideographs Extension D, Ideographic Description
190  *  Characters (=isIdeo())
191  * <li><i>cjkcompatibility</i> - CJK Compatibility, CJK Compatibility
192  * Ideographs, CJK Compatibility Forms, CJK Compatibility Ideographs Supplement
193  * <li><i>cjkradicals</i> - the CJK radicals, KangXi radicals
194  * <li><i>hangul</i> - Hangul Jamo, Hangul Syllables, Hangul Jamo Extended-A,
195  * Hangul Jamo Extended-B, Hangul Compatibility Jamo
196  * <li><i>cjkpunct</i> - CJK symbols and punctuation
197  * <li><i>cjkstrokes</i> - CJK strokes
198  * <li><i>hiragana</i>
199  * <li><i>katakana</i> - Katakana, Katakana Phonetic Extensions, Kana Supplement
200  * <li><i>kanbun</i>
201  * <li><i>lisu</i>
202  * <li><i>yi</i> - Yi Syllables, Yi Radicals
203  * <li><i>cherokee</i>
204  * <li><i>canadian</i> - Unified Canadian Aboriginal Syllabics, Unified Canadian
205  * Aboriginal Syllabics Extended
206  * <li><i>presentation</i> - Alphabetic presentation forms
207  * <li><i>vertical</i> - Vertical Forms
208  * <li><i>width</i> - Halfwidth and Fullwidth Forms
209  * <li><i>punctuation</i> - General punctuation, Supplemental Punctuation
210  * <li><i>box</i> - Box Drawing
211  * <li><i>block</i> - Block Elements
212  * <li><i>letterlike</i> - Letterlike symbols
213  * <li><i>mathematical</i> - Mathematical alphanumeric symbols, Miscellaneous
214  * Mathematical Symbols-A, Miscellaneous Mathematical Symbols-B
215  * <li><i>enclosedalpha</i> - Enclosed alphanumerics, Enclosed Alphanumeric Supplement
216  * <li><i>enclosedcjk</i> - Enclosed CJK letters and months, Enclosed Ideographic Supplement
217  * <li><i>cjkcompatibility</i> - CJK compatibility
218  * <li><i>apl</i> - APL symbols
219  * <li><i>controlpictures</i> - Control pictures
220  * <li><i>misc</i> - Miscellaneous technical
221  * <li><i>ocr</i> - Optical character recognition (OCR)
222  * <li><i>combining</i> - Combining Diacritical Marks, Combining Diacritical Marks
223  * for Symbols, Combining Diacritical Marks Supplement, Combining Diacritical Marks Extended
224  * <li><i>digits</i> - ASCII digits (=isDigit())
225  * <li><i>indicnumber</i> - Common Indic Number Forms
226  * <li><i>numbers</i> - Number forms
227  * <li><i>supersub</i> - Superscripts and Subscripts
228  * <li><i>arrows</i> - Arrows, Miscellaneous Symbols and Arrows, Supplemental Arrows-A,
229  * Supplemental Arrows-B, Supplemental Arrows-C
230  * <li><i>operators</i> - Mathematical operators, supplemental
231  * mathematical operators
232  * <li><i>geometric</i> - Geometric shapes, Geometric shapes extended
233  * <li><i>ancient</i> - Ancient symbols
234  * <li><i>braille</i> - Braille patterns
235  * <li><i>currency</i> - Currency symbols
236  * <li><i>dingbats</i>
237  * <li><i>gamesymbols</i>
238  * <li><i>yijing</i> - Yijing Hexagram Symbols
239  * <li><i>specials</i>
240  * <li><i>variations</i> - Variation Selectors, Variation Selectors Supplement
241  * <li><i>privateuse</i> - Private Use Area, Supplementary Private Use Area-A,
242  * Supplementary Private Use Area-B
243  * <li><i>supplementarya</i> - Supplementary private use area-A
244  * <li><i>supplementaryb</i> - Supplementary private use area-B
245  * <li><i>highsurrogates</i> - High Surrogates, High Private Use Surrogates
246  * <li><i>lowsurrogates</i>
247  * <li><i>reserved</i>
248  * <li><i>noncharacters</i>
249  * <li><i>copticnumber</i> - coptic epact numbers
250  * <li><i>oldpermic</i> - old permic
251  * <li><i>albanian</i> - albanian
252  * <li><i>lineara</i> - linear a
253  * <li><i>meroitic</i> - meroitic cursive
254  * <li><i>oldnortharabian</i> - old north arabian
255  * <li><i>oldhungarian</i> - Supplementary private use area-A
256  * <li><i>sorasompeng</i> - sora sompeng
257  * <li><i>warangciti</i> - warang citi
258  * <li><i>paucinhau</i> - pau cin hau
259  * <li><i>bassavah</i> - bassa vah
260  * <li><i>pahawhhmong</i> - pahawh hmong
261  * <li><i>shorthandformat</i> - shorthand format controls
262  * <li><i>suttonsignwriting</i> - sutton signwriting
263  * <li><i>pictographs</i> - miscellaneous symbols and pictographs, supplemental symbols and pictographs
264  * <li><i>ornamentaldingbats</i> - ornamental dingbats
265  * </ul><p>
266  *
267  *
268  * @protected
269  * @param {string|IString|number} ch character or code point to examine
270  * @param {string} rangeName the name of the range to check
271  * @return {boolean} true if the first character is within the named
272  * range
273  */
274 CType.withinRange = function(ch, rangeName) {
275     if (!rangeName) {
276         return false;
277     }
278     var num;
279     switch (typeof(ch)) {
280         case 'number':
281             num = ch;
282             break;
283         case 'string':
284             num = IString.toCodePoint(ch, 0);
285             break;
286         case 'undefined':
287             return false;
288         default:
289             num = ch._toCodePoint(0);
290             break;
291     }
292 
293     return CType._inRange(num, rangeName.toLowerCase(), ilib.data.ctype);
294 };
295 
296 /**
297  * @protected
298  * @param {boolean} sync
299  * @param {Object|undefined} loadParams
300  * @param {function(*)|undefined} onLoad
301  */
302 CType._init = function(sync, loadParams, onLoad) {
303     CType._load("ctype", sync, loadParams, onLoad);
304 };
305 
306 /**
307  * @protected
308  * @param {string} name
309  * @param {boolean} sync
310  * @param {Object|undefined} loadParams
311  * @param {function(*)|undefined} onLoad
312  */
313 CType._load = function (name, sync, loadParams, onLoad) {
314     if (!ilib.data[name]) {
315         var loadName = name ? name + ".json" : "CType.json";
316         Utils.loadData({
317             object: "CType",
318             name: loadName,
319             locale: "-",
320             nonlocale: true,
321             sync: sync,
322             loadParams: loadParams,
323             callback: ilib.bind(this, function(ct) {
324                 ilib.data[name] = ct;
325                 if (onLoad && typeof(onLoad) === 'function') {
326                     onLoad(ilib.data[name]);
327                 }
328             })
329         });
330     } else {
331         if (onLoad && typeof(onLoad) === 'function') {
332             onLoad(ilib.data[name]);
333         }
334     }
335 };
336 
337 module.exports = CType;
338