1 /* 2 * CType.js - Character type definitions 3 * 4 * Copyright © 2012-2015, 2018, JEDLSoft 5 * 6 * Licensed under the Apache License, Version 2.0 (the "License"); 7 * you may not use this file except in compliance with the License. 8 * You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * 16 * See the License for the specific language governing permissions and 17 * limitations under the License. 18 */ 19 20 // !data ctype 21 22 var ilib = require("../index.js"); 23 var SearchUtils = require("./SearchUtils.js"); 24 var Utils = require("./Utils.js"); 25 var IString = require("./IString.js"); 26 27 /** 28 * Provides a set of static routines that return information about characters. 29 * These routines emulate the C-library ctype functions. The characters must be 30 * encoded in utf-16, as no other charsets are currently supported. Only the first 31 * character of the given string is tested. 32 * @namespace 33 */ 34 var CType = {}; 35 36 37 /** 38 * Actual implementation for withinRange. Searches the given object for ranges. 39 * The range names are taken from the Unicode range names in 40 * http://www.unicode.org/Public/UNIDATA/extracted/DerivedGeneralCategory.txt 41 * 42 * <ul> 43 * <li>Cn - Unassigned 44 * <li>Lu - Uppercase_Letter 45 * <li>Ll - Lowercase_Letter 46 * <li>Lt - Titlecase_Letter 47 * <li>Lm - Modifier_Letter 48 * <li>Lo - Other_Letter 49 * <li>Mn - Nonspacing_Mark 50 * <li>Me - Enclosing_Mark 51 * <li>Mc - Spacing_Mark 52 * <li>Nd - Decimal_Number 53 * <li>Nl - Letter_Number 54 * <li>No - Other_Number 55 * <li>Zs - Space_Separator 56 * <li>Zl - Line_Separator 57 * <li>Zp - Paragraph_Separator 58 * <li>Cc - Control 59 * <li>Cf - Format 60 * <li>Co - Private_Use 61 * <li>Cs - Surrogate 62 * <li>Pd - Dash_Punctuation 63 * <li>Ps - Open_Punctuation 64 * <li>Pe - Close_Punctuation 65 * <li>Pc - Connector_Punctuation 66 * <li>Po - Other_Punctuation 67 * <li>Sm - Math_Symbol 68 * <li>Sc - Currency_Symbol 69 * <li>Sk - Modifier_Symbol 70 * <li>So - Other_Symbol 71 * <li>Pi - Initial_Punctuation 72 * <li>Pf - Final_Punctuation 73 * </ul> 74 * 75 * @protected 76 * @param {number} num code point of the character to examine 77 * @param {string} rangeName the name of the range to check 78 * @param {Object} obj object containing the character range data 79 * @return {boolean} true if the first character is within the named 80 * range 81 */ 82 CType._inRange = function(num, rangeName, obj) { 83 var range; 84 if (num < 0 || !rangeName || !obj) { 85 return false; 86 } 87 88 range = obj[rangeName]; 89 if (!range) { 90 return false; 91 } 92 93 var compare = function(singlerange, target) { 94 if (singlerange.length === 1) { 95 return singlerange[0] - target; 96 } else { 97 return target < singlerange[0] ? singlerange[0] - target : 98 (target > singlerange[1] ? singlerange[1] - target : 0); 99 } 100 }; 101 var result = SearchUtils.bsearch(num, range, compare); 102 return result < range.length && compare(range[result], num) === 0; 103 }; 104 105 /** 106 * Return whether or not the first character is within the named range 107 * of Unicode characters. The valid list of range names are taken from 108 * the Unicode 6.0 spec. Characters in all ranges of Unicode are supported, 109 * including those supported in Javascript via UTF-16. Currently, this method 110 * supports the following range names: 111 * 112 * <ul> 113 * <li><i>ascii</i> - basic ASCII 114 * <li><i>latin</i> - Latin, Latin Extended Additional, Latin-1 supplement, Latin Extended-C, Latin Extended-D, Latin Extended-E 115 * <li><i>armenian</i> 116 * <li><i>greek</i> - Greek, Greek Extended 117 * <li><i>cyrillic</i> - Cyrillic, Cyrillic Extended-A, Cyrillic Extended-B, Cyrillic Extended-C, Cyrillic Supplement 118 * <li><i>georgian</i> - Georgian, Georgian Supplement 119 * <li><i>glagolitic</i> - Glagolitic, Glagolitic Supplement 120 * <li><i>gothic</i> 121 * <li><i>ogham</i> 122 * <li><i>oldpersian</i> 123 * <li><i>runic</i> 124 * <li><i>ipa</i> - IPA, Phonetic Extensions, Phonetic Extensions Supplement 125 * <li><i>phonetic</i> 126 * <li><i>modifiertone</i> - Modifier Tone Letters 127 * <li><i>spacing</i> 128 * <li><i>diacritics</i> 129 * <li><i>halfmarks</i> - Combining Half Marks 130 * <li><i>small</i> - Small Form Variants 131 * <li><i>bamum</i> - Bamum, Bamum Supplement 132 * <li><i>ethiopic</i> - Ethiopic, Ethiopic Extended, Ethiopic Extended-A 133 * <li><i>nko</i> 134 * <li><i>osmanya</i> 135 * <li><i>tifinagh</i> 136 * <li><i>val</i> 137 * <li><i>arabic</i> - Arabic, Arabic Supplement, Arabic Presentation Forms-A, 138 * Arabic Presentation Forms-B, Arabic Mathematical Alphabetic Symbols 139 * <li><i>carlan</i> 140 * <li><i>hebrew</i> 141 * <li><i>mandaic</i> 142 * <li><i>samaritan</i> 143 * <li><i>syriac</i> 144 * <li><i>mongolian</i> 145 * <li><i>phagspa</i> 146 * <li><i>tibetan</i> 147 * <li><i>bengali</i> 148 * <li><i>devanagari</i> - Devanagari, Devanagari Extended 149 * <li><i>gujarati</i> 150 * <li><i>gurmukhi</i> 151 * <li><i>kannada</i> 152 * <li><i>lepcha</i> 153 * <li><i>limbu</i> 154 * <li><i>malayalam</i> 155 * <li><i>meetaimayek</i> 156 * <li><i>olchiki</i> 157 * <li><i>oriya</i> 158 * <li><i>saurashtra</i> 159 * <li><i>sinhala</i> 160 * <li><i>sylotinagri</i> - Syloti Nagri 161 * <li><i>tangut</i> 162 * <li><i>tamil</i> 163 * <li><i>telugu</i> 164 * <li><i>thaana</i> 165 * <li><i>vedic</i> 166 * <li><i>batak</i> 167 * <li><i>balinese</i> 168 * <li><i>buginese</i> 169 * <li><i>cham</i> 170 * <li><i>javanese</i> 171 * <li><i>kayahli</i> 172 * <li><i>khmer</i> 173 * <li><i>lao</i> 174 * <li><i>myanmar</i> - Myanmar, Myanmar Extended-A, Myanmar Extended-B 175 * <li><i>newtailue</i> 176 * <li><i>rejang</i> 177 * <li><i>sundanese</i> - Sundanese, Sundanese Supplement 178 * <li><i>taile</i> 179 * <li><i>taitham</i> 180 * <li><i>taiviet</i> 181 * <li><i>thai</i> 182 * <li><i>buhld</i> 183 * <li><i>hanunoo</i> 184 * <li><i>tagalog</i> 185 * <li><i>tagbanwa</i> 186 * <li><i>bopomofo</i> - Bopomofo, Bopomofo Extended 187 * <li><i>cjk</i> - the CJK unified ideographs (Han), CJK Unified Ideographs 188 * Extension A, CJK Unified Ideographs Extension B, CJK Unified Ideographs 189 * Extension C, CJK Unified Ideographs Extension D, Ideographic Description 190 * Characters (=isIdeo()) 191 * <li><i>cjkcompatibility</i> - CJK Compatibility, CJK Compatibility 192 * Ideographs, CJK Compatibility Forms, CJK Compatibility Ideographs Supplement 193 * <li><i>cjkradicals</i> - the CJK radicals, KangXi radicals 194 * <li><i>hangul</i> - Hangul Jamo, Hangul Syllables, Hangul Jamo Extended-A, 195 * Hangul Jamo Extended-B, Hangul Compatibility Jamo 196 * <li><i>cjkpunct</i> - CJK symbols and punctuation 197 * <li><i>cjkstrokes</i> - CJK strokes 198 * <li><i>hiragana</i> 199 * <li><i>katakana</i> - Katakana, Katakana Phonetic Extensions, Kana Supplement 200 * <li><i>kanbun</i> 201 * <li><i>lisu</i> 202 * <li><i>yi</i> - Yi Syllables, Yi Radicals 203 * <li><i>cherokee</i> 204 * <li><i>canadian</i> - Unified Canadian Aboriginal Syllabics, Unified Canadian 205 * Aboriginal Syllabics Extended 206 * <li><i>presentation</i> - Alphabetic presentation forms 207 * <li><i>vertical</i> - Vertical Forms 208 * <li><i>width</i> - Halfwidth and Fullwidth Forms 209 * <li><i>punctuation</i> - General punctuation, Supplemental Punctuation 210 * <li><i>box</i> - Box Drawing 211 * <li><i>block</i> - Block Elements 212 * <li><i>letterlike</i> - Letterlike symbols 213 * <li><i>mathematical</i> - Mathematical alphanumeric symbols, Miscellaneous 214 * Mathematical Symbols-A, Miscellaneous Mathematical Symbols-B 215 * <li><i>enclosedalpha</i> - Enclosed alphanumerics, Enclosed Alphanumeric Supplement 216 * <li><i>enclosedcjk</i> - Enclosed CJK letters and months, Enclosed Ideographic Supplement 217 * <li><i>cjkcompatibility</i> - CJK compatibility 218 * <li><i>apl</i> - APL symbols 219 * <li><i>controlpictures</i> - Control pictures 220 * <li><i>misc</i> - Miscellaneous technical 221 * <li><i>ocr</i> - Optical character recognition (OCR) 222 * <li><i>combining</i> - Combining Diacritical Marks, Combining Diacritical Marks 223 * for Symbols, Combining Diacritical Marks Supplement, Combining Diacritical Marks Extended 224 * <li><i>digits</i> - ASCII digits (=isDigit()) 225 * <li><i>indicnumber</i> - Common Indic Number Forms 226 * <li><i>numbers</i> - Number forms 227 * <li><i>supersub</i> - Superscripts and Subscripts 228 * <li><i>arrows</i> - Arrows, Miscellaneous Symbols and Arrows, Supplemental Arrows-A, 229 * Supplemental Arrows-B, Supplemental Arrows-C 230 * <li><i>operators</i> - Mathematical operators, supplemental 231 * mathematical operators 232 * <li><i>geometric</i> - Geometric shapes, Geometric shapes extended 233 * <li><i>ancient</i> - Ancient symbols 234 * <li><i>braille</i> - Braille patterns 235 * <li><i>currency</i> - Currency symbols 236 * <li><i>dingbats</i> 237 * <li><i>gamesymbols</i> 238 * <li><i>yijing</i> - Yijing Hexagram Symbols 239 * <li><i>specials</i> 240 * <li><i>variations</i> - Variation Selectors, Variation Selectors Supplement 241 * <li><i>privateuse</i> - Private Use Area, Supplementary Private Use Area-A, 242 * Supplementary Private Use Area-B 243 * <li><i>supplementarya</i> - Supplementary private use area-A 244 * <li><i>supplementaryb</i> - Supplementary private use area-B 245 * <li><i>highsurrogates</i> - High Surrogates, High Private Use Surrogates 246 * <li><i>lowsurrogates</i> 247 * <li><i>reserved</i> 248 * <li><i>noncharacters</i> 249 * <li><i>copticnumber</i> - coptic epact numbers 250 * <li><i>oldpermic</i> - old permic 251 * <li><i>albanian</i> - albanian 252 * <li><i>lineara</i> - linear a 253 * <li><i>meroitic</i> - meroitic cursive 254 * <li><i>oldnortharabian</i> - old north arabian 255 * <li><i>oldhungarian</i> - Supplementary private use area-A 256 * <li><i>sorasompeng</i> - sora sompeng 257 * <li><i>warangciti</i> - warang citi 258 * <li><i>paucinhau</i> - pau cin hau 259 * <li><i>bassavah</i> - bassa vah 260 * <li><i>pahawhhmong</i> - pahawh hmong 261 * <li><i>shorthandformat</i> - shorthand format controls 262 * <li><i>suttonsignwriting</i> - sutton signwriting 263 * <li><i>pictographs</i> - miscellaneous symbols and pictographs, supplemental symbols and pictographs 264 * <li><i>ornamentaldingbats</i> - ornamental dingbats 265 * </ul><p> 266 * 267 * 268 * @protected 269 * @param {string|IString|number} ch character or code point to examine 270 * @param {string} rangeName the name of the range to check 271 * @return {boolean} true if the first character is within the named 272 * range 273 */ 274 CType.withinRange = function(ch, rangeName) { 275 if (!rangeName) { 276 return false; 277 } 278 var num; 279 switch (typeof(ch)) { 280 case 'number': 281 num = ch; 282 break; 283 case 'string': 284 num = IString.toCodePoint(ch, 0); 285 break; 286 case 'undefined': 287 return false; 288 default: 289 num = ch._toCodePoint(0); 290 break; 291 } 292 293 return CType._inRange(num, rangeName.toLowerCase(), ilib.data.ctype); 294 }; 295 296 /** 297 * @protected 298 * @param {boolean} sync 299 * @param {Object|undefined} loadParams 300 * @param {function(*)|undefined} onLoad 301 */ 302 CType._init = function(sync, loadParams, onLoad) { 303 CType._load("ctype", sync, loadParams, onLoad); 304 }; 305 306 /** 307 * @protected 308 * @param {string} name 309 * @param {boolean} sync 310 * @param {Object|undefined} loadParams 311 * @param {function(*)|undefined} onLoad 312 */ 313 CType._load = function (name, sync, loadParams, onLoad) { 314 if (!ilib.data[name]) { 315 var loadName = name ? name + ".json" : "CType.json"; 316 Utils.loadData({ 317 object: "CType", 318 name: loadName, 319 locale: "-", 320 nonlocale: true, 321 sync: sync, 322 loadParams: loadParams, 323 callback: ilib.bind(this, function(ct) { 324 ilib.data[name] = ct; 325 if (onLoad && typeof(onLoad) === 'function') { 326 onLoad(ilib.data[name]); 327 } 328 }) 329 }); 330 } else { 331 if (onLoad && typeof(onLoad) === 'function') { 332 onLoad(ilib.data[name]); 333 } 334 } 335 }; 336 337 module.exports = CType; 338