1;;; characters.el --- set syntax and category for multibyte characters 2 3;; Copyright (C) 1997, 2000-2021 Free Software Foundation, Inc. 4;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 5;; 2005, 2006, 2007, 2008, 2009, 2010, 2011 6;; National Institute of Advanced Industrial Science and Technology (AIST) 7;; Registration Number H14PRO021 8;; Copyright (C) 2003 9;; National Institute of Advanced Industrial Science and Technology (AIST) 10;; Registration Number H13PRO009 11 12;; Keywords: multibyte character, character set, syntax, category 13 14;; This file is part of GNU Emacs. 15 16;; GNU Emacs is free software: you can redistribute it and/or modify 17;; it under the terms of the GNU General Public License as published by 18;; the Free Software Foundation, either version 3 of the License, or 19;; (at your option) any later version. 20 21;; GNU Emacs is distributed in the hope that it will be useful, 22;; but WITHOUT ANY WARRANTY; without even the implied warranty of 23;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24;; GNU General Public License for more details. 25 26;; You should have received a copy of the GNU General Public License 27;; along with GNU Emacs. If not, see <https://www.gnu.org/licenses/>. 28 29;;; Commentary: 30 31;;; Code: 32 33;;; Predefined categories. 34 35;; For each character set. 36 37(define-category ?a "ASCII 38ASCII graphic characters 32-126 (ISO646 IRV:1983[4/0])") 39(define-category ?l "Latin") 40(define-category ?t "Thai") 41(define-category ?g "Greek") 42(define-category ?b "Arabic") 43(define-category ?w "Hebrew") 44(define-category ?y "Cyrillic") 45(define-category ?k "Katakana 46Japanese katakana") 47(define-category ?r "Roman 48Japanese roman") 49(define-category ?c "Chinese") 50(define-category ?j "Japanese") 51(define-category ?h "Korean") 52(define-category ?e "Ethiopic 53Ethiopic (Ge'ez)") 54(define-category ?v "Viet 55Vietnamese") 56(define-category ?i "Indian") 57(define-category ?o "Lao") 58(define-category ?q "Tibetan") 59 60;; For each group (row) of 2-byte character sets. 61 62(define-category ?A "2-byte alnum 63Alpha-numeric characters of 2-byte character sets") 64(define-category ?C "2-byte han 65Chinese (Han) characters of 2-byte character sets") 66(define-category ?G "2-byte Greek 67Greek characters of 2-byte character sets") 68(define-category ?H "2-byte Hiragana 69Japanese Hiragana characters of 2-byte character sets") 70(define-category ?K "2-byte Katakana 71Japanese Katakana characters of 2-byte character sets") 72(define-category ?N "2-byte Korean 73Korean Hangul characters of 2-byte character sets") 74(define-category ?Y "2-byte Cyrillic 75Cyrillic characters of 2-byte character sets") 76(define-category ?I "Indian Glyphs") 77 78;; For phonetic classifications. 79 80(define-category ?0 "consonant") 81(define-category ?1 "base vowel 82Base (independent) vowel") 83(define-category ?2 "upper diacritic 84Upper diacritical mark (including upper vowel)") 85(define-category ?3 "lower diacritic 86Lower diacritical mark (including lower vowel)") 87(define-category ?4 "combining tone 88Combining tone mark") 89(define-category ?5 "symbol") 90(define-category ?6 "digit") 91(define-category ?7 "vowel diacritic 92Vowel-modifying diacritical mark") 93(define-category ?8 "vowel-signs") 94(define-category ?9 "semivowel lower") 95 96;; For filling. 97(define-category ?| "line breakable 98While filling, we can break a line at this character.") 99 100;; For indentation calculation. 101(define-category ?\s 102 "space for indent 103This character counts as a space for indentation purposes.") 104 105;; Keep the following for `kinsoku' processing. See comments in 106;; kinsoku.el. 107(define-category ?> "Not at bol 108A character which can't be placed at beginning of line.") 109(define-category ?< "Not at eol 110A character which can't be placed at end of line.") 111 112;; Base and Combining 113(define-category ?. "Base 114Base characters (Unicode General Category L,N,P,S,Zs)") 115(define-category ?^ "Combining 116Combining diacritic or mark (Unicode General Category M)") 117 118;; bidi types 119(define-category ?R "Right-to-left (strong) 120Characters with \"strong\" right-to-left directionality, i.e. 121with R, AL, RLE, or RLO Unicode bidi character type.") 122 123(define-category ?L "Left-to-right (strong) 124Characters with \"strong\" left-to-right directionality, i.e. 125with L, LRE, or LRO Unicode bidi character type.") 126 127 128;;; Setting syntax and category. 129 130;; ASCII 131 132;; All ASCII characters have the category `a' (ASCII) and `l' (Latin). 133(modify-category-entry '(32 . 127) ?a) 134(modify-category-entry '(32 . 127) ?l) 135 136;; Deal with the CJK charsets first. Since the syntax of blocks is 137;; defined per charset, and the charsets may contain e.g. Latin 138;; characters, we end up with the wrong syntax definitions if we're 139;; not careful. 140 141;; Chinese characters (Unicode) 142(modify-category-entry '(#x2E80 . #x312F) ?|) 143(modify-category-entry '(#x3190 . #x33FF) ?|) 144(modify-category-entry '(#x3400 . #x4DB5) ?C) 145(modify-category-entry '(#x4E00 . #x9FD5) ?C) 146(modify-category-entry '(#x3400 . #x9FD5) ?c) 147(modify-category-entry '(#x3400 . #x9FD5) ?|) 148(modify-category-entry '(#xF900 . #xFAFF) ?C) 149(modify-category-entry '(#xF900 . #xFAFF) ?c) 150(modify-category-entry '(#xF900 . #xFAFF) ?|) 151(modify-category-entry '(#x1B170 . #x1B2FF) ?c) 152(modify-category-entry '(#x20000 . #x2FFFF) ?|) 153(modify-category-entry '(#x20000 . #x2FFFF) ?C) 154(modify-category-entry '(#x20000 . #x2FFFF) ?c) 155 156 157;; Chinese character set (GB2312) 158 159(map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2121 #x217E) 160(map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2221 #x227E) 161(map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2921 #x297E) 162 163(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?c) 164(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2330 #x2339) 165(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2341 #x235A) 166(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2361 #x237A) 167(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?H #x2421 #x247E) 168(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?K #x2521 #x257E) 169(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?G #x2621 #x267E) 170(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?Y #x2721 #x277E) 171(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?C #x3021 #x7E7E) 172 173;; Chinese character set (BIG5) 174 175(map-charset-chars #'modify-category-entry 'big5 ?c) 176(map-charset-chars #'modify-category-entry 'big5 ?C #xA259 #xA261) 177(map-charset-chars #'modify-category-entry 'big5 ?C #xA440 #xC67E) 178(map-charset-chars #'modify-category-entry 'big5 ?C #xC940 #xF9DC) 179 180;; Chinese character set (CNS11643) 181 182(dolist (c '(chinese-cns11643-1 chinese-cns11643-2 chinese-cns11643-3 183 chinese-cns11643-4 chinese-cns11643-5 chinese-cns11643-6 184 chinese-cns11643-7)) 185 (map-charset-chars #'modify-category-entry c ?c) 186 (if (eq c 'chinese-cns11643-1) 187 (map-charset-chars #'modify-category-entry c ?C #x4421 #x7E7E) 188 (map-charset-chars #'modify-category-entry c ?C))) 189 190;; Japanese character set (JISX0201, JISX0208, JISX0212, JISX0213) 191 192(map-charset-chars #'modify-category-entry 'katakana-jisx0201 ?k) 193 194(map-charset-chars #'modify-category-entry 'latin-jisx0201 ?r) 195 196(dolist (l '(katakana-jisx0201 japanese-jisx0208 japanese-jisx0212 197 japanese-jisx0213-1 japanese-jisx0213-2 198 japanese-jisx0213.2004-1 199 cp932-2-byte)) 200 (map-charset-chars #'modify-category-entry l ?j)) 201 202;; Fullwidth characters 203(modify-category-entry '(#xff01 . #xff60) ?\|) 204 205;; Unicode equivalents of JISX0201-kana 206(let ((range '(#xff61 . #xff9f))) 207 (modify-category-entry range ?k) 208 (modify-category-entry range ?j) 209 (modify-category-entry range ?\|)) 210 211;; Katakana block 212(modify-category-entry '(#x3099 . #x309C) ?K) 213(modify-category-entry '(#x30A0 . #x30FF) ?K) 214(modify-category-entry '(#x31F0 . #x31FF) ?K) 215(modify-category-entry '(#x30A0 . #x30FA) ?\|) 216(modify-category-entry #x30FF ?\|) 217 218;; Hiragana block 219(modify-category-entry '(#x3040 . #x309F) ?H) 220(modify-category-entry '(#x3040 . #x3096) ?\|) 221(modify-category-entry #x309F ?\|) 222(modify-category-entry #x30A0 ?H) 223(modify-category-entry #x30FC ?H) 224 225(modify-category-entry '(#x1B000 . #x1B1FF) ?j) 226 227 228;; JISX0208 229(map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2121 #x227E) 230(map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2821 #x287E) 231(let ((chars '(?ー ?゛ ?゜ ?ヽ ?ヾ ?ゝ ?ゞ ?〃 ?仝 ?々 ?〆 ?〇))) 232 (dolist (elt chars) 233 (modify-syntax-entry elt "w"))) 234 235(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?A #x2321 #x237E) 236(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?H #x2421 #x247E) 237(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?K #x2521 #x257E) 238(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?G #x2621 #x267E) 239(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?Y #x2721 #x277E) 240(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?C #x3021 #x7E7E) 241(let ((chars '(?仝 ?々 ?〆 ?〇))) 242 (while chars 243 (modify-category-entry (car chars) ?C) 244 (setq chars (cdr chars)))) 245 246;; JISX0212 247 248(map-charset-chars #'modify-syntax-entry 'japanese-jisx0212 "_" #x2121 #x237E) 249 250;; JISX0201-Kana 251 252(let ((chars '(?。 ?、 ?・))) 253 (while chars 254 (modify-syntax-entry (car chars) ".") 255 (setq chars (cdr chars)))) 256 257(modify-syntax-entry ?\「 "(」") 258(modify-syntax-entry ?\」 "(「") 259 260;; Korean character set (KSC5601) 261 262(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?h) 263 264(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2121 #x227E) 265(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2621 #x277E) 266(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2830 #x287E) 267(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2930 #x297E) 268(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2330 #x2339) 269(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2341 #x235A) 270(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2361 #x237A) 271(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?G #x2521 #x257E) 272(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?H #x2A21 #x2A7E) 273(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?K #x2B21 #x2B7E) 274(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?Y #x2C21 #x2C7E) 275 276;; These are in more than one charset. 277(let ((parens (concat "〈〉《》「」『』【】〔〕〖〗〘〙〚〛" 278 "︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄" 279 "()[]{}")) 280 open close) 281 (dotimes (i (/ (length parens) 2)) 282 (setq open (aref parens (* i 2)) 283 close (aref parens (1+ (* i 2)))) 284 (modify-syntax-entry open (format "(%c" close)) 285 (modify-syntax-entry close (format ")%c" open)))) 286 287;; Arabic character set 288 289(let ((charsets '(arabic-iso8859-6 290 arabic-digit 291 arabic-1-column 292 arabic-2-column))) 293 (while charsets 294 (map-charset-chars #'modify-category-entry (car charsets) ?b) 295 (setq charsets (cdr charsets)))) 296(modify-category-entry '(#x600 . #x6ff) ?b) 297(modify-category-entry '(#x8a0 . #x8ff) ?b) 298(modify-category-entry '(#xfb50 . #xfdff) ?b) 299(modify-category-entry '(#xfe70 . #xfefe) ?b) 300 301;; Cyrillic character set (ISO-8859-5) 302 303(modify-syntax-entry ?№ ".") 304 305;; Ethiopic character set 306 307(modify-category-entry '(#x1200 . #x1399) ?e) 308(modify-category-entry '(#x2d80 . #x2dde) ?e) 309(let ((chars '(?፡ ?። ?፣ ?፤ ?፥ ?፦ ?፧ ?፨))) 310 (while chars 311 (modify-syntax-entry (car chars) ".") 312 (setq chars (cdr chars)))) 313(map-charset-chars #'modify-category-entry 'ethiopic ?e) 314 315;; Hebrew character set (ISO-8859-8) 316 317(modify-syntax-entry #x5be ".") ; MAQAF 318(modify-syntax-entry #x5c0 ".") ; PASEQ 319(modify-syntax-entry #x5c3 ".") ; SOF PASUQ 320(modify-syntax-entry #x5f3 ".") ; GERESH 321(modify-syntax-entry #x5f4 ".") ; GERSHAYIM 322 323;; Indian character set (IS 13194 and other Emacs original Indian charsets) 324 325(modify-category-entry '(#x901 . #x970) ?i) 326(map-charset-chars #'modify-category-entry 'indian-is13194 ?i) 327(map-charset-chars #'modify-category-entry 'indian-2-column ?i) 328 329;; Lao character set 330 331(modify-category-entry '(#xe80 . #xeff) ?o) 332(map-charset-chars #'modify-category-entry 'lao ?o) 333 334(let ((deflist '(("ກ-ຮ" "w" ?0) ; consonant 335 ("ະາຳຽເ-ໄ" "w" ?1) ; vowel base 336 ("ັິ-ືົໍ" "w" ?2) ; vowel upper 337 ("ຸູ" "w" ?3) ; vowel lower 338 ("່-໋" "w" ?4) ; tone mark 339 ("ຼຽ" "w" ?9) ; semivowel lower 340 ("໐-໙" "w" ?6) ; digit 341 ("ຯໆ" "_" ?5) ; symbol 342 )) 343 elm chars len syntax category to ch i) 344 (while deflist 345 (setq elm (car deflist)) 346 (setq chars (car elm) 347 len (length chars) 348 syntax (nth 1 elm) 349 category (nth 2 elm) 350 i 0) 351 (while (< i len) 352 (if (= (aref chars i) ?-) 353 (setq i (1+ i) 354 to (aref chars i)) 355 (setq ch (aref chars i) 356 to ch)) 357 (while (<= ch to) 358 (unless (string-equal syntax "w") 359 (modify-syntax-entry ch syntax)) 360 (modify-category-entry ch category) 361 (setq ch (1+ ch))) 362 (setq i (1+ i))) 363 (setq deflist (cdr deflist)))) 364 365;; Thai character set (TIS620) 366 367(modify-category-entry '(#xe00 . #xe7f) ?t) 368(map-charset-chars #'modify-category-entry 'thai-tis620 ?t) 369 370(let ((deflist '(;; chars syntax category 371 ("ก-รลว-ฮ" "w" ?0) ; consonant 372 ("ฤฦะาำเ-ๅ" "w" ?1) ; vowel base 373 ("ัิ-ื็๎" "w" ?2) ; vowel upper 374 ("ุ-ฺ" "w" ?3) ; vowel lower 375 ("่-ํ" "w" ?4) ; tone mark 376 ("๐-๙" "w" ?6) ; digit 377 ("ฯๆ฿๏๚๛" "_" ?5) ; symbol 378 )) 379 elm chars len syntax category to ch i) 380 (while deflist 381 (setq elm (car deflist)) 382 (setq chars (car elm) 383 len (length chars) 384 syntax (nth 1 elm) 385 category (nth 2 elm) 386 i 0) 387 (while (< i len) 388 (if (= (aref chars i) ?-) 389 (setq i (1+ i) 390 to (aref chars i)) 391 (setq ch (aref chars i) 392 to ch)) 393 (while (<= ch to) 394 (unless (string-equal syntax "w") 395 (modify-syntax-entry ch syntax)) 396 (modify-category-entry ch category) 397 (setq ch (1+ ch))) 398 (setq i (1+ i))) 399 (setq deflist (cdr deflist)))) 400 401;; Tibetan character set 402 403(modify-category-entry '(#xf00 . #xfff) ?q) 404(map-charset-chars #'modify-category-entry 'tibetan ?q) 405(map-charset-chars #'modify-category-entry 'tibetan-1-column ?q) 406 407(let ((deflist '(;; chars syntax category 408 ("ཀ-ཀྵཪ" "w" ?0) ; consonant 409 ("ྐ-ྐྵྺྻྼ" "w" ?0) ; 410 ("ིེཻོཽྀ" "w" ?2) ; upper vowel 411 ("ཾྂྃ྆྇ྈྉྊྋ" "w" ?2) ; upper modifier 412 ("྄ཱུ༙༵༷" "w" ?3) ; lower vowel/modifier 413 ("" "w" ?3) ; invisible vowel a 414 ("༠-༩༪-༳" "w" ?6) ; digit 415 ("་།-༒༔ཿ" "." ?|) ; line-break char 416 ("་།༏༐༑༔ཿ" "." ?|) ; 417 ("༈་།-༒༔ཿ༽༴" "." ?>) ; prohibition 418 ("་།༏༐༑༔ཿ" "." ?>) ; 419 ("ༀ-༊༼࿁࿂྅" "." ?<) ; prohibition 420 ("༓༕-༘༚-༟༶༸-༻༾༿྾྿-࿏" "." ?q) ; others 421 )) 422 elm chars len syntax category to ch i) 423 (while deflist 424 (setq elm (car deflist)) 425 (setq chars (car elm) 426 len (length chars) 427 syntax (nth 1 elm) 428 category (nth 2 elm) 429 i 0) 430 (while (< i len) 431 (if (= (aref chars i) ?-) 432 (setq i (1+ i) 433 to (aref chars i)) 434 (setq ch (aref chars i) 435 to ch)) 436 (while (<= ch to) 437 (unless (string-equal syntax "w") 438 (modify-syntax-entry ch syntax)) 439 (modify-category-entry ch category) 440 (setq ch (1+ ch))) 441 (setq i (1+ i))) 442 (setq deflist (cdr deflist)))) 443 444;; Vietnamese character set 445 446;; To make a word with Latin characters 447(map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?l) 448(map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?v) 449 450(map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?l) 451(map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?v) 452 453(let ((tbl (standard-case-table)) 454 (i 32)) 455 (while (< i 128) 456 (let* ((char (decode-char 'vietnamese-viscii-upper i)) 457 (charl (decode-char 'vietnamese-viscii-lower i)) 458 (uc (encode-char char 'ucs)) 459 (lc (encode-char charl 'ucs))) 460 (set-case-syntax-pair char (decode-char 'vietnamese-viscii-lower i) 461 tbl) 462 (if uc (modify-category-entry uc ?v)) 463 (if lc (modify-category-entry lc ?v))) 464 (setq i (1+ i)))) 465 466;; Tai Viet 467(let ((deflist '(;; chars syntax category 468 ((?ꪀ. ?ꪯ) "w" ?0) ; consonant 469 ("ꪱꪵꪶ" "w" ?1) ; vowel base 470 ((?ꪹ . ?ꪽ) "w" ?1) ; vowel base 471 ("ꪰꪲꪳꪷꪸꪾ" "w" ?2) ; vowel upper 472 ("ꪴ" "w" ?3) ; vowel lower 473 ("ꫀꫂ" "w" ?1) ; non-combining tone-mark 474 ("꪿꫁" "w" ?4) ; combining tone-mark 475 ((?ꫛ . ?꫟) "_" ?5) ; symbol 476 ))) 477 (dolist (elm deflist) 478 (let ((chars (car elm)) 479 (syntax (nth 1 elm)) 480 (category (nth 2 elm))) 481 (if (consp chars) 482 (progn 483 (modify-syntax-entry chars syntax) 484 (modify-category-entry chars category)) 485 (mapc #'(lambda (x) 486 (modify-syntax-entry x syntax) 487 (modify-category-entry x category)) 488 chars))))) 489 490;; Bidi categories 491 492;; If bootstrapping without generated uni-*.el files, table not defined. 493(let ((table (unicode-property-table-internal 'bidi-class))) 494 (when table 495 (map-char-table (lambda (key val) 496 (cond 497 ((memq val '(R AL RLO RLE)) 498 (modify-category-entry key ?R)) 499 ((memq val '(L LRE LRO)) 500 (modify-category-entry key ?L)))) 501 table))) 502 503;; Load uni-mirrored.el and uni-brackets.el if available, so that they 504;; get dumped into Emacs. This allows starting Emacs with 505;; force-load-messages in ~/.emacs, and avoid infinite recursion in 506;; bidi_initialize, which needs to load uni-mirrored.el and 507;; uni-brackets.el in order to display the "Loading" messages. 508(unicode-property-table-internal 'mirroring) 509(unicode-property-table-internal 'bracket-type) 510 511;; Latin 512 513(modify-category-entry '(#x80 . #x024F) ?l) 514 515(let ((tbl (standard-case-table)) c) 516 517 ;; Latin-1 518 519 ;; Fixme: Some of the non-word syntaxes here perhaps should be 520 ;; reviewed. (Note that the following all implicitly have word 521 ;; syntax: ¢£¤¥¨ª¯²³´¶¸¹º.) There should be a well-defined way of 522 ;; relating Unicode categories to Emacs syntax codes. 523 524 ;; NBSP isn't semantically interchangeable with other whitespace chars, 525 ;; so it's more like punctuation. 526 (set-case-syntax ? "." tbl) 527 (set-case-syntax ?¡ "." tbl) 528 (set-case-syntax ?¦ "_" tbl) 529 (set-case-syntax ?§ "." tbl) 530 (set-case-syntax ?© "_" tbl) 531 ;; French wants 532 ;; (set-case-syntax-delims ?« ?» tbl) 533 ;; And German wants 534 ;; (set-case-syntax-delims ?» ?« tbl) 535 ;; So let's stay neutral and let users set these up if/when they want to. 536 (set-case-syntax ?« "." tbl) 537 (set-case-syntax ?» "." tbl) 538 (set-case-syntax ?¬ "_" tbl) 539 (set-case-syntax ? "_" tbl) 540 (set-case-syntax ?® "_" tbl) 541 (set-case-syntax ?° "_" tbl) 542 (set-case-syntax ?± "_" tbl) 543 (set-case-syntax ?µ "_" tbl) 544 (set-case-syntax ?· "_" tbl) 545 (set-case-syntax ?¼ "_" tbl) 546 (set-case-syntax ?½ "_" tbl) 547 (set-case-syntax ?¾ "_" tbl) 548 (set-case-syntax ?¿ "." tbl) 549 (set-case-syntax ?× "_" tbl) 550 (set-case-syntax ?ß "w" tbl) 551 (set-case-syntax ?÷ "_" tbl) 552 ;; See below for ÿ. 553 554 ;; Latin Extended-A, Latin Extended-B 555 (setq c #x0100) 556 (while (<= c #x02B8) 557 (modify-category-entry c ?l) 558 (setq c (1+ c))) 559 560 ;; Latin Extended Additional 561 (modify-category-entry '(#x1e00 . #x1ef9) ?l) 562 563 ;; Latin Extended-C 564 (setq c #x2C60) 565 (while (<= c #x2C7F) 566 (modify-category-entry c ?l) 567 (setq c (1+ c))) 568 569 ;; Latin Extended-D 570 (setq c #xA720) 571 (while (<= c #xA7FF) 572 (modify-category-entry c ?l) 573 (setq c (1+ c))) 574 575 ;; Latin Extended-E 576 (setq c #xAB30) 577 (while (<= c #xAB64) 578 (modify-category-entry c ?l) 579 (setq c (1+ c))) 580 581 ;; Greek 582 (modify-category-entry '(#x0370 . #x03ff) ?g) 583 584 ;; Armenian 585 (setq c #x531) 586 587 ;; Greek Extended 588 (modify-category-entry '(#x1f00 . #x1fff) ?g) 589 590 ;; cyrillic 591 (modify-category-entry '(#x0400 . #x04FF) ?y) 592 (modify-category-entry '(#xA640 . #xA69F) ?y) 593 594 ;; Georgian 595 (setq c #x10A0) 596 597 ;; Cyrillic Extended-C 598 (modify-category-entry '(#x1C80 . #x1C8F) ?y) 599 600 ;; general punctuation 601 (setq c #x2000) 602 (while (<= c #x200b) 603 (set-case-syntax c " " tbl) 604 (setq c (1+ c))) 605 (while (<= c #x200F) 606 (set-case-syntax c "." tbl) 607 (setq c (1+ c))) 608 ;; Fixme: These aren't all right: 609 (setq c #x2010) 610 (while (<= c #x2016) 611 (set-case-syntax c "_" tbl) 612 (setq c (1+ c))) 613 ;; Punctuation syntax for quotation marks (like `) 614 (while (<= c #x201f) 615 (set-case-syntax c "." tbl) 616 (setq c (1+ c))) 617 ;; Fixme: These aren't all right: 618 (while (<= c #x2027) 619 (set-case-syntax c "_" tbl) 620 (setq c (1+ c))) 621 (while (<= c #x206F) 622 (set-case-syntax c "." tbl) 623 (setq c (1+ c))) 624 625 ;; Fixme: The following blocks might be better as symbol rather than 626 ;; punctuation. 627 ;; Arrows 628 (setq c #x2190) 629 (while (<= c #x21FF) 630 (set-case-syntax c "." tbl) 631 (setq c (1+ c))) 632 ;; Mathematical Operators 633 (while (<= c #x22FF) 634 (set-case-syntax c "." tbl) 635 (setq c (1+ c))) 636 ;; Miscellaneous Technical 637 (while (<= c #x23FF) 638 (set-case-syntax c "." tbl) 639 (setq c (1+ c))) 640 ;; Control Pictures 641 (while (<= c #x243F) 642 (set-case-syntax c "_" tbl) 643 (setq c (1+ c))) 644 645 ;; Circled Latin 646 (setq c #x24B6) 647 (while (<= c #x24CF) 648 (modify-category-entry c ?l) 649 (modify-category-entry (+ c 26) ?l) 650 (setq c (1+ c))) 651 652 ;; Supplemental Mathematical Operators 653 (setq c #x2A00) 654 (while (<= c #x2AFF) 655 (set-case-syntax c "." tbl) 656 (setq c (1+ c))) 657 658 ;; Miscellaneous Symbols and Arrows 659 (setq c #x2B00) 660 (while (<= c #x2BFF) 661 (set-case-syntax c "." tbl) 662 (setq c (1+ c))) 663 664 ;; Coptic 665 ;; There's no Coptic category. However, Coptic letters that are 666 ;; part of the Greek block above get the Greek category, and those 667 ;; in this block are derived from Greek letters, so let's be 668 ;; consistent about their category. 669 (modify-category-entry '(#x2C80 . #x2CFF) ?g) 670 671 ;; Supplemental Punctuation 672 (setq c #x2E00) 673 (while (<= c #x2E7F) 674 (set-case-syntax c "." tbl) 675 (setq c (1+ c))) 676 677 ;; Symbols for Legacy Computing 678 (setq c #x1FB00) 679 (while (<= c #x1FBFF) 680 (set-case-syntax c "." tbl) 681 (setq c (1+ c))) 682 683 ;; Fullwidth Latin 684 (setq c #xff21) 685 (while (<= c #xff3a) 686 (modify-category-entry c ?l) 687 (modify-category-entry (+ c #x20) ?l) 688 (setq c (1+ c))) 689 690 ;; Combining diacritics 691 (modify-category-entry '(#x300 . #x362) ?^) 692 ;; Combining marks 693 (modify-category-entry '(#x20d0 . #x20ff) ?^) 694 695 (let ((gc (unicode-property-table-internal 'general-category)) 696 (syn-table (standard-syntax-table))) 697 ;; In early bootstrapping Unicode tables are not available so we need to 698 ;; skip this step in those cases. 699 (when gc 700 ;; Set all Letter, uppercase; Letter, lowercase and Letter, 701 ;; titlecase syntax to word. 702 (map-char-table 703 (lambda (ch cat) 704 (when (memq cat '(Lu Ll Lt)) 705 (modify-syntax-entry ch "w " syn-table))) 706 gc) 707 ;; Ⅰ through Ⅻ had word syntax in the past so set it here as well. 708 ;; The general category of those characters is Number, Letter. 709 (modify-syntax-entry '(#x2160 . #x216b) "w " syn-table) 710 711 ;; ⓐ through ⓩ are symbols, other according to Unicode but Emacs set 712 ;; their syntax to word in the past so keep backwards compatibility. 713 (modify-syntax-entry '(#x24D0 . #x24E9) "w " syn-table) 714 715 ;; Set downcase and upcase from Unicode properties 716 717 ;; In some languages, such as Turkish, U+0049 LATIN CAPITAL LETTER I and 718 ;; U+0131 LATIN SMALL LETTER DOTLESS I make a case pair, and so do U+0130 719 ;; LATIN CAPITAL LETTER I WITH DOT ABOVE and U+0069 LATIN SMALL LETTER I. 720 721 ;; We used to set up half of those correspondence unconditionally, but 722 ;; that makes searches slow. So now we don't set up either half of these 723 ;; correspondences by default. 724 725 ;; (set-downcase-syntax ?İ ?i tbl) 726 ;; (set-upcase-syntax ?I ?ı tbl) 727 728 (let ((map-unicode-property 729 (lambda (property func) 730 (map-char-table 731 (lambda (ch cased) 732 ;; ASCII characters skipped due to reasons outlined above. As 733 ;; of Unicode 9.0, this exception affects the following: 734 ;; lc(U+0130 İ) = i 735 ;; uc(U+0131 ı) = I 736 ;; uc(U+017F ſ) = S 737 ;; uc(U+212A K) = k 738 (when (> cased 127) 739 (let ((end (if (consp ch) (cdr ch) ch))) 740 (setq ch (max 128 (if (consp ch) (car ch) ch))) 741 (while (<= ch end) 742 (funcall func ch cased) 743 (setq ch (1+ ch)))))) 744 (unicode-property-table-internal property)))) 745 (down tbl) 746 (up (case-table-get-table tbl 'up))) 747 748 ;; This works on an assumption that if toUpper(x) != x then toLower(x) 749 ;; == x (and the opposite for toLower/toUpper). This doesn’t hold for 750 ;; title case characters but those incorrect mappings will be 751 ;; overwritten later. 752 (funcall map-unicode-property 'uppercase 753 (lambda (lc uc) (aset down lc lc) (aset up uc uc))) 754 (funcall map-unicode-property 'lowercase 755 (lambda (uc lc) (aset down lc lc) (aset up uc uc))) 756 757 ;; Now deal with the actual mapping. This will correctly assign casing 758 ;; for title-case characters. 759 (funcall map-unicode-property 'uppercase 760 (lambda (lc uc) (aset up lc uc) (aset up uc uc))) 761 (funcall map-unicode-property 'lowercase 762 (lambda (uc lc) (aset down uc lc) (aset down lc lc)))))) 763 764 ;; Clear out the extra slots so that they will be recomputed from the main 765 ;; (downcase) table and upcase table. Since we’re side-stepping the usual 766 ;; set-case-syntax-* functions, we need to do it explicitly. 767 (set-char-table-extra-slot tbl 1 nil) 768 (set-char-table-extra-slot tbl 2 nil) 769 770 ;; Fixme: syntax for symbols &c 771 ) 772 773(let ((pairs 774 '("⁅⁆" ; U+2045 U+2046 775 "⁽⁾" ; U+207D U+207E 776 "₍₎" ; U+208D U+208E 777 "〈〉" ; U+2329 U+232A 778 "⎴⎵" ; U+23B4 U+23B5 779 "❨❩" ; U+2768 U+2769 780 "❪❫" ; U+276A U+276B 781 "❬❭" ; U+276C U+276D 782 "❰❱" ; U+2770 U+2771 783 "❲❳" ; U+2772 U+2773 784 "❴❵" ; U+2774 U+2775 785 "⟦⟧" ; U+27E6 U+27E7 786 "⟨⟩" ; U+27E8 U+27E9 787 "⟪⟫" ; U+27EA U+27EB 788 "⦃⦄" ; U+2983 U+2984 789 "⦅⦆" ; U+2985 U+2986 790 "⦇⦈" ; U+2987 U+2988 791 "⦉⦊" ; U+2989 U+298A 792 "⦋⦌" ; U+298B U+298C 793 "⦍⦎" ; U+298D U+298E 794 "⦏⦐" ; U+298F U+2990 795 "⦑⦒" ; U+2991 U+2992 796 "⦓⦔" ; U+2993 U+2994 797 "⦕⦖" ; U+2995 U+2996 798 "⦗⦘" ; U+2997 U+2998 799 "⧼⧽" ; U+29FC U+29FD 800 "〈〉" ; U+3008 U+3009 801 "《》" ; U+300A U+300B 802 "「」" ; U+300C U+300D 803 "『』" ; U+300E U+300F 804 "【】" ; U+3010 U+3011 805 "〔〕" ; U+3014 U+3015 806 "〖〗" ; U+3016 U+3017 807 "〘〙" ; U+3018 U+3019 808 "〚〛" ; U+301A U+301B 809 "﴾﴿" ; U+FD3E U+FD3F 810 "︵︶" ; U+FE35 U+FE36 811 "︷︸" ; U+FE37 U+FE38 812 "︹︺" ; U+FE39 U+FE3A 813 "︻︼" ; U+FE3B U+FE3C 814 "︽︾" ; U+FE3D U+FE3E 815 "︿﹀" ; U+FE3F U+FE40 816 "﹁﹂" ; U+FE41 U+FE42 817 "﹃﹄" ; U+FE43 U+FE44 818 "﹙﹚" ; U+FE59 U+FE5A 819 "﹛﹜" ; U+FE5B U+FE5C 820 "﹝﹞" ; U+FE5D U+FE5E 821 "()" ; U+FF08 U+FF09 822 "[]" ; U+FF3B U+FF3D 823 "{}" ; U+FF5B U+FF5D 824 "⦅⦆" ; U+FF5F U+FF60 825 "「」" ; U+FF62 U+FF63 826 ))) 827 (dolist (elt pairs) 828 (modify-syntax-entry (aref elt 0) (string ?\( (aref elt 1))) 829 (modify-syntax-entry (aref elt 1) (string ?\) (aref elt 0))))) 830 831 832;; For each character set, put the information of the most proper 833;; coding system to encode it by `preferred-coding-system' property. 834 835;; Fixme: should this be junked? 836(let ((l '((latin-iso8859-1 . iso-latin-1) 837 (latin-iso8859-2 . iso-latin-2) 838 (latin-iso8859-3 . iso-latin-3) 839 (latin-iso8859-4 . iso-latin-4) 840 (thai-tis620 . thai-tis620) 841 (greek-iso8859-7 . greek-iso-8bit) 842 (arabic-iso8859-6 . iso-2022-7bit) 843 (hebrew-iso8859-8 . hebrew-iso-8bit) 844 (katakana-jisx0201 . japanese-shift-jis) 845 (latin-jisx0201 . japanese-shift-jis) 846 (cyrillic-iso8859-5 . cyrillic-iso-8bit) 847 (latin-iso8859-9 . iso-latin-5) 848 (japanese-jisx0208-1978 . iso-2022-jp) 849 (chinese-gb2312 . chinese-iso-8bit) 850 (chinese-gbk . chinese-gbk) 851 (gb18030-2-byte . chinese-gb18030) 852 (gb18030-4-byte-bmp . chinese-gb18030) 853 (gb18030-4-byte-smp . chinese-gb18030) 854 (gb18030-4-byte-ext-1 . chinese-gb18030) 855 (gb18030-4-byte-ext-2 . chinese-gb18030) 856 (japanese-jisx0208 . iso-2022-jp) 857 (korean-ksc5601 . iso-2022-kr) 858 (japanese-jisx0212 . iso-2022-jp) 859 (chinese-big5-1 . chinese-big5) 860 (chinese-big5-2 . chinese-big5) 861 (chinese-sisheng . iso-2022-7bit) 862 (ipa . iso-2022-7bit) 863 (vietnamese-viscii-lower . vietnamese-viscii) 864 (vietnamese-viscii-upper . vietnamese-viscii) 865 (arabic-digit . iso-2022-7bit) 866 (arabic-1-column . iso-2022-7bit) 867 (lao . lao) 868 (arabic-2-column . iso-2022-7bit) 869 (indian-is13194 . devanagari) 870 (indian-glyph . devanagari) 871 (tibetan-1-column . tibetan) 872 (ethiopic . iso-2022-7bit) 873 (chinese-cns11643-1 . iso-2022-cn) 874 (chinese-cns11643-2 . iso-2022-cn) 875 (chinese-cns11643-3 . iso-2022-cn) 876 (chinese-cns11643-4 . iso-2022-cn) 877 (chinese-cns11643-5 . iso-2022-cn) 878 (chinese-cns11643-6 . iso-2022-cn) 879 (chinese-cns11643-7 . iso-2022-cn) 880 (indian-2-column . devanagari) 881 (tibetan . tibetan) 882 (latin-iso8859-14 . iso-latin-8) 883 (latin-iso8859-15 . iso-latin-9)))) 884 (while l 885 (put-charset-property (car (car l)) 'preferred-coding-system (cdr (car l))) 886 (setq l (cdr l)))) 887 888 889;; Setup auto-fill-chars for charsets that should invoke auto-filling. 890;; SPACE and NEWLINE are already set. 891 892(set-char-table-range auto-fill-chars '(#x3041 . #x30FF) t) 893(set-char-table-range auto-fill-chars '(#x3400 . #x4DB5) t) 894(set-char-table-range auto-fill-chars '(#x4e00 . #x9fbb) t) 895(set-char-table-range auto-fill-chars '(#xF900 . #xFAFF) t) 896(set-char-table-range auto-fill-chars '(#xFF00 . #xFF9F) t) 897(set-char-table-range auto-fill-chars '(#x20000 . #x2FFFF) t) 898 899 900;;; Setting char-width-table. The default is 1. 901 902;; 0: non-spacing, enclosing combining, formatting, Hangul Jamo medial 903;; and final characters. 904(let ((l '((#x0300 . #x036F) 905 (#x0483 . #x0489) 906 (#x0591 . #x05BD) 907 (#x05BF . #x05BF) 908 (#x05C1 . #x05C2) 909 (#x05C4 . #x05C5) 910 (#x05C7 . #x05C7) 911 (#x0600 . #x0605) 912 (#x0610 . #x061C) 913 (#x064B . #x065F) 914 (#x0670 . #x0670) 915 (#x06D6 . #x06E4) 916 (#x06E7 . #x06E8) 917 (#x06EA . #x06ED) 918 (#x070F . #x070F) 919 (#x0711 . #x0711) 920 (#x0730 . #x074A) 921 (#x07A6 . #x07B0) 922 (#x07EB . #x07F3) 923 (#x0816 . #x0823) 924 (#x0825 . #x082D) 925 (#x0859 . #x085B) 926 (#x08D4 . #x0902) 927 (#x093A . #x093A) 928 (#x093C . #x093C) 929 (#x0941 . #x0948) 930 (#x094D . #x094D) 931 (#x0951 . #x0957) 932 (#x0962 . #x0963) 933 (#x0981 . #x0981) 934 (#x09BC . #x09BC) 935 (#x09C1 . #x09C4) 936 (#x09CD . #x09CD) 937 (#x09E2 . #x09E3) 938 (#x0A01 . #x0A02) 939 (#x0A3C . #x0A3C) 940 (#x0A41 . #x0A4D) 941 (#x0A41 . #x0A42) 942 (#x0A47 . #x0A48) 943 (#x0A4B . #x0A4D) 944 (#x0A51 . #x0A51) 945 (#x0A70 . #x0A71) 946 (#x0A75 . #x0A75) 947 (#x0A81 . #x0A82) 948 (#x0ABC . #x0ABC) 949 (#x0AC1 . #x0AC8) 950 (#x0ACD . #x0ACD) 951 (#x0AE2 . #x0AE3) 952 (#x0B01 . #x0B01) 953 (#x0B3C . #x0B3C) 954 (#x0B3F . #x0B3F) 955 (#x0B41 . #x0B44) 956 (#x0B4D . #x0B56) 957 (#x0B62 . #x0B63) 958 (#x0B82 . #x0B82) 959 (#x0BC0 . #x0BC0) 960 (#x0BCD . #x0BCD) 961 (#x0C00 . #x0C00) 962 (#x0C3E . #x0C40) 963 (#x0C46 . #x0C56) 964 (#x0C62 . #x0C63) 965 (#x0C81 . #x0C81) 966 (#x0CBC . #x0CBC) 967 (#x0CCC . #x0CCD) 968 (#x0CE2 . #x0CE3) 969 (#x0D01 . #x0D01) 970 (#x0D41 . #x0D44) 971 (#x0D4D . #x0D4D) 972 (#x0D62 . #x0D63) 973 (#x0D81 . #x0D81) 974 (#x0DCA . #x0DCA) 975 (#x0DD2 . #x0DD6) 976 (#x0E31 . #x0E31) 977 (#x0E34 . #x0E3A) 978 (#x0E47 . #x0E4E) 979 (#x0EB1 . #x0EB1) 980 (#x0EB4 . #x0EBC) 981 (#x0EC8 . #x0ECD) 982 (#x0F18 . #x0F19) 983 (#x0F35 . #x0F35) 984 (#x0F37 . #x0F37) 985 (#x0F39 . #x0F39) 986 (#x0F71 . #x0F7E) 987 (#x0F80 . #x0F84) 988 (#x0F86 . #x0F87) 989 (#x0F8D . #x0FBC) 990 (#x0FC6 . #x0FC6) 991 (#x102D . #x1030) 992 (#x1032 . #x1037) 993 (#x1039 . #x103A) 994 (#x103D . #x103E) 995 (#x1058 . #x1059) 996 (#x105E . #x1060) 997 (#x1071 . #x1074) 998 (#x1082 . #x1082) 999 (#x1085 . #x1086) 1000 (#x108D . #x108D) 1001 (#x109D . #x109D) 1002 (#x1160 . #x11FF) 1003 (#x135D . #x135F) 1004 (#x1712 . #x1714) 1005 (#x1732 . #x1734) 1006 (#x1752 . #x1753) 1007 (#x1772 . #x1773) 1008 (#x17B4 . #x17B5) 1009 (#x17B7 . #x17BD) 1010 (#x17C6 . #x17C6) 1011 (#x17C9 . #x17D3) 1012 (#x17DD . #x17DD) 1013 (#x180B . #x180E) 1014 (#x18A9 . #x18A9) 1015 (#x1885 . #x1886) 1016 (#x18A9 . #x18A9) 1017 (#x1920 . #x1922) 1018 (#x1927 . #x1928) 1019 (#x1932 . #x1932) 1020 (#x1939 . #x193B) 1021 (#x1A17 . #x1A18) 1022 (#x1A1B . #x1A1B) 1023 (#x1A56 . #x1A56) 1024 (#x1A58 . #x1A5E) 1025 (#x1A60 . #x1A60) 1026 (#x1A62 . #x1A62) 1027 (#x1A65 . #x1A6C) 1028 (#x1A73 . #x1A7C) 1029 (#x1A7F . #x1A7F) 1030 (#x1AB0 . #x1AC0) 1031 (#x1B00 . #x1B03) 1032 (#x1B34 . #x1B34) 1033 (#x1B36 . #x1B3A) 1034 (#x1B3C . #x1B3C) 1035 (#x1B42 . #x1B42) 1036 (#x1B6B . #x1B73) 1037 (#x1B80 . #x1B81) 1038 (#x1BA2 . #x1BA5) 1039 (#x1BA8 . #x1BA9) 1040 (#x1BAB . #x1BAD) 1041 (#x1BE6 . #x1BE6) 1042 (#x1BE8 . #x1BE9) 1043 (#x1BED . #x1BED) 1044 (#x1BEF . #x1BF1) 1045 (#x1C2C . #x1C33) 1046 (#x1C36 . #x1C37) 1047 (#x1CD0 . #x1CD2) 1048 (#x1CD4 . #x1CE0) 1049 (#x1CE2 . #x1CE8) 1050 (#x1CED . #x1CED) 1051 (#x1CF4 . #x1CF4) 1052 (#x1CF8 . #x1CF9) 1053 (#x1DC0 . #x1DFF) 1054 (#x200B . #x200F) 1055 (#x202A . #x202E) 1056 (#x2060 . #x206F) 1057 (#x20D0 . #x20F0) 1058 (#x2CEF . #x2CF1) 1059 (#x2D7F . #x2D7F) 1060 (#x2DE0 . #x2DFF) 1061 (#xA66F . #xA672) 1062 (#xA674 . #xA69F) 1063 (#xA6F0 . #xA6F1) 1064 (#xA802 . #xA802) 1065 (#xA806 . #xA806) 1066 (#xA80B . #xA80B) 1067 (#xA825 . #xA826) 1068 (#xA82C . #xA82C) 1069 (#xA8C4 . #xA8C5) 1070 (#xA8E0 . #xA8F1) 1071 (#xA926 . #xA92D) 1072 (#xA947 . #xA951) 1073 (#xA980 . #xA9B3) 1074 (#xA9B6 . #xA9B9) 1075 (#xA9BC . #xA9BC) 1076 (#xA9E5 . #xA9E5) 1077 (#xAA29 . #xAA2E) 1078 (#xAA31 . #xAA32) 1079 (#xAA35 . #xAA36) 1080 (#xAA43 . #xAA43) 1081 (#xAA4C . #xAA4C) 1082 (#xAA7C . #xAA7C) 1083 (#xAAB0 . #xAAB0) 1084 (#xAAB2 . #xAAB4) 1085 (#xAAB7 . #xAAB8) 1086 (#xAABE . #xAABF) 1087 (#xAAC1 . #xAAC1) 1088 (#xAAEC . #xAAED) 1089 (#xAAF6 . #xAAF6) 1090 (#xABE5 . #xABE5) 1091 (#xABE8 . #xABE8) 1092 (#xABED . #xABED) 1093 (#xD7B0 . #xD7FB) 1094 (#xFB1E . #xFB1E) 1095 (#xFE00 . #xFE0F) 1096 (#xFE20 . #xFE2F) 1097 (#xFEFF . #xFEFF) 1098 (#xFFF9 . #xFFFB) 1099 (#x101FD . #x101FD) 1100 (#x102E0 . #x102E0) 1101 (#x10376 . #x1037A) 1102 (#x10A01 . #x10A0F) 1103 (#x10A38 . #x10A3F) 1104 (#x10AE5 . #x10AE6) 1105 (#x10EAB . #x10EAC) 1106 (#x11001 . #x11001) 1107 (#x11038 . #x11046) 1108 (#x1107F . #x11081) 1109 (#x110B3 . #x110B6) 1110 (#x110B9 . #x110BA) 1111 (#x110BD . #x110BD) 1112 (#x11100 . #x11102) 1113 (#x11127 . #x1112B) 1114 (#x1112D . #x11134) 1115 (#x11173 . #x11173) 1116 (#x11180 . #x11181) 1117 (#x111B6 . #x111BE) 1118 (#x111CA . #x111CC) 1119 (#x111CF . #x111CF) 1120 (#x1122F . #x11231) 1121 (#x11234 . #x11234) 1122 (#x11236 . #x11237) 1123 (#x1123E . #x1123E) 1124 (#x112DF . #x112DF) 1125 (#x112E3 . #x112EA) 1126 (#x11300 . #x11301) 1127 (#x1133C . #x1133C) 1128 (#x11340 . #x11340) 1129 (#x11366 . #x1136C) 1130 (#x11370 . #x11374) 1131 (#x11438 . #x1143F) 1132 (#x11442 . #x11444) 1133 (#x11446 . #x11446) 1134 (#x114B3 . #x114B8) 1135 (#x114BA . #x114C0) 1136 (#x114C2 . #x114C3) 1137 (#x115B2 . #x115B5) 1138 (#x115BC . #x115BD) 1139 (#x115BF . #x115C0) 1140 (#x115DC . #x115DD) 1141 (#x11633 . #x1163A) 1142 (#x1163D . #x1163D) 1143 (#x1163F . #x11640) 1144 (#x116AB . #x116AB) 1145 (#x116AD . #x116AD) 1146 (#x116B0 . #x116B5) 1147 (#x116B7 . #x116B7) 1148 (#x1171D . #x1171F) 1149 (#x11722 . #x11725) 1150 (#x11727 . #x1172B) 1151 (#x1193B . #x1193C) 1152 (#x1193E . #x1193E) 1153 (#x11943 . #x11943) 1154 (#x11C30 . #x11C36) 1155 (#x11C38 . #x11C3D) 1156 (#x11C92 . #x11CA7) 1157 (#x11CAA . #x11CB0) 1158 (#x11CB2 . #x11CB3) 1159 (#x11CB5 . #x11CB6) 1160 (#x16AF0 . #x16AF4) 1161 (#x16B30 . #x16B36) 1162 (#x16F8F . #x16F92) 1163 (#x16FE4 . #x16FE4) 1164 (#x1BC9D . #x1BC9E) 1165 (#x1BCA0 . #x1BCA3) 1166 (#x1D167 . #x1D169) 1167 (#x1D173 . #x1D182) 1168 (#x1D185 . #x1D18B) 1169 (#x1D1AA . #x1D1AD) 1170 (#x1D242 . #x1D244) 1171 (#x1DA00 . #x1DA36) 1172 (#x1DA3B . #x1DA6C) 1173 (#x1DA75 . #x1DA75) 1174 (#x1DA84 . #x1DA84) 1175 (#x1DA9B . #x1DA9F) 1176 (#x1DAA1 . #x1DAAF) 1177 (#x1E000 . #x1E006) 1178 (#x1E008 . #x1E018) 1179 (#x1E01B . #x1E021) 1180 (#x1E023 . #x1E024) 1181 (#x1E026 . #x1E02A) 1182 (#x1E8D0 . #x1E8D6) 1183 (#x1E944 . #x1E94A) 1184 (#xE0001 . #xE01EF)))) 1185 (dolist (elt l) 1186 (set-char-table-range char-width-table elt 0))) 1187 1188;; 2: East Asian Wide and Full-width characters. 1189(let ((l '((#x1100 . #x115F) 1190 (#x231A . #x231B) 1191 (#x2329 . #x232A) 1192 (#x23E9 . #x23EC) 1193 (#x23F0 . #x23F0) 1194 (#x23F3 . #x23F3) 1195 (#x25FD . #x25FE) 1196 (#x2614 . #x2615) 1197 (#x2648 . #x2653) 1198 (#x267F . #x267F) 1199 (#x2693 . #x2693) 1200 (#x26A1 . #x26A1) 1201 (#x26AA . #x26AB) 1202 (#x26BD . #x26BE) 1203 (#x26C4 . #x26C5) 1204 (#x26CE . #x26CE) 1205 (#x26D4 . #x26D4) 1206 (#x26EA . #x26EA) 1207 (#x26F2 . #x26F3) 1208 (#x26F5 . #x26F5) 1209 (#x26FA . #x26FA) 1210 (#x26FD . #x26FD) 1211 (#x2705 . #x2705) 1212 (#x270A . #x270B) 1213 (#x2728 . #x2728) 1214 (#x274C . #x274C) 1215 (#x274E . #x274E) 1216 (#x2753 . #x2755) 1217 (#x2757 . #x2757) 1218 (#x2795 . #x2797) 1219 (#x27B0 . #x27B0) 1220 (#x27BF . #x27BF) 1221 (#x2B1B . #x2B1C) 1222 (#x2B50 . #x2B50) 1223 (#x2B55 . #x2B55) 1224 (#x2E80 . #x303E) 1225 (#x3040 . #x3247) 1226 (#x3250 . #x4DBF) 1227 (#x4E00 . #x9FFF) 1228 (#xA490 . #xA4C6) 1229 (#xA960 . #xA97F) 1230 (#xAC00 . #xD7A3) 1231 (#xF900 . #xFAFF) 1232 (#xFE10 . #xFE19) 1233 (#xFE30 . #xFE6F) 1234 (#xFF01 . #xFF60) 1235 (#xFFE0 . #xFFE6) 1236 (#x16FE0 . #x16FE4) 1237 (#x16FF0 . #x16FF1) 1238 (#x17000 . #x187F7) 1239 (#x18800 . #x18AFF) 1240 (#x18B00 . #x18CD5) 1241 (#x1B000 . #x1B152) 1242 (#x1B164 . #x1B167) 1243 (#x1B170 . #x1B2FB) 1244 (#x1F004 . #x1F004) 1245 (#x1F0CF . #x1F0CF) 1246 (#x1F18E . #x1F18E) 1247 (#x1F191 . #x1F19A) 1248 (#x1F1AD . #x1F1AD) 1249 (#x1F200 . #x1F320) 1250 (#x1F32D . #x1F335) 1251 (#x1F337 . #x1F37C) 1252 (#x1F37E . #x1F393) 1253 (#x1F3A0 . #x1F3CA) 1254 (#x1F3CF . #x1F3D3) 1255 (#x1F3E0 . #x1F3F0) 1256 (#x1F3F4 . #x1F3F4) 1257 (#x1F3F8 . #x1F3FA) 1258 (#x1F3FB . #x1F3FF) 1259 (#x1F400 . #x1F43E) 1260 (#x1F440 . #x1F440) 1261 (#x1F442 . #x1F4FC) 1262 (#x1F4FF . #x1F53D) 1263 (#x1F54B . #x1F54E) 1264 (#x1F550 . #x1F567) 1265 (#x1F57A . #x1F57A) 1266 (#x1F595 . #x1F596) 1267 (#x1F5A4 . #x1F5A4) 1268 (#x1F5FB . #x1F5FF) 1269 (#x1F600 . #x1F64F) 1270 (#x1F680 . #x1F6C5) 1271 (#x1F6CC . #x1F6CC) 1272 (#x1F6D0 . #x1F6D2) 1273 (#x1F6D5 . #x1F6D7) 1274 (#x1F6EB . #x1F6EC) 1275 (#x1F6F4 . #x1F6FC) 1276 (#x1F7E0 . #x1F7EB) 1277 (#x1F90C . #x1F93A) 1278 (#x1F93C . #x1F945) 1279 (#x1F947 . #x1F978) 1280 (#x1F97A . #x1F9CB) 1281 (#x1F9A5 . #x1F9AA) 1282 (#x1F9AE . #x1F9CA) 1283 (#x1F9CD . #x1F9FF) 1284 (#x1FA00 . #x1FA53) 1285 (#x1FA60 . #x1FA6D) 1286 (#x1FA70 . #x1FA74) 1287 (#x1FA78 . #x1FA7A) 1288 (#x1FA80 . #x1FA86) 1289 (#x1FA90 . #x1FAA8) 1290 (#x1FAB0 . #x1FAB6) 1291 (#x1FAC0 . #x1FAC2) 1292 (#x1FAD0 . #x1FAD6) 1293 (#x1FB00 . #x1FB92) 1294 (#x20000 . #x2FFFF) 1295 (#x30000 . #x3FFFF)))) 1296 (dolist (elt l) 1297 (set-char-table-range char-width-table elt 2))) 1298 1299;; Other double width 1300;;(map-charset-chars 1301;; (lambda (range ignore) (set-char-table-range char-width-table range 2)) 1302;; 'ethiopic) 1303;; (map-charset-chars 1304;; (lambda (range ignore) (set-char-table-range char-width-table range 2)) 1305;; 'tibetan) 1306(map-charset-chars 1307 (lambda (range _ignore) (set-char-table-range char-width-table range 2)) 1308 'indian-2-column) 1309(map-charset-chars 1310 (lambda (range _ignore) (set-char-table-range char-width-table range 2)) 1311 'arabic-2-column) 1312 1313;; Internal use only. 1314;; Alist of locale symbol vs charsets. In a language environment 1315;; corresponding to the locale, width of characters in the charsets is 1316;; set to 2. Each element has the form: 1317;; (LOCALE TABLE (CHARSET (FROM-CODE . TO-CODE) ...) ...) 1318;; LOCALE: locale symbol 1319;; TABLE: char-table used for char-width-table, initially nil. 1320;; CHARSET: character set 1321;; FROM-CODE, TO-CODE: range of code-points in CHARSET 1322 1323(defvar cjk-char-width-table-list 1324 '((ja_JP nil (japanese-jisx0208 (#x2121 . #x287E)) 1325 (cp932-2-byte (#x8140 . #x879F))) 1326 (zh_CN nil (chinese-gb2312 (#x2121 . #x297E))) 1327 (zh_HK nil (big5-hkscs (#xA140 . #xA3FE) (#xC6A0 . #xC8FE))) 1328 (zh_TW nil (big5 (#xA140 . #xA3FE)) 1329 (chinese-cns11643-1 (#x2121 . #x427E))) 1330 (ko_KR nil (korean-ksc5601 (#x2121 . #x2C7E))))) 1331 1332;; Internal use only. 1333;; Setup char-width-table appropriate for a language environment 1334;; corresponding to LOCALE-NAME (symbol). 1335 1336(defun use-cjk-char-width-table (locale-name) 1337 (while (char-table-parent char-width-table) 1338 (setq char-width-table (char-table-parent char-width-table))) 1339 (let ((slot (assq locale-name cjk-char-width-table-list))) 1340 (or slot (error "Unknown locale for CJK language environment: %s" 1341 locale-name)) 1342 (unless (nth 1 slot) 1343 (let ((table (make-char-table nil))) 1344 (dolist (charset-info (nthcdr 2 slot)) 1345 (let ((charset (car charset-info))) 1346 (dolist (code-range (cdr charset-info)) 1347 (map-charset-chars #'(lambda (range _arg) 1348 (set-char-table-range table range 2)) 1349 charset nil 1350 (car code-range) (cdr code-range))))) 1351 (optimize-char-table table) 1352 (set-char-table-parent table char-width-table) 1353 (setcar (cdr slot) table))) 1354 (setq char-width-table (nth 1 slot)))) 1355 1356(defun use-default-char-width-table () 1357 "Internal use only. 1358Setup char-width-table appropriate for non-CJK language environment." 1359 (while (char-table-parent char-width-table) 1360 (setq char-width-table (char-table-parent char-width-table)))) 1361 1362(optimize-char-table (standard-case-table)) 1363(optimize-char-table (standard-syntax-table)) 1364 1365 1366;; Setting char-script-table. 1367(if dump-mode 1368 ;; While dumping, we can't use require, and international is not 1369 ;; in load-path. 1370 (load "international/charscript") 1371 (require 'charscript)) 1372 1373(map-charset-chars 1374 #'(lambda (range _ignore) 1375 (set-char-table-range char-script-table range 'tibetan)) 1376 'tibetan) 1377 1378 1379;;; Setting unicode-category-table. 1380 1381(when (setq unicode-category-table 1382 (unicode-property-table-internal 'general-category)) 1383 (map-char-table #'(lambda (key val) 1384 (if val 1385 (cond ((or (and (/= (aref (symbol-name val) 0) ?M) 1386 (/= (aref (symbol-name val) 0) ?C)) 1387 (eq val 'Zs)) 1388 (modify-category-entry key ?.)) 1389 ((eq val 'Mn) 1390 (modify-category-entry key ?^))))) 1391 unicode-category-table)) 1392 1393(optimize-char-table (standard-category-table)) 1394 1395 1396;; Display of glyphless characters. 1397 1398(defvar char-acronym-table 1399 (make-char-table 'char-acronym-table nil) 1400 "Char table of acronyms for non-graphic characters.") 1401 1402(let ((c0-acronyms '("NUL" "SOH" "STX" "ETX" "EOT" "ENQ" "ACK" "BEL" 1403 "BS" nil nil "VT" "FF" "CR" "SO" "SI" 1404 "DLE" "DC1" "DC2" "DC3" "DC4" "NAK" "SYN" "ETB" 1405 "CAN" "EM" "SUB" "ESC" "FC" "GS" "RS" "US"))) 1406 (dotimes (i 32) 1407 (aset char-acronym-table i (car c0-acronyms)) 1408 (setq c0-acronyms (cdr c0-acronyms)))) 1409 1410(let ((c1-acronyms '("PAD" "HOP" "BPH" "NBH" "IND" "NEL" "SSA" "ESA" 1411 "HTS" "HTJ" "VTS" "PLD" "PLU" "R1" "SS2" "SS1" 1412 "DCS" "PU1" "PU2" "STS" "CCH" "MW" "SPA" "EPA" 1413 "SOS" "SGCI" "SC1" "CSI" "ST" "OSC" "PM" "APC"))) 1414 (dotimes (i 32) 1415 (aset char-acronym-table (+ #x0080 i) (car c1-acronyms)) 1416 (setq c1-acronyms (cdr c1-acronyms)))) 1417 1418(aset char-acronym-table #x17B4 "KIVAQ") ; KHMER VOWEL INHERENT AQ 1419(aset char-acronym-table #x17B5 "KIVAA") ; KHMER VOWEL INHERENT AA 1420(aset char-acronym-table #x200B "ZWSP") ; ZERO WIDTH SPACE 1421(aset char-acronym-table #x200C "ZWNJ") ; ZERO WIDTH NON-JOINER 1422(aset char-acronym-table #x200D "ZWJ") ; ZERO WIDTH JOINER 1423(aset char-acronym-table #x200E "LRM") ; LEFT-TO-RIGHT MARK 1424(aset char-acronym-table #x200F "RLM") ; RIGHT-TO-LEFT MARK 1425(aset char-acronym-table #x202A "LRE") ; LEFT-TO-RIGHT EMBEDDING 1426(aset char-acronym-table #x202B "RLE") ; RIGHT-TO-LEFT EMBEDDING 1427(aset char-acronym-table #x202C "PDF") ; POP DIRECTIONAL FORMATTING 1428(aset char-acronym-table #x202D "LRO") ; LEFT-TO-RIGHT OVERRIDE 1429(aset char-acronym-table #x202E "RLO") ; RIGHT-TO-LEFT OVERRIDE 1430(aset char-acronym-table #x2060 "WJ") ; WORD JOINER 1431(aset char-acronym-table #x206A "ISS") ; INHIBIT SYMMETRIC SWAPPING 1432(aset char-acronym-table #x206B "ASS") ; ACTIVATE SYMMETRIC SWAPPING 1433(aset char-acronym-table #x206C "IAFS") ; INHIBIT ARABIC FORM SHAPING 1434(aset char-acronym-table #x206D "AAFS") ; ACTIVATE ARABIC FORM SHAPING 1435(aset char-acronym-table #x206E "NADS") ; NATIONAL DIGIT SHAPES 1436(aset char-acronym-table #x206F "NODS") ; NOMINAL DIGIT SHAPES 1437(aset char-acronym-table #xFEFF "ZWNBSP") ; ZERO WIDTH NO-BREAK SPACE 1438(aset char-acronym-table #xFFF9 "IAA") ; INTERLINEAR ANNOTATION ANCHOR 1439(aset char-acronym-table #xFFFA "IAS") ; INTERLINEAR ANNOTATION SEPARATOR 1440(aset char-acronym-table #xFFFB "IAT") ; INTERLINEAR ANNOTATION TERMINATOR 1441(aset char-acronym-table #x1D173 "BEGBM") ; MUSICAL SYMBOL BEGIN BEAM 1442(aset char-acronym-table #x1D174 "ENDBM") ; MUSICAL SYMBOL END BEAM 1443(aset char-acronym-table #x1D175 "BEGTIE") ; MUSICAL SYMBOL BEGIN TIE 1444(aset char-acronym-table #x1D176 "END") ; MUSICAL SYMBOL END TIE 1445(aset char-acronym-table #x1D177 "BEGSLR") ; MUSICAL SYMBOL BEGIN SLUR 1446(aset char-acronym-table #x1D178 "ENDSLR") ; MUSICAL SYMBOL END SLUR 1447(aset char-acronym-table #x1D179 "BEGPHR") ; MUSICAL SYMBOL BEGIN PHRASE 1448(aset char-acronym-table #x1D17A "ENDPHR") ; MUSICAL SYMBOL END PHRASE 1449(aset char-acronym-table #xE0001 "|->TAG") ; LANGUAGE TAG 1450(aset char-acronym-table #xE0020 "SP TAG") ; TAG SPACE 1451(dotimes (i 94) 1452 (aset char-acronym-table (+ #xE0021 i) (format " %c TAG" (+ 33 i)))) 1453(aset char-acronym-table #xE007F "->|TAG") ; CANCEL TAG 1454 1455(defun update-glyphless-char-display (&optional variable value) 1456 "Make the setting of `glyphless-char-display-control' take effect. 1457This function updates the char-table `glyphless-char-display', 1458and is intended to be used in the `:set' attribute of the 1459option `glyphless-char-display'." 1460 (when value 1461 (set-default variable value)) 1462 (dolist (elt value) 1463 (let ((target (car elt)) 1464 (method (cdr elt))) 1465 (or (memq method '(zero-width thin-space empty-box acronym hex-code)) 1466 (error "Invalid glyphless character display method: %s" method)) 1467 (cond ((eq target 'c0-control) 1468 (glyphless-set-char-table-range glyphless-char-display 1469 #x00 #x1F method) 1470 ;; Users will not expect their newlines and TABs be 1471 ;; displayed as anything but themselves, so exempt those 1472 ;; two characters from c0-control. 1473 (set-char-table-range glyphless-char-display #x9 nil) 1474 (set-char-table-range glyphless-char-display #xa nil)) 1475 ((eq target 'c1-control) 1476 (glyphless-set-char-table-range glyphless-char-display 1477 #x80 #x9F method)) 1478 ((eq target 'format-control) 1479 (when unicode-category-table 1480 (map-char-table 1481 #'(lambda (char category) 1482 (if (eq category 'Cf) 1483 (let ((this-method method) 1484 from to) 1485 (if (consp char) 1486 (setq from (car char) to (cdr char)) 1487 (setq from char to char)) 1488 (while (<= from to) 1489 (when (/= from #xAD) 1490 (if (eq method 'acronym) 1491 (setq this-method 1492 (aref char-acronym-table from))) 1493 (set-char-table-range glyphless-char-display 1494 from this-method)) 1495 (setq from (1+ from)))))) 1496 unicode-category-table))) 1497 ((eq target 'no-font) 1498 (set-char-table-extra-slot glyphless-char-display 0 method)) 1499 (t 1500 (error "Invalid glyphless character group: %s" target)))))) 1501 1502(defun glyphless-set-char-table-range (chartable from to method) 1503 (if (eq method 'acronym) 1504 (let ((i from)) 1505 (while (<= i to) 1506 (set-char-table-range chartable i (aref char-acronym-table i)) 1507 (setq i (1+ i)))) 1508 (set-char-table-range chartable (cons from to) method))) 1509 1510;;; Control of displaying glyphless characters. 1511(defcustom glyphless-char-display-control 1512 '((format-control . thin-space) 1513 (no-font . hex-code)) 1514 "List of directives to control display of glyphless characters. 1515 1516Each element has the form (GROUP . METHOD), where GROUP is a 1517symbol specifying the character group, and METHOD is a symbol 1518specifying the method of displaying characters belonging to that 1519group. 1520 1521GROUP must be one of these symbols: 1522 `c0-control': U+0000..U+001F, but excluding newline and TAB. 1523 `c1-control': U+0080..U+009F. 1524 `format-control': Characters of Unicode General Category `Cf', 1525 such as U+200C (ZWNJ), U+200E (LRM), but 1526 excluding characters that have graphic images, 1527 such as U+00AD (SHY). 1528 `no-font': characters for which no suitable font is found. 1529 For character terminals, characters that cannot 1530 be encoded by `terminal-coding-system'. 1531 1532METHOD must be one of these symbols: 1533 `zero-width': don't display. 1534 `thin-space': display a thin (1-pixel width) space. On character 1535 terminals, display as 1-character space. 1536 `empty-box': display an empty box. 1537 `acronym': display an acronym of the character in a box. The 1538 acronym is taken from `char-acronym-table', which see. 1539 `hex-code': display the hexadecimal character code in a box. 1540 1541Do not set its value directly from Lisp; the value takes effect 1542only via a custom `:set' 1543function (`update-glyphless-char-display'), which updates 1544`glyphless-char-display'." 1545 :version "24.1" 1546 :type '(alist :key-type (symbol :tag "Character Group") 1547 :value-type (symbol :tag "Display Method")) 1548 :options '((c0-control 1549 (choice (const :tag "Don't display" zero-width) 1550 (const :tag "Display as thin space" thin-space) 1551 (const :tag "Display as empty box" empty-box) 1552 (const :tag "Display acronym" acronym) 1553 (const :tag "Display hex code in a box" hex-code))) 1554 (c1-control 1555 (choice (const :tag "Don't display" zero-width) 1556 (const :tag "Display as thin space" thin-space) 1557 (const :tag "Display as empty box" empty-box) 1558 (const :tag "Display acronym" acronym) 1559 (const :tag "Display hex code in a box" hex-code))) 1560 (format-control 1561 (choice (const :tag "Don't display" zero-width) 1562 (const :tag "Display as thin space" thin-space) 1563 (const :tag "Display as empty box" empty-box) 1564 (const :tag "Display acronym" acronym) 1565 (const :tag "Display hex code in a box" hex-code))) 1566 (no-font 1567 (choice (const :tag "Don't display" zero-width) 1568 (const :tag "Display as thin space" thin-space) 1569 (const :tag "Display as empty box" empty-box) 1570 (const :tag "Display acronym" acronym) 1571 (const :tag "Display hex code in a box" hex-code)))) 1572 :set 'update-glyphless-char-display 1573 :group 'display) 1574 1575 1576;;; Setting word boundary. 1577 1578(setq word-combining-categories 1579 '((nil . ?^) 1580 (?^ . nil) 1581 (?C . ?H) 1582 (?C . ?K))) 1583 1584(setq word-separating-categories ; (2-byte character sets) 1585 '((?H . ?K) ; Hiragana - Katakana 1586 )) 1587 1588;; Local Variables: 1589;; coding: utf-8 1590;; End: 1591 1592;;; characters.el ends here 1593