1#!/usr/local/bin/python3.8 2# vim:fileencoding=utf-8 3# License: GPL v3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net> 4 5import os 6import re 7import sys 8from collections import defaultdict 9from contextlib import contextmanager 10from datetime import date 11from functools import partial 12from html.entities import html5 13from itertools import groupby 14from operator import itemgetter 15from typing import ( 16 Callable, DefaultDict, Dict, FrozenSet, Generator, Iterable, List, 17 Optional, Set, Tuple, Union 18) 19from urllib.request import urlopen 20 21os.chdir(os.path.dirname(os.path.abspath(__file__))) 22 23non_characters = frozenset(range(0xfffe, 0x10ffff, 0x10000)) 24non_characters |= frozenset(range(0xffff, 0x10ffff + 1, 0x10000)) 25non_characters |= frozenset(range(0xfdd0, 0xfdf0)) 26if len(non_characters) != 66: 27 raise SystemExit('non_characters table incorrect') 28emoji_skin_tone_modifiers = frozenset(range(0x1f3fb, 0x1F3FF + 1)) 29 30 31def get_data(fname: str, folder: str = 'UCD') -> Iterable[str]: 32 url = f'https://www.unicode.org/Public/{folder}/latest/{fname}' 33 bn = os.path.basename(url) 34 local = os.path.join('/tmp', bn) 35 if os.path.exists(local): 36 with open(local, 'rb') as f: 37 data = f.read() 38 else: 39 data = urlopen(url).read() 40 with open(local, 'wb') as f: 41 f.write(data) 42 for line in data.decode('utf-8').splitlines(): 43 line = line.strip() 44 if line and not line.startswith('#'): 45 yield line 46 47 48# Map of class names to set of codepoints in class 49class_maps: Dict[str, Set[int]] = {} 50all_symbols: Set[int] = set() 51name_map: Dict[int, str] = {} 52word_search_map: DefaultDict[str, Set[int]] = defaultdict(set) 53zwj = 0x200d 54flag_codepoints = frozenset(range(0x1F1E6, 0x1F1E6 + 26)) 55marks = set(emoji_skin_tone_modifiers) | {zwj} | flag_codepoints 56not_assigned = set(range(0, sys.maxunicode)) 57 58 59def parse_ucd() -> None: 60 61 def add_word(w: str, c: int) -> None: 62 if c <= 32 or c == 127 or 128 <= c <= 159: 63 return 64 if len(w) > 1: 65 word_search_map[w.lower()].add(c) 66 67 first: Optional[int] = None 68 for word, c in html5.items(): 69 if len(c) == 1: 70 add_word(word.rstrip(';'), ord(c)) 71 word_search_map['nnbsp'].add(0x202f) 72 for line in get_data('ucd/UnicodeData.txt'): 73 parts = [x.strip() for x in line.split(';')] 74 codepoint = int(parts[0], 16) 75 name = parts[1] or parts[10] 76 if name == '<control>': 77 name = parts[10] 78 if name: 79 name_map[codepoint] = name 80 for word in name.lower().split(): 81 add_word(word, codepoint) 82 category = parts[2] 83 s = class_maps.setdefault(category, set()) 84 desc = parts[1] 85 codepoints: Union[Tuple[int, ...], Iterable[int]] = (codepoint,) 86 if first is None: 87 if desc.endswith(', First>'): 88 first = codepoint 89 continue 90 else: 91 codepoints = range(first, codepoint + 1) 92 first = None 93 for codepoint in codepoints: 94 s.add(codepoint) 95 not_assigned.discard(codepoint) 96 if category.startswith('M'): 97 marks.add(codepoint) 98 elif category.startswith('S'): 99 all_symbols.add(codepoint) 100 101 with open('nerd-fonts-glyphs.txt') as f: 102 for line in f: 103 line = line.strip() 104 if not line or line.startswith('#'): 105 continue 106 code, category, name = line.split(' ', 2) 107 codepoint = int(code, 16) 108 if name and codepoint not in name_map: 109 name_map[codepoint] = name.upper() 110 for word in name.lower().split(): 111 add_word(word, codepoint) 112 113 # Some common synonyms 114 word_search_map['bee'] |= word_search_map['honeybee'] 115 word_search_map['lambda'] |= word_search_map['lamda'] 116 word_search_map['lamda'] |= word_search_map['lambda'] 117 word_search_map['diamond'] |= word_search_map['gem'] 118 119 120def parse_range_spec(spec: str) -> Set[int]: 121 spec = spec.strip() 122 if '..' in spec: 123 chars_ = tuple(map(lambda x: int(x, 16), filter(None, spec.split('.')))) 124 chars = set(range(chars_[0], chars_[1] + 1)) 125 else: 126 chars = {int(spec, 16)} 127 return chars 128 129 130def split_two(line: str) -> Tuple[Set[int], str]: 131 spec, rest = line.split(';', 1) 132 spec, rest = spec.strip(), rest.strip().split(' ', 1)[0].strip() 133 return parse_range_spec(spec), rest 134 135 136all_emoji: Set[int] = set() 137emoji_presentation_bases: Set[int] = set() 138narrow_emoji: Set[int] = set() 139wide_emoji: Set[int] = set() 140flags: Dict[int, List[int]] = {} 141 142 143def parse_basic_emoji(spec: str) -> None: 144 parts = list(filter(None, spec.split())) 145 has_emoji_presentation = len(parts) < 2 146 chars = parse_range_spec(parts[0]) 147 all_emoji.update(chars) 148 emoji_presentation_bases.update(chars) 149 (wide_emoji if has_emoji_presentation else narrow_emoji).update(chars) 150 151 152def parse_keycap_sequence(spec: str) -> None: 153 base, fe0f, cc = list(filter(None, spec.split())) 154 chars = parse_range_spec(base) 155 all_emoji.update(chars) 156 emoji_presentation_bases.update(chars) 157 narrow_emoji.update(chars) 158 159 160def parse_flag_emoji_sequence(spec: str) -> None: 161 a, b = list(filter(None, spec.split())) 162 left, right = int(a, 16), int(b, 16) 163 chars = {left, right} 164 all_emoji.update(chars) 165 wide_emoji.update(chars) 166 emoji_presentation_bases.update(chars) 167 flags.setdefault(left, []).append(right) 168 169 170def parse_emoji_tag_sequence(spec: str) -> None: 171 a = int(spec.split()[0], 16) 172 all_emoji.add(a) 173 wide_emoji.add(a) 174 emoji_presentation_bases.add(a) 175 176 177def parse_emoji_modifier_sequence(spec: str) -> None: 178 a, b = list(filter(None, spec.split())) 179 char, mod = int(a, 16), int(b, 16) 180 mod 181 all_emoji.add(char) 182 wide_emoji.add(char) 183 emoji_presentation_bases.add(char) 184 185 186def parse_emoji() -> None: 187 for line in get_data('emoji-sequences.txt', 'emoji'): 188 parts = [x.strip() for x in line.split(';')] 189 if len(parts) < 2: 190 continue 191 data, etype = parts[:2] 192 if etype == 'Basic_Emoji': 193 parse_basic_emoji(data) 194 elif etype == 'Emoji_Keycap_Sequence': 195 parse_keycap_sequence(data) 196 elif etype == 'RGI_Emoji_Flag_Sequence': 197 parse_flag_emoji_sequence(data) 198 elif etype == 'RGI_Emoji_Tag_Sequence': 199 parse_emoji_tag_sequence(data) 200 elif etype == 'RGI_Emoji_Modifier_Sequence': 201 parse_emoji_modifier_sequence(data) 202 203 204doublewidth: Set[int] = set() 205ambiguous: Set[int] = set() 206 207 208def parse_eaw() -> None: 209 global doublewidth, ambiguous 210 seen: Set[int] = set() 211 for line in get_data('ucd/EastAsianWidth.txt'): 212 chars, eaw = split_two(line) 213 if eaw == 'A': 214 ambiguous |= chars 215 seen |= chars 216 elif eaw == 'W' or eaw == 'F': 217 doublewidth |= chars 218 seen |= chars 219 doublewidth |= set(range(0x3400, 0x4DBF + 1)) - seen 220 doublewidth |= set(range(0x4E00, 0x9FFF + 1)) - seen 221 doublewidth |= set(range(0xF900, 0xFAFF + 1)) - seen 222 doublewidth |= set(range(0x20000, 0x2FFFD + 1)) - seen 223 doublewidth |= set(range(0x30000, 0x3FFFD + 1)) - seen 224 225 226def get_ranges(items: List[int]) -> Generator[Union[int, Tuple[int, int]], None, None]: 227 items.sort() 228 for k, g in groupby(enumerate(items), lambda m: m[0]-m[1]): 229 group = tuple(map(itemgetter(1), g)) 230 a, b = group[0], group[-1] 231 if a == b: 232 yield a 233 else: 234 yield a, b 235 236 237def write_case(spec: Union[Tuple, int], p: Callable) -> None: 238 if isinstance(spec, tuple): 239 p('\t\tcase 0x{:x} ... 0x{:x}:'.format(*spec)) 240 else: 241 p('\t\tcase 0x{:x}:'.format(spec)) 242 243 244@contextmanager 245def create_header(path: str, include_data_types: bool = True) -> Generator[Callable, None, None]: 246 with open(path, 'w') as f: 247 p = partial(print, file=f) 248 p('// unicode data, built from the unicode standard on:', date.today()) 249 p('// see gen-wcwidth.py') 250 if path.endswith('.h'): 251 p('#pragma once') 252 if include_data_types: 253 p('#include "data-types.h"\n') 254 p('START_ALLOW_CASE_RANGE') 255 p() 256 yield p 257 p() 258 if include_data_types: 259 p('END_ALLOW_CASE_RANGE') 260 261 262def gen_emoji() -> None: 263 with create_header('kitty/emoji.h') as p: 264 p('static inline bool\nis_emoji(char_type code) {') 265 p('\tswitch(code) {') 266 for spec in get_ranges(list(all_emoji)): 267 write_case(spec, p) 268 p('\t\t\treturn true;') 269 p('\t\tdefault: return false;') 270 p('\t}') 271 p('\treturn false;\n}') 272 273 p('static inline bool\nis_symbol(char_type code) {') 274 p('\tswitch(code) {') 275 for spec in get_ranges(list(all_symbols)): 276 write_case(spec, p) 277 p('\t\t\treturn true;') 278 p('\t\tdefault: return false;') 279 p('\t}') 280 p('\treturn false;\n}') 281 282 283def category_test( 284 name: str, 285 p: Callable, 286 classes: Iterable[str], 287 comment: str, 288 use_static: bool = False, 289 extra_chars: Union[FrozenSet[int], Set[int]] = frozenset(), 290 exclude: Union[Set[int], FrozenSet[int]] = frozenset(), 291 least_check_return: Optional[str] = None, 292 ascii_range: Optional[str] = None 293) -> None: 294 static = 'static inline ' if use_static else '' 295 chars: Set[int] = set() 296 for c in classes: 297 chars |= class_maps[c] 298 chars |= extra_chars 299 chars -= exclude 300 p(f'{static}bool\n{name}(char_type code) {{') 301 p(f'\t// {comment} ({len(chars)} codepoints)' + ' {{' '{') 302 if least_check_return is not None: 303 least = min(chars) 304 p(f'\tif (LIKELY(code < {least})) return {least_check_return};') 305 if ascii_range is not None: 306 p(f'\tif (LIKELY(0x20 <= code && code <= 0x7e)) return {ascii_range};') 307 p('\tswitch(code) {') 308 for spec in get_ranges(list(chars)): 309 write_case(spec, p) 310 p('\t\t\treturn true;') 311 p('\t} // }}}\n') 312 p('\treturn false;\n}\n') 313 314 315def codepoint_to_mark_map(p: Callable, mark_map: List[int]) -> Dict[int, int]: 316 p('\tswitch(c) { // {{{') 317 rmap = {c: m for m, c in enumerate(mark_map)} 318 for spec in get_ranges(mark_map): 319 if isinstance(spec, tuple): 320 s = rmap[spec[0]] 321 cases = ' '.join(f'case {i}:' for i in range(spec[0], spec[1]+1)) 322 p(f'\t\t{cases} return {s} + c - {spec[0]};') 323 else: 324 p(f'\t\tcase {spec}: return {rmap[spec]};') 325 p('default: return 0;') 326 p('\t} // }}}') 327 return rmap 328 329 330def classes_to_regex(classes: Iterable[str], exclude: str = '') -> Iterable[str]: 331 chars: Set[int] = set() 332 for c in classes: 333 chars |= class_maps[c] 334 for x in map(ord, exclude): 335 chars.discard(x) 336 337 def as_string(codepoint: int) -> str: 338 if codepoint < 256: 339 return r'\x{:02x}'.format(codepoint) 340 if codepoint <= 0xffff: 341 return r'\u{:04x}'.format(codepoint) 342 return r'\U{:08x}'.format(codepoint) 343 344 for spec in get_ranges(list(chars)): 345 if isinstance(spec, tuple): 346 yield '{}-{}'.format(*map(as_string, (spec[0], spec[1]))) 347 else: 348 yield as_string(spec) 349 350 351def gen_ucd() -> None: 352 cz = {c for c in class_maps if c[0] in 'CZ'} 353 with create_header('kitty/unicode-data.c') as p: 354 p('#include "unicode-data.h"') 355 category_test( 356 'is_combining_char', p, 357 {c for c in class_maps if c.startswith('M')}, 358 'M category (marks)', 359 # See https://github.com/harfbuzz/harfbuzz/issues/169 360 extra_chars=emoji_skin_tone_modifiers | {zwj}, 361 least_check_return='false' 362 ) 363 category_test( 364 'is_ignored_char', p, 'Cc Cf Cs'.split(), 365 'Control characters and non-characters', 366 extra_chars=non_characters, exclude={zwj}, 367 ascii_range='false' 368 ) 369 category_test('is_word_char', p, {c for c in class_maps if c[0] in 'LN'}, 'L and N categories') 370 category_test('is_CZ_category', p, cz, 'C and Z categories') 371 category_test('is_P_category', p, {c for c in class_maps if c[0] == 'P'}, 'P category (punctuation)') 372 mark_map = [0] + list(sorted(marks)) 373 p('char_type codepoint_for_mark(combining_type m) {') 374 p(f'\tstatic char_type map[{len(mark_map)}] =', '{', ', '.join(map(str, mark_map)), '}; // {{{ mapping }}}') 375 p('\tif (m < arraysz(map)) return map[m];') 376 p('\treturn 0;') 377 p('}\n') 378 p('combining_type mark_for_codepoint(char_type c) {') 379 rmap = codepoint_to_mark_map(p, mark_map) 380 p('}\n') 381 with open('kitty/unicode-data.h') as f: 382 unicode_data = f.read() 383 m = re.search(r'^#define VS15 (\d+)', unicode_data, re.M) 384 if m is not None: 385 expected = int(m.group(1)) 386 if rmap[0xfe0e] != expected: 387 raise ValueError('The mark for 0xfe0e has changed, you have to update VS15 to {} and VS16 to {} in unicode-data.h'.format( 388 rmap[0xfe0e], rmap[0xfe0f] 389 )) 390 with open('kittens/hints/url_regex.py', 'w') as f: 391 f.write('# generated by gen-wcwidth.py, do not edit\n\n') 392 f.write("url_delimiters = '{}' # noqa".format(''.join(classes_to_regex(cz, exclude='\n\r')))) 393 394 395def gen_names() -> None: 396 with create_header('kittens/unicode_input/names.h') as p: 397 mark_to_cp = list(sorted(name_map)) 398 cp_to_mark = {cp: m for m, cp in enumerate(mark_to_cp)} 399 # Mapping of mark to codepoint name 400 p(f'static const char* name_map[{len(mark_to_cp)}] = {{' ' // {{{') 401 for cp in mark_to_cp: 402 w = name_map[cp].replace('"', '\\"') 403 p(f'\t"{w}",') 404 p("}; // }}}\n") 405 406 # Mapping of mark to codepoint 407 p(f'static const char_type mark_to_cp[{len(mark_to_cp)}] = {{' ' // {{{') 408 p(', '.join(map(str, mark_to_cp))) 409 p('}; // }}}\n') 410 411 # Function to get mark number for codepoint 412 p('static char_type mark_for_codepoint(char_type c) {') 413 codepoint_to_mark_map(p, mark_to_cp) 414 p('}\n') 415 p('static inline const char* name_for_codepoint(char_type cp) {') 416 p('\tchar_type m = mark_for_codepoint(cp); if (m == 0) return NULL;') 417 p('\treturn name_map[m];') 418 p('}\n') 419 420 # Array of all words 421 word_map = tuple(sorted(word_search_map)) 422 word_rmap = {w: i for i, w in enumerate(word_map)} 423 p(f'static const char* all_words_map[{len(word_map)}] = {{' ' // {{{') 424 cwords = (w.replace('"', '\\"') for w in word_map) 425 p(', '.join(f'"{w}"' for w in cwords)) 426 p('}; // }}}\n') 427 428 # Array of sets of marks for each word 429 word_to_marks = {word_rmap[w]: frozenset(map(cp_to_mark.__getitem__, cps)) for w, cps in word_search_map.items()} 430 all_mark_groups = frozenset(word_to_marks.values()) 431 array = [0] 432 mg_to_offset = {} 433 for mg in all_mark_groups: 434 mg_to_offset[mg] = len(array) 435 array.append(len(mg)) 436 array.extend(sorted(mg)) 437 p(f'static const char_type mark_groups[{len(array)}] = {{' ' // {{{') 438 p(', '.join(map(str, array))) 439 p('}; // }}}\n') 440 offsets_array = [] 441 for wi, w in enumerate(word_map): 442 mg = word_to_marks[wi] 443 offsets_array.append(mg_to_offset[mg]) 444 p(f'static const char_type mark_to_offset[{len(offsets_array)}] = {{' ' // {{{') 445 p(', '.join(map(str, offsets_array))) 446 p('}; // }}}\n') 447 448 # The trie 449 p('typedef struct { uint32_t children_offset; uint32_t match_offset; } word_trie;\n') 450 all_trie_nodes: List['TrieNode'] = [] # noqa 451 452 class TrieNode: 453 454 def __init__(self) -> None: 455 self.match_offset = 0 456 self.children_offset = 0 457 self.children: Dict[int, int] = {} 458 459 def add_letter(self, letter: int) -> int: 460 if letter not in self.children: 461 self.children[letter] = len(all_trie_nodes) 462 all_trie_nodes.append(TrieNode()) 463 return self.children[letter] 464 465 def __str__(self) -> str: 466 return f'{{ .children_offset={self.children_offset}, .match_offset={self.match_offset} }}' 467 468 root = TrieNode() 469 all_trie_nodes.append(root) 470 471 def add_word(word_idx: int, word: str) -> None: 472 parent = root 473 for letter in map(ord, word): 474 idx = parent.add_letter(letter) 475 parent = all_trie_nodes[idx] 476 parent.match_offset = offsets_array[word_idx] 477 478 for i, word in enumerate(word_map): 479 add_word(i, word) 480 children_array = [0] 481 for node in all_trie_nodes: 482 if node.children: 483 node.children_offset = len(children_array) 484 children_array.append(len(node.children)) 485 for letter, child_offset in node.children.items(): 486 children_array.append((child_offset << 8) | (letter & 0xff)) 487 488 p(f'static const word_trie all_trie_nodes[{len(all_trie_nodes)}] = {{' ' // {{{') 489 p(',\n'.join(map(str, all_trie_nodes))) 490 p('\n}; // }}}\n') 491 p(f'static const uint32_t children_array[{len(children_array)}] = {{' ' // {{{') 492 p(', '.join(map(str, children_array))) 493 p('}; // }}}\n') 494 495 496def gen_wcwidth() -> None: 497 seen: Set[int] = set() 498 499 def add(p: Callable, comment: str, chars_: Union[Set[int], FrozenSet[int]], ret: int) -> None: 500 chars = chars_ - seen 501 seen.update(chars) 502 p(f'\t\t// {comment} ({len(chars)} codepoints)' + ' {{' '{') 503 for spec in get_ranges(list(chars)): 504 write_case(spec, p) 505 p(f'\t\t\treturn {ret};') 506 p('\t\t// }}}\n') 507 508 with create_header('kitty/wcwidth-std.h') as p: 509 p('static inline int\nwcwidth_std(int32_t code) {') 510 p('\tif (LIKELY(0x20 <= code && code <= 0x7e)) return 1;') 511 p('\tswitch(code) {') 512 513 non_printing = class_maps['Cc'] | class_maps['Cf'] | class_maps['Cs'] 514 add(p, 'Flags', flag_codepoints, 2) 515 add(p, 'Marks', marks | {0}, 0) 516 add(p, 'Non-printing characters', non_printing, -1) 517 add(p, 'Private use', class_maps['Co'], -3) 518 add(p, 'Text Presentation', narrow_emoji, 1) 519 add(p, 'East Asian ambiguous width', ambiguous, -2) 520 add(p, 'East Asian double width', doublewidth, 2) 521 add(p, 'Emoji Presentation', wide_emoji, 2) 522 523 add(p, 'Not assigned in the unicode character database', not_assigned, -4) 524 525 p('\t\tdefault: return 1;') 526 p('\t}') 527 p('\treturn 1;\n}') 528 529 p('static inline bool\nis_emoji_presentation_base(uint32_t code) {') 530 p('\tswitch(code) {') 531 for spec in get_ranges(list(emoji_presentation_bases)): 532 write_case(spec, p) 533 p('\t\t\treturn true;') 534 p('\t\tdefault: return false;') 535 p('\t}') 536 p('\treturn 1;\n}') 537 538 539parse_ucd() 540parse_emoji() 541parse_eaw() 542gen_ucd() 543gen_wcwidth() 544gen_emoji() 545gen_names() 546