1#!/usr/local/bin/python3.8
2# vim:fileencoding=utf-8
3# License: GPL v3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
4
5import os
6import re
7import sys
8from collections import defaultdict
9from contextlib import contextmanager
10from datetime import date
11from functools import partial
12from html.entities import html5
13from itertools import groupby
14from operator import itemgetter
15from typing import (
16    Callable, DefaultDict, Dict, FrozenSet, Generator, Iterable, List,
17    Optional, Set, Tuple, Union
18)
19from urllib.request import urlopen
20
21os.chdir(os.path.dirname(os.path.abspath(__file__)))
22
23non_characters = frozenset(range(0xfffe, 0x10ffff, 0x10000))
24non_characters |= frozenset(range(0xffff, 0x10ffff + 1, 0x10000))
25non_characters |= frozenset(range(0xfdd0, 0xfdf0))
26if len(non_characters) != 66:
27    raise SystemExit('non_characters table incorrect')
28emoji_skin_tone_modifiers = frozenset(range(0x1f3fb, 0x1F3FF + 1))
29
30
31def get_data(fname: str, folder: str = 'UCD') -> Iterable[str]:
32    url = f'https://www.unicode.org/Public/{folder}/latest/{fname}'
33    bn = os.path.basename(url)
34    local = os.path.join('/tmp', bn)
35    if os.path.exists(local):
36        with open(local, 'rb') as f:
37            data = f.read()
38    else:
39        data = urlopen(url).read()
40        with open(local, 'wb') as f:
41            f.write(data)
42    for line in data.decode('utf-8').splitlines():
43        line = line.strip()
44        if line and not line.startswith('#'):
45            yield line
46
47
48# Map of class names to set of codepoints in class
49class_maps: Dict[str, Set[int]] = {}
50all_symbols: Set[int] = set()
51name_map: Dict[int, str] = {}
52word_search_map: DefaultDict[str, Set[int]] = defaultdict(set)
53zwj = 0x200d
54flag_codepoints = frozenset(range(0x1F1E6, 0x1F1E6 + 26))
55marks = set(emoji_skin_tone_modifiers) | {zwj} | flag_codepoints
56not_assigned = set(range(0, sys.maxunicode))
57
58
59def parse_ucd() -> None:
60
61    def add_word(w: str, c: int) -> None:
62        if c <= 32 or c == 127 or 128 <= c <= 159:
63            return
64        if len(w) > 1:
65            word_search_map[w.lower()].add(c)
66
67    first: Optional[int] = None
68    for word, c in html5.items():
69        if len(c) == 1:
70            add_word(word.rstrip(';'), ord(c))
71    word_search_map['nnbsp'].add(0x202f)
72    for line in get_data('ucd/UnicodeData.txt'):
73        parts = [x.strip() for x in line.split(';')]
74        codepoint = int(parts[0], 16)
75        name = parts[1] or parts[10]
76        if name == '<control>':
77            name = parts[10]
78        if name:
79            name_map[codepoint] = name
80            for word in name.lower().split():
81                add_word(word, codepoint)
82        category = parts[2]
83        s = class_maps.setdefault(category, set())
84        desc = parts[1]
85        codepoints: Union[Tuple[int, ...], Iterable[int]] = (codepoint,)
86        if first is None:
87            if desc.endswith(', First>'):
88                first = codepoint
89                continue
90        else:
91            codepoints = range(first, codepoint + 1)
92            first = None
93        for codepoint in codepoints:
94            s.add(codepoint)
95            not_assigned.discard(codepoint)
96            if category.startswith('M'):
97                marks.add(codepoint)
98            elif category.startswith('S'):
99                all_symbols.add(codepoint)
100
101    with open('nerd-fonts-glyphs.txt') as f:
102        for line in f:
103            line = line.strip()
104            if not line or line.startswith('#'):
105                continue
106            code, category, name = line.split(' ', 2)
107            codepoint = int(code, 16)
108            if name and codepoint not in name_map:
109                name_map[codepoint] = name.upper()
110                for word in name.lower().split():
111                    add_word(word, codepoint)
112
113    # Some common synonyms
114    word_search_map['bee'] |= word_search_map['honeybee']
115    word_search_map['lambda'] |= word_search_map['lamda']
116    word_search_map['lamda'] |= word_search_map['lambda']
117    word_search_map['diamond'] |= word_search_map['gem']
118
119
120def parse_range_spec(spec: str) -> Set[int]:
121    spec = spec.strip()
122    if '..' in spec:
123        chars_ = tuple(map(lambda x: int(x, 16), filter(None, spec.split('.'))))
124        chars = set(range(chars_[0], chars_[1] + 1))
125    else:
126        chars = {int(spec, 16)}
127    return chars
128
129
130def split_two(line: str) -> Tuple[Set[int], str]:
131    spec, rest = line.split(';', 1)
132    spec, rest = spec.strip(), rest.strip().split(' ', 1)[0].strip()
133    return parse_range_spec(spec), rest
134
135
136all_emoji: Set[int] = set()
137emoji_presentation_bases: Set[int] = set()
138narrow_emoji: Set[int] = set()
139wide_emoji: Set[int] = set()
140flags: Dict[int, List[int]] = {}
141
142
143def parse_basic_emoji(spec: str) -> None:
144    parts = list(filter(None, spec.split()))
145    has_emoji_presentation = len(parts) < 2
146    chars = parse_range_spec(parts[0])
147    all_emoji.update(chars)
148    emoji_presentation_bases.update(chars)
149    (wide_emoji if has_emoji_presentation else narrow_emoji).update(chars)
150
151
152def parse_keycap_sequence(spec: str) -> None:
153    base, fe0f, cc = list(filter(None, spec.split()))
154    chars = parse_range_spec(base)
155    all_emoji.update(chars)
156    emoji_presentation_bases.update(chars)
157    narrow_emoji.update(chars)
158
159
160def parse_flag_emoji_sequence(spec: str) -> None:
161    a, b = list(filter(None, spec.split()))
162    left, right = int(a, 16), int(b, 16)
163    chars = {left, right}
164    all_emoji.update(chars)
165    wide_emoji.update(chars)
166    emoji_presentation_bases.update(chars)
167    flags.setdefault(left, []).append(right)
168
169
170def parse_emoji_tag_sequence(spec: str) -> None:
171    a = int(spec.split()[0], 16)
172    all_emoji.add(a)
173    wide_emoji.add(a)
174    emoji_presentation_bases.add(a)
175
176
177def parse_emoji_modifier_sequence(spec: str) -> None:
178    a, b = list(filter(None, spec.split()))
179    char, mod = int(a, 16), int(b, 16)
180    mod
181    all_emoji.add(char)
182    wide_emoji.add(char)
183    emoji_presentation_bases.add(char)
184
185
186def parse_emoji() -> None:
187    for line in get_data('emoji-sequences.txt', 'emoji'):
188        parts = [x.strip() for x in line.split(';')]
189        if len(parts) < 2:
190            continue
191        data, etype = parts[:2]
192        if etype == 'Basic_Emoji':
193            parse_basic_emoji(data)
194        elif etype == 'Emoji_Keycap_Sequence':
195            parse_keycap_sequence(data)
196        elif etype == 'RGI_Emoji_Flag_Sequence':
197            parse_flag_emoji_sequence(data)
198        elif etype == 'RGI_Emoji_Tag_Sequence':
199            parse_emoji_tag_sequence(data)
200        elif etype == 'RGI_Emoji_Modifier_Sequence':
201            parse_emoji_modifier_sequence(data)
202
203
204doublewidth: Set[int] = set()
205ambiguous: Set[int] = set()
206
207
208def parse_eaw() -> None:
209    global doublewidth, ambiguous
210    seen: Set[int] = set()
211    for line in get_data('ucd/EastAsianWidth.txt'):
212        chars, eaw = split_two(line)
213        if eaw == 'A':
214            ambiguous |= chars
215            seen |= chars
216        elif eaw == 'W' or eaw == 'F':
217            doublewidth |= chars
218            seen |= chars
219    doublewidth |= set(range(0x3400, 0x4DBF + 1)) - seen
220    doublewidth |= set(range(0x4E00, 0x9FFF + 1)) - seen
221    doublewidth |= set(range(0xF900, 0xFAFF + 1)) - seen
222    doublewidth |= set(range(0x20000, 0x2FFFD + 1)) - seen
223    doublewidth |= set(range(0x30000, 0x3FFFD + 1)) - seen
224
225
226def get_ranges(items: List[int]) -> Generator[Union[int, Tuple[int, int]], None, None]:
227    items.sort()
228    for k, g in groupby(enumerate(items), lambda m: m[0]-m[1]):
229        group = tuple(map(itemgetter(1), g))
230        a, b = group[0], group[-1]
231        if a == b:
232            yield a
233        else:
234            yield a, b
235
236
237def write_case(spec: Union[Tuple, int], p: Callable) -> None:
238    if isinstance(spec, tuple):
239        p('\t\tcase 0x{:x} ... 0x{:x}:'.format(*spec))
240    else:
241        p('\t\tcase 0x{:x}:'.format(spec))
242
243
244@contextmanager
245def create_header(path: str, include_data_types: bool = True) -> Generator[Callable, None, None]:
246    with open(path, 'w') as f:
247        p = partial(print, file=f)
248        p('// unicode data, built from the unicode standard on:', date.today())
249        p('// see gen-wcwidth.py')
250        if path.endswith('.h'):
251            p('#pragma once')
252        if include_data_types:
253            p('#include "data-types.h"\n')
254            p('START_ALLOW_CASE_RANGE')
255        p()
256        yield p
257        p()
258        if include_data_types:
259            p('END_ALLOW_CASE_RANGE')
260
261
262def gen_emoji() -> None:
263    with create_header('kitty/emoji.h') as p:
264        p('static inline bool\nis_emoji(char_type code) {')
265        p('\tswitch(code) {')
266        for spec in get_ranges(list(all_emoji)):
267            write_case(spec, p)
268            p('\t\t\treturn true;')
269        p('\t\tdefault: return false;')
270        p('\t}')
271        p('\treturn false;\n}')
272
273        p('static inline bool\nis_symbol(char_type code) {')
274        p('\tswitch(code) {')
275        for spec in get_ranges(list(all_symbols)):
276            write_case(spec, p)
277            p('\t\t\treturn true;')
278        p('\t\tdefault: return false;')
279        p('\t}')
280        p('\treturn false;\n}')
281
282
283def category_test(
284    name: str,
285    p: Callable,
286    classes: Iterable[str],
287    comment: str,
288    use_static: bool = False,
289    extra_chars: Union[FrozenSet[int], Set[int]] = frozenset(),
290    exclude: Union[Set[int], FrozenSet[int]] = frozenset(),
291    least_check_return: Optional[str] = None,
292    ascii_range: Optional[str] = None
293) -> None:
294    static = 'static inline ' if use_static else ''
295    chars: Set[int] = set()
296    for c in classes:
297        chars |= class_maps[c]
298    chars |= extra_chars
299    chars -= exclude
300    p(f'{static}bool\n{name}(char_type code) {{')
301    p(f'\t// {comment} ({len(chars)} codepoints)' + ' {{' '{')
302    if least_check_return is not None:
303        least = min(chars)
304        p(f'\tif (LIKELY(code < {least})) return {least_check_return};')
305    if ascii_range is not None:
306        p(f'\tif (LIKELY(0x20 <= code && code <= 0x7e)) return {ascii_range};')
307    p('\tswitch(code) {')
308    for spec in get_ranges(list(chars)):
309        write_case(spec, p)
310        p('\t\t\treturn true;')
311    p('\t} // }}}\n')
312    p('\treturn false;\n}\n')
313
314
315def codepoint_to_mark_map(p: Callable, mark_map: List[int]) -> Dict[int, int]:
316    p('\tswitch(c) { // {{{')
317    rmap = {c: m for m, c in enumerate(mark_map)}
318    for spec in get_ranges(mark_map):
319        if isinstance(spec, tuple):
320            s = rmap[spec[0]]
321            cases = ' '.join(f'case {i}:' for i in range(spec[0], spec[1]+1))
322            p(f'\t\t{cases} return {s} + c - {spec[0]};')
323        else:
324            p(f'\t\tcase {spec}: return {rmap[spec]};')
325    p('default: return 0;')
326    p('\t} // }}}')
327    return rmap
328
329
330def classes_to_regex(classes: Iterable[str], exclude: str = '') -> Iterable[str]:
331    chars: Set[int] = set()
332    for c in classes:
333        chars |= class_maps[c]
334    for x in map(ord, exclude):
335        chars.discard(x)
336
337    def as_string(codepoint: int) -> str:
338        if codepoint < 256:
339            return r'\x{:02x}'.format(codepoint)
340        if codepoint <= 0xffff:
341            return r'\u{:04x}'.format(codepoint)
342        return r'\U{:08x}'.format(codepoint)
343
344    for spec in get_ranges(list(chars)):
345        if isinstance(spec, tuple):
346            yield '{}-{}'.format(*map(as_string, (spec[0], spec[1])))
347        else:
348            yield as_string(spec)
349
350
351def gen_ucd() -> None:
352    cz = {c for c in class_maps if c[0] in 'CZ'}
353    with create_header('kitty/unicode-data.c') as p:
354        p('#include "unicode-data.h"')
355        category_test(
356                'is_combining_char', p,
357                {c for c in class_maps if c.startswith('M')},
358                'M category (marks)',
359                # See https://github.com/harfbuzz/harfbuzz/issues/169
360                extra_chars=emoji_skin_tone_modifiers | {zwj},
361                least_check_return='false'
362        )
363        category_test(
364            'is_ignored_char', p, 'Cc Cf Cs'.split(),
365            'Control characters and non-characters',
366            extra_chars=non_characters, exclude={zwj},
367            ascii_range='false'
368        )
369        category_test('is_word_char', p, {c for c in class_maps if c[0] in 'LN'}, 'L and N categories')
370        category_test('is_CZ_category', p, cz, 'C and Z categories')
371        category_test('is_P_category', p, {c for c in class_maps if c[0] == 'P'}, 'P category (punctuation)')
372        mark_map = [0] + list(sorted(marks))
373        p('char_type codepoint_for_mark(combining_type m) {')
374        p(f'\tstatic char_type map[{len(mark_map)}] =', '{', ', '.join(map(str, mark_map)), '}; // {{{ mapping }}}')
375        p('\tif (m < arraysz(map)) return map[m];')
376        p('\treturn 0;')
377        p('}\n')
378        p('combining_type mark_for_codepoint(char_type c) {')
379        rmap = codepoint_to_mark_map(p, mark_map)
380        p('}\n')
381        with open('kitty/unicode-data.h') as f:
382            unicode_data = f.read()
383        m = re.search(r'^#define VS15 (\d+)', unicode_data, re.M)
384        if m is not None:
385            expected = int(m.group(1))
386        if rmap[0xfe0e] != expected:
387            raise ValueError('The mark for 0xfe0e has changed, you have to update VS15 to {} and VS16 to {} in unicode-data.h'.format(
388                rmap[0xfe0e], rmap[0xfe0f]
389            ))
390    with open('kittens/hints/url_regex.py', 'w') as f:
391        f.write('# generated by gen-wcwidth.py, do not edit\n\n')
392        f.write("url_delimiters = '{}'  # noqa".format(''.join(classes_to_regex(cz, exclude='\n\r'))))
393
394
395def gen_names() -> None:
396    with create_header('kittens/unicode_input/names.h') as p:
397        mark_to_cp = list(sorted(name_map))
398        cp_to_mark = {cp: m for m, cp in enumerate(mark_to_cp)}
399        # Mapping of mark to codepoint name
400        p(f'static const char* name_map[{len(mark_to_cp)}] = {{' ' // {{{')
401        for cp in mark_to_cp:
402            w = name_map[cp].replace('"', '\\"')
403            p(f'\t"{w}",')
404        p("}; // }}}\n")
405
406        # Mapping of mark to codepoint
407        p(f'static const char_type mark_to_cp[{len(mark_to_cp)}] = {{' ' // {{{')
408        p(', '.join(map(str, mark_to_cp)))
409        p('}; // }}}\n')
410
411        # Function to get mark number for codepoint
412        p('static char_type mark_for_codepoint(char_type c) {')
413        codepoint_to_mark_map(p, mark_to_cp)
414        p('}\n')
415        p('static inline const char* name_for_codepoint(char_type cp) {')
416        p('\tchar_type m = mark_for_codepoint(cp); if (m == 0) return NULL;')
417        p('\treturn name_map[m];')
418        p('}\n')
419
420        # Array of all words
421        word_map = tuple(sorted(word_search_map))
422        word_rmap = {w: i for i, w in enumerate(word_map)}
423        p(f'static const char* all_words_map[{len(word_map)}] = {{' ' // {{{')
424        cwords = (w.replace('"', '\\"') for w in word_map)
425        p(', '.join(f'"{w}"' for w in cwords))
426        p('}; // }}}\n')
427
428        # Array of sets of marks for each word
429        word_to_marks = {word_rmap[w]: frozenset(map(cp_to_mark.__getitem__, cps)) for w, cps in word_search_map.items()}
430        all_mark_groups = frozenset(word_to_marks.values())
431        array = [0]
432        mg_to_offset = {}
433        for mg in all_mark_groups:
434            mg_to_offset[mg] = len(array)
435            array.append(len(mg))
436            array.extend(sorted(mg))
437        p(f'static const char_type mark_groups[{len(array)}] = {{' ' // {{{')
438        p(', '.join(map(str, array)))
439        p('}; // }}}\n')
440        offsets_array = []
441        for wi, w in enumerate(word_map):
442            mg = word_to_marks[wi]
443            offsets_array.append(mg_to_offset[mg])
444        p(f'static const char_type mark_to_offset[{len(offsets_array)}] = {{' ' // {{{')
445        p(', '.join(map(str, offsets_array)))
446        p('}; // }}}\n')
447
448        # The trie
449        p('typedef struct { uint32_t children_offset; uint32_t match_offset; } word_trie;\n')
450        all_trie_nodes: List['TrieNode'] = []  # noqa
451
452        class TrieNode:
453
454            def __init__(self) -> None:
455                self.match_offset = 0
456                self.children_offset = 0
457                self.children: Dict[int, int] = {}
458
459            def add_letter(self, letter: int) -> int:
460                if letter not in self.children:
461                    self.children[letter] = len(all_trie_nodes)
462                    all_trie_nodes.append(TrieNode())
463                return self.children[letter]
464
465            def __str__(self) -> str:
466                return f'{{ .children_offset={self.children_offset}, .match_offset={self.match_offset} }}'
467
468        root = TrieNode()
469        all_trie_nodes.append(root)
470
471        def add_word(word_idx: int, word: str) -> None:
472            parent = root
473            for letter in map(ord, word):
474                idx = parent.add_letter(letter)
475                parent = all_trie_nodes[idx]
476            parent.match_offset = offsets_array[word_idx]
477
478        for i, word in enumerate(word_map):
479            add_word(i, word)
480        children_array = [0]
481        for node in all_trie_nodes:
482            if node.children:
483                node.children_offset = len(children_array)
484                children_array.append(len(node.children))
485                for letter, child_offset in node.children.items():
486                    children_array.append((child_offset << 8) | (letter & 0xff))
487
488        p(f'static const word_trie all_trie_nodes[{len(all_trie_nodes)}] = {{' ' // {{{')
489        p(',\n'.join(map(str, all_trie_nodes)))
490        p('\n}; // }}}\n')
491        p(f'static const uint32_t children_array[{len(children_array)}] = {{' ' // {{{')
492        p(', '.join(map(str, children_array)))
493        p('}; // }}}\n')
494
495
496def gen_wcwidth() -> None:
497    seen: Set[int] = set()
498
499    def add(p: Callable, comment: str, chars_: Union[Set[int], FrozenSet[int]], ret: int) -> None:
500        chars = chars_ - seen
501        seen.update(chars)
502        p(f'\t\t// {comment} ({len(chars)} codepoints)' + ' {{' '{')
503        for spec in get_ranges(list(chars)):
504            write_case(spec, p)
505            p(f'\t\t\treturn {ret};')
506        p('\t\t// }}}\n')
507
508    with create_header('kitty/wcwidth-std.h') as p:
509        p('static inline int\nwcwidth_std(int32_t code) {')
510        p('\tif (LIKELY(0x20 <= code && code <= 0x7e)) return 1;')
511        p('\tswitch(code) {')
512
513        non_printing = class_maps['Cc'] | class_maps['Cf'] | class_maps['Cs']
514        add(p, 'Flags', flag_codepoints, 2)
515        add(p, 'Marks', marks | {0}, 0)
516        add(p, 'Non-printing characters', non_printing, -1)
517        add(p, 'Private use', class_maps['Co'], -3)
518        add(p, 'Text Presentation', narrow_emoji, 1)
519        add(p, 'East Asian ambiguous width', ambiguous, -2)
520        add(p, 'East Asian double width', doublewidth, 2)
521        add(p, 'Emoji Presentation', wide_emoji, 2)
522
523        add(p, 'Not assigned in the unicode character database', not_assigned, -4)
524
525        p('\t\tdefault: return 1;')
526        p('\t}')
527        p('\treturn 1;\n}')
528
529        p('static inline bool\nis_emoji_presentation_base(uint32_t code) {')
530        p('\tswitch(code) {')
531        for spec in get_ranges(list(emoji_presentation_bases)):
532            write_case(spec, p)
533            p('\t\t\treturn true;')
534        p('\t\tdefault: return false;')
535        p('\t}')
536        p('\treturn 1;\n}')
537
538
539parse_ucd()
540parse_emoji()
541parse_eaw()
542gen_ucd()
543gen_wcwidth()
544gen_emoji()
545gen_names()
546