1#!/usr/bin/env python3 2 3import argparse 4 5from dataclasses import dataclass 6 7parser = argparse.ArgumentParser() 8parser.add_argument('input', type=argparse.FileType('r')) 9parser.add_argument('output', type=argparse.FileType('w')) 10opts = parser.parse_args() 11 12 13@dataclass 14class Emoji: 15 codepoint: int 16 count: int = 1 17 emoji_presentation: bool = False 18 modifier: bool = False 19 modifier_base: bool = False 20 component: bool = False 21 pictographic: bool = False 22 23 24emojis = {} 25 26 27for line in opts.input: 28 line = line.rstrip() 29 if not line: 30 continue 31 if line[0] == '#': 32 continue 33 34 codepoint_or_range, props_and_trash = line.split(';', maxsplit=1) 35 props = props_and_trash.split('#', maxsplit=1)[0].strip() 36 37 codepoint_range = tuple( 38 map(lambda s: int(s.strip(), 16), codepoint_or_range.split('..'))) 39 assert len(codepoint_range) in [1, 2] 40 41 if len(codepoint_range) == 1: 42 codepoint_range = codepoint_range[0], codepoint_range[0] 43 44 for cp in range(codepoint_range[0], codepoint_range[1] + 1): 45 assert cp < (1 << 24), f'codepoint is outside range: 0x{cp:x}' 46 47 if props == 'Emoji': 48 for cp in range(codepoint_range[0], codepoint_range[1] + 1): 49 assert cp not in emojis 50 emojis[cp] = Emoji(codepoint=cp) 51 52 elif props == 'Emoji_Presentation': 53 for cp in range(codepoint_range[0], codepoint_range[1] + 1): 54 try: 55 emojis[cp].emoji_presentation = True 56 except KeyError: 57 pass 58 59 elif props == 'Emoji_Modifier': 60 for cp in range(codepoint_range[0], codepoint_range[1] + 1): 61 try: 62 emojis[cp].modifier = True 63 except KeyError: 64 pass 65 66 elif props == 'Emoji_Modifier_Base': 67 for cp in range(codepoint_range[0], codepoint_range[1] + 1): 68 try: 69 emojis[cp].modifier_base = True 70 except KeyError: 71 pass 72 73 elif props == 'Emoji_Component': 74 for cp in range(codepoint_range[0], codepoint_range[1] + 1): 75 try: 76 emojis[cp].component = True 77 except KeyError: 78 pass 79 80 elif props == 'Extended_Pictographic': 81 for cp in range(codepoint_range[0], codepoint_range[1] + 1): 82 try: 83 emojis[cp].pictographic = True 84 except KeyError: 85 pass 86 87# for emoji in emojis.values(): 88# print(f'{emoji.codepoint:x} ({chr(emoji.codepoint)}): ' 89# f'presentation={"emoji" if emoji.emoji_presentation else "text"}, ' 90# f'component={emoji.component}, ' 91# f'pictographic={emoji.pictographic}') 92# print(f'{len(emojis)} codepoints') 93# sys.exit(0) 94 95# Compact the list, by creating ranges of consecutive codepoints with 96# identical properties 97 98compacted = [] 99last = None 100for emoji in sorted(emojis.values(), key=lambda x: x.codepoint): 101 if last is None: 102 last = emoji 103 compacted.append(last) 104 continue 105 106 if emoji.codepoint == last.codepoint + last.count and \ 107 emoji.emoji_presentation == last.emoji_presentation and \ 108 emoji.modifier == last.modifier and \ 109 emoji.modifier_base == last.modifier_base and \ 110 emoji.component == last.component and \ 111 emoji.pictographic == last.pictographic: 112 last.count += 1 113 else: 114 last = emoji 115 compacted.append(last) 116 117print(f'compacted from {len(emojis)} to {len(compacted)} entries') 118 119opts.output.write('#pragma once\n') 120opts.output.write('#include <stdint.h>\n') 121opts.output.write('#include <stdbool.h>\n') 122opts.output.write('\n') 123opts.output.write('struct emoji {\n') 124opts.output.write(' bool emoji_presentation:1;\n') 125opts.output.write(' bool modifier:1;\n') 126opts.output.write(' bool modifier_base:1;\n') 127opts.output.write(' bool component:1;\n') 128opts.output.write(' bool pictographic:1;\n') 129opts.output.write(' uint32_t cp:24;\n') 130opts.output.write(' uint8_t count;\n') 131opts.output.write('} __attribute__((packed));\n') 132opts.output.write('_Static_assert(sizeof(struct emoji) == 5, "unexpected struct size");\n') 133opts.output.write('\n') 134opts.output.write('#if defined(FCFT_HAVE_HARFBUZZ)\n') 135opts.output.write('\n') 136opts.output.write(f'static const struct emoji emojis[{len(compacted)}] = {{\n') 137 138for emoji in compacted: 139 assert emoji.count < 256 140 opts.output.write(' {\n') 141 opts.output.write(f' .emoji_presentation = {"true" if emoji.emoji_presentation else "false"},\n') 142 opts.output.write(f' .modifier = {"true" if emoji.modifier else "false"},\n') 143 opts.output.write(f' .modifier_base = {"true" if emoji.modifier_base else "false"},\n') 144 opts.output.write(f' .component = {"true" if emoji.component else "false"},\n') 145 opts.output.write(f' .pictographic = {"true" if emoji.pictographic else "false"},\n') 146 opts.output.write(f' .cp = 0x{emoji.codepoint:05x},\n') 147 opts.output.write(f' .count = {emoji.count},\n') 148 opts.output.write(' },\n') 149 150opts.output.write('};\n') 151 152opts.output.write('#else /* !FCFT_HAVE_HARFBUZZ */\n') 153opts.output.write('static const struct emoji emojis[0];\n') 154opts.output.write('#endif /* !FCFT_HAVE_HARFBUZZ */\n') 155