1import sys 2import re 3import os 4 5def bsdconv01(dt): 6 dt=dt.strip().lstrip("0").upper() 7 if len(dt) & 1: 8 return "010"+dt 9 else: 10 return "01"+dt 11 12 13f_ambiguous=open("modules/inter/_AMBIGUOUS.h", "w") 14f_width=open("modules/inter/_WIDTH.h", "w") 15f_ccc=open("modules/inter/_NF-CCC.h", "w") 16f_nfd=open("modules/inter/_NFD.txt", "w") 17f_nfkd=open("modules/inter/_NFKD.txt", "w") 18f_nfc=open("modules/inter/_NFC-MAP.txt", "w") 19f_upper=open("modules/inter/UPPER.txt", "w") 20f_lower=open("modules/inter/LOWER.txt", "w") 21f_casefold=open("modules/inter/CASEFOLD.txt", "w") 22f_cjkvar=open("tmp/cjkvar.txt", "w") 23 24ccc_start=-1 25ccc_end=-1 26ccc_value=0 27t_ccc={} 28m_nfd={} 29m_nfd_raw={} 30m_nfkd={} 31 32m_url={} 33f_map=open("tmp/map.txt") 34for l in f_map: 35 l=l.strip().split("\t") 36 if len(l)==2: 37 m_url[l[0]]=l[1] 38 39f_ambiguous.write("/* Generated from {url}*/\n".format(url=m_url["EastAsianWidth.txt"])); 40f_width.write("/* Generated from {url}*/\n".format(url=m_url["EastAsianWidth.txt"])); 41f_ccc.write("/* Generated from {url}*/\n".format(url=m_url["UnicodeData.txt"])); 42for f in [f_nfc, f_nfd, f_nfkd, f_upper, f_lower]: 43 f.write("Source: {url}\n".format(url=m_url["UnicodeData.txt"])) 44 45f_upper.write("Source: {url}\n".format(url=m_url["SpecialCasing.txt"])) 46f_lower.write("Source: {url}\n".format(url=m_url["SpecialCasing.txt"])) 47f_casefold.write("Source: {url}\n".format(url=m_url["CaseFolding.txt"])) 48 49f_ccc.write(""" 50struct ccc_interval { 51 int beg; 52 int end; 53 int ccc; 54}; 55 56static const struct ccc_interval ccc_table[] = { 57"""); 58 59def lookup(l,m): 60 ret=[] 61 for e in l: 62 if e in m: 63 ret.extend(lookup(m[e], m)) 64 else: 65 ret.append(e) 66 return ret 67 68def nf_order(l): 69 r=[] 70 b=0 71 e=0 72 for i in range(len(l)): 73 if l[i] in t_ccc: 74 if b==0: 75 b=i 76 e=i 77 else: 78 e=i 79 else: 80 if b!=e: 81 r.append((b,e)) 82 b=0 83 e=0 84 if b!=e: 85 r.append((b,e)) 86 for b,e in r: 87 a=sorted(l[b:e+1], key=lambda x:t_ccc[x]) 88 for i in range(b,e+1): 89 l[i]=a[i-b] 90 return l 91 92def in_range(s,rs): 93 s=int(s[2:], 16) 94 for r0,r1 in rs: 95 r0=int(r0[2:], 16) 96 r1=int(r1[2:], 16) 97 if s>=r0 and s<=r1: 98 return True 99 return False 100 101# ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html 102l_nfd=[] 103l_nfkd=[] 104ud=open("tmp/UnicodeData.txt") 105for l in ud: 106 if not l.strip(): 107 continue 108 a=l.split(";") 109 cp=bsdconv01(a[0]) 110 code_point=int(a[0], 16) 111 if "CJK" in a[1] and a[5] and not " "in a[5]: 112 f_cjkvar.write("{}\t{}\n".format(a[0], a[5])) 113 if a[3]!="0": 114 ccc=int(a[3]) 115 if ccc: 116 t_ccc[cp]=ccc 117 if ccc==ccc_value and code_point==ccc_end+1: 118 ccc_end=code_point 119 else: 120 if ccc_value!=0: 121 f_ccc.write("{0x%x, 0x%x, %d},\n" % (ccc_start, ccc_end, ccc_value)) 122 ccc_start=code_point 123 ccc_end=code_point 124 ccc_value=ccc 125 if a[5]: 126 dt=a[5].split(" ") 127 compat=False 128 if dt[0][0]=="<": 129 tag=dt[0][1:-1] 130 dt=dt[1:] 131 compat=True 132 dt=[bsdconv01(x) for x in dt] 133 if compat: 134 l_nfkd.append((cp,tag)) 135 m_nfkd[cp]=dt 136 else: 137 l_nfkd.append((cp,"canonical")) 138 m_nfkd[cp]=dt 139 l_nfd.append(cp) 140 m_nfd[cp]=dt 141 m_nfd_raw[cp]=dt 142 if a[12]: 143 dt=bsdconv01(a[12]) 144 f_upper.write("{f}\t{t}\n".format(f=cp, t=dt)) 145 if a[13]: 146 dt=bsdconv01(a[13]) 147 f_lower.write("{f}\t{t}\n".format(f=cp, t=dt)) 148 149f_ccc.write("{0x%x, 0x%x, %d},\n" % (ccc_start, ccc_end, ccc_value)) 150f_ccc.write("};\n") 151f_ccc.close() 152f_cjkvar.close() 153 154sc=open("tmp/SpecialCasing.txt") 155for l in sc: 156 l=l.strip() 157 if not l: 158 continue 159 if l[0] in "#": 160 continue 161 d,c=l.split("#") 162 d=d.split(";") 163 code=",".join([bsdconv01(x) for x in d[0].strip().split(" ")]) 164 lower=",".join([bsdconv01(x) for x in d[1].strip().split(" ")]) 165 title=",".join([bsdconv01(x) for x in d[2].strip().split(" ")]) 166 upper=",".join([bsdconv01(x) for x in d[3].strip().split(" ")]) 167 cond=d[4].strip() 168 if cond=="": 169 if code!=upper: 170 f_upper.write("{f}\t{t}\n".format(f=code, t=upper)) 171 if code!=lower: 172 f_lower.write("{f}\t{t}\n".format(f=code, t=lower)) 173 174 175f_upper.close() 176f_lower.close() 177 178l_fce=[] 179dnp=open("tmp/DerivedNormalizationProps.txt") 180for l in dnp: 181 l=l.strip() 182 if not l: 183 continue 184 if l[0] in "#": 185 continue 186 a=l.split(";") 187 if not a[1].strip().startswith("Full_Composition_Exclusion"): 188 continue 189 r=a[0].strip().split("..") 190 if len(r)==1: 191 r.append(r[0]) 192 l_fce.append((bsdconv01(r[0]),bsdconv01(r[1]))) 193 194for cp in l_nfd: 195 d=nf_order(lookup(m_nfd[cp], m_nfd)) 196 m_nfd[cp]=d 197 f_nfd.write("{f}\t{t}\n".format(f=cp, t=",".join(d))) 198f_nfd.close() 199 200for cp,tag in l_nfkd: 201 d=nf_order(lookup(m_nfkd[cp], m_nfkd)) 202 m_nfkd[cp]=d 203 f_nfkd.write("{f}\t{t}\t#{c}\n".format(f=cp, t=",".join(d), c=tag)) 204f_nfkd.close() 205 206for cp in l_nfd: 207 if in_range(cp, l_fce): 208 continue 209 l=m_nfd_raw[cp] 210 f_nfc.write("{f}\t{t}\n".format(f=",".join(l), t=cp)) 211f_nfc.close() 212 213cf=open("tmp/CaseFolding.txt") 214for l in cf: 215 l=l.strip() 216 if not l: 217 continue 218 if l[0] in "#": 219 continue 220 csm, name = l.split("#") 221 code, status, mapping, null = csm.split(";") 222 code = bsdconv01(code) 223 status=status.strip() 224 mapping = ",".join([bsdconv01(x) for x in mapping.strip().split(" ")]) 225 f_casefold.write("{f}\t{t}\n".format(f=code, t=mapping)) 226f_casefold.close() 227 228 229# Blocks 230 231blocks={ 232 "^.*Arabian$": "ARABIC", 233 "^Arabic.*$": "ARABIC", 234 "^Armenian$": "ARMENIAN", 235 "^.*Arrows.*$": "ARROWS", 236 "^Bopomofo.*$": "CJK", 237 "^Braille.*$": "BRAILLE", 238 "^Cherokee$": "CHEROKEE", 239 "^.*CJK.*$": "CJK", 240 "^Cuneiform.*$": "CUNEIFORM", 241 "^Currency.*$": "CURRENCY", 242 "^Cyrillic.*$": "CYRILLIC", 243 "^Devanagari.*$": "DEVANAGARI", 244 "^Egyptian.*$": "EGYPTIAN", 245 "^Emoticons$": "EMOTICON", 246 "^Ethiopic.*$": "ETHIOPIC", 247 "^Georgian.*$": "GEORGIAN", 248 "^.*Greek.*$": "GREEK", 249 "^Hangul.*$": ["HANGUL", "CJK"], 250 "^Hebrew$": "HEBREW", 251 "^Hiragana$": ["HIRAGANA", "CJK"], 252 "^Ideographic Description Characters*": "CJK", 253 "^IPA.*$": ["IPA", "PHONETIC"], 254 "^Javanese$": "JAVANESE", 255 "^Katakana.*$": ["KATAKANA", "CJK"], 256 "^Kana .*$": "CJK", 257 "^Kanbun.*$": "CJK", 258 "^Kangxi Radicals$": "CJK", 259 "^Kannada$": "KANNADA", 260 "^Khmer.*$": "KHMER", 261 "^Lao$": "LAO", 262 "^.*Latin.*$": "LATIN", 263 "^Miao$": "MIAO", 264 "^Mahjong.*$": "MAHJONG", 265 "^Malayalam$": "MALAYALAM", 266 "^.*Mathematical.*$": "MATH", 267 "^Mongolian$": "MONGOLIAN", 268 "^.*Musical.*$": "MUSIC", 269 "^Myanmar.*$": "MYANMAR", 270 "^Phonetic.*$": "PHONETIC", 271 "^.*Private Use Area.*$": "PUA", 272 "^.*Punctuation.*$": "PUNCTUATION", 273 "^Samaritan$": "SAMARITAN", 274 "^Sinhala.*$": "SINHALA", 275 "^Sundanese.*$": "SUNDANESE", 276 "^Syriac$": "SYRIAC", 277 "^Tagalog$": "TAGALOG", 278 "^Tai Xuan Jing.*$": "CJK", 279 "^Tamil$": "TAMIL", 280 "^Telugu$": "TELUGU", 281 "^Thai$": "THAI", 282 "^Tibetan$": "TIBETAN", 283 "^Tifinagh$": "TIFINAGH", 284 "^Yi .*$": ["YI", "CJK"], 285 "^Yijing.*$": "CJK", 286} 287 288m={} 289blk=open("tmp/Blocks.txt") 290for l in blk: 291 l=l.strip() 292 if l=="" or l[0]=="#": 293 continue 294 r, d = l.split(";") 295 d=d.strip() 296 cl=[] 297 for pt in blocks: 298 if re.match(pt, d): 299 c=blocks[pt] 300 if type(c)==list: 301 cl.extend(c) 302 else: 303 cl.append(c) 304 print r, d, cl 305 for c in cl: 306 if c not in m: 307 m[c]=open(os.path.join("modules/filter", c+".c"), "w") 308 m[c].write("/*\n" 309 " * Generated from: "+m_url["Blocks.txt"]+"\n" 310 " */\n" 311 "\n" 312 "#include \"../../src/bsdconv.h\"\n" 313 "\n" 314 "static const struct uint32_range ranges[] = {\n" 315 ) 316 b,e=r.split("..") 317 m[c].write("\t{{ 0x{beg}, 0x{end} }}, // {desc}\n".format(beg=b, end=e, desc=d)) 318 319for c in m: 320 m[c].write("};\n" 321 "#include \"unicode_range.c\"\n") 322 m[c].close() 323 324f_ambiguous.write(""" 325struct interval { 326 int first; 327 int last; 328}; 329 330static const struct interval ambiguous[] = { 331"""); 332 333f_width.write(""" 334struct width_interval { 335 int beg; 336 int end; 337 int width; 338}; 339 340static const struct width_interval width_table[] = { 341"""); 342 343propmap = {"A":"AMBI", "F":"FULL", "H":"HALF", "N":"HALF", "Na":"HALF", "W":"FULL"} 344ambi_beg = None 345ambi_end = None 346width_beg = None 347width_end = None 348width_prop = None 349eaw=open("tmp/EastAsianWidth.txt") 350for l in eaw: 351 l = l.strip() 352 if not l: 353 continue 354 if l.startswith("#"): 355 continue 356 l, desc = l.split("#") 357 a=l.strip().split(";") 358 desc = desc[1:3] 359 w = a[1] 360 r = a[0].split("..") 361 b = r[0] 362 if len(r)==1: 363 e = b 364 else: 365 e = r[1] 366 367 if w == "A": 368 if ambi_beg is None: 369 ambi_beg = b 370 ambi_end = e 371 elif int(ambi_end, 16)+1==int(b, 16): 372 ambi_end = e 373 else: 374 f_ambiguous.write("{{ 0x{beg}, 0x{end} }},\n".format(beg=ambi_beg, end=ambi_end)); 375 ambi_beg = b 376 ambi_end = e 377 378 if desc != "Cc": 379 p = propmap[w] 380 if width_prop is None: 381 width_prop = p 382 width_beg = b 383 width_end = e 384 elif p == width_prop: 385 if int(width_end, 16)+1==int(b, 16): 386 width_end = e 387 else: 388 f_width.write("{{ 0x{beg}, 0x{end}, {prop} }},\n".format(beg=width_beg, end=width_end, prop=width_prop)); 389 width_beg = b 390 width_end = e 391 else: 392 f_width.write("{{ 0x{beg}, 0x{end}, {prop} }},\n".format(beg=width_beg, end=width_end, prop=width_prop)); 393 width_prop = p 394 width_beg = b 395 width_end = e 396 397f_ambiguous.write("{{ 0x{beg}, 0x{end} }},\n".format(beg=ambi_beg, end=ambi_end)); 398f_ambiguous.write("};\n") 399f_ambiguous.close() 400 401f_width.write("{{ 0x{beg}, 0x{end}, {prop} }},\n".format(beg=width_beg, end=width_end, prop=width_prop)); 402f_width.write("};\n") 403f_width.close() 404