1#!/usr/bin/env python 2# -*- coding: utf-8 -*- 3# 4# compose-parse.py, version 1.5 5# 6# multifunction script that helps manage the compose sequence table in GTK+ (gtk/gtkimcontextsimple.c) 7# the script produces statistics and information about the whole process, run with --help for more. 8# 9# You may need to switch your python installation to utf-8, if you get 'ascii' codec errors. 10# 11# Complain to Simos Xenitellis (simos@gnome.org, http://simos.info/blog) for this craft. 12 13from re import findall, match, split, sub 14from string import atoi 15from unicodedata import normalize 16from urllib import urlretrieve 17from os.path import isfile, getsize 18from copy import copy 19 20import sys 21import getopt 22 23# We grab files off the web, left and right. 24URL_COMPOSE = 'http://cgit.freedesktop.org/xorg/lib/libX11/plain/nls/en_US.UTF-8/Compose.pre' 25URL_KEYSYMSTXT = "http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt" 26URL_GDKKEYSYMSH = "http://git.gnome.org/browse/gtk%2B/plain/gdk/gdkkeysyms.h" 27URL_UNICODEDATATXT = 'http://www.unicode.org/Public/6.0.0/ucd/UnicodeData.txt' 28FILENAME_COMPOSE_SUPPLEMENTARY = 'gtk-compose-lookaside.txt' 29 30# We currently support keysyms of size 2; once upstream xorg gets sorted, 31# we might produce some tables with size 2 and some with size 4. 32SIZEOFINT = 2 33 34# Current max compose sequence length; in case it gets increased. 35WIDTHOFCOMPOSETABLE = 5 36 37keysymdatabase = {} 38keysymunicodedatabase = {} 39unicodedatabase = {} 40 41headerfile_start = """/* GTK - The GIMP Tool Kit 42 * Copyright (C) 2007, 2008 GNOME Foundation 43 * 44 * This library is free software; you can redistribute it and/or 45 * modify it under the terms of the GNU Lesser General Public 46 * License as published by the Free Software Foundation; either 47 * version 2 of the License, or (at your option) any later version. 48 * 49 * This library is distributed in the hope that it will be useful, 50 * but WITHOUT ANY WARRANTY; without even the implied warranty of 51 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 52 * Lesser General Public License for more details. 53 * 54 * You should have received a copy of the GNU Lesser General Public 55 * License along with this library. If not, see <http://www.gnu.org/licenses/>. 56 */ 57 58/* 59 * File auto-generated from script found at http://bugzilla.gnome.org/show_bug.cgi?id=321896 60 * using the input files 61 * Input : http://cgit.freedesktop.org/xorg/lib/libX11/plain/nls/en_US.UTF-8/Compose.pre 62 * Input : http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt 63 * Input : http://www.unicode.org/Public/UNIDATA/UnicodeData.txt 64 * 65 * This table is optimised for space and requires special handling to access the content. 66 * This table is used solely by http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimple.c 67 * 68 * The resulting file is placed at http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimpleseqs.h 69 * This file is described in bug report http://bugzilla.gnome.org/show_bug.cgi?id=321896 70 */ 71 72/* 73 * Modified by the GTK+ Team and others 2007, 2008. See the AUTHORS 74 * file for a list of people on the GTK+ Team. See the ChangeLog 75 * files for a list of changes. These files are distributed with 76 * GTK+ at ftp://ftp.gtk.org/pub/gtk/. 77 */ 78 79#ifndef __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ 80#define __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ 81 82/* === These are the original comments of the file; we keep for historical purposes === 83 * 84 * The following table was generated from the X compose tables include with 85 * XFree86 4.0 using a set of Perl scripts. Contact Owen Taylor <otaylor@redhat.com> 86 * to obtain the relevant perl scripts. 87 * 88 * The following compose letter letter sequences confliced 89 * Dstroke/dstroke and ETH/eth; resolved to Dstroke (Croation, Vietnamese, Lappish), over 90 * ETH (Icelandic, Faroese, old English, IPA) [ D- -D d- -d ] 91 * Amacron/amacron and ordfeminine; resolved to ordfeminine [ _A A_ a_ _a ] 92 * Amacron/amacron and Atilde/atilde; resolved to atilde [ -A A- a- -a ] 93 * Omacron/Omacron and masculine; resolved to masculine [ _O O_ o_ _o ] 94 * Omacron/omacron and Otilde/atilde; resolved to otilde [ -O O- o- -o ] 95 * 96 * [ Amacron and Omacron are in Latin-4 (Baltic). ordfeminine and masculine are used for 97 * spanish. atilde and otilde are used at least for Portuguese ] 98 * 99 * at and Aring; resolved to Aring [ AA ] 100 * guillemotleft and caron; resolved to guillemotleft [ << ] 101 * ogonek and cedilla; resolved to cedilla [ ,, ] 102 * 103 * This probably should be resolved by first checking an additional set of compose tables 104 * that depend on the locale or selected input method. 105 */ 106 107static const guint16 gtk_compose_seqs_compact[] = {""" 108 109headerfile_end = """}; 110 111#endif /* __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ */ 112""" 113 114declare_compose_sequence_32bit_first = """}; 115 116static const guint16 gtk_compose_seqs_compact_32bit_first[] = {""" 117 118declare_compose_sequence_32bit_second = """}; 119 120static const guint32 gtk_compose_seqs_compact_32bit_second[] = {""" 121 122def stringtohex(str): return atoi(str, 16) 123 124def factorial(n): 125 if n <= 1: 126 return 1 127 else: 128 return n * factorial(n-1) 129 130def uniq(*args) : 131 """ Performs a uniq operation on a list or lists """ 132 theInputList = [] 133 for theList in args: 134 theInputList += theList 135 theFinalList = [] 136 for elem in theInputList: 137 if elem not in theFinalList: 138 theFinalList.append(elem) 139 return theFinalList 140 141 142 143def all_permutations(seq): 144 """ Borrowed from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/252178 """ 145 """ Produces all permutations of the items of a list """ 146 if len(seq) <=1: 147 yield seq 148 else: 149 for perm in all_permutations(seq[1:]): 150 for i in range(len(perm)+1): 151 #nb str[0:1] works in both string and list contexts 152 yield perm[:i] + seq[0:1] + perm[i:] 153 154def usage(): 155 print """compose-parse available parameters: 156 -h, --help this craft 157 -s, --statistics show overall statistics (both algorithmic, non-algorithmic) 158 -a, --algorithmic show sequences saved with algorithmic optimisation 159 -g, --gtk show entries that go to GTK+ 160 -u, --unicodedatatxt show compose sequences derived from UnicodeData.txt (from unicode.org) 161 -v, --verbose show verbose output 162 -p, --plane1 show plane1 compose sequences 163 -n, --numeric when used with --gtk, create file with numeric values only 164 -e, --gtk-expanded when used with --gtk, create file that repeats first column; not usable in GTK+ 165 166 Default is to show statistics. 167 """ 168 169try: 170 opts, args = getopt.getopt(sys.argv[1:], "pvgashune", ["help", "algorithmic", "statistics", "unicodedatatxt", 171 "stats", "gtk", "verbose", "plane1", "numeric", "gtk-expanded"]) 172except: 173 usage() 174 sys.exit(2) 175 176opt_statistics = False 177opt_algorithmic = False 178opt_gtk = False 179opt_unicodedatatxt = False 180opt_verbose = False 181opt_plane1 = False 182opt_numeric = False 183opt_gtkexpanded = False 184 185for o, a in opts: 186 if o in ("-h", "--help"): 187 usage() 188 sys.exit() 189 if o in ("-s", "--statistics"): 190 opt_statistics = True 191 if o in ("-a", "--algorithmic"): 192 opt_algorithmic = True 193 if o in ("-g", "--gtk"): 194 opt_gtk = True 195 if o in ("-u", "--unicodedatatxt"): 196 opt_unicodedatatxt = True 197 if o in ("-v", "--verbose"): 198 opt_verbose = True 199 if o in ("-p", "--plane1"): 200 opt_plane1 = True 201 if o in ("-n", "--numeric"): 202 opt_numeric = True 203 if o in ("-e", "--gtk-expanded"): 204 opt_gtkexpanded = True 205 206if not opt_algorithmic and not opt_gtk and not opt_unicodedatatxt: 207 opt_statistics = True 208 209def download_hook(blocks_transferred, block_size, file_size): 210 """ A download hook to provide some feedback when downloading """ 211 if blocks_transferred == 0: 212 if file_size > 0: 213 if opt_verbose: 214 print "Downloading", file_size, "bytes: ", 215 else: 216 if opt_verbose: 217 print "Downloading: ", 218 sys.stdout.write('#') 219 sys.stdout.flush() 220 221 222def download_file(url): 223 """ Downloads a file provided a URL. Returns the filename. """ 224 """ Borks on failure """ 225 localfilename = url.split('/')[-1] 226 if not isfile(localfilename) or getsize(localfilename) <= 0: 227 if opt_verbose: 228 print "Downloading ", url, "..." 229 try: 230 urlretrieve(url, localfilename, download_hook) 231 except IOError, (errno, strerror): 232 print "I/O error(%s): %s" % (errno, strerror) 233 sys.exit(-1) 234 except: 235 print "Unexpected error: ", sys.exc_info()[0] 236 sys.exit(-1) 237 print " done." 238 else: 239 if opt_verbose: 240 print "Using cached file for ", url 241 return localfilename 242 243def process_gdkkeysymsh(): 244 """ Opens the gdkkeysyms.h file from GTK+/gdk/gdkkeysyms.h """ 245 """ Fills up keysymdb with contents """ 246 filename_gdkkeysymsh = download_file(URL_GDKKEYSYMSH) 247 try: 248 gdkkeysymsh = open(filename_gdkkeysymsh, 'r') 249 except IOError, (errno, strerror): 250 print "I/O error(%s): %s" % (errno, strerror) 251 sys.exit(-1) 252 except: 253 print "Unexpected error: ", sys.exc_info()[0] 254 sys.exit(-1) 255 256 """ Parse the gdkkeysyms.h file and place contents in keysymdb """ 257 linenum_gdkkeysymsh = 0 258 keysymdb = {} 259 for line in gdkkeysymsh.readlines(): 260 linenum_gdkkeysymsh += 1 261 line = line.strip() 262 if line == "" or not match('^#define GDK_KEY_', line): 263 continue 264 components = split('\s+', line) 265 if len(components) < 3: 266 print "Invalid line %(linenum)d in %(filename)s: %(line)s"\ 267 % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line} 268 print "Was expecting 3 items in the line" 269 sys.exit(-1) 270 if not match('^GDK_KEY_', components[1]): 271 print "Invalid line %(linenum)d in %(filename)s: %(line)s"\ 272 % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line} 273 print "Was expecting a keysym starting with GDK_KEY_" 274 sys.exit(-1) 275 if match('^0x[0-9a-fA-F]+$', components[2]): 276 unival = long(components[2][2:], 16) 277 if unival == 0: 278 continue 279 keysymdb[components[1][8:]] = unival 280 else: 281 print "Invalid line %(linenum)d in %(filename)s: %(line)s"\ 282 % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line} 283 print "Was expecting a hexadecimal number at the end of the line" 284 sys.exit(-1) 285 gdkkeysymsh.close() 286 287 """ Patch up the keysymdb with some of our own stuff """ 288 289 """ This is for a missing keysym from the currently upstream file """ 290 ###keysymdb['dead_stroke'] = 0x338 291 292 """ This is for a missing keysym from the currently upstream file """ 293 ###keysymdb['dead_belowring'] = 0x323 294 ###keysymdb['dead_belowmacron'] = 0x331 295 ###keysymdb['dead_belowcircumflex'] = 0x32d 296 ###keysymdb['dead_belowtilde'] = 0x330 297 ###keysymdb['dead_belowbreve'] = 0x32e 298 ###keysymdb['dead_belowdiaeresis'] = 0x324 299 300 """ This is^Wwas preferential treatment for Greek """ 301 # keysymdb['dead_tilde'] = 0x342 302 """ This is^was preferential treatment for Greek """ 303 #keysymdb['combining_tilde'] = 0x342 304 305 """ Fixing VoidSymbol """ 306 keysymdb['VoidSymbol'] = 0xFFFF 307 308 return keysymdb 309 310def process_keysymstxt(): 311 """ Grabs and opens the keysyms.txt file that Markus Kuhn maintains """ 312 """ This file keeps a record between keysyms <-> unicode chars """ 313 filename_keysymstxt = download_file(URL_KEYSYMSTXT) 314 try: 315 keysymstxt = open(filename_keysymstxt, 'r') 316 except IOError, (errno, strerror): 317 print "I/O error(%s): %s" % (errno, strerror) 318 sys.exit(-1) 319 except: 320 print "Unexpected error: ", sys.exc_info()[0] 321 sys.exit(-1) 322 323 """ Parse the keysyms.txt file and place content in keysymdb """ 324 linenum_keysymstxt = 0 325 keysymdb = {} 326 for line in keysymstxt.readlines(): 327 linenum_keysymstxt += 1 328 line = line.strip() 329 if line == "" or match('^#', line): 330 continue 331 components = split('\s+', line) 332 if len(components) < 5: 333 print "Invalid line %(linenum)d in %(filename)s: %(line)s'"\ 334 % {'linenum': linenum_keysymstxt, 'filename': filename_keysymstxt, 'line': line} 335 print "Was expecting 5 items in the line" 336 sys.exit(-1) 337 if match('^U[0-9a-fA-F]+$', components[1]): 338 unival = long(components[1][1:], 16) 339 if unival == 0: 340 continue 341 keysymdb[components[4]] = unival 342 keysymstxt.close() 343 344 """ Patch up the keysymdb with some of our own stuff """ 345 """ This is for a missing keysym from the currently upstream file """ 346 keysymdb['dead_belowring'] = 0x323 347 keysymdb['dead_belowmacron'] = 0x331 348 keysymdb['dead_belowcircumflex'] = 0x32d 349 keysymdb['dead_belowtilde'] = 0x330 350 keysymdb['dead_belowbreve'] = 0x32e 351 keysymdb['dead_belowdiaeresis'] = 0x324 352 353 """ This is preferential treatment for Greek """ 354 """ => we get more savings if used for Greek """ 355 # keysymdb['dead_tilde'] = 0x342 356 """ This is preferential treatment for Greek """ 357 # keysymdb['combining_tilde'] = 0x342 358 359 """ This is for a missing keysym from Markus Kuhn's db """ 360 keysymdb['dead_stroke'] = 0x338 361 """ This is for a missing keysym from Markus Kuhn's db """ 362 keysymdb['Oslash'] = 0x0d8 363 """ This is for a missing keysym from Markus Kuhn's db """ 364 keysymdb['Ssharp'] = 0x1e9e 365 366 """ This is for a missing (recently added) keysym """ 367 keysymdb['dead_psili'] = 0x313 368 """ This is for a missing (recently added) keysym """ 369 keysymdb['dead_dasia'] = 0x314 370 371 """ Allows to import Multi_key sequences """ 372 keysymdb['Multi_key'] = 0xff20 373 374 keysymdb['zerosubscript'] = 0x2080 375 keysymdb['onesubscript'] = 0x2081 376 keysymdb['twosubscript'] = 0x2082 377 keysymdb['threesubscript'] = 0x2083 378 keysymdb['foursubscript'] = 0x2084 379 keysymdb['fivesubscript'] = 0x2085 380 keysymdb['sixsubscript'] = 0x2086 381 keysymdb['sevensubscript'] = 0x2087 382 keysymdb['eightsubscript'] = 0x2088 383 keysymdb['ninesubscript'] = 0x2089 384 keysymdb['dead_doublegrave'] = 0x030F 385 keysymdb['dead_invertedbreve'] = 0x0311 386 keysymdb['dead_belowcomma'] = 0xfe6e 387 keysymdb['dead_currency'] = 0xfe6f 388 keysymdb['dead_greek'] = 0xfe8c 389 390 return keysymdb 391 392def keysymvalue(keysym, file = "n/a", linenum = 0): 393 """ Extracts a value from the keysym """ 394 """ Find the value of keysym, using the data from keysyms """ 395 """ Use file and linenum to when reporting errors """ 396 if keysym == "": 397 return 0 398 if keysymdatabase.has_key(keysym): 399 return keysymdatabase[keysym] 400 elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]): 401 return atoi(keysym[1:], 16) 402 elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]): 403 return atoi(keysym[2:], 16) 404 else: 405 print 'keysymvalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym } 406 #return -1 407 sys.exit(-1) 408 409def keysymunicodevalue(keysym, file = "n/a", linenum = 0): 410 """ Extracts a value from the keysym """ 411 """ Find the value of keysym, using the data from keysyms """ 412 """ Use file and linenum to when reporting errors """ 413 if keysym == "": 414 return 0 415 if keysymunicodedatabase.has_key(keysym): 416 return keysymunicodedatabase[keysym] 417 elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]): 418 return atoi(keysym[1:], 16) 419 elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]): 420 return atoi(keysym[2:], 16) 421 else: 422 print 'keysymunicodevalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym } 423 sys.exit(-1) 424 425def rename_combining(seq): 426 filtered_sequence = [] 427 for ks in seq: 428 if findall('^combining_', ks): 429 ks = sub('^combining_', 'dead_', ks) 430 if ks == 'dead_double_grave': 431 ks = 'dead_doublegrave' 432 if ks == 'dead_inverted_breve': 433 ks = 'dead_invertedbreve' 434 filtered_sequence.append(ks) 435 return filtered_sequence 436 437 438keysymunicodedatabase = process_keysymstxt() 439keysymdatabase = process_gdkkeysymsh() 440 441""" Grab and open the compose file from upstream """ 442filename_compose = download_file(URL_COMPOSE) 443try: 444 composefile = open(filename_compose, 'r') 445except IOError, (errno, strerror): 446 print "I/O error(%s): %s" % (errno, strerror) 447 sys.exit(-1) 448except: 449 print "Unexpected error: ", sys.exc_info()[0] 450 sys.exit(-1) 451 452""" Look if there is a lookaside (supplementary) compose file in the current 453 directory, and if so, open, then merge with upstream Compose file. 454""" 455xorg_compose_sequences_raw = [] 456for seq in composefile.readlines(): 457 xorg_compose_sequences_raw.append(seq) 458 459try: 460 composefile_lookaside = open(FILENAME_COMPOSE_SUPPLEMENTARY, 'r') 461 for seq in composefile_lookaside.readlines(): 462 xorg_compose_sequences_raw.append(seq) 463except IOError, (errno, strerror): 464 if opt_verbose: 465 print "I/O error(%s): %s" % (errno, strerror) 466 print "Did not find lookaside compose file. Continuing..." 467except: 468 print "Unexpected error: ", sys.exc_info()[0] 469 sys.exit(-1) 470 471""" Parse the compose file in xorg_compose_sequences""" 472xorg_compose_sequences = [] 473xorg_compose_sequences_32bit = [] 474xorg_compose_sequences_algorithmic = [] 475linenum_compose = 0 476comment_nest_depth = 0 477for line in xorg_compose_sequences_raw: 478 linenum_compose += 1 479 line = line.strip() 480 if match("^XCOMM", line) or match("^#", line): 481 continue 482 483 line = sub(r"\/\*([^\*]*|[\*][^/])\*\/", "", line) 484 485 comment_start = line.find("/*") 486 487 if comment_start >= 0: 488 if comment_nest_depth == 0: 489 line = line[:comment_start] 490 else: 491 line = "" 492 493 comment_nest_depth += 1 494 else: 495 comment_end = line.find("*/") 496 497 if comment_end >= 0: 498 comment_nest_depth -= 1 499 500 if comment_nest_depth < 0: 501 print "Invalid comment %(linenum_compose)d in %(filename)s: \ 502 Closing '*/' without opening '/*'" % { "linenum_compose": linenum_compose, "filename": filename_compose } 503 exit(-1) 504 505 if comment_nest_depth > 0: 506 line = "" 507 else: 508 line = line[comment_end + 2:] 509 510 if line is "": 511 continue 512 513 #line = line[:-1] 514 components = split(':', line, 1) 515 if len(components) != 2: 516 print "Invalid line %(linenum_compose)d in %(filename)s: No sequence\ 517 /value pair found" % { "linenum_compose": linenum_compose, "filename": filename_compose } 518 exit(-1) 519 (seq, val ) = split(':', line, 1) 520 seq = seq.strip() 521 val = val.strip() 522 raw_sequence = findall('\w+', seq) 523 values = split('\s+', val) 524 unichar_temp = split('"', values[0]) 525 unichar_utf8 = unichar_temp[1] 526 # The line of "U17ff" in Compose.pre includes a unichar only 527 codepointstr = values[1] if len(values) > 1 else '' 528 codepoint = 0 529 codepoints = [] 530 enable_32bit = False 531 if raw_sequence[0][0] == 'U' and match('[0-9a-fA-F]+$', raw_sequence[0][1:]): 532 raw_sequence[0] = '0x' + raw_sequence[0][1:] 533 if match('^U[0-9a-fA-F]+$', codepointstr): 534 codepoint = long(codepointstr[1:], 16) 535 if codepoint > 0xFFFF: 536 codepoints.append(codepoint) 537 codepoint = 0 538 enable_32bit = True 539 elif keysymunicodedatabase.has_key(codepointstr): 540 #if keysymdatabase[codepointstr] != keysymunicodedatabase[codepointstr]: 541 #print "DIFFERENCE: 0x%(a)X 0x%(b)X" % { "a": keysymdatabase[codepointstr], "b": keysymunicodedatabase[codepointstr]}, 542 #print raw_sequence, codepointstr 543 codepoint = keysymunicodedatabase[codepointstr] 544 else: 545 unichars = unicode(unichar_utf8, 'utf-8') 546 if len(unichars) > 1 or ord(unichars[0]) > 0xFFFF: 547 enable_32bit = True 548 for unichar in unichars: 549 codepoints.append(ord(unichar)) 550 else: 551 codepoint = ord(unichars[0]); 552 sequence = rename_combining(raw_sequence) 553 reject_this = False 554 for i in sequence: 555 if keysymvalue(i) > 0xFFFF: 556 reject_this = True 557 if opt_plane1: 558 print sequence 559 break 560 if keysymvalue(i) < 0: 561 reject_this = True 562 break 563 if reject_this: 564 continue 565 if "U0342" in sequence or \ 566 "U0313" in sequence or \ 567 "U0314" in sequence or \ 568 "0x0313" in sequence or \ 569 "0x0342" in sequence or \ 570 "0x0314" in sequence: 571 continue 572 if "Multi_key" not in sequence: 573 if True: 574 original_sequence = copy(sequence) 575 stats_sequence = copy(sequence) 576 base = sequence.pop() 577 basechar = keysymvalue(base, filename_compose, linenum_compose) 578 579 if basechar < 0xFFFF: 580 counter = 1 581 unisequence = [] 582 not_normalised = True 583 skipping_this = False 584 for i in range(0, len(sequence)): 585 """ If the sequence has dead_tilde and is for Greek, we don't do algorithmically 586 because of lack of dead_perispomeni (i.e. conflict) 587 """ 588 bc = basechar 589 """if sequence[-1] == "dead_tilde" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff): 590 skipping_this = True 591 break 592 if sequence[-1] == "dead_horn" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff): 593 skipping_this = True 594 break 595 if sequence[-1] == "dead_ogonek" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff): 596 skipping_this = True 597 break 598 if sequence[-1] == "dead_psili": 599 sequence[i] = "dead_horn" 600 if sequence[-1] == "dead_dasia": 601 sequence[-1] = "dead_ogonek" 602 """ 603 unisequence.append(unichr(keysymunicodevalue(sequence.pop(), filename_compose, linenum_compose))) 604 605 if skipping_this: 606 unisequence = [] 607 for perm in all_permutations(unisequence): 608 # print counter, original_sequence, unichr(basechar) + "".join(perm) 609 # print counter, map(unichr, perm) 610 normalized = normalize('NFC', unichr(basechar) + "".join(perm)) 611 if len(normalized) == 1: 612 # print 'Base: %(base)s [%(basechar)s], produces [%(unichar)s] (0x%(codepoint)04X)' \ 613 # % { "base": base, "basechar": unichr(basechar), "unichar": unichar, "codepoint": codepoint }, 614 # print "Normalized: [%(normalized)s] SUCCESS %(c)d" % { "normalized": normalized, "c": counter } 615 stats_sequence_data = map(keysymunicodevalue, stats_sequence) 616 stats_sequence_data.append(normalized) 617 xorg_compose_sequences_algorithmic.append(stats_sequence_data) 618 not_normalised = False 619 break; 620 counter += 1 621 if not_normalised: 622 if enable_32bit: 623 for cp in codepoints: 624 original_sequence.append(cp) 625 length = len(codepoints) 626 original_sequence.append(length) 627 xorg_compose_sequences_32bit.append(original_sequence) 628 else: 629 original_sequence.append(codepoint) 630 original_sequence.append(1) 631 xorg_compose_sequences.append(original_sequence) 632 633 else: 634 print "Error in base char !?!" 635 exit(-2) 636 else: 637 if enable_32bit: 638 for cp in codepoints: 639 sequence.append(cp) 640 length = len(codepoints) 641 sequence.append(length) 642 xorg_compose_sequences_32bit.append(sequence) 643 else: 644 sequence.append(codepoint) 645 sequence.append(1) 646 xorg_compose_sequences.append(sequence) 647 648def sequence_cmp(x, y): 649 length_x = len(x) - x[-1] - 1 650 length_y = len(y) - y[-1] - 1 651 if keysymvalue(x[0]) > keysymvalue(y[0]): 652 return 1 653 elif keysymvalue(x[0]) < keysymvalue(y[0]): 654 return -1 655 elif length_x > length_y: 656 return 1 657 elif length_x < length_y: 658 return -1 659 elif keysymvalue(x[1]) > keysymvalue(y[1]): 660 return 1 661 elif keysymvalue(x[1]) < keysymvalue(y[1]): 662 return -1 663 for i in range(2, length_x): 664 if keysymvalue(x[i]) > keysymvalue(y[i]): 665 return 1 666 elif keysymvalue(x[i]) < keysymvalue(y[i]): 667 return -1 668 return 0 669 670def sequence_unicode_cmp(x, y): 671 length_x = len(x) - x[-1] - 1 672 length_y = len(y) - y[-1] - 1 673 if keysymunicodevalue(x[0]) > keysymunicodevalue(y[0]): 674 return 1 675 elif keysymunicodevalue(x[0]) < keysymunicodevalue(y[0]): 676 return -1 677 elif length_x > length_y: 678 return 1 679 elif length_x < length_y: 680 return -1 681 elif keysymunicodevalue(x[1]) > keysymunicodevalue(y[1]): 682 return 1 683 elif keysymunicodevalue(x[1]) < keysymunicodevalue(y[1]): 684 return -1 685 for i in range(2, length_x): 686 if keysymunicodevalue(x[i]) > keysymunicodevalue(y[i]): 687 return 1 688 elif keysymunicodevalue(x[i]) < keysymunicodevalue(y[i]): 689 return -1 690 return 0 691 692def sequence_algorithmic_cmp(x, y): 693 if len(x) < len(y): 694 return -1 695 elif len(x) > len(y): 696 return 1 697 else: 698 for i in range(len(x)): 699 if x[i] < y[i]: 700 return -1 701 elif x[i] > y[i]: 702 return 1 703 return 0 704 705 706xorg_compose_sequences.sort(sequence_cmp) 707xorg_compose_sequences_32bit.sort(sequence_cmp) 708 709def num_of_keysyms(seq): 710 value_length = seq[-1] 711 return len(seq) - value_length - 1 712 713 714def check_max_width_of_compose_table(compose_sequences): 715 for sequence in xorg_compose_sequences: 716 num = num_of_keysyms(sequence) 717 global WIDTHOFCOMPOSETABLE 718 if num > WIDTHOFCOMPOSETABLE: 719 print("Extend the sequence length: %s" % num) 720 WIDTHOFCOMPOSETABLE = num 721 722def make_compose_sequences_unique(compose_sequences): 723 xorg_compose_sequences_uniqued = [] 724 first_time = True 725 item = None 726 for next_item in compose_sequences: 727 if first_time: 728 first_time = False 729 item = next_item 730 xorg_compose_sequences_uniqued.append(next_item) 731 if sequence_unicode_cmp(item, next_item) != 0: 732 xorg_compose_sequences_uniqued.append(next_item) 733 item = next_item 734 735 return copy(xorg_compose_sequences_uniqued) 736 737check_max_width_of_compose_table(xorg_compose_sequences) 738check_max_width_of_compose_table(xorg_compose_sequences_32bit) 739 740xorg_compose_sequences = make_compose_sequences_unique(xorg_compose_sequences) 741xorg_compose_sequences_32bit = make_compose_sequences_unique(xorg_compose_sequences_32bit) 742 743counter_multikey = 0 744for item in xorg_compose_sequences + xorg_compose_sequences_32bit: 745 length = item[-1] 746 if findall('Multi_key', "".join(item[:len(item) - length - 1])) != []: 747 counter_multikey += 1 748 749xorg_compose_sequences_algorithmic.sort(sequence_algorithmic_cmp) 750xorg_compose_sequences_algorithmic_uniqued = uniq(xorg_compose_sequences_algorithmic) 751 752firstitem = "" 753num_first_keysyms = 0 754num_first_keysyms_32bit = 0 755zeroes = 0 756num_entries = 0 757num_algorithmic_greek = 0 758for sequence in xorg_compose_sequences: 759 if keysymvalue(firstitem) != keysymvalue(sequence[0]): 760 firstitem = sequence[0] 761 num_first_keysyms += 1 762 # max length of sequences + length of unichar(== 1) - length of the 763 # current sequence + common offset (== 1) 764 zeroes += WIDTHOFCOMPOSETABLE - num_of_keysyms(sequence) 765 num_entries += 1 766 767firstitem = "" 768for sequence in xorg_compose_sequences_32bit: 769 if keysymvalue(firstitem) != keysymvalue(sequence[0]): 770 firstitem = sequence[0] 771 num_first_keysyms_32bit += 1 772 773for sequence in xorg_compose_sequences_algorithmic_uniqued: 774 ch = ord(sequence[-1:][0]) 775 if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff: 776 num_algorithmic_greek += 1 777 778 779if opt_algorithmic: 780 for sequence in xorg_compose_sequences_algorithmic_uniqued: 781 letter = "".join(sequence[-1:]) 782 print '0x%(cp)04X, %(uni)s, seq: [ <0x%(base)04X>,' % { 'cp': ord(unicode(letter)), 'uni': letter.encode('utf-8'), 'base': sequence[-2] }, 783 for elem in sequence[:-2]: 784 print "<0x%(keysym)04X>," % { 'keysym': elem }, 785 """ Yeah, verified... We just want to keep the output similar to -u, so we can compare/sort easily """ 786 print "], recomposed as", letter.encode('utf-8'), "verified" 787 788def convert_UnotationToHex(arg): 789 if isinstance(arg, str): 790 if match('^U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]$', arg): 791 return sub('^U', '0x', arg) 792 return arg 793 794def addprefix_GDK(arg): 795 if match('^0x', arg): 796 return '%(arg)s, ' % { 'arg': arg } 797 elif match('^U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]$', arg.upper()): 798 keysym = '' 799 for k, c in keysymunicodedatabase.items(): 800 if c == keysymvalue(arg): 801 keysym = k 802 break 803 if keysym != '': 804 return 'IBUS_KEY_%(arg)s, ' % { 'arg': keysym } 805 else: 806 return '0x%(arg)04X, ' % { 'arg': keysymvalue(arg) } 807 else: 808 return 'IBUS_KEY_%(arg)s, ' % { 'arg': arg } 809 810def make_compose_table(compose_sequences, 811 compose_table, 812 ct_second_part, 813 start_offset, 814 is_32bit): 815 first_keysym = "" 816 sequence = [] 817 we_finished = False 818 counter = 0 819 820 sequence_iterator = iter(compose_sequences) 821 sequence = sequence_iterator.next() 822 while True: 823 first_keysym = sequence[0] # Set the first keysym 824 compose_table_sequence = [first_keysym] + \ 825 map(lambda x: 0, range(WIDTHOFCOMPOSETABLE)) 826 compose_table.append(compose_table_sequence) 827 while sequence[0] == first_keysym: 828 compose_table[counter][num_of_keysyms(sequence)-1] += 1 829 try: 830 sequence = sequence_iterator.next() 831 except StopIteration: 832 we_finished = True 833 break 834 if we_finished: 835 break 836 counter += 1 837 838 ct_index = start_offset 839 for line_num in range(len(compose_table)): 840 for i in range(WIDTHOFCOMPOSETABLE): 841 occurrences = compose_table[line_num][i+1] 842 compose_table[line_num][i+1] = ct_index 843 # If not 32bit, i + 1 is the next index in for loop 844 # and i + 2 is the next index + unichar size (== 1) 845 # If 32bit, i + 1 is the next index in for loop 846 # and i + 3 is the next index + unichar index size 847 # (== 1) + unichar length size (== 1) 848 if is_32bit: 849 ct_index += occurrences * (i+3) 850 else: 851 ct_index += occurrences * (i+2) 852 853 for sequence in compose_sequences: 854 ct_second_part.append(map(convert_UnotationToHex, sequence)) 855 856 857def print_compose_table_keysyms(compose_table): 858 for i in compose_table: 859 if opt_gtkexpanded: 860 print "0x%(ks)04X," % { "ks": keysymvalue(i[0]) }, 861 print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i[1:])) } 862 elif not match('^0x', i[0]): 863 print 'IBUS_KEY_%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) } 864 else: 865 print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) } 866 867 868def print_compose_table_values(ct_second_part, is_32bit, is_second): 869 i = 0 870 for s in ct_second_part: 871 length = s[-1] 872 seq_length = len(s) - length - 1 873 if opt_numeric: 874 if not is_32bit or not is_second: 875 for ks in s[:seq_length][1:]: 876 print '0x%(seq)04X,' % { 'seq': keysymvalue(ks) }, 877 if is_32bit and not is_second: 878 print '%(i)d, %(l)d,' % { 'i':i, 'l':length } 879 if not is_32bit or is_second: 880 for v in range(seq_length, seq_length + length): 881 print '0x%(cp)04X,' % { 'cp':s[v] }, 882 print '' 883 elif opt_gtkexpanded: 884 if not is_32bit or not is_second: 885 print '%(seq)s' % { 'seq': "".join(map(addprefix_GDK, s[:seq_length][1:])) }, 886 if is_32bit and not is_second: 887 print '%(i)d, %(l)d,' % { 'i':i, 'l':length } 888 if not is_32bit or is_second: 889 for v in range(seq_length, seq_length + length): 890 print '0x%(cp)04X,' % { 'cp':s[v] }, 891 print '' 892 else: 893 if not is_32bit or not is_second: 894 print '%(seq)s' % { 'seq': "".join(map(addprefix_GDK, s[:seq_length][1:])) }, 895 if is_32bit and not is_second: 896 print '%(i)d, %(l)d,' % { 'i':i, 'l':length } 897 if not is_32bit or is_second: 898 for v in range(seq_length, seq_length + length): 899 print '0x%(cp)04X,' % { 'cp':s[v] }, 900 print '' 901 i += length 902 903 904def compose_gtk(): 905 compose_table = [] 906 compose_table_32bit = [] 907 ct_second_part = [] 908 ct_second_part_32bit = [] 909 start_offset = num_first_keysyms * (WIDTHOFCOMPOSETABLE+1) 910 911 make_compose_table(xorg_compose_sequences, 912 compose_table, 913 ct_second_part, 914 start_offset, 915 False) 916 print headerfile_start 917 print_compose_table_keysyms(compose_table) 918 print_compose_table_values(ct_second_part, False, False) 919 if len(xorg_compose_sequences_32bit) == 0: 920 print headerfile_end 921 return 922 print declare_compose_sequence_32bit_first 923 start_offset = num_first_keysyms_32bit * (WIDTHOFCOMPOSETABLE+1) 924 make_compose_table(xorg_compose_sequences_32bit, 925 compose_table_32bit, 926 ct_second_part_32bit, 927 start_offset, 928 True) 929 print_compose_table_keysyms(compose_table_32bit) 930 print_compose_table_values(ct_second_part_32bit, True, False) 931 print declare_compose_sequence_32bit_second 932 print_compose_table_values(ct_second_part_32bit, True, True) 933 print headerfile_end 934 935 936if opt_gtk: 937 compose_gtk() 938 939 940def redecompose(codepoint): 941 (name, decomposition, combiningclass) = unicodedatabase[codepoint] 942 if decomposition[0] == '' or decomposition[0] == '0': 943 return [codepoint] 944 if match('<\w+>', decomposition[0]): 945 numdecomposition = map(stringtohex, decomposition[1:]) 946 return map(redecompose, numdecomposition) 947 numdecomposition = map(stringtohex, decomposition) 948 return map(redecompose, numdecomposition) 949 950def process_unicodedata_file(verbose = False): 951 """ Grab from wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt """ 952 filename_unicodedatatxt = download_file(URL_UNICODEDATATXT) 953 try: 954 unicodedatatxt = open(filename_unicodedatatxt, 'r') 955 except IOError, (errno, strerror): 956 print "I/O error(%s): %s" % (errno, strerror) 957 sys.exit(-1) 958 except: 959 print "Unexpected error: ", sys.exc_info()[0] 960 sys.exit(-1) 961 for line in unicodedatatxt.readlines(): 962 if line[0] == "" or line[0] == '#': 963 continue 964 line = line[:-1] 965 uniproperties = split(';', line) 966 codepoint = stringtohex(uniproperties[0]) 967 """ We don't do Plane 1 or CJK blocks. The latter require reading additional files. """ 968 if codepoint > 0xFFFF or (codepoint >= 0x4E00 and codepoint <= 0x9FFF) or (codepoint >= 0xF900 and codepoint <= 0xFAFF): 969 continue 970 name = uniproperties[1] 971 category = uniproperties[2] 972 combiningclass = uniproperties[3] 973 decomposition = uniproperties[5] 974 unicodedatabase[codepoint] = [name, split('\s+', decomposition), combiningclass] 975 976 counter_combinations = 0 977 counter_combinations_greek = 0 978 counter_entries = 0 979 counter_entries_greek = 0 980 981 for item in unicodedatabase.keys(): 982 (name, decomposition, combiningclass) = unicodedatabase[item] 983 if decomposition[0] == '': 984 continue 985 print name, "is empty" 986 elif match('<\w+>', decomposition[0]): 987 continue 988 print name, "has weird", decomposition[0] 989 else: 990 sequence = map(stringtohex, decomposition) 991 chrsequence = map(unichr, sequence) 992 normalized = normalize('NFC', "".join(chrsequence)) 993 994 """ print name, sequence, "Combining: ", "".join(chrsequence), normalized, len(normalized), """ 995 decomposedsequence = [] 996 for subseq in map(redecompose, sequence): 997 for seqitem in subseq: 998 if isinstance(seqitem, list): 999 for i in seqitem: 1000 if isinstance(i, list): 1001 for j in i: 1002 decomposedsequence.append(j) 1003 else: 1004 decomposedsequence.append(i) 1005 else: 1006 decomposedsequence.append(seqitem) 1007 recomposedchar = normalize('NFC', "".join(map(unichr, decomposedsequence))) 1008 if len(recomposedchar) == 1 and len(decomposedsequence) > 1: 1009 counter_entries += 1 1010 counter_combinations += factorial(len(decomposedsequence)-1) 1011 ch = item 1012 if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff: 1013 counter_entries_greek += 1 1014 counter_combinations_greek += factorial(len(decomposedsequence)-1) 1015 if verbose: 1016 print "0x%(cp)04X, %(uni)c, seq:" % { 'cp':item, 'uni':unichr(item) }, 1017 print "[", 1018 for elem in decomposedsequence: 1019 print '<0x%(hex)04X>,' % { 'hex': elem }, 1020 print "], recomposed as", recomposedchar, 1021 if unichr(item) == recomposedchar: 1022 print "verified" 1023 1024 if verbose == False: 1025 print "Unicode statistics from UnicodeData.txt" 1026 print "Number of entries that can be algorithmically produced :", counter_entries 1027 print " of which are for Greek :", counter_entries_greek 1028 print "Number of compose sequence combinations requiring :", counter_combinations 1029 print " of which are for Greek :", counter_combinations_greek 1030 print "Note: We do not include partial compositions, " 1031 print "thus the slight discrepancy in the figures" 1032 print 1033 1034if opt_unicodedatatxt: 1035 process_unicodedata_file(True) 1036 1037if opt_statistics: 1038 print 1039 print "Total number of compose sequences (from file) :", len(xorg_compose_sequences) + len(xorg_compose_sequences_algorithmic) 1040 print " of which can be expressed algorithmically :", len(xorg_compose_sequences_algorithmic) 1041 print " of which cannot be expressed algorithmically :", len(xorg_compose_sequences) 1042 print " of which have Multi_key :", counter_multikey 1043 print 1044 print "Algorithmic (stats for Xorg Compose file)" 1045 print "Number of sequences off due to algo from file (len(array)) :", len(xorg_compose_sequences_algorithmic) 1046 print "Number of sequences off due to algo (uniq(sort(array))) :", len(xorg_compose_sequences_algorithmic_uniqued) 1047 print " of which are for Greek :", num_algorithmic_greek 1048 print 1049 process_unicodedata_file() 1050 print "Not algorithmic (stats from Xorg Compose file)" 1051 print "Number of sequences :", len(xorg_compose_sequences) 1052 print "Flat array looks like :", len(xorg_compose_sequences), "rows of 6 integers (2 bytes per int, or 12 bytes per row)" 1053 print "Flat array would have taken up (in bytes) :", num_entries * 2 * 6, "bytes from the GTK+ library" 1054 print "Number of items in flat array :", len(xorg_compose_sequences) * 6 1055 print " of which are zeroes :", zeroes, "or ", (100 * zeroes) / (len(xorg_compose_sequences) * 6), " per cent" 1056 print "Number of different first items :", num_first_keysyms 1057 print "Number of different first items 32bit :", num_first_keysyms_32bit 1058 print "Number of max keysym sequences :", WIDTHOFCOMPOSETABLE 1059 print "Number of max bytes (if using flat array) :", num_entries * 2 * 6 1060 print "Number of savings :", zeroes * 2 - num_first_keysyms * 2 * 5 1061 print 1062 print "Memory needs if both algorithmic+optimised table in latest Xorg compose file" 1063 print " :", num_entries * 2 * 6 - zeroes * 2 + num_first_keysyms * 2 * 5 1064 print 1065 print "Existing (old) implementation in GTK+" 1066 print "Number of sequences in old gtkimcontextsimple.c :", 691 1067 print "The existing (old) implementation in GTK+ takes up :", 691 * 2 * 12, "bytes" 1068