1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3#
4# compose-parse.py, version 1.5
5#
6# multifunction script that helps manage the compose sequence table in GTK+ (gtk/gtkimcontextsimple.c)
7# the script produces statistics and information about the whole process, run with --help for more.
8#
9# You may need to switch your python installation to utf-8, if you get 'ascii' codec errors.
10#
11# Complain to Simos Xenitellis (simos@gnome.org, http://simos.info/blog) for this craft.
12
13from re			import findall, match, split, sub
14from string		import atoi
15from unicodedata	import normalize
16from urllib 		import urlretrieve
17from os.path		import isfile, getsize
18from copy 		import copy
19
20import sys
21import getopt
22
23# We grab files off the web, left and right.
24URL_COMPOSE = 'http://cgit.freedesktop.org/xorg/lib/libX11/plain/nls/en_US.UTF-8/Compose.pre'
25URL_KEYSYMSTXT = "http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt"
26URL_GDKKEYSYMSH = "http://git.gnome.org/browse/gtk%2B/plain/gdk/gdkkeysyms.h"
27URL_UNICODEDATATXT = 'http://www.unicode.org/Public/6.0.0/ucd/UnicodeData.txt'
28FILENAME_COMPOSE_SUPPLEMENTARY = 'gtk-compose-lookaside.txt'
29
30# We currently support keysyms of size 2; once upstream xorg gets sorted,
31# we might produce some tables with size 2 and some with size 4.
32SIZEOFINT = 2
33
34# Current max compose sequence length; in case it gets increased.
35WIDTHOFCOMPOSETABLE = 5
36
37keysymdatabase = {}
38keysymunicodedatabase = {}
39unicodedatabase = {}
40
41headerfile_start = """/* GTK - The GIMP Tool Kit
42 * Copyright (C) 2007, 2008 GNOME Foundation
43 *
44 * This library is free software; you can redistribute it and/or
45 * modify it under the terms of the GNU Lesser General Public
46 * License as published by the Free Software Foundation; either
47 * version 2 of the License, or (at your option) any later version.
48 *
49 * This library is distributed in the hope that it will be useful,
50 * but WITHOUT ANY WARRANTY; without even the implied warranty of
51 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
52 * Lesser General Public License for more details.
53 *
54 * You should have received a copy of the GNU Lesser General Public
55 * License along with this library. If not, see <http://www.gnu.org/licenses/>.
56 */
57
58/*
59 * File auto-generated from script found at http://bugzilla.gnome.org/show_bug.cgi?id=321896
60 * using the input files
61 *  Input   : http://cgit.freedesktop.org/xorg/lib/libX11/plain/nls/en_US.UTF-8/Compose.pre
62 *  Input   : http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt
63 *  Input   : http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
64 *
65 * This table is optimised for space and requires special handling to access the content.
66 * This table is used solely by http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimple.c
67 *
68 * The resulting file is placed at http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimpleseqs.h
69 * This file is described in bug report http://bugzilla.gnome.org/show_bug.cgi?id=321896
70 */
71
72/*
73 * Modified by the GTK+ Team and others 2007, 2008.  See the AUTHORS
74 * file for a list of people on the GTK+ Team.  See the ChangeLog
75 * files for a list of changes.  These files are distributed with
76 * GTK+ at ftp://ftp.gtk.org/pub/gtk/.
77 */
78
79#ifndef __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
80#define __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
81
82/* === These are the original comments of the file; we keep for historical purposes ===
83 *
84 * The following table was generated from the X compose tables include with
85 * XFree86 4.0 using a set of Perl scripts. Contact Owen Taylor <otaylor@redhat.com>
86 * to obtain the relevant perl scripts.
87 *
88 * The following compose letter letter sequences confliced
89 *   Dstroke/dstroke and ETH/eth; resolved to Dstroke (Croation, Vietnamese, Lappish), over
90 *                                ETH (Icelandic, Faroese, old English, IPA)  [ D- -D d- -d ]
91 *   Amacron/amacron and ordfeminine; resolved to ordfeminine                 [ _A A_ a_ _a ]
92 *   Amacron/amacron and Atilde/atilde; resolved to atilde                    [ -A A- a- -a ]
93 *   Omacron/Omacron and masculine; resolved to masculine                     [ _O O_ o_ _o ]
94 *   Omacron/omacron and Otilde/atilde; resolved to otilde                    [ -O O- o- -o ]
95 *
96 * [ Amacron and Omacron are in Latin-4 (Baltic). ordfeminine and masculine are used for
97 *   spanish. atilde and otilde are used at least for Portuguese ]
98 *
99 *   at and Aring; resolved to Aring                                          [ AA ]
100 *   guillemotleft and caron; resolved to guillemotleft                       [ << ]
101 *   ogonek and cedilla; resolved to cedilla                                  [ ,, ]
102 *
103 * This probably should be resolved by first checking an additional set of compose tables
104 * that depend on the locale or selected input method.
105 */
106
107static const guint16 gtk_compose_seqs_compact[] = {"""
108
109headerfile_end = """};
110
111#endif /* __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ */
112"""
113
114declare_compose_sequence_32bit_first = """};
115
116static const guint16 gtk_compose_seqs_compact_32bit_first[] = {"""
117
118declare_compose_sequence_32bit_second = """};
119
120static const guint32 gtk_compose_seqs_compact_32bit_second[] = {"""
121
122def stringtohex(str): return atoi(str, 16)
123
124def factorial(n):
125	if n <= 1:
126		return 1
127	else:
128		return n * factorial(n-1)
129
130def uniq(*args) :
131	""" Performs a uniq operation on a list or lists """
132    	theInputList = []
133    	for theList in args:
134    	   theInputList += theList
135    	theFinalList = []
136    	for elem in theInputList:
137		if elem not in theFinalList:
138          		theFinalList.append(elem)
139    	return theFinalList
140
141
142
143def all_permutations(seq):
144	""" Borrowed from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/252178 """
145	""" Produces all permutations of the items of a list """
146    	if len(seq) <=1:
147    	    yield seq
148    	else:
149    	    for perm in all_permutations(seq[1:]):
150    	        for i in range(len(perm)+1):
151    	            #nb str[0:1] works in both string and list contexts
152        	        yield perm[:i] + seq[0:1] + perm[i:]
153
154def usage():
155	print """compose-parse available parameters:
156	-h, --help		this craft
157	-s, --statistics	show overall statistics (both algorithmic, non-algorithmic)
158	-a, --algorithmic	show sequences saved with algorithmic optimisation
159	-g, --gtk		show entries that go to GTK+
160	-u, --unicodedatatxt	show compose sequences derived from UnicodeData.txt (from unicode.org)
161	-v, --verbose		show verbose output
162        -p, --plane1		show plane1 compose sequences
163	-n, --numeric		when used with --gtk, create file with numeric values only
164	-e, --gtk-expanded	when used with --gtk, create file that repeats first column; not usable in GTK+
165
166	Default is to show statistics.
167	"""
168
169try:
170	opts, args = getopt.getopt(sys.argv[1:], "pvgashune", ["help", "algorithmic", "statistics", "unicodedatatxt",
171		"stats", "gtk", "verbose", "plane1", "numeric", "gtk-expanded"])
172except:
173	usage()
174	sys.exit(2)
175
176opt_statistics = False
177opt_algorithmic = False
178opt_gtk = False
179opt_unicodedatatxt = False
180opt_verbose = False
181opt_plane1 = False
182opt_numeric = False
183opt_gtkexpanded = False
184
185for o, a in opts:
186	if o in ("-h", "--help"):
187		usage()
188		sys.exit()
189	if o in ("-s", "--statistics"):
190		opt_statistics = True
191	if o in ("-a", "--algorithmic"):
192		opt_algorithmic = True
193	if o in ("-g", "--gtk"):
194		opt_gtk = True
195	if o in ("-u", "--unicodedatatxt"):
196		opt_unicodedatatxt = True
197	if o in ("-v", "--verbose"):
198		opt_verbose = True
199	if o in ("-p", "--plane1"):
200		opt_plane1 = True
201	if o in ("-n", "--numeric"):
202		opt_numeric = True
203	if o in ("-e", "--gtk-expanded"):
204		opt_gtkexpanded = True
205
206if not opt_algorithmic and not opt_gtk and not opt_unicodedatatxt:
207	opt_statistics = True
208
209def download_hook(blocks_transferred, block_size, file_size):
210	""" A download hook to provide some feedback when downloading """
211	if blocks_transferred == 0:
212		if file_size > 0:
213			if opt_verbose:
214				print "Downloading", file_size, "bytes: ",
215		else:
216			if opt_verbose:
217				print "Downloading: ",
218	sys.stdout.write('#')
219	sys.stdout.flush()
220
221
222def download_file(url):
223	""" Downloads a file provided a URL. Returns the filename. """
224	""" Borks on failure """
225	localfilename = url.split('/')[-1]
226        if not isfile(localfilename) or getsize(localfilename) <= 0:
227		if opt_verbose:
228			print "Downloading ", url, "..."
229		try:
230			urlretrieve(url, localfilename, download_hook)
231		except IOError, (errno, strerror):
232			print "I/O error(%s): %s" % (errno, strerror)
233			sys.exit(-1)
234		except:
235			print "Unexpected error: ", sys.exc_info()[0]
236			sys.exit(-1)
237		print " done."
238        else:
239		if opt_verbose:
240                	print "Using cached file for ", url
241	return localfilename
242
243def process_gdkkeysymsh():
244	""" Opens the gdkkeysyms.h file from GTK+/gdk/gdkkeysyms.h """
245	""" Fills up keysymdb with contents """
246	filename_gdkkeysymsh = download_file(URL_GDKKEYSYMSH)
247	try:
248		gdkkeysymsh = open(filename_gdkkeysymsh, 'r')
249	except IOError, (errno, strerror):
250		print "I/O error(%s): %s" % (errno, strerror)
251		sys.exit(-1)
252	except:
253		print "Unexpected error: ", sys.exc_info()[0]
254		sys.exit(-1)
255
256	""" Parse the gdkkeysyms.h file and place contents in  keysymdb """
257	linenum_gdkkeysymsh = 0
258	keysymdb = {}
259	for line in gdkkeysymsh.readlines():
260		linenum_gdkkeysymsh += 1
261		line = line.strip()
262		if line == "" or not match('^#define GDK_KEY_', line):
263			continue
264		components = split('\s+', line)
265		if len(components) < 3:
266			print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
267			% {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
268			print "Was expecting 3 items in the line"
269			sys.exit(-1)
270		if not match('^GDK_KEY_', components[1]):
271			print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
272			% {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
273			print "Was expecting a keysym starting with GDK_KEY_"
274			sys.exit(-1)
275		if match('^0x[0-9a-fA-F]+$', components[2]):
276			unival = long(components[2][2:], 16)
277			if unival == 0:
278				continue
279			keysymdb[components[1][8:]] = unival
280		else:
281			print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
282			% {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
283			print "Was expecting a hexadecimal number at the end of the line"
284			sys.exit(-1)
285	gdkkeysymsh.close()
286
287	""" Patch up the keysymdb with some of our own stuff """
288
289	""" This is for a missing keysym from the currently upstream file """
290	###keysymdb['dead_stroke'] = 0x338
291
292	""" This is for a missing keysym from the currently upstream file """
293	###keysymdb['dead_belowring'] = 0x323
294	###keysymdb['dead_belowmacron'] = 0x331
295	###keysymdb['dead_belowcircumflex'] = 0x32d
296	###keysymdb['dead_belowtilde'] = 0x330
297	###keysymdb['dead_belowbreve'] = 0x32e
298	###keysymdb['dead_belowdiaeresis'] = 0x324
299
300	""" This is^Wwas preferential treatment for Greek """
301	# keysymdb['dead_tilde'] = 0x342
302	""" This is^was preferential treatment for Greek """
303	#keysymdb['combining_tilde'] = 0x342
304
305	""" Fixing VoidSymbol """
306	keysymdb['VoidSymbol'] = 0xFFFF
307
308	return keysymdb
309
310def process_keysymstxt():
311	""" Grabs and opens the keysyms.txt file that Markus Kuhn maintains """
312	""" This file keeps a record between keysyms <-> unicode chars """
313	filename_keysymstxt = download_file(URL_KEYSYMSTXT)
314	try:
315		keysymstxt = open(filename_keysymstxt, 'r')
316	except IOError, (errno, strerror):
317		print "I/O error(%s): %s" % (errno, strerror)
318		sys.exit(-1)
319	except:
320		print "Unexpected error: ", sys.exc_info()[0]
321		sys.exit(-1)
322
323	""" Parse the keysyms.txt file and place content in  keysymdb """
324	linenum_keysymstxt = 0
325	keysymdb = {}
326	for line in keysymstxt.readlines():
327		linenum_keysymstxt += 1
328		line = line.strip()
329		if line == "" or match('^#', line):
330			continue
331		components = split('\s+', line)
332		if len(components) < 5:
333			print "Invalid line %(linenum)d in %(filename)s: %(line)s'"\
334			% {'linenum': linenum_keysymstxt, 'filename': filename_keysymstxt, 'line': line}
335			print "Was expecting 5 items in the line"
336			sys.exit(-1)
337		if match('^U[0-9a-fA-F]+$', components[1]):
338			unival = long(components[1][1:], 16)
339		if unival == 0:
340			continue
341		keysymdb[components[4]] = unival
342	keysymstxt.close()
343
344	""" Patch up the keysymdb with some of our own stuff """
345	""" This is for a missing keysym from the currently upstream file """
346	keysymdb['dead_belowring'] = 0x323
347	keysymdb['dead_belowmacron'] = 0x331
348	keysymdb['dead_belowcircumflex'] = 0x32d
349	keysymdb['dead_belowtilde'] = 0x330
350	keysymdb['dead_belowbreve'] = 0x32e
351	keysymdb['dead_belowdiaeresis'] = 0x324
352
353	""" This is preferential treatment for Greek """
354	""" => we get more savings if used for Greek """
355	# keysymdb['dead_tilde'] = 0x342
356	""" This is preferential treatment for Greek """
357	# keysymdb['combining_tilde'] = 0x342
358
359	""" This is for a missing keysym from Markus Kuhn's db """
360	keysymdb['dead_stroke'] = 0x338
361	""" This is for a missing keysym from Markus Kuhn's db """
362	keysymdb['Oslash'] = 0x0d8
363	""" This is for a missing keysym from Markus Kuhn's db """
364	keysymdb['Ssharp'] = 0x1e9e
365
366	""" This is for a missing (recently added) keysym """
367	keysymdb['dead_psili'] = 0x313
368	""" This is for a missing (recently added) keysym """
369	keysymdb['dead_dasia'] = 0x314
370
371	""" Allows to import Multi_key sequences """
372	keysymdb['Multi_key'] = 0xff20
373
374        keysymdb['zerosubscript'] = 0x2080
375        keysymdb['onesubscript'] = 0x2081
376        keysymdb['twosubscript'] = 0x2082
377        keysymdb['threesubscript'] = 0x2083
378        keysymdb['foursubscript'] = 0x2084
379        keysymdb['fivesubscript'] = 0x2085
380        keysymdb['sixsubscript'] = 0x2086
381        keysymdb['sevensubscript'] = 0x2087
382        keysymdb['eightsubscript'] = 0x2088
383        keysymdb['ninesubscript'] = 0x2089
384        keysymdb['dead_doublegrave'] = 0x030F
385        keysymdb['dead_invertedbreve'] = 0x0311
386        keysymdb['dead_belowcomma'] = 0xfe6e
387        keysymdb['dead_currency'] = 0xfe6f
388        keysymdb['dead_greek'] = 0xfe8c
389
390	return keysymdb
391
392def keysymvalue(keysym, file = "n/a", linenum = 0):
393	""" Extracts a value from the keysym """
394	""" Find the value of keysym, using the data from keysyms """
395	""" Use file and linenum to when reporting errors """
396	if keysym == "":
397		return 0
398       	if keysymdatabase.has_key(keysym):
399               	return keysymdatabase[keysym]
400       	elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
401               	return atoi(keysym[1:], 16)
402       	elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
403		return atoi(keysym[2:], 16)
404	else:
405        	print 'keysymvalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym }
406               	#return -1
407		sys.exit(-1)
408
409def keysymunicodevalue(keysym, file = "n/a", linenum = 0):
410	""" Extracts a value from the keysym """
411	""" Find the value of keysym, using the data from keysyms """
412	""" Use file and linenum to when reporting errors """
413	if keysym == "":
414		return 0
415       	if keysymunicodedatabase.has_key(keysym):
416               	return keysymunicodedatabase[keysym]
417       	elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
418               	return atoi(keysym[1:], 16)
419       	elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
420		return atoi(keysym[2:], 16)
421	else:
422        	print 'keysymunicodevalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym }
423               	sys.exit(-1)
424
425def rename_combining(seq):
426	filtered_sequence = []
427	for ks in seq:
428		if findall('^combining_', ks):
429			ks = sub('^combining_', 'dead_', ks)
430                if ks == 'dead_double_grave':
431                        ks = 'dead_doublegrave'
432                if ks == 'dead_inverted_breve':
433                        ks = 'dead_invertedbreve'
434		filtered_sequence.append(ks)
435	return filtered_sequence
436
437
438keysymunicodedatabase = process_keysymstxt()
439keysymdatabase = process_gdkkeysymsh()
440
441""" Grab and open the compose file from upstream """
442filename_compose = download_file(URL_COMPOSE)
443try:
444	composefile = open(filename_compose, 'r')
445except IOError, (errno, strerror):
446	print "I/O error(%s): %s" % (errno, strerror)
447	sys.exit(-1)
448except:
449	print "Unexpected error: ", sys.exc_info()[0]
450	sys.exit(-1)
451
452""" Look if there is a lookaside (supplementary) compose file in the current
453    directory, and if so, open, then merge with upstream Compose file.
454"""
455xorg_compose_sequences_raw = []
456for seq in composefile.readlines():
457        xorg_compose_sequences_raw.append(seq)
458
459try:
460        composefile_lookaside = open(FILENAME_COMPOSE_SUPPLEMENTARY, 'r')
461        for seq in composefile_lookaside.readlines():
462                xorg_compose_sequences_raw.append(seq)
463except IOError, (errno, strerror):
464        if opt_verbose:
465                print "I/O error(%s): %s" % (errno, strerror)
466                print "Did not find lookaside compose file. Continuing..."
467except:
468        print "Unexpected error: ", sys.exc_info()[0]
469        sys.exit(-1)
470
471""" Parse the compose file in  xorg_compose_sequences"""
472xorg_compose_sequences = []
473xorg_compose_sequences_32bit = []
474xorg_compose_sequences_algorithmic = []
475linenum_compose = 0
476comment_nest_depth = 0
477for line in xorg_compose_sequences_raw:
478	linenum_compose += 1
479	line = line.strip()
480	if match("^XCOMM", line) or match("^#", line):
481		continue
482
483	line = sub(r"\/\*([^\*]*|[\*][^/])\*\/", "", line)
484
485	comment_start = line.find("/*")
486
487	if comment_start >= 0:
488		if comment_nest_depth == 0:
489			line = line[:comment_start]
490		else:
491			line = ""
492
493		comment_nest_depth += 1
494	else:
495		comment_end = line.find("*/")
496
497		if comment_end >= 0:
498			comment_nest_depth -= 1
499
500		if comment_nest_depth < 0:
501			print "Invalid comment %(linenum_compose)d in %(filename)s: \
502			Closing '*/' without opening '/*'" % { "linenum_compose": linenum_compose, "filename": filename_compose }
503			exit(-1)
504
505		if comment_nest_depth > 0:
506			line = ""
507		else:
508			line = line[comment_end + 2:]
509
510	if line is "":
511		continue
512
513	#line = line[:-1]
514	components = split(':', line, 1)
515	if len(components) != 2:
516		print "Invalid line %(linenum_compose)d in %(filename)s: No sequence\
517		/value pair found" % { "linenum_compose": linenum_compose, "filename": filename_compose }
518		exit(-1)
519	(seq, val ) = split(':', line, 1)
520	seq = seq.strip()
521	val = val.strip()
522	raw_sequence = findall('\w+', seq)
523	values = split('\s+', val)
524	unichar_temp = split('"', values[0])
525	unichar_utf8 = unichar_temp[1]
526	# The line of "U17ff" in Compose.pre includes a unichar only
527	codepointstr = values[1] if len(values) > 1 else ''
528	codepoint = 0
529	codepoints = []
530	enable_32bit = False
531	if raw_sequence[0][0] == 'U' and match('[0-9a-fA-F]+$', raw_sequence[0][1:]):
532		raw_sequence[0] = '0x' + raw_sequence[0][1:]
533	if  match('^U[0-9a-fA-F]+$', codepointstr):
534		codepoint = long(codepointstr[1:], 16)
535		if codepoint > 0xFFFF:
536			codepoints.append(codepoint)
537			codepoint = 0
538			enable_32bit = True
539	elif keysymunicodedatabase.has_key(codepointstr):
540		#if keysymdatabase[codepointstr] != keysymunicodedatabase[codepointstr]:
541			#print "DIFFERENCE: 0x%(a)X 0x%(b)X" % { "a": keysymdatabase[codepointstr], "b": keysymunicodedatabase[codepointstr]},
542			#print raw_sequence, codepointstr
543		codepoint = keysymunicodedatabase[codepointstr]
544	else:
545		unichars = unicode(unichar_utf8, 'utf-8')
546		if len(unichars) > 1 or ord(unichars[0]) > 0xFFFF:
547			enable_32bit = True
548			for unichar in unichars:
549				codepoints.append(ord(unichar))
550		else:
551			codepoint = ord(unichars[0]);
552	sequence = rename_combining(raw_sequence)
553	reject_this = False
554	for i in sequence:
555		if keysymvalue(i) > 0xFFFF:
556			reject_this = True
557			if opt_plane1:
558				print sequence
559			break
560		if keysymvalue(i) < 0:
561			reject_this = True
562			break
563	if reject_this:
564		continue
565	if "U0342" in sequence or \
566		"U0313" in sequence or \
567		"U0314" in sequence or \
568		"0x0313" in sequence or \
569		"0x0342" in sequence or \
570		"0x0314" in sequence:
571		continue
572	if "Multi_key" not in sequence:
573		if True:
574			original_sequence = copy(sequence)
575			stats_sequence = copy(sequence)
576			base = sequence.pop()
577			basechar = keysymvalue(base, filename_compose, linenum_compose)
578
579			if basechar < 0xFFFF:
580				counter = 1
581				unisequence = []
582				not_normalised = True
583				skipping_this = False
584				for i in range(0, len(sequence)):
585					""" If the sequence has dead_tilde and is for Greek, we don't do algorithmically
586					    because of lack of dead_perispomeni (i.e. conflict)
587					"""
588					bc = basechar
589					"""if sequence[-1] == "dead_tilde" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
590						skipping_this = True
591						break
592					if sequence[-1] == "dead_horn" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
593						skipping_this = True
594						break
595					if sequence[-1] == "dead_ogonek" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
596						skipping_this = True
597						break
598					if sequence[-1] == "dead_psili":
599						sequence[i] = "dead_horn"
600					if sequence[-1] == "dead_dasia":
601						sequence[-1] = "dead_ogonek"
602					"""
603					unisequence.append(unichr(keysymunicodevalue(sequence.pop(), filename_compose, linenum_compose)))
604
605				if skipping_this:
606					unisequence = []
607				for perm in all_permutations(unisequence):
608					# print counter, original_sequence, unichr(basechar) + "".join(perm)
609					# print counter, map(unichr, perm)
610					normalized = normalize('NFC', unichr(basechar) + "".join(perm))
611					if len(normalized) == 1:
612						# print 'Base: %(base)s [%(basechar)s], produces [%(unichar)s] (0x%(codepoint)04X)' \
613						# % { "base": base, "basechar": unichr(basechar), "unichar": unichar, "codepoint": codepoint },
614						# print "Normalized: [%(normalized)s] SUCCESS %(c)d" % { "normalized": normalized, "c": counter }
615						stats_sequence_data = map(keysymunicodevalue, stats_sequence)
616						stats_sequence_data.append(normalized)
617						xorg_compose_sequences_algorithmic.append(stats_sequence_data)
618						not_normalised = False
619						break;
620					counter += 1
621				if not_normalised:
622					if enable_32bit:
623						for cp in codepoints:
624							original_sequence.append(cp)
625						length = len(codepoints)
626						original_sequence.append(length)
627						xorg_compose_sequences_32bit.append(original_sequence)
628					else:
629						original_sequence.append(codepoint)
630						original_sequence.append(1)
631						xorg_compose_sequences.append(original_sequence)
632
633			else:
634				print "Error in base char !?!"
635				exit(-2)
636	else:
637		if enable_32bit:
638			for cp in codepoints:
639				sequence.append(cp)
640			length = len(codepoints)
641			sequence.append(length)
642			xorg_compose_sequences_32bit.append(sequence)
643		else:
644			sequence.append(codepoint)
645			sequence.append(1)
646			xorg_compose_sequences.append(sequence)
647
648def sequence_cmp(x, y):
649	length_x = len(x) - x[-1] - 1
650	length_y = len(y) - y[-1] - 1
651	if keysymvalue(x[0]) > keysymvalue(y[0]):
652		return 1
653	elif keysymvalue(x[0]) < keysymvalue(y[0]):
654		return -1
655	elif length_x > length_y:
656		return 1
657	elif length_x < length_y:
658		return -1
659	elif keysymvalue(x[1]) > keysymvalue(y[1]):
660		return 1
661	elif keysymvalue(x[1]) < keysymvalue(y[1]):
662		return -1
663        for i in range(2, length_x):
664		if keysymvalue(x[i]) > keysymvalue(y[i]):
665			return 1
666		elif keysymvalue(x[i]) < keysymvalue(y[i]):
667			return -1
668	return 0
669
670def sequence_unicode_cmp(x, y):
671	length_x = len(x) - x[-1] - 1
672	length_y = len(y) - y[-1] - 1
673	if keysymunicodevalue(x[0]) > keysymunicodevalue(y[0]):
674		return 1
675	elif keysymunicodevalue(x[0]) < keysymunicodevalue(y[0]):
676		return -1
677	elif length_x > length_y:
678		return 1
679	elif length_x < length_y:
680		return -1
681	elif keysymunicodevalue(x[1]) > keysymunicodevalue(y[1]):
682		return 1
683	elif keysymunicodevalue(x[1]) < keysymunicodevalue(y[1]):
684		return -1
685        for i in range(2, length_x):
686		if keysymunicodevalue(x[i]) > keysymunicodevalue(y[i]):
687			return 1
688		elif keysymunicodevalue(x[i]) < keysymunicodevalue(y[i]):
689			return -1
690	return 0
691
692def sequence_algorithmic_cmp(x, y):
693	if len(x) < len(y):
694		return -1
695	elif len(x) > len(y):
696		return 1
697	else:
698		for i in range(len(x)):
699			if x[i] < y[i]:
700				return -1
701			elif x[i] > y[i]:
702				return 1
703	return 0
704
705
706xorg_compose_sequences.sort(sequence_cmp)
707xorg_compose_sequences_32bit.sort(sequence_cmp)
708
709def num_of_keysyms(seq):
710	value_length = seq[-1]
711	return len(seq) - value_length - 1
712
713
714def check_max_width_of_compose_table(compose_sequences):
715	for sequence in xorg_compose_sequences:
716		num = num_of_keysyms(sequence)
717		global WIDTHOFCOMPOSETABLE
718		if num > WIDTHOFCOMPOSETABLE:
719			print("Extend the sequence length: %s" % num)
720			WIDTHOFCOMPOSETABLE = num
721
722def make_compose_sequences_unique(compose_sequences):
723	xorg_compose_sequences_uniqued = []
724	first_time = True
725	item = None
726	for next_item in compose_sequences:
727		if first_time:
728			first_time = False
729			item = next_item
730			xorg_compose_sequences_uniqued.append(next_item)
731		if sequence_unicode_cmp(item, next_item) != 0:
732			xorg_compose_sequences_uniqued.append(next_item)
733		item = next_item
734
735	return copy(xorg_compose_sequences_uniqued)
736
737check_max_width_of_compose_table(xorg_compose_sequences)
738check_max_width_of_compose_table(xorg_compose_sequences_32bit)
739
740xorg_compose_sequences = make_compose_sequences_unique(xorg_compose_sequences)
741xorg_compose_sequences_32bit = make_compose_sequences_unique(xorg_compose_sequences_32bit)
742
743counter_multikey = 0
744for item in xorg_compose_sequences + xorg_compose_sequences_32bit:
745	length = item[-1]
746	if findall('Multi_key', "".join(item[:len(item) - length - 1])) != []:
747		counter_multikey += 1
748
749xorg_compose_sequences_algorithmic.sort(sequence_algorithmic_cmp)
750xorg_compose_sequences_algorithmic_uniqued = uniq(xorg_compose_sequences_algorithmic)
751
752firstitem = ""
753num_first_keysyms = 0
754num_first_keysyms_32bit = 0
755zeroes = 0
756num_entries = 0
757num_algorithmic_greek = 0
758for sequence in xorg_compose_sequences:
759	if keysymvalue(firstitem) != keysymvalue(sequence[0]):
760		firstitem = sequence[0]
761		num_first_keysyms += 1
762	# max length of sequences + length of unichar(== 1) - length of the
763        # current sequence + common offset (== 1)
764	zeroes += WIDTHOFCOMPOSETABLE - num_of_keysyms(sequence)
765	num_entries += 1
766
767firstitem = ""
768for sequence in xorg_compose_sequences_32bit:
769	if keysymvalue(firstitem) != keysymvalue(sequence[0]):
770		firstitem = sequence[0]
771		num_first_keysyms_32bit += 1
772
773for sequence in xorg_compose_sequences_algorithmic_uniqued:
774	ch = ord(sequence[-1:][0])
775	if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
776		num_algorithmic_greek += 1
777
778
779if opt_algorithmic:
780	for sequence in xorg_compose_sequences_algorithmic_uniqued:
781		letter = "".join(sequence[-1:])
782		print '0x%(cp)04X, %(uni)s, seq: [ <0x%(base)04X>,' % { 'cp': ord(unicode(letter)), 'uni': letter.encode('utf-8'), 'base': sequence[-2] },
783		for elem in sequence[:-2]:
784			print "<0x%(keysym)04X>," % { 'keysym': elem },
785		""" Yeah, verified... We just want to keep the output similar to -u, so we can compare/sort easily """
786		print "], recomposed as", letter.encode('utf-8'), "verified"
787
788def convert_UnotationToHex(arg):
789	if isinstance(arg, str):
790		if match('^U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]$', arg):
791			return sub('^U', '0x', arg)
792	return arg
793
794def addprefix_GDK(arg):
795	if match('^0x', arg):
796		return '%(arg)s, ' % { 'arg': arg }
797	elif match('^U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]$', arg.upper()):
798                keysym = ''
799                for k, c in keysymunicodedatabase.items():
800                    if c == keysymvalue(arg):
801                        keysym = k
802                        break
803                if keysym != '':
804		    return 'IBUS_KEY_%(arg)s, ' % { 'arg': keysym }
805                else:
806		    return '0x%(arg)04X, ' % { 'arg': keysymvalue(arg) }
807	else:
808		return 'IBUS_KEY_%(arg)s, ' % { 'arg': arg }
809
810def make_compose_table(compose_sequences,
811                       compose_table,
812                       ct_second_part,
813                       start_offset,
814                       is_32bit):
815	first_keysym = ""
816	sequence = []
817	we_finished = False
818	counter = 0
819
820	sequence_iterator = iter(compose_sequences)
821	sequence = sequence_iterator.next()
822	while True:
823		first_keysym = sequence[0]					# Set the first keysym
824		compose_table_sequence = [first_keysym] + \
825			map(lambda x: 0, range(WIDTHOFCOMPOSETABLE))
826		compose_table.append(compose_table_sequence)
827		while sequence[0] == first_keysym:
828			compose_table[counter][num_of_keysyms(sequence)-1] += 1
829			try:
830				sequence = sequence_iterator.next()
831			except StopIteration:
832				we_finished = True
833				break
834		if we_finished:
835			break
836		counter += 1
837
838	ct_index = start_offset
839	for line_num in range(len(compose_table)):
840		for i in range(WIDTHOFCOMPOSETABLE):
841			occurrences = compose_table[line_num][i+1]
842			compose_table[line_num][i+1] = ct_index
843			# If not 32bit, i + 1 is the next index in for loop
844			# and i + 2 is the next index + unichar size (== 1)
845			# If 32bit, i + 1 is the next index in for loop
846			# and i + 3 is the next index + unichar index size
847			# (== 1) + unichar length size (== 1)
848			if is_32bit:
849				ct_index += occurrences * (i+3)
850			else:
851				ct_index += occurrences * (i+2)
852
853	for sequence in compose_sequences:
854		ct_second_part.append(map(convert_UnotationToHex, sequence))
855
856
857def print_compose_table_keysyms(compose_table):
858	for i in compose_table:
859		if opt_gtkexpanded:
860			print "0x%(ks)04X," % { "ks": keysymvalue(i[0]) },
861			print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i[1:])) }
862		elif not match('^0x', i[0]):
863			print 'IBUS_KEY_%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
864		else:
865			print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
866
867
868def print_compose_table_values(ct_second_part, is_32bit, is_second):
869	i = 0
870	for s in ct_second_part:
871		length = s[-1]
872		seq_length = len(s) - length - 1
873		if opt_numeric:
874			if not is_32bit or not is_second:
875				for ks in s[:seq_length][1:]:
876					print '0x%(seq)04X,' % { 'seq': keysymvalue(ks) },
877			if is_32bit and not is_second:
878				print '%(i)d, %(l)d,' % { 'i':i, 'l':length }
879			if not is_32bit or is_second:
880				for v in range(seq_length, seq_length + length):
881					print '0x%(cp)04X,' % { 'cp':s[v] },
882				print ''
883		elif opt_gtkexpanded:
884			if not is_32bit or not is_second:
885				print '%(seq)s' % { 'seq': "".join(map(addprefix_GDK, s[:seq_length][1:])) },
886			if is_32bit and not is_second:
887				print '%(i)d, %(l)d,' % { 'i':i, 'l':length }
888			if not is_32bit or is_second:
889				for v in range(seq_length, seq_length + length):
890					print '0x%(cp)04X,' % { 'cp':s[v] },
891				print ''
892		else:
893			if not is_32bit or not is_second:
894				print '%(seq)s' % { 'seq': "".join(map(addprefix_GDK, s[:seq_length][1:])) },
895			if is_32bit and not is_second:
896				print '%(i)d, %(l)d,' % { 'i':i, 'l':length }
897			if not is_32bit or is_second:
898				for v in range(seq_length, seq_length + length):
899					print '0x%(cp)04X,' % { 'cp':s[v] },
900				print ''
901		i += length
902
903
904def compose_gtk():
905	compose_table = []
906	compose_table_32bit = []
907	ct_second_part = []
908	ct_second_part_32bit = []
909	start_offset = num_first_keysyms * (WIDTHOFCOMPOSETABLE+1)
910
911	make_compose_table(xorg_compose_sequences,
912	                   compose_table,
913	                   ct_second_part,
914	                   start_offset,
915	                   False)
916	print headerfile_start
917	print_compose_table_keysyms(compose_table)
918	print_compose_table_values(ct_second_part, False, False)
919	if len(xorg_compose_sequences_32bit) == 0:
920		print headerfile_end
921		return
922	print declare_compose_sequence_32bit_first
923	start_offset = num_first_keysyms_32bit * (WIDTHOFCOMPOSETABLE+1)
924	make_compose_table(xorg_compose_sequences_32bit,
925	                   compose_table_32bit,
926	                   ct_second_part_32bit,
927	                   start_offset,
928	                   True)
929	print_compose_table_keysyms(compose_table_32bit)
930	print_compose_table_values(ct_second_part_32bit, True, False)
931	print declare_compose_sequence_32bit_second
932	print_compose_table_values(ct_second_part_32bit, True, True)
933	print headerfile_end
934
935
936if opt_gtk:
937	compose_gtk()
938
939
940def redecompose(codepoint):
941	(name, decomposition, combiningclass) = unicodedatabase[codepoint]
942	if decomposition[0] == '' or decomposition[0] == '0':
943		return [codepoint]
944	if match('<\w+>', decomposition[0]):
945		numdecomposition = map(stringtohex, decomposition[1:])
946		return map(redecompose, numdecomposition)
947	numdecomposition = map(stringtohex, decomposition)
948	return map(redecompose, numdecomposition)
949
950def process_unicodedata_file(verbose = False):
951	""" Grab from wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt """
952	filename_unicodedatatxt = download_file(URL_UNICODEDATATXT)
953	try:
954		unicodedatatxt = open(filename_unicodedatatxt, 'r')
955	except IOError, (errno, strerror):
956		print "I/O error(%s): %s" % (errno, strerror)
957		sys.exit(-1)
958	except:
959		print "Unexpected error: ", sys.exc_info()[0]
960		sys.exit(-1)
961	for line in unicodedatatxt.readlines():
962		if line[0] == "" or line[0] == '#':
963			continue
964		line = line[:-1]
965		uniproperties = split(';', line)
966		codepoint = stringtohex(uniproperties[0])
967		""" We don't do Plane 1 or CJK blocks. The latter require reading additional files. """
968		if codepoint > 0xFFFF or (codepoint >= 0x4E00 and codepoint <= 0x9FFF) or (codepoint >= 0xF900 and codepoint <= 0xFAFF):
969			continue
970		name = uniproperties[1]
971		category = uniproperties[2]
972		combiningclass = uniproperties[3]
973		decomposition = uniproperties[5]
974		unicodedatabase[codepoint] = [name, split('\s+', decomposition), combiningclass]
975
976	counter_combinations = 0
977	counter_combinations_greek = 0
978	counter_entries = 0
979	counter_entries_greek = 0
980
981	for item in unicodedatabase.keys():
982		(name, decomposition, combiningclass) = unicodedatabase[item]
983		if decomposition[0] == '':
984			continue
985			print name, "is empty"
986		elif match('<\w+>', decomposition[0]):
987			continue
988			print name, "has weird", decomposition[0]
989		else:
990			sequence = map(stringtohex, decomposition)
991			chrsequence = map(unichr, sequence)
992			normalized = normalize('NFC', "".join(chrsequence))
993
994			""" print name, sequence, "Combining: ", "".join(chrsequence), normalized, len(normalized),  """
995			decomposedsequence = []
996			for subseq in map(redecompose, sequence):
997				for seqitem in subseq:
998					if isinstance(seqitem, list):
999						for i in seqitem:
1000							if isinstance(i, list):
1001								for j in i:
1002									decomposedsequence.append(j)
1003							else:
1004								decomposedsequence.append(i)
1005					else:
1006						decomposedsequence.append(seqitem)
1007			recomposedchar = normalize('NFC', "".join(map(unichr, decomposedsequence)))
1008			if len(recomposedchar) == 1 and len(decomposedsequence) > 1:
1009				counter_entries += 1
1010				counter_combinations += factorial(len(decomposedsequence)-1)
1011				ch = item
1012				if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
1013					counter_entries_greek += 1
1014					counter_combinations_greek += factorial(len(decomposedsequence)-1)
1015				if verbose:
1016					print "0x%(cp)04X, %(uni)c, seq:" % { 'cp':item, 'uni':unichr(item) },
1017					print "[",
1018					for elem in decomposedsequence:
1019						print '<0x%(hex)04X>,' % { 'hex': elem },
1020					print "], recomposed as", recomposedchar,
1021					if unichr(item) == recomposedchar:
1022						print "verified"
1023
1024	if verbose == False:
1025		print "Unicode statistics from UnicodeData.txt"
1026		print "Number of entries that can be algorithmically produced     :", counter_entries
1027		print "  of which are for Greek                                   :", counter_entries_greek
1028		print "Number of compose sequence combinations requiring          :", counter_combinations
1029		print "  of which are for Greek                                   :", counter_combinations_greek
1030		print "Note: We do not include partial compositions, "
1031		print "thus the slight discrepancy in the figures"
1032		print
1033
1034if opt_unicodedatatxt:
1035	process_unicodedata_file(True)
1036
1037if opt_statistics:
1038	print
1039	print "Total number of compose sequences (from file)              :", len(xorg_compose_sequences) + len(xorg_compose_sequences_algorithmic)
1040	print "  of which can be expressed algorithmically                :", len(xorg_compose_sequences_algorithmic)
1041	print "  of which cannot be expressed algorithmically             :", len(xorg_compose_sequences)
1042	print "    of which have Multi_key                                :", counter_multikey
1043	print
1044	print "Algorithmic (stats for Xorg Compose file)"
1045	print "Number of sequences off due to algo from file (len(array)) :", len(xorg_compose_sequences_algorithmic)
1046	print "Number of sequences off due to algo (uniq(sort(array)))    :", len(xorg_compose_sequences_algorithmic_uniqued)
1047	print "  of which are for Greek                                   :", num_algorithmic_greek
1048	print
1049	process_unicodedata_file()
1050	print "Not algorithmic (stats from Xorg Compose file)"
1051	print "Number of sequences                                        :", len(xorg_compose_sequences)
1052	print "Flat array looks like                                      :", len(xorg_compose_sequences), "rows of 6 integers (2 bytes per int, or 12 bytes per row)"
1053	print "Flat array would have taken up (in bytes)                  :", num_entries * 2 * 6, "bytes from the GTK+ library"
1054	print "Number of items in flat array                              :", len(xorg_compose_sequences) * 6
1055	print "  of which are zeroes                                      :", zeroes, "or ", (100 * zeroes) / (len(xorg_compose_sequences) * 6), " per cent"
1056	print "Number of different first items                            :", num_first_keysyms
1057	print "Number of different first items 32bit                      :", num_first_keysyms_32bit
1058	print "Number of max keysym sequences                             :", WIDTHOFCOMPOSETABLE
1059	print "Number of max bytes (if using flat array)                  :", num_entries * 2 * 6
1060	print "Number of savings                                          :", zeroes * 2 - num_first_keysyms * 2 * 5
1061	print
1062	print "Memory needs if both algorithmic+optimised table in latest Xorg compose file"
1063	print "                                                           :", num_entries * 2 * 6 - zeroes * 2 + num_first_keysyms * 2 * 5
1064	print
1065	print "Existing (old) implementation in GTK+"
1066	print "Number of sequences in old gtkimcontextsimple.c            :", 691
1067	print "The existing (old) implementation in GTK+ takes up         :", 691 * 2 * 12, "bytes"
1068