1import sys
2import re
3import os
4
5def bsdconv01(dt):
6	dt=dt.strip().lstrip("0").upper()
7	if len(dt) & 1:
8		return "010"+dt
9	else:
10		return "01"+dt
11
12
13f_ambiguous=open("modules/inter/_AMBIGUOUS.h", "w")
14f_width=open("modules/inter/_WIDTH.h", "w")
15f_ccc=open("modules/inter/_NF-CCC.h", "w")
16f_nfd=open("modules/inter/_NFD.txt", "w")
17f_nfkd=open("modules/inter/_NFKD.txt", "w")
18f_nfc=open("modules/inter/_NFC-MAP.txt", "w")
19f_upper=open("modules/inter/UPPER.txt", "w")
20f_lower=open("modules/inter/LOWER.txt", "w")
21f_casefold=open("modules/inter/CASEFOLD.txt", "w")
22f_cjkvar=open("tmp/cjkvar.txt", "w")
23
24ccc_start=-1
25ccc_end=-1
26ccc_value=0
27t_ccc={}
28m_nfd={}
29m_nfd_raw={}
30m_nfkd={}
31
32m_url={}
33f_map=open("tmp/map.txt")
34for l in f_map:
35	l=l.strip().split("\t")
36	if len(l)==2:
37		m_url[l[0]]=l[1]
38
39f_ambiguous.write("/* Generated from {url}*/\n".format(url=m_url["EastAsianWidth.txt"]));
40f_width.write("/* Generated from {url}*/\n".format(url=m_url["EastAsianWidth.txt"]));
41f_ccc.write("/* Generated from {url}*/\n".format(url=m_url["UnicodeData.txt"]));
42for f in [f_nfc, f_nfd, f_nfkd, f_upper, f_lower]:
43	f.write("Source: {url}\n".format(url=m_url["UnicodeData.txt"]))
44
45f_upper.write("Source: {url}\n".format(url=m_url["SpecialCasing.txt"]))
46f_lower.write("Source: {url}\n".format(url=m_url["SpecialCasing.txt"]))
47f_casefold.write("Source: {url}\n".format(url=m_url["CaseFolding.txt"]))
48
49f_ccc.write("""
50struct ccc_interval {
51	int beg;
52	int end;
53	int ccc;
54};
55
56static const struct ccc_interval ccc_table[] = {
57""");
58
59def lookup(l,m):
60	ret=[]
61	for e in l:
62		if e in m:
63			ret.extend(lookup(m[e], m))
64		else:
65			ret.append(e)
66	return ret
67
68def nf_order(l):
69	r=[]
70	b=0
71	e=0
72	for i in range(len(l)):
73		if l[i] in t_ccc:
74			if b==0:
75				b=i
76				e=i
77			else:
78				e=i
79		else:
80			if b!=e:
81				r.append((b,e))
82			b=0
83			e=0
84	if b!=e:
85		r.append((b,e))
86	for b,e in r:
87		a=sorted(l[b:e+1], key=lambda x:t_ccc[x])
88		for i in range(b,e+1):
89			l[i]=a[i-b]
90	return l
91
92def in_range(s,rs):
93	s=int(s[2:], 16)
94	for r0,r1 in rs:
95		r0=int(r0[2:], 16)
96		r1=int(r1[2:], 16)
97		if s>=r0 and s<=r1:
98			return True
99	return False
100
101# ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
102l_nfd=[]
103l_nfkd=[]
104ud=open("tmp/UnicodeData.txt")
105for l in ud:
106	if not l.strip():
107		continue
108	a=l.split(";")
109	cp=bsdconv01(a[0])
110	code_point=int(a[0], 16)
111	if "CJK" in a[1] and a[5] and not " "in a[5]:
112		f_cjkvar.write("{}\t{}\n".format(a[0], a[5]))
113	if a[3]!="0":
114		ccc=int(a[3])
115		if ccc:
116			t_ccc[cp]=ccc
117		if ccc==ccc_value and code_point==ccc_end+1:
118			ccc_end=code_point
119		else:
120			if ccc_value!=0:
121				f_ccc.write("{0x%x, 0x%x, %d},\n" % (ccc_start, ccc_end, ccc_value))
122			ccc_start=code_point
123			ccc_end=code_point
124			ccc_value=ccc
125	if a[5]:
126		dt=a[5].split(" ")
127		compat=False
128		if dt[0][0]=="<":
129			tag=dt[0][1:-1]
130			dt=dt[1:]
131			compat=True
132		dt=[bsdconv01(x) for x in dt]
133		if compat:
134			l_nfkd.append((cp,tag))
135			m_nfkd[cp]=dt
136		else:
137			l_nfkd.append((cp,"canonical"))
138			m_nfkd[cp]=dt
139			l_nfd.append(cp)
140			m_nfd[cp]=dt
141			m_nfd_raw[cp]=dt
142	if a[12]:
143		dt=bsdconv01(a[12])
144		f_upper.write("{f}\t{t}\n".format(f=cp, t=dt))
145	if a[13]:
146		dt=bsdconv01(a[13])
147		f_lower.write("{f}\t{t}\n".format(f=cp, t=dt))
148
149f_ccc.write("{0x%x, 0x%x, %d},\n" % (ccc_start, ccc_end, ccc_value))
150f_ccc.write("};\n")
151f_ccc.close()
152f_cjkvar.close()
153
154sc=open("tmp/SpecialCasing.txt")
155for l in sc:
156	l=l.strip()
157	if not l:
158		continue
159	if l[0] in "#":
160		continue
161	d,c=l.split("#")
162	d=d.split(";")
163	code=",".join([bsdconv01(x) for x in d[0].strip().split(" ")])
164	lower=",".join([bsdconv01(x) for x in d[1].strip().split(" ")])
165	title=",".join([bsdconv01(x) for x in d[2].strip().split(" ")])
166	upper=",".join([bsdconv01(x) for x in d[3].strip().split(" ")])
167	cond=d[4].strip()
168	if cond=="":
169		if code!=upper:
170			f_upper.write("{f}\t{t}\n".format(f=code, t=upper))
171		if code!=lower:
172			f_lower.write("{f}\t{t}\n".format(f=code, t=lower))
173
174
175f_upper.close()
176f_lower.close()
177
178l_fce=[]
179dnp=open("tmp/DerivedNormalizationProps.txt")
180for l in dnp:
181	l=l.strip()
182	if not l:
183		continue
184	if l[0] in "#":
185		continue
186	a=l.split(";")
187	if not a[1].strip().startswith("Full_Composition_Exclusion"):
188		continue
189	r=a[0].strip().split("..")
190	if len(r)==1:
191		r.append(r[0])
192	l_fce.append((bsdconv01(r[0]),bsdconv01(r[1])))
193
194for cp in l_nfd:
195	d=nf_order(lookup(m_nfd[cp], m_nfd))
196	m_nfd[cp]=d
197	f_nfd.write("{f}\t{t}\n".format(f=cp, t=",".join(d)))
198f_nfd.close()
199
200for cp,tag in l_nfkd:
201	d=nf_order(lookup(m_nfkd[cp], m_nfkd))
202	m_nfkd[cp]=d
203	f_nfkd.write("{f}\t{t}\t#{c}\n".format(f=cp, t=",".join(d), c=tag))
204f_nfkd.close()
205
206for cp in l_nfd:
207	if in_range(cp, l_fce):
208		continue
209	l=m_nfd_raw[cp]
210	f_nfc.write("{f}\t{t}\n".format(f=",".join(l), t=cp))
211f_nfc.close()
212
213cf=open("tmp/CaseFolding.txt")
214for l in cf:
215	l=l.strip()
216	if not l:
217		continue
218	if l[0] in "#":
219		continue
220	csm, name = l.split("#")
221	code, status, mapping, null = csm.split(";")
222	code = bsdconv01(code)
223	status=status.strip()
224	mapping = ",".join([bsdconv01(x) for x in mapping.strip().split(" ")])
225	f_casefold.write("{f}\t{t}\n".format(f=code, t=mapping))
226f_casefold.close()
227
228
229# Blocks
230
231blocks={
232	"^.*Arabian$": "ARABIC",
233	"^Arabic.*$": "ARABIC",
234	"^Armenian$": "ARMENIAN",
235	"^.*Arrows.*$": "ARROWS",
236	"^Bopomofo.*$": "CJK",
237	"^Braille.*$": "BRAILLE",
238	"^Cherokee$": "CHEROKEE",
239	"^.*CJK.*$": "CJK",
240	"^Cuneiform.*$": "CUNEIFORM",
241	"^Currency.*$": "CURRENCY",
242	"^Cyrillic.*$": "CYRILLIC",
243	"^Devanagari.*$": "DEVANAGARI",
244	"^Egyptian.*$": "EGYPTIAN",
245	"^Emoticons$": "EMOTICON",
246	"^Ethiopic.*$": "ETHIOPIC",
247	"^Georgian.*$": "GEORGIAN",
248	"^.*Greek.*$": "GREEK",
249	"^Hangul.*$": ["HANGUL", "CJK"],
250	"^Hebrew$": "HEBREW",
251	"^Hiragana$": ["HIRAGANA", "CJK"],
252	"^Ideographic Description Characters*": "CJK",
253	"^IPA.*$": ["IPA", "PHONETIC"],
254	"^Javanese$": "JAVANESE",
255	"^Katakana.*$": ["KATAKANA", "CJK"],
256	"^Kana .*$": "CJK",
257	"^Kanbun.*$": "CJK",
258	"^Kangxi Radicals$": "CJK",
259	"^Kannada$": "KANNADA",
260	"^Khmer.*$": "KHMER",
261	"^Lao$": "LAO",
262	"^.*Latin.*$": "LATIN",
263	"^Miao$": "MIAO",
264	"^Mahjong.*$": "MAHJONG",
265	"^Malayalam$": "MALAYALAM",
266	"^.*Mathematical.*$": "MATH",
267	"^Mongolian$": "MONGOLIAN",
268	"^.*Musical.*$": "MUSIC",
269	"^Myanmar.*$": "MYANMAR",
270	"^Phonetic.*$": "PHONETIC",
271	"^.*Private Use Area.*$": "PUA",
272	"^.*Punctuation.*$": "PUNCTUATION",
273	"^Samaritan$": "SAMARITAN",
274	"^Sinhala.*$": "SINHALA",
275	"^Sundanese.*$": "SUNDANESE",
276	"^Syriac$": "SYRIAC",
277	"^Tagalog$": "TAGALOG",
278	"^Tai Xuan Jing.*$": "CJK",
279	"^Tamil$": "TAMIL",
280	"^Telugu$": "TELUGU",
281	"^Thai$": "THAI",
282	"^Tibetan$": "TIBETAN",
283	"^Tifinagh$": "TIFINAGH",
284	"^Yi .*$": ["YI", "CJK"],
285	"^Yijing.*$": "CJK",
286}
287
288m={}
289blk=open("tmp/Blocks.txt")
290for l in blk:
291	l=l.strip()
292	if l=="" or l[0]=="#":
293		continue
294	r, d = l.split(";")
295	d=d.strip()
296	cl=[]
297	for pt in blocks:
298		if re.match(pt, d):
299			c=blocks[pt]
300			if type(c)==list:
301				cl.extend(c)
302			else:
303				cl.append(c)
304	print r, d, cl
305	for c in cl:
306		if c not in m:
307			m[c]=open(os.path.join("modules/filter", c+".c"), "w")
308			m[c].write("/*\n"
309			" * Generated from: "+m_url["Blocks.txt"]+"\n"
310			" */\n"
311			"\n"
312			"#include \"../../src/bsdconv.h\"\n"
313			"\n"
314			"static const struct uint32_range ranges[] = {\n"
315			)
316		b,e=r.split("..")
317		m[c].write("\t{{ 0x{beg}, 0x{end} }}, // {desc}\n".format(beg=b, end=e, desc=d))
318
319for c in m:
320	m[c].write("};\n"
321	"#include \"unicode_range.c\"\n")
322	m[c].close()
323
324f_ambiguous.write("""
325struct interval {
326	int first;
327	int last;
328};
329
330static const struct interval ambiguous[] = {
331""");
332
333f_width.write("""
334struct width_interval {
335	int beg;
336	int end;
337	int width;
338};
339
340static const struct width_interval width_table[] = {
341""");
342
343propmap = {"A":"AMBI", "F":"FULL", "H":"HALF", "N":"HALF", "Na":"HALF", "W":"FULL"}
344ambi_beg = None
345ambi_end = None
346width_beg = None
347width_end = None
348width_prop = None
349eaw=open("tmp/EastAsianWidth.txt")
350for l in eaw:
351	l = l.strip()
352	if not l:
353		continue
354	if l.startswith("#"):
355		continue
356	l, desc = l.split("#")
357	a=l.strip().split(";")
358	desc = desc[1:3]
359	w = a[1]
360	r = a[0].split("..")
361	b = r[0]
362	if len(r)==1:
363		e = b
364	else:
365		e = r[1]
366
367	if w == "A":
368		if ambi_beg is None:
369			ambi_beg = b
370			ambi_end = e
371		elif int(ambi_end, 16)+1==int(b, 16):
372			ambi_end = e
373		else:
374			f_ambiguous.write("{{ 0x{beg}, 0x{end} }},\n".format(beg=ambi_beg, end=ambi_end));
375			ambi_beg = b
376			ambi_end = e
377
378	if desc != "Cc":
379		p = propmap[w]
380		if width_prop is None:
381			width_prop = p
382			width_beg = b
383			width_end = e
384		elif p == width_prop:
385			if int(width_end, 16)+1==int(b, 16):
386				width_end = e
387			else:
388				f_width.write("{{ 0x{beg}, 0x{end}, {prop} }},\n".format(beg=width_beg, end=width_end, prop=width_prop));
389				width_beg = b
390				width_end = e
391		else:
392			f_width.write("{{ 0x{beg}, 0x{end}, {prop} }},\n".format(beg=width_beg, end=width_end, prop=width_prop));
393			width_prop = p
394			width_beg = b
395			width_end = e
396
397f_ambiguous.write("{{ 0x{beg}, 0x{end} }},\n".format(beg=ambi_beg, end=ambi_end));
398f_ambiguous.write("};\n")
399f_ambiguous.close()
400
401f_width.write("{{ 0x{beg}, 0x{end}, {prop} }},\n".format(beg=width_beg, end=width_end, prop=width_prop));
402f_width.write("};\n")
403f_width.close()
404