1# coding=utf-8
2from __future__ import absolute_import
3from __future__ import unicode_literals
4
5import re
6import os
7import pprint
8from collections import OrderedDict
9
10from bs4 import BeautifulSoup
11
12TEMPLATE = """\
13import re
14from collections import OrderedDict
15data = """
16
17TEMPLATE_END = """\
18
19for lang, grps in data.iteritems():
20    for grp in grps.iterkeys():
21        if data[lang][grp]["pattern"]:
22            data[lang][grp]["pattern"] = re.compile(data[lang][grp]["pattern"])
23"""
24
25
26SZ_FIX_DATA = {
27    "eng": {
28        "PartialWordsAlways": {
29            "°x°": "%",
30            "compiete": "complete",
31            "Âs": "'s",
32            "ÃÂs": "'s",
33            "a/ion": "ation",
34            "at/on": "ation",
35            "l/an": "lian",
36            "lljust": "ll just",
37            " L ": " I ",
38            " l ": " I ",
39            "'sjust": "'s just",
40            "'tjust": "'t just",
41            "\";": "'s",
42        },
43        "WholeWords": {
44            "I'11": "I'll",
45            "III'll": "I'll",
46            "Tun": "Run",
47            "pan'": "part",
48            "al'": "at",
49            "a re": "are",
50            "wail'": "wait",
51            "he)'": "hey",
52            "he)\"": "hey",
53            "He)'": "Hey",
54            "He)\"": "Hey",
55            "He)’": "Hey",
56            "Yea h": "Yeah",
57            "yea h": "yeah",
58            "h is": "his",
59            " 're ": "'re ",
60            "LAst": "Last",
61            "forthis": "for this",
62            "Ls": "Is",
63            "Iam": "I am",
64            "Ican": "I can",
65        },
66        "PartialLines": {
67            "L know": "I know",
68            "L should": "I should",
69            "L do": "I do",
70            "L would": "I would",
71            "L could": "I could",
72            "L can": "I can",
73            "L happen": "I happen",
74            "L might": "I might",
75            "L have ": "I have",
76            "L had": "I had",
77            "L want": "I want",
78            "L was": "I was",
79            "L am": "I am",
80            "L will": "I will",
81            "L suggest": "I suggest",
82            "L think": "I think",
83            "L reckon": "I reckon",
84            "L like": "I like",
85            "L love": "I love",
86            "L don't": "I don't",
87            "L didn't": "I didn't",
88            "L wasn't": "I wasnt't",
89            "L haven't": "I haven't",
90            "L couldn't": "I couldn't",
91            "L won't": "I won't",
92            "H i": "Hi",
93        },
94        "BeginLines": {
95            "l ": "I ",
96            "L ": "I ",
97        }
98    },
99    "nld": {
100        "PartialWordsAlways": {
101            "ט": "è",
102            "י": "é",
103            "כ": "ë",
104            "צ": "ë",
105            "ן": "ï",
106            "ף": "ó",
107            "א": "à",
108            "Iֻ": "I",
109            "č": "è",
110            "פ": "o",
111            "ם": "i",
112        },
113    },
114    "swe": {
115        "PartialWordsAlways": {
116            "ĺ": "å",
117            "Ĺ": "Å",
118        }
119    }
120}
121
122SZ_FIX_DATA_GLOBAL = {
123}
124
125if __name__ == "__main__":
126    cur_dir = os.path.dirname(os.path.realpath(__file__))
127    xml_dir = os.path.join(cur_dir, "xml")
128    file_list = os.listdir(xml_dir)
129
130    data = {}
131
132    for fn in file_list:
133        if fn.endswith("_OCRFixReplaceList.xml"):
134            lang = fn.split("_")[0]
135            soup = BeautifulSoup(open(os.path.join(xml_dir, fn)), "xml")
136
137            fetch_data = (
138                    # group, item_name, pattern
139                    ("WholeLines", "Line", None),
140                    ("WholeWords", "Word", lambda d: (r"(?um)(\b|^)(?:" + "|".join([re.escape(k) for k in list(d.keys())])
141                                                      + r')(\b|$)') if d else None),
142                    ("PartialWordsAlways", "WordPart", None),
143                    ("PartialLines", "LinePart", lambda d: (r"(?um)(?:(?<=\s)|(?<=^)|(?<=\b))(?:" +
144                                                            "|".join([re.escape(k) for k in list(d.keys())]) +
145                                                            r")(?:(?=\s)|(?=$)|(?=\b))") if d else None),
146                    ("BeginLines", "Beginning", lambda d: (r"(?um)^(?:"+"|".join([re.escape(k) for k in list(d.keys())])
147                                                           + r')') if d else None),
148                    ("EndLines", "Ending", lambda d: (r"(?um)(?:" + "|".join([re.escape(k) for k in list(d.keys())]) +
149                                                      r")$") if d else None,),
150            )
151
152            data[lang] = dict((grp, {"data": OrderedDict(), "pattern": None}) for grp, item_name, pattern in fetch_data)
153
154            for grp, item_name, pattern in fetch_data:
155                for grp_data in soup.find_all(grp):
156                    for line in grp_data.find_all(item_name):
157                        data[lang][grp]["data"][line["from"]] = line["to"]
158
159                # add our own dictionaries
160                if lang in SZ_FIX_DATA and grp in SZ_FIX_DATA[lang]:
161                    data[lang][grp]["data"].update(SZ_FIX_DATA[lang][grp])
162
163                if grp in SZ_FIX_DATA_GLOBAL:
164                    data[lang][grp]["data"].update(SZ_FIX_DATA_GLOBAL[grp])
165
166                if pattern:
167                    data[lang][grp]["pattern"] = pattern(data[lang][grp]["data"])
168
169    f = open(os.path.join(cur_dir, "data.py"), "w+")
170    f.write(TEMPLATE)
171    f.write(pprint.pformat(data, width=1))
172    f.write(TEMPLATE_END)
173    f.close()
174