1# coding=utf-8 2from __future__ import absolute_import 3from __future__ import unicode_literals 4 5import re 6import os 7import pprint 8from collections import OrderedDict 9 10from bs4 import BeautifulSoup 11 12TEMPLATE = """\ 13import re 14from collections import OrderedDict 15data = """ 16 17TEMPLATE_END = """\ 18 19for lang, grps in data.iteritems(): 20 for grp in grps.iterkeys(): 21 if data[lang][grp]["pattern"]: 22 data[lang][grp]["pattern"] = re.compile(data[lang][grp]["pattern"]) 23""" 24 25 26SZ_FIX_DATA = { 27 "eng": { 28 "PartialWordsAlways": { 29 "°x°": "%", 30 "compiete": "complete", 31 "Âs": "'s", 32 "ÃÂs": "'s", 33 "a/ion": "ation", 34 "at/on": "ation", 35 "l/an": "lian", 36 "lljust": "ll just", 37 " L ": " I ", 38 " l ": " I ", 39 "'sjust": "'s just", 40 "'tjust": "'t just", 41 "\";": "'s", 42 }, 43 "WholeWords": { 44 "I'11": "I'll", 45 "III'll": "I'll", 46 "Tun": "Run", 47 "pan'": "part", 48 "al'": "at", 49 "a re": "are", 50 "wail'": "wait", 51 "he)'": "hey", 52 "he)\"": "hey", 53 "He)'": "Hey", 54 "He)\"": "Hey", 55 "He)’": "Hey", 56 "Yea h": "Yeah", 57 "yea h": "yeah", 58 "h is": "his", 59 " 're ": "'re ", 60 "LAst": "Last", 61 "forthis": "for this", 62 "Ls": "Is", 63 "Iam": "I am", 64 "Ican": "I can", 65 }, 66 "PartialLines": { 67 "L know": "I know", 68 "L should": "I should", 69 "L do": "I do", 70 "L would": "I would", 71 "L could": "I could", 72 "L can": "I can", 73 "L happen": "I happen", 74 "L might": "I might", 75 "L have ": "I have", 76 "L had": "I had", 77 "L want": "I want", 78 "L was": "I was", 79 "L am": "I am", 80 "L will": "I will", 81 "L suggest": "I suggest", 82 "L think": "I think", 83 "L reckon": "I reckon", 84 "L like": "I like", 85 "L love": "I love", 86 "L don't": "I don't", 87 "L didn't": "I didn't", 88 "L wasn't": "I wasnt't", 89 "L haven't": "I haven't", 90 "L couldn't": "I couldn't", 91 "L won't": "I won't", 92 "H i": "Hi", 93 }, 94 "BeginLines": { 95 "l ": "I ", 96 "L ": "I ", 97 } 98 }, 99 "nld": { 100 "PartialWordsAlways": { 101 "ט": "è", 102 "י": "é", 103 "כ": "ë", 104 "צ": "ë", 105 "ן": "ï", 106 "ף": "ó", 107 "א": "à", 108 "Iֻ": "I", 109 "č": "è", 110 "פ": "o", 111 "ם": "i", 112 }, 113 }, 114 "swe": { 115 "PartialWordsAlways": { 116 "ĺ": "å", 117 "Ĺ": "Å", 118 } 119 } 120} 121 122SZ_FIX_DATA_GLOBAL = { 123} 124 125if __name__ == "__main__": 126 cur_dir = os.path.dirname(os.path.realpath(__file__)) 127 xml_dir = os.path.join(cur_dir, "xml") 128 file_list = os.listdir(xml_dir) 129 130 data = {} 131 132 for fn in file_list: 133 if fn.endswith("_OCRFixReplaceList.xml"): 134 lang = fn.split("_")[0] 135 soup = BeautifulSoup(open(os.path.join(xml_dir, fn)), "xml") 136 137 fetch_data = ( 138 # group, item_name, pattern 139 ("WholeLines", "Line", None), 140 ("WholeWords", "Word", lambda d: (r"(?um)(\b|^)(?:" + "|".join([re.escape(k) for k in list(d.keys())]) 141 + r')(\b|$)') if d else None), 142 ("PartialWordsAlways", "WordPart", None), 143 ("PartialLines", "LinePart", lambda d: (r"(?um)(?:(?<=\s)|(?<=^)|(?<=\b))(?:" + 144 "|".join([re.escape(k) for k in list(d.keys())]) + 145 r")(?:(?=\s)|(?=$)|(?=\b))") if d else None), 146 ("BeginLines", "Beginning", lambda d: (r"(?um)^(?:"+"|".join([re.escape(k) for k in list(d.keys())]) 147 + r')') if d else None), 148 ("EndLines", "Ending", lambda d: (r"(?um)(?:" + "|".join([re.escape(k) for k in list(d.keys())]) + 149 r")$") if d else None,), 150 ) 151 152 data[lang] = dict((grp, {"data": OrderedDict(), "pattern": None}) for grp, item_name, pattern in fetch_data) 153 154 for grp, item_name, pattern in fetch_data: 155 for grp_data in soup.find_all(grp): 156 for line in grp_data.find_all(item_name): 157 data[lang][grp]["data"][line["from"]] = line["to"] 158 159 # add our own dictionaries 160 if lang in SZ_FIX_DATA and grp in SZ_FIX_DATA[lang]: 161 data[lang][grp]["data"].update(SZ_FIX_DATA[lang][grp]) 162 163 if grp in SZ_FIX_DATA_GLOBAL: 164 data[lang][grp]["data"].update(SZ_FIX_DATA_GLOBAL[grp]) 165 166 if pattern: 167 data[lang][grp]["pattern"] = pattern(data[lang][grp]["data"]) 168 169 f = open(os.path.join(cur_dir, "data.py"), "w+") 170 f.write(TEMPLATE) 171 f.write(pprint.pformat(data, width=1)) 172 f.write(TEMPLATE_END) 173 f.close() 174