1""" 2Text formatter 3""" 4 5import re 6 7 8dReplTable = { 9 # surnumerary_spaces 10 "start_of_paragraph": [("^[ ]+", "")], 11 "end_of_paragraph": [("[ ]+$", "")], 12 "between_words": [(" | ", " "), # espace + espace insécable -> espace 13 (" +", " "), # espaces surnuméraires 14 (" +", " ")], # espaces insécables surnuméraires 15 "before_punctuation": [(" +(?=[.,…])", "")], 16 "within_parenthesis": [("\\([ ]+", "("), 17 ("[ ]+\\)", ")")], 18 "within_square_brackets": [("\\[[ ]+", "["), 19 ("[ ]+\\]", "]")], 20 "within_quotation_marks": [("“[ ]+", "“"), 21 ("[ ]”", "”")], 22 ## non-breaking spaces 23 # espaces insécables 24 "nbsp_before_punctuation": [("(?<=[]\\w…)»}])([:;?!])[ …]", " \\1 "), 25 ("(?<=[]\\w…)»}])([:;?!])$", " \\1"), 26 ("[ ]+([:;?!])", " \\1")], 27 "nbsp_within_quotation_marks": [("«(?=\\w)", "« "), 28 ("«[ ]+", "« "), 29 ("(?<=[\\w.!?])»", " »"), 30 ("[ ]+»", " »")], 31 "nbsp_within_numbers": [("(\\d)[ ](?=\\d)", "\\1 ")], 32 # espaces insécables fines 33 "nnbsp_before_punctuation": [("(?<=[]\\w…)»}])([;?!])[ …]", " \\1 "), 34 ("(?<=[]\\w…)»}])([;?!])$", " \\1"), 35 ("[ ]+([;?!])", " \\1"), 36 ("(?<=[]\\w…)»}]):", " :"), 37 ("[ ]+:", " :")], 38 "nnbsp_within_quotation_marks":[("«(?=\\w)", "« "), 39 ("«[ ]+", "« "), 40 ("(?<=[\\w.!?])»", " »"), 41 ("[ ]+»", " »")], 42 "nnbsp_within_numbers": [("(\\d)[ ](\\d)", "\\1 \\2")], 43 # common 44 "nbsp_titles": [("\\bM(mes?|ᵐᵉˢ?|grs?|ᵍʳˢ?|lles?|ˡˡᵉˢ?|rs?|ʳˢ?|M\\.) ", "M\\1 "), 45 ("\\bP(re?s?|ʳᵉ?ˢ?) ", "P\\1 "), 46 ("\\bD(re?s?|ʳᵉ?ˢ?) ", "D\\1 "), 47 ("\\bV(ves?|ᵛᵉˢ?) ", "V\\1 ")], 48 "nbsp_before_symbol": [("(\\d) ?([%‰€$£¥˚Ω℃])", "\\1 \\2")], 49 "nbsp_before_units": [("(?<=[0-9⁰¹²³⁴⁵⁶⁷⁸⁹]) ?([kcmµn]?(?:[slgJKΩ]|m[²³]?|Wh?|Hz|dB)|[%‰]|°C)\\b", " \\1")], 50 "nbsp_repair": [("(?<=[\\[(])[ ]([!?:;])", "\\1"), 51 ("(https?|ftp)[ ]:(?=//)", "\\1:"), 52 ("&([a-z]+)[ ];", "&\\1;"), 53 ("&#([0-9]+|x[0-9a-fA-F]+)[ ];", "&#\\1;")], 54 ## missing spaces 55 "add_space_after_punctuation": [("([;!…])(?=\\w)", "\\1 "), 56 ("[?](?=[A-ZÉÈÊÂÀÎ])", "? "), 57 ("\\.(?=[A-ZÉÈÎ][a-zA-ZàâÂéÉèÈêÊîÎïÏôÔöÖûÛüÜùÙ])", ". "), 58 ("\\.(?=À)", ". "), 59 ("(?i)([,:])(?=[a-zàâäéèêëîïôöûüù])", "\\1 "), 60 ("(?i)([a-zàâäéèêëîïôöûüù]),(?=[0-9])", "\\1, ")], 61 "add_space_around_hyphens": [(" ([-–—])(?=[a-zàâäéèêëîïôöûüù\"«“'‘])", " \\1 "), 62 ("(?<=[a-zàâäéèêëîïôöûüù\"»”'’])([-–—]) ", " \\1 ")], 63 "add_space_repair": [("DnT, ([wA])\\b", "DnT,\\1")], 64 ## erase 65 "erase_non_breaking_hyphens": [("", "")], 66 ## typographic signs 67 "ts_apostrophe": [ ("(?i)\\b([ldnjmtscç])['´‘′`](?=\\w)", "\\1’"), 68 ("(?i)(qu|jusqu|lorsqu|puisqu|quoiqu|quelqu|presqu|entr|aujourd|prud)['´‘′`]", "\\1’") ], 69 "ts_ellipsis": [ ("\\.\\.\\.", "…"), 70 ("(?<=…)[.][.]", "…"), 71 ("…[.](?![.])", "…") ], 72 "ts_n_dash_middle": [ (" [-—] ", " – "), 73 (" [-—],", " –,") ], 74 "ts_m_dash_middle": [ (" [-–] ", " — "), 75 (" [-–],", " —,") ], 76 "ts_n_dash_start": [ ("^[-—][ ]", "– "), 77 ("^– ", "– "), 78 ("^[-–—](?=[\\w.…])", "– ") ], 79 "ts_m_dash_start": [ ("^[-–][ ]", "— "), 80 ("^— ", "— "), 81 ("^«[ ][—–-][ ]", "« — "), 82 ("^[-–—](?=[\\w.…])", "— ") ], 83 "ts_quotation_marks": [ ('"(\\w+)"', "“$1”"), 84 ("''(\\w+)''", "“$1”"), 85 ("'(\\w+)'", "“$1”"), 86 ("^(?:\"|'')(?=\\w)", "« "), 87 (" (?:\"|'')(?=\\w)", " « "), 88 ("\\((?:\"|'')(?=\\w)", "(« "), 89 ("(?<=\\w)(?:\"|'')$", " »"), 90 ("(?<=\\w)(?:\"|'')(?=[] ,.:;?!…)])", " »"), 91 ('(?<=[.!?…])" ', " » "), 92 ('(?<=[.!?…])"$', " »") ], 93 "ts_spell": [ ("coeur", "cœur"), ("Coeur", "Cœur"), 94 ("coel(?=[aeio])", "cœl"), ("Coel(?=[aeio])", "Cœl"), 95 ("choeur", "chœur"), ("Choeur", "Chœur"), 96 ("foet", "fœt"), ("Foet", "Fœt"), 97 ("oeil", "œil"), ("Oeil", "Œil"), 98 ("oeno", "œno"), ("Oeno", "Œno"), 99 ("oesoph", "œsoph"), ("Oesoph", "Œsoph"), 100 ("oestro", "œstro"), ("Oestro", "Œstro"), 101 ("oeuf", "œuf"), ("Oeuf", "Œuf"), 102 ("oeuvr", "œuvr"), ("Oeuvr", "Œuvr"), 103 ("moeur", "mœur"), ("Moeur", "Mœur"), 104 ("noeu", "nœu"), ("Noeu", "Nœu"), 105 ("soeur", "sœur"), ("Soeur", "Sœur"), 106 ("voeu", "vœu"), ("Voeu", "Vœu"), 107 ("aequo", "æquo"), ("Aequo", "Æquo"), 108 ("\\bCa\\b", "Ça"), (" ca\\b", " ça"), 109 ("\\bdej[aà]\\b", "déjà"), ("\\bplutot\\b", "plutôt"), 110 ("\\bmeme\\b", "même"), ("\\bmemes\\b", "mêmes"), ("\\bMeme\\b", "Même"), 111 ("\\b([cC]e(?:ux|lles?|lui))-la\\b", "$1-là"), 112 ("\\bmalgre\\b", "malgré"), ("\\bMalgre\\b", "Malgré"), 113 ("\\betre\\b", "être"), ("\\bEtre\\b", "Être"), 114 ("\\btres\\b", "très"), ("\\bTres\\b", "Très"), 115 ("\\bEtai([ts]|ent)\\b", "Étai$1"), 116 ("\\bE(tat|cole|crit|poque|tude|ducation|glise|conomi(?:qu|)e|videmment|lysée|tienne|thiopie|cosse|gypt(?:e|ien)|rythrée|pinal|vreux)", "É$1") ], 117 "ts_ligature_ffi_on": [("ffi", "ffi")], 118 "ts_ligature_ffl_on": [("ffl", "ffl")], 119 "ts_ligature_fi_on": [("fi", "fi")], 120 "ts_ligature_fl_on": [("fl", "fl")], 121 "ts_ligature_ff_on": [("ff", "ff")], 122 "ts_ligature_ft_on": [("ft", "ſt")], 123 "ts_ligature_st_on": [("st", "st")], 124 "ts_ligature_fi_off": [("fi", "fi")], 125 "ts_ligature_fl_off": [("fl", "fl")], 126 "ts_ligature_ff_off": [("ff", "ff")], 127 "ts_ligature_ffi_off": [("ffi", "ffi")], 128 "ts_ligature_ffl_off": [("ffl", "ffl")], 129 "ts_ligature_ft_off": [("ſt", "ft")], 130 "ts_ligature_st_off": [("st", "st")], 131 "ts_units": [ ("\\bN\\.([ms])\\b", "N·\\1"), # N·m et N·m-1, N·s 132 ("\\bW\\.h\\b", "W·h"), 133 ("\\bPa\\.s\\b", "Pa·s"), 134 ("\\bA\\.h\\b", "A·h"), 135 ("\\bΩ\\.m\\b", "Ω·m"), 136 ("\\bS\\.m\\b", "S·m"), 137 ("\\bg\\.s(?=-1)\\b", "g·s"), 138 ("\\bm\\.s(?=-[12])\\b", "m·s"), 139 ("\\bg\\.m(?=2|-3)\\b", "g·m"), 140 ("\\bA\\.m(?=-1)\\b", "A·m"), 141 ("\\bJ\\.K(?=-1)\\b", "J·K"), 142 ("\\bW\\.m(?=-2)\\b", "W·m"), 143 ("\\bcd\\.m(?=-2)\\b", "cd·m"), 144 ("\\bC\\.kg(?=-1)\\b", "C·kg"), 145 ("\\bH\\.m(?=-1)\\b", "H·m"), 146 ("\\bJ\\.kg(?=-1)\\b", "J·kg"), 147 ("\\bJ\\.m(?=-3)\\b", "J·m"), 148 ("\\bm[2²]\\.s\\b", "m²·s"), 149 ("\\bm[3³]\\.s(?=-1)\\b", "m³·s"), 150 #("\\bJ.kg-1.K-1\\b", "J·kg-1·K-1"), 151 #("\\bW.m-1.K-1\\b", "W·m-1·K-1"), 152 #("\\bW.m-2.K-1\\b", "W·m-2·K-1"), 153 ("\\b(Y|Z|E|P|T|G|M|k|h|da|d|c|m|µ|n|p|f|a|z|y)Ω\\b", "\\1Ω") ], 154 ## misc 155 "ordinals_exponant": [ ("\\b([0-9]+)(?:i?[èe]me|è|e)\\b", "\\1ᵉ"), 156 ("\\b([XVICL]+)(?:i?[èe]me|è)\\b", "\\1ᵉ"), 157 ("(?<=\\b(au|l[ea]|du) [XVICL])e\\b", "ᵉ"), 158 ("(?<=\\b[XVI])e(?= siècle)", "ᵉ"), 159 ("(?<=\\b[1I])er\\b", "ᵉʳ"), 160 ("(?<=\\b[1I])re\\b", "ʳᵉ") ], 161 "ordinals_no_exponant": [ ("\\b([0-9]+)(?:i?[èe]me|è)\\b", "\\1e"), 162 ("\\b([XVICL]+)(?:i?[èe]me|è)\\b", "\\1e"), 163 ("(?<=\\b[1I])ᵉʳ\\b", "er"), 164 ("(?<=\\b[1I])ʳᵉ\\b", "er")], 165 "etc": [ ("etc(…|[.][.][.]?)", "etc."), 166 ("(?<!,) etc[.]", ", etc.") ], 167 ## missing hyphens 168 "mh_interrogatives": [ ("[ -]t[’'](?=il\\b|elle|on\\b)", "-t-"), 169 (" t-(?=il|elle|on)", "-t-"), 170 ("[ -]t[’'-](?=ils|elles)", "-"), 171 ("(?<=[td])-t-(?=il|elle|on)", "-") ], 172 "mh_numbers": [ ("dix (sept|huit|neuf)", "dix-\\1"), 173 ("quatre vingt", "quatre-vingt"), 174 ("(soixante|quatre-vingt) dix", "\\1-dix"), 175 ("(vingt|trente|quarante|cinquante|soixante(?:-dix|)|quatre-vingt(?:-dix|)) (deux|trois|quatre|cinq|six|sept|huit|neuf)\\b", "\\1-\\2")], 176 "mh_frequent_words": [ ("(?i)ce(lles?|lui|ux) (ci|là)\\b", "ce\\1-\\2"), 177 ("(?i)(?<!-)\\b(ci) (joint|desso?us|contre|devant|avant|après|incluse|g[îi]t|gisent)", "\\1-\\2"), 178 ("vis à vis", "vis-à-vis"), 179 ("Vis à vis", "Vis-à-vis"), 180 ("week end", "week-end"), 181 ("Week end", "Week-end"), 182 ("(?i)(plus|moins) value", "\\1-value") ], 183 ## missing apostrophes 184 "ma_word": [("(?i)(qu|lorsqu|puisqu|quoiqu|presqu|jusqu|aujourd|entr|quelqu|prud) ", "\\1’")], 185 "ma_1letter_lowercase": [("\\b([ldjnmtscç]) (?=[aàeéêiîoôuyhAÀEÉÊIÎOÔUYH])", "\\1’")], 186 "ma_1letter_uppercase": [("\\b([LDJNMTSCÇ]) (?=[aàeéêiîoôuyhAÀEÉÊIÎOÔUYH])", "\\1’")] 187} 188 189 190dDefaultOptions = { 191 "ts_units": True, 192 "start_of_paragraph": True, 193 "end_of_paragraph": True, 194 "between_words": True, 195 "before_punctuation": True, 196 "within_parenthesis": True, 197 "within_square_brackets": True, 198 "within_quotation_marks": True, 199 "nbsp_before_punctuation": True, 200 "nbsp_within_quotation_marks": True, 201 "nbsp_within_numbers": True, 202 "nnbsp_before_punctuation": False, 203 "nnbsp_within_quotation_marks": False, 204 "nnbsp_within_numbers": False, 205 "nbsp_titles": False, 206 "nbsp_before_symbol": True, 207 "nbsp_before_units": True, 208 "nbsp_repair": True, 209 "add_space_after_punctuation": True, 210 "add_space_around_hyphens": True, 211 "add_space_repair": True, 212 "erase_non_breaking_hyphens": False, 213 "ts_apostrophe": True, 214 "ts_ellipsis": True, 215 "ts_n_dash_middle": True, 216 "ts_m_dash_middle": False, 217 "ts_n_dash_start": False, 218 "ts_m_dash_start": True, 219 "ts_quotation_marks": True, 220 "ts_spell": True, 221 "ts_ligature_ffi_on": False, 222 "ts_ligature_ffl_on": False, 223 "ts_ligature_fi_on": False, 224 "ts_ligature_fl_on": False, 225 "ts_ligature_ff_on": False, 226 "ts_ligature_ft_on": False, 227 "ts_ligature_st_on": False, 228 "ts_ligature_fi_off": False, 229 "ts_ligature_fl_off": False, 230 "ts_ligature_ff_off": False, 231 "ts_ligature_ffi_off": False, 232 "ts_ligature_ffl_off": False, 233 "ts_ligature_ft_off": False, 234 "ts_ligature_st_off": False, 235 "ordinals_exponant": False, 236 "ordinals_no_exponant": True, 237 "etc": True, 238 "mh_interrogatives": True, 239 "mh_numbers": True, 240 "mh_frequent_words": True, 241 "ma_word": True, 242 "ma_1letter_lowercase": False, 243 "ma_1letter_uppercase": False 244} 245 246 247class TextFormatter: 248 "Text Formatter: purge typographic mistakes from text" 249 250 def __init__ (self): 251 for _, lTup in dReplTable.items(): 252 for i, t in enumerate(lTup): 253 lTup[i] = (re.compile(t[0]), t[1]) 254 255 def formatText (self, sText): 256 "returns formatted text" 257 for sOptName, bVal in dDefaultOptions.items(): 258 if bVal: 259 for zRgx, sRep in dReplTable[sOptName]: 260 sText = zRgx.sub(sRep, sText) 261 return sText 262 263 def getDefaultOptions (self): 264 "returns default options" 265 return dDefaultOptions.copy() 266