1"""
2Text formatter
3"""
4
5import re
6
7
8dReplTable = {
9    # surnumerary_spaces
10    "start_of_paragraph":          [("^[  ]+", "")],
11    "end_of_paragraph":            [("[  ]+$", "")],
12    "between_words":               [("  |  ", " "),  # espace + espace insécable -> espace
13                                    ("  +", " "),    # espaces surnuméraires
14                                    ("  +", " ")],   # espaces insécables surnuméraires
15    "before_punctuation":          [(" +(?=[.,…])", "")],
16    "within_parenthesis":          [("\\([  ]+", "("),
17                                    ("[  ]+\\)", ")")],
18    "within_square_brackets":      [("\\[[  ]+", "["),
19                                    ("[  ]+\\]", "]")],
20    "within_quotation_marks":      [("“[  ]+", "“"),
21                                    ("[  ]”", "”")],
22    ## non-breaking spaces
23    # espaces insécables
24    "nbsp_before_punctuation":     [("(?<=[]\\w…)»}])([:;?!])[   …]", " \\1 "),
25                                    ("(?<=[]\\w…)»}])([:;?!])$", " \\1"),
26                                    ("[  ]+([:;?!])", " \\1")],
27    "nbsp_within_quotation_marks": [("«(?=\\w)", "« "),
28                                    ("«[  ]+", "« "),
29                                    ("(?<=[\\w.!?])»", " »"),
30                                    ("[  ]+»", " »")],
31    "nbsp_within_numbers":         [("(\\d)[  ](?=\\d)", "\\1 ")],
32    # espaces insécables fines
33    "nnbsp_before_punctuation":    [("(?<=[]\\w…)»}])([;?!])[   …]", " \\1 "),
34                                    ("(?<=[]\\w…)»}])([;?!])$", " \\1"),
35                                    ("[  ]+([;?!])", " \\1"),
36                                    ("(?<=[]\\w…)»}]):", " :"),
37                                    ("[  ]+:", " :")],
38    "nnbsp_within_quotation_marks":[("«(?=\\w)", "« "),
39                                    ("«[  ]+", "« "),
40                                    ("(?<=[\\w.!?])»", " »"),
41                                    ("[  ]+»", " »")],
42    "nnbsp_within_numbers":        [("(\\d)[  ](\\d)", "\\1 \\2")],
43    # common
44    "nbsp_titles":                 [("\\bM(mes?|ᵐᵉˢ?|grs?|ᵍʳˢ?|lles?|ˡˡᵉˢ?|rs?|ʳˢ?|M\\.) ", "M\\1 "),
45                                    ("\\bP(re?s?|ʳᵉ?ˢ?) ", "P\\1 "),
46                                    ("\\bD(re?s?|ʳᵉ?ˢ?) ", "D\\1 "),
47                                    ("\\bV(ves?|ᵛᵉˢ?) ", "V\\1 ")],
48    "nbsp_before_symbol":          [("(\\d) ?([%‰€$£¥˚Ω℃])", "\\1 \\2")],
49    "nbsp_before_units":           [("(?<=[0-9⁰¹²³⁴⁵⁶⁷⁸⁹]) ?([kcmµn]?(?:[slgJKΩ]|m[²³]?|Wh?|Hz|dB)|[%‰]|°C)\\b", " \\1")],
50    "nbsp_repair":                 [("(?<=[\\[(])[   ]([!?:;])", "\\1"),
51                                    ("(https?|ftp)[   ]:(?=//)", "\\1:"),
52                                    ("&([a-z]+)[   ];", "&\\1;"),
53                                    ("&#([0-9]+|x[0-9a-fA-F]+)[   ];", "&#\\1;")],
54    ## missing spaces
55    "add_space_after_punctuation": [("([;!…])(?=\\w)", "\\1 "),
56                                    ("[?](?=[A-ZÉÈÊÂÀÎ])", "? "),
57                                    ("\\.(?=[A-ZÉÈÎ][a-zA-ZàâÂéÉèÈêÊîÎïÏôÔöÖûÛüÜùÙ])", ". "),
58                                    ("\\.(?=À)", ". "),
59                                    ("(?i)([,:])(?=[a-zàâäéèêëîïôöûüù])", "\\1 "),
60                                    ("(?i)([a-zàâäéèêëîïôöûüù]),(?=[0-9])", "\\1, ")],
61    "add_space_around_hyphens":    [(" ([-–—])(?=[a-zàâäéèêëîïôöûüù\"«“'‘])", " \\1 "),
62                                    ("(?<=[a-zàâäéèêëîïôöûüù\"»”'’])([-–—]) ", " \\1 ")],
63    "add_space_repair":            [("DnT, ([wA])\\b", "DnT,\\1")],
64    ## erase
65    "erase_non_breaking_hyphens":  [("­", "")],
66    ## typographic signs
67    "ts_apostrophe":          [ ("(?i)\\b([ldnjmtscç])['´‘′`](?=\\w)", "\\1’"),
68                                ("(?i)(qu|jusqu|lorsqu|puisqu|quoiqu|quelqu|presqu|entr|aujourd|prud)['´‘′`]", "\\1’") ],
69    "ts_ellipsis":            [ ("\\.\\.\\.", "…"),
70                                ("(?<=…)[.][.]", "…"),
71                                ("…[.](?![.])", "…") ],
72    "ts_n_dash_middle":       [ (" [-—] ", " – "),
73                                (" [-—],", " –,") ],
74    "ts_m_dash_middle":       [ (" [-–] ", " — "),
75                                (" [-–],", " —,") ],
76    "ts_n_dash_start":        [ ("^[-—][  ]", "– "),
77                                ("^– ", "– "),
78                                ("^[-–—](?=[\\w.…])", "– ") ],
79    "ts_m_dash_start":        [ ("^[-–][  ]", "— "),
80                                ("^— ", "— "),
81                                ("^«[  ][—–-][  ]", "« — "),
82                                ("^[-–—](?=[\\w.…])", "— ") ],
83    "ts_quotation_marks":     [ ('"(\\w+)"', "“$1”"),
84                                ("''(\\w+)''", "“$1”"),
85                                ("'(\\w+)'", "“$1”"),
86                                ("^(?:\"|'')(?=\\w)", "« "),
87                                (" (?:\"|'')(?=\\w)", " « "),
88                                ("\\((?:\"|'')(?=\\w)", "(« "),
89                                ("(?<=\\w)(?:\"|'')$", " »"),
90                                ("(?<=\\w)(?:\"|'')(?=[] ,.:;?!…)])", " »"),
91                                ('(?<=[.!?…])" ', " » "),
92                                ('(?<=[.!?…])"$', " »") ],
93    "ts_spell":               [ ("coeur", "cœur"), ("Coeur", "Cœur"),
94                                ("coel(?=[aeio])", "cœl"), ("Coel(?=[aeio])", "Cœl"),
95                                ("choeur", "chœur"), ("Choeur", "Chœur"),
96                                ("foet", "fœt"), ("Foet", "Fœt"),
97                                ("oeil", "œil"), ("Oeil", "Œil"),
98                                ("oeno", "œno"), ("Oeno", "Œno"),
99                                ("oesoph", "œsoph"), ("Oesoph", "Œsoph"),
100                                ("oestro", "œstro"), ("Oestro", "Œstro"),
101                                ("oeuf", "œuf"), ("Oeuf", "Œuf"),
102                                ("oeuvr", "œuvr"), ("Oeuvr", "Œuvr"),
103                                ("moeur", "mœur"), ("Moeur", "Mœur"),
104                                ("noeu", "nœu"), ("Noeu", "Nœu"),
105                                ("soeur", "sœur"), ("Soeur", "Sœur"),
106                                ("voeu", "vœu"), ("Voeu", "Vœu"),
107                                ("aequo", "æquo"), ("Aequo", "Æquo"),
108                                ("\\bCa\\b", "Ça"), (" ca\\b", " ça"),
109                                ("\\bdej[aà]\\b", "déjà"), ("\\bplutot\\b", "plutôt"),
110                                ("\\bmeme\\b", "même"), ("\\bmemes\\b", "mêmes"), ("\\bMeme\\b", "Même"),
111                                ("\\b([cC]e(?:ux|lles?|lui))-la\\b", "$1-là"),
112                                ("\\bmalgre\\b", "malgré"), ("\\bMalgre\\b", "Malgré"),
113                                ("\\betre\\b", "être"), ("\\bEtre\\b", "Être"),
114                                ("\\btres\\b", "très"), ("\\bTres\\b", "Très"),
115                                ("\\bEtai([ts]|ent)\\b", "Étai$1"),
116                                ("\\bE(tat|cole|crit|poque|tude|ducation|glise|conomi(?:qu|)e|videmment|lysée|tienne|thiopie|cosse|gypt(?:e|ien)|rythrée|pinal|vreux)", "É$1") ],
117    "ts_ligature_ffi_on":       [("ffi", "ffi")],
118    "ts_ligature_ffl_on":       [("ffl", "ffl")],
119    "ts_ligature_fi_on":        [("fi", "fi")],
120    "ts_ligature_fl_on":        [("fl", "fl")],
121    "ts_ligature_ff_on":        [("ff", "ff")],
122    "ts_ligature_ft_on":        [("ft", "ſt")],
123    "ts_ligature_st_on":        [("st", "st")],
124    "ts_ligature_fi_off":       [("fi", "fi")],
125    "ts_ligature_fl_off":       [("fl", "fl")],
126    "ts_ligature_ff_off":       [("ff", "ff")],
127    "ts_ligature_ffi_off":      [("ffi", "ffi")],
128    "ts_ligature_ffl_off":      [("ffl", "ffl")],
129    "ts_ligature_ft_off":       [("ſt", "ft")],
130    "ts_ligature_st_off":       [("st", "st")],
131    "ts_units":               [ ("\\bN\\.([ms])\\b", "N·\\1"), # N·m et N·m-1, N·s
132                                ("\\bW\\.h\\b", "W·h"),
133                                ("\\bPa\\.s\\b", "Pa·s"),
134                                ("\\bA\\.h\\b", "A·h"),
135                                ("\\bΩ\\.m\\b", "Ω·m"),
136                                ("\\bS\\.m\\b", "S·m"),
137                                ("\\bg\\.s(?=-1)\\b", "g·s"),
138                                ("\\bm\\.s(?=-[12])\\b", "m·s"),
139                                ("\\bg\\.m(?=2|-3)\\b", "g·m"),
140                                ("\\bA\\.m(?=-1)\\b", "A·m"),
141                                ("\\bJ\\.K(?=-1)\\b", "J·K"),
142                                ("\\bW\\.m(?=-2)\\b", "W·m"),
143                                ("\\bcd\\.m(?=-2)\\b", "cd·m"),
144                                ("\\bC\\.kg(?=-1)\\b", "C·kg"),
145                                ("\\bH\\.m(?=-1)\\b", "H·m"),
146                                ("\\bJ\\.kg(?=-1)\\b", "J·kg"),
147                                ("\\bJ\\.m(?=-3)\\b", "J·m"),
148                                ("\\bm[2²]\\.s\\b", "m²·s"),
149                                ("\\bm[3³]\\.s(?=-1)\\b", "m³·s"),
150                                #("\\bJ.kg-1.K-1\\b", "J·kg-1·K-1"),
151                                #("\\bW.m-1.K-1\\b", "W·m-1·K-1"),
152                                #("\\bW.m-2.K-1\\b", "W·m-2·K-1"),
153                                ("\\b(Y|Z|E|P|T|G|M|k|h|da|d|c|m|µ|n|p|f|a|z|y)Ω\\b", "\\1Ω") ],
154    ## misc
155    "ordinals_exponant":      [ ("\\b([0-9]+)(?:i?[èe]me|è|e)\\b", "\\1ᵉ"),
156                                ("\\b([XVICL]+)(?:i?[èe]me|è)\\b", "\\1ᵉ"),
157                                ("(?<=\\b(au|l[ea]|du) [XVICL])e\\b", "ᵉ"),
158                                ("(?<=\\b[XVI])e(?= siècle)", "ᵉ"),
159                                ("(?<=\\b[1I])er\\b", "ᵉʳ"),
160                                ("(?<=\\b[1I])re\\b", "ʳᵉ") ],
161    "ordinals_no_exponant":   [ ("\\b([0-9]+)(?:i?[èe]me|è)\\b", "\\1e"),
162                                ("\\b([XVICL]+)(?:i?[èe]me|è)\\b", "\\1e"),
163                                ("(?<=\\b[1I])ᵉʳ\\b", "er"),
164                                ("(?<=\\b[1I])ʳᵉ\\b", "er")],
165    "etc":                    [ ("etc(…|[.][.][.]?)", "etc."),
166                                ("(?<!,) etc[.]", ", etc.") ],
167    ## missing hyphens
168    "mh_interrogatives":      [ ("[ -]t[’'](?=il\\b|elle|on\\b)", "-t-"),
169                                (" t-(?=il|elle|on)", "-t-"),
170                                ("[ -]t[’'-](?=ils|elles)", "-"),
171                                ("(?<=[td])-t-(?=il|elle|on)", "-") ],
172    "mh_numbers": [ ("dix (sept|huit|neuf)", "dix-\\1"),
173                    ("quatre vingt", "quatre-vingt"),
174                    ("(soixante|quatre-vingt) dix", "\\1-dix"),
175                    ("(vingt|trente|quarante|cinquante|soixante(?:-dix|)|quatre-vingt(?:-dix|)) (deux|trois|quatre|cinq|six|sept|huit|neuf)\\b", "\\1-\\2")],
176    "mh_frequent_words":      [ ("(?i)ce(lles?|lui|ux) (ci|là)\\b", "ce\\1-\\2"),
177                                ("(?i)(?<!-)\\b(ci) (joint|desso?us|contre|devant|avant|après|incluse|g[îi]t|gisent)", "\\1-\\2"),
178                                ("vis à vis", "vis-à-vis"),
179                                ("Vis à vis", "Vis-à-vis"),
180                                ("week end", "week-end"),
181                                ("Week end", "Week-end"),
182                                ("(?i)(plus|moins) value", "\\1-value") ],
183    ## missing apostrophes
184    "ma_word":                  [("(?i)(qu|lorsqu|puisqu|quoiqu|presqu|jusqu|aujourd|entr|quelqu|prud) ", "\\1’")],
185    "ma_1letter_lowercase":     [("\\b([ldjnmtscç]) (?=[aàeéêiîoôuyhAÀEÉÊIÎOÔUYH])", "\\1’")],
186    "ma_1letter_uppercase":     [("\\b([LDJNMTSCÇ]) (?=[aàeéêiîoôuyhAÀEÉÊIÎOÔUYH])", "\\1’")]
187}
188
189
190dDefaultOptions = {
191    "ts_units": True,
192    "start_of_paragraph": True,
193    "end_of_paragraph": True,
194    "between_words": True,
195    "before_punctuation": True,
196    "within_parenthesis": True,
197    "within_square_brackets": True,
198    "within_quotation_marks": True,
199    "nbsp_before_punctuation": True,
200    "nbsp_within_quotation_marks": True,
201    "nbsp_within_numbers": True,
202    "nnbsp_before_punctuation": False,
203    "nnbsp_within_quotation_marks": False,
204    "nnbsp_within_numbers": False,
205    "nbsp_titles": False,
206    "nbsp_before_symbol": True,
207    "nbsp_before_units": True,
208    "nbsp_repair": True,
209    "add_space_after_punctuation": True,
210    "add_space_around_hyphens": True,
211    "add_space_repair": True,
212    "erase_non_breaking_hyphens": False,
213    "ts_apostrophe": True,
214    "ts_ellipsis": True,
215    "ts_n_dash_middle": True,
216    "ts_m_dash_middle": False,
217    "ts_n_dash_start": False,
218    "ts_m_dash_start": True,
219    "ts_quotation_marks": True,
220    "ts_spell": True,
221    "ts_ligature_ffi_on": False,
222    "ts_ligature_ffl_on": False,
223    "ts_ligature_fi_on": False,
224    "ts_ligature_fl_on": False,
225    "ts_ligature_ff_on": False,
226    "ts_ligature_ft_on": False,
227    "ts_ligature_st_on": False,
228    "ts_ligature_fi_off": False,
229    "ts_ligature_fl_off": False,
230    "ts_ligature_ff_off": False,
231    "ts_ligature_ffi_off": False,
232    "ts_ligature_ffl_off": False,
233    "ts_ligature_ft_off": False,
234    "ts_ligature_st_off": False,
235    "ordinals_exponant": False,
236    "ordinals_no_exponant": True,
237    "etc": True,
238    "mh_interrogatives": True,
239    "mh_numbers": True,
240    "mh_frequent_words": True,
241    "ma_word": True,
242    "ma_1letter_lowercase": False,
243    "ma_1letter_uppercase": False
244}
245
246
247class TextFormatter:
248    "Text Formatter: purge typographic mistakes from text"
249
250    def __init__ (self):
251        for _, lTup in dReplTable.items():
252            for i, t in enumerate(lTup):
253                lTup[i] = (re.compile(t[0]), t[1])
254
255    def formatText (self, sText):
256        "returns formatted text"
257        for sOptName, bVal in dDefaultOptions.items():
258            if bVal:
259                for zRgx, sRep in dReplTable[sOptName]:
260                    sText = zRgx.sub(sRep, sText)
261        return sText
262
263    def getDefaultOptions (self):
264        "returns default options"
265        return dDefaultOptions.copy()
266