1# coding=utf-8
2
3from __future__ import absolute_import
4from __future__ import unicode_literals
5import re
6
7from subzero.language import Language
8from subzero.modification.mods import SubtitleTextModification, empty_line_post_processors, SubtitleModification
9from subzero.modification.processors import FuncProcessor
10from subzero.modification.processors.re_processor import NReProcessor
11from subzero.modification import registry
12from tld import get_tld
13
14
15ENGLISH = Language("eng")
16
17
18class CommonFixes(SubtitleTextModification):
19    identifier = "common"
20    description = "Basic common fixes"
21    exclusive = True
22    order = 40
23
24    long_description = "Fix common and whitespace/punctuation issues in subtitles"
25
26    processors = [
27        # normalize hyphens
28        NReProcessor(re.compile(r'(?u)([‑‐﹘﹣])'), u"-", name="CM_hyphens"),
29
30        # -- = em dash
31        NReProcessor(re.compile(r'(?u)(\w|\b|\s|^)(-\s?-{1,2})'), r"\1—", name="CM_multidash"),
32
33        # line = _/-/\s
34        NReProcessor(re.compile(r'(?u)(^\W*[-_.:<>~"\']+\W*$)'), "", name="CM_non_word_only"),
35
36        # remove >>
37        NReProcessor(re.compile(r'(?u)^\s?>>\s*'), "", name="CM_leading_crocodiles"),
38
39        # line = : text
40        NReProcessor(re.compile(r'(?u)(^\W*:\s*(?=\w+))'), "", name="CM_empty_colon_start"),
41
42        # fix music symbols
43        NReProcessor(re.compile(r'(?u)(^[-\s>~]*[*#¶]+\s+)|(\s*[*#¶]+\s*$)'),
44                     lambda x: u"♪ " if x.group(1) else u" ♪",
45                     name="CM_music_symbols"),
46
47        # '' = "
48        NReProcessor(re.compile(r'(?u)([\'’ʼ❜‘‛][\'’ʼ❜‘‛]+)'), u'"', name="CM_double_apostrophe"),
49
50        # double quotes instead of single quotes inside words
51        NReProcessor(re.compile(r'(?u)([A-zÀ-ž])"([A-zÀ-ž])'), r"\1'\2", name="CM_double_as_single"),
52
53        # normalize quotes
54        NReProcessor(re.compile(r'(?u)(\s*["”“‟„])\s*(["”“‟„]["”“‟„\s]*)'),
55                     lambda match: '"' + (" " if match.group(2).endswith(" ") else ""),
56                     name="CM_normalize_quotes"),
57
58        # normalize single quotes
59        NReProcessor(re.compile(r'(?u)([\'’ʼ❜‘‛])'), u"'", name="CM_normalize_squotes"),
60
61        # remove leading ...
62        NReProcessor(re.compile(r'(?u)^\.\.\.[\s]*'), "", name="CM_leading_ellipsis"),
63
64        # remove "downloaded from" tags
65        NReProcessor(re.compile(r'(?ui).+downloaded\s+from.+'), "", name="CM_crap"),
66
67        # no space after ellipsis
68        NReProcessor(re.compile(r'(?u)\.\.\.(?![\s.,!?\'"])(?!$)'), "... ", name="CM_ellipsis_no_space"),
69
70        # no space before spaced ellipsis
71        NReProcessor(re.compile(r'(?u)(?<=[^\s])(?<!\s)\. \. \.'), " . . .", name="CM_ellipsis_no_space2"),
72
73        # multiple spaces
74        NReProcessor(re.compile(r'(?u)[\s]{2,}'), " ", name="CM_multiple_spaces"),
75
76        # more than 3 dots
77        NReProcessor(re.compile(r'(?u)\.{3,}'), "...", name="CM_dots"),
78
79        # no space after starting dash
80        NReProcessor(re.compile(r'(?u)^-(?![\s-])'), "- ", name="CM_dash_space"),
81
82        # remove starting spaced dots (not matching ellipses)
83        NReProcessor(re.compile(r'(?u)^(?!\s?(\.\s\.\s\.)|(\s?\.{3}))(?=\.+\s+)[\s.]*'), "",
84                     name="CM_starting_spacedots"),
85
86        # space missing before doublequote
87        # ReProcessor(re.compile(r'(?u)(?<!^)(?<![\s(\["])("[^"]+")'), r' \1', name="CM_space_before_dblquote"),
88
89        # space missing after doublequote
90        # ReProcessor(re.compile(r'(?u)("[^"\s][^"]+")([^\s.,!?)\]]+)'), r"\1 \2", name="CM_space_after_dblquote"),
91
92        # space before ending doublequote?
93
94        # replace uppercase I with lowercase L in words
95        NReProcessor(re.compile(r'(?u)([a-zà-ž]+)(I+)'),
96                     lambda match: r'%s%s' % (match.group(1), "l" * len(match.group(2))),
97                     name="CM_uppercase_i_in_word"),
98
99        # fix spaces in numbers (allows for punctuation: ,.:' (comma/dot only fixed if after space, those may be
100        # countdowns otherwise); don't break up ellipses
101        NReProcessor(
102            re.compile(r'(?u)(\b[0-9]+[0-9:\']*(?<!\.\.)\s+(?!\.\.)[0-9,.:\'\s]*(?=[0-9]+)[0-9,.:\'])'),
103            lambda match: match.group(1).replace(" ", "") if match.group(1).count(" ") == 1 else match.group(1),
104            name="CM_spaces_in_numbers"),
105
106        # uppercase after dot
107        NReProcessor(re.compile(r'(?u)((?<!(?=\s*[A-ZÀ-Ž-_0-9.]\s*))(?:[^.\s])+\.\s+)([a-zà-ž])'),
108                     lambda match: r'%s%s' % (match.group(1), match.group(2).upper()), name="CM_uppercase_after_dot"),
109
110        # remove double interpunction
111        NReProcessor(re.compile(r'(?u)(\s*[,!?])\s*([,.!?][,.!?\s]*)'),
112                     lambda match: match.group(1).strip() + (" " if match.group(2).endswith(" ") else ""),
113                     name="CM_double_interpunct"),
114
115        # remove spaces before punctuation; don't break spaced ellipses
116        NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=\w)) +([!?.,](?![!?.,]| \.))'), r"\1", name="CM_punctuation_space"),
117
118        # add space after punctuation
119        NReProcessor(re.compile(r'(?u)(([^\s]*)([!?.,:])([A-zÀ-ž]{2,}))'),
120                     lambda match: u"%s%s %s" % (match.group(2), match.group(3), match.group(4)) if not get_tld(match.group(1), fail_silently=True, fix_protocol=True) else match.group(1),
121                     name="CM_punctuation_space2"),
122
123        # fix lowercase I in english
124        NReProcessor(re.compile(r'(?u)(\b)i(\b)'), r"\1I\2", name="CM_EN_lowercase_i",
125                     supported=lambda p: p.language == ENGLISH),
126    ]
127
128    post_processors = empty_line_post_processors
129
130
131class RemoveTags(SubtitleModification):
132    identifier = "remove_tags"
133    description = "Remove all style tags"
134    exclusive = True
135    modifies_whole_file = True
136
137    long_description = "Removes all possible style tags from the subtitle, such as font, bold, color etc."
138
139    def modify(self, content, debug=False, parent=None, **kwargs):
140        for entry in parent.f:
141            # this actually plaintexts the entry and by re-assigning it to plaintext, it replaces \n with \N again
142            entry.plaintext = entry.plaintext
143
144
145class ReverseRTL(SubtitleModification):
146    identifier = "reverse_rtl"
147    description = "Reverse punctuation in RTL languages"
148    exclusive = True
149    order = 50
150    languages = [Language(l) for l in ('heb', 'ara', 'fas')]
151
152    long_description = "Some playback devices don't properly handle right-to-left markers for punctuation. " \
153                       "Physically swap punctuation. Applicable to languages: hebrew, arabic, farsi, persian"
154
155    processors = [
156        # new? (?u)(^([\s.!?]*)(.+?)(\s*)(-?\s*)$); \5\4\3\2
157        #NReProcessor(re.compile(r"(?u)((?=(?<=\b|^)|(?<=\s))([.!?-]+)([^.!?-]+)(?=\b|$|\s))"), r"\3\2",
158        #             name="CM_RTL_reverse")
159        NReProcessor(re.compile(r"(?u)(^([\s.!?:,'-]*)(.+?)(\s*)(-?\s*)$)"), r"\5\4\3\2",
160                     name="CM_RTL_reverse")
161    ]
162
163
164split_upper_re = re.compile(r"(\s*[.!?♪\-]\s*)")
165
166
167class FixUppercase(SubtitleModification):
168    identifier = "fix_uppercase"
169    description = "Fixes all-uppercase subtitles"
170    modifies_whole_file = True
171    exclusive = True
172    order = 41
173    only_uppercase = True
174    apply_last = True
175
176    long_description = "Some subtitles are in all-uppercase letters. This at least makes them readable."
177
178    def capitalize(self, c):
179        return u"".join([s.capitalize() for s in split_upper_re.split(c)])
180
181    def modify(self, content, debug=False, parent=None, **kwargs):
182        for entry in parent.f:
183            entry.plaintext = self.capitalize(entry.plaintext)
184
185
186registry.register(CommonFixes)
187registry.register(RemoveTags)
188registry.register(ReverseRTL)
189registry.register(FixUppercase)
190