1# coding=utf-8 2 3from __future__ import absolute_import 4from __future__ import unicode_literals 5import re 6 7from subzero.language import Language 8from subzero.modification.mods import SubtitleTextModification, empty_line_post_processors, SubtitleModification 9from subzero.modification.processors import FuncProcessor 10from subzero.modification.processors.re_processor import NReProcessor 11from subzero.modification import registry 12from tld import get_tld 13 14 15ENGLISH = Language("eng") 16 17 18class CommonFixes(SubtitleTextModification): 19 identifier = "common" 20 description = "Basic common fixes" 21 exclusive = True 22 order = 40 23 24 long_description = "Fix common and whitespace/punctuation issues in subtitles" 25 26 processors = [ 27 # normalize hyphens 28 NReProcessor(re.compile(r'(?u)([‑‐﹘﹣])'), u"-", name="CM_hyphens"), 29 30 # -- = em dash 31 NReProcessor(re.compile(r'(?u)(\w|\b|\s|^)(-\s?-{1,2})'), r"\1—", name="CM_multidash"), 32 33 # line = _/-/\s 34 NReProcessor(re.compile(r'(?u)(^\W*[-_.:<>~"\']+\W*$)'), "", name="CM_non_word_only"), 35 36 # remove >> 37 NReProcessor(re.compile(r'(?u)^\s?>>\s*'), "", name="CM_leading_crocodiles"), 38 39 # line = : text 40 NReProcessor(re.compile(r'(?u)(^\W*:\s*(?=\w+))'), "", name="CM_empty_colon_start"), 41 42 # fix music symbols 43 NReProcessor(re.compile(r'(?u)(^[-\s>~]*[*#¶]+\s+)|(\s*[*#¶]+\s*$)'), 44 lambda x: u"♪ " if x.group(1) else u" ♪", 45 name="CM_music_symbols"), 46 47 # '' = " 48 NReProcessor(re.compile(r'(?u)([\'’ʼ❜‘‛][\'’ʼ❜‘‛]+)'), u'"', name="CM_double_apostrophe"), 49 50 # double quotes instead of single quotes inside words 51 NReProcessor(re.compile(r'(?u)([A-zÀ-ž])"([A-zÀ-ž])'), r"\1'\2", name="CM_double_as_single"), 52 53 # normalize quotes 54 NReProcessor(re.compile(r'(?u)(\s*["”“‟„])\s*(["”“‟„]["”“‟„\s]*)'), 55 lambda match: '"' + (" " if match.group(2).endswith(" ") else ""), 56 name="CM_normalize_quotes"), 57 58 # normalize single quotes 59 NReProcessor(re.compile(r'(?u)([\'’ʼ❜‘‛])'), u"'", name="CM_normalize_squotes"), 60 61 # remove leading ... 62 NReProcessor(re.compile(r'(?u)^\.\.\.[\s]*'), "", name="CM_leading_ellipsis"), 63 64 # remove "downloaded from" tags 65 NReProcessor(re.compile(r'(?ui).+downloaded\s+from.+'), "", name="CM_crap"), 66 67 # no space after ellipsis 68 NReProcessor(re.compile(r'(?u)\.\.\.(?![\s.,!?\'"])(?!$)'), "... ", name="CM_ellipsis_no_space"), 69 70 # no space before spaced ellipsis 71 NReProcessor(re.compile(r'(?u)(?<=[^\s])(?<!\s)\. \. \.'), " . . .", name="CM_ellipsis_no_space2"), 72 73 # multiple spaces 74 NReProcessor(re.compile(r'(?u)[\s]{2,}'), " ", name="CM_multiple_spaces"), 75 76 # more than 3 dots 77 NReProcessor(re.compile(r'(?u)\.{3,}'), "...", name="CM_dots"), 78 79 # no space after starting dash 80 NReProcessor(re.compile(r'(?u)^-(?![\s-])'), "- ", name="CM_dash_space"), 81 82 # remove starting spaced dots (not matching ellipses) 83 NReProcessor(re.compile(r'(?u)^(?!\s?(\.\s\.\s\.)|(\s?\.{3}))(?=\.+\s+)[\s.]*'), "", 84 name="CM_starting_spacedots"), 85 86 # space missing before doublequote 87 # ReProcessor(re.compile(r'(?u)(?<!^)(?<![\s(\["])("[^"]+")'), r' \1', name="CM_space_before_dblquote"), 88 89 # space missing after doublequote 90 # ReProcessor(re.compile(r'(?u)("[^"\s][^"]+")([^\s.,!?)\]]+)'), r"\1 \2", name="CM_space_after_dblquote"), 91 92 # space before ending doublequote? 93 94 # replace uppercase I with lowercase L in words 95 NReProcessor(re.compile(r'(?u)([a-zà-ž]+)(I+)'), 96 lambda match: r'%s%s' % (match.group(1), "l" * len(match.group(2))), 97 name="CM_uppercase_i_in_word"), 98 99 # fix spaces in numbers (allows for punctuation: ,.:' (comma/dot only fixed if after space, those may be 100 # countdowns otherwise); don't break up ellipses 101 NReProcessor( 102 re.compile(r'(?u)(\b[0-9]+[0-9:\']*(?<!\.\.)\s+(?!\.\.)[0-9,.:\'\s]*(?=[0-9]+)[0-9,.:\'])'), 103 lambda match: match.group(1).replace(" ", "") if match.group(1).count(" ") == 1 else match.group(1), 104 name="CM_spaces_in_numbers"), 105 106 # uppercase after dot 107 NReProcessor(re.compile(r'(?u)((?<!(?=\s*[A-ZÀ-Ž-_0-9.]\s*))(?:[^.\s])+\.\s+)([a-zà-ž])'), 108 lambda match: r'%s%s' % (match.group(1), match.group(2).upper()), name="CM_uppercase_after_dot"), 109 110 # remove double interpunction 111 NReProcessor(re.compile(r'(?u)(\s*[,!?])\s*([,.!?][,.!?\s]*)'), 112 lambda match: match.group(1).strip() + (" " if match.group(2).endswith(" ") else ""), 113 name="CM_double_interpunct"), 114 115 # remove spaces before punctuation; don't break spaced ellipses 116 NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=\w)) +([!?.,](?![!?.,]| \.))'), r"\1", name="CM_punctuation_space"), 117 118 # add space after punctuation 119 NReProcessor(re.compile(r'(?u)(([^\s]*)([!?.,:])([A-zÀ-ž]{2,}))'), 120 lambda match: u"%s%s %s" % (match.group(2), match.group(3), match.group(4)) if not get_tld(match.group(1), fail_silently=True, fix_protocol=True) else match.group(1), 121 name="CM_punctuation_space2"), 122 123 # fix lowercase I in english 124 NReProcessor(re.compile(r'(?u)(\b)i(\b)'), r"\1I\2", name="CM_EN_lowercase_i", 125 supported=lambda p: p.language == ENGLISH), 126 ] 127 128 post_processors = empty_line_post_processors 129 130 131class RemoveTags(SubtitleModification): 132 identifier = "remove_tags" 133 description = "Remove all style tags" 134 exclusive = True 135 modifies_whole_file = True 136 137 long_description = "Removes all possible style tags from the subtitle, such as font, bold, color etc." 138 139 def modify(self, content, debug=False, parent=None, **kwargs): 140 for entry in parent.f: 141 # this actually plaintexts the entry and by re-assigning it to plaintext, it replaces \n with \N again 142 entry.plaintext = entry.plaintext 143 144 145class ReverseRTL(SubtitleModification): 146 identifier = "reverse_rtl" 147 description = "Reverse punctuation in RTL languages" 148 exclusive = True 149 order = 50 150 languages = [Language(l) for l in ('heb', 'ara', 'fas')] 151 152 long_description = "Some playback devices don't properly handle right-to-left markers for punctuation. " \ 153 "Physically swap punctuation. Applicable to languages: hebrew, arabic, farsi, persian" 154 155 processors = [ 156 # new? (?u)(^([\s.!?]*)(.+?)(\s*)(-?\s*)$); \5\4\3\2 157 #NReProcessor(re.compile(r"(?u)((?=(?<=\b|^)|(?<=\s))([.!?-]+)([^.!?-]+)(?=\b|$|\s))"), r"\3\2", 158 # name="CM_RTL_reverse") 159 NReProcessor(re.compile(r"(?u)(^([\s.!?:,'-]*)(.+?)(\s*)(-?\s*)$)"), r"\5\4\3\2", 160 name="CM_RTL_reverse") 161 ] 162 163 164split_upper_re = re.compile(r"(\s*[.!?♪\-]\s*)") 165 166 167class FixUppercase(SubtitleModification): 168 identifier = "fix_uppercase" 169 description = "Fixes all-uppercase subtitles" 170 modifies_whole_file = True 171 exclusive = True 172 order = 41 173 only_uppercase = True 174 apply_last = True 175 176 long_description = "Some subtitles are in all-uppercase letters. This at least makes them readable." 177 178 def capitalize(self, c): 179 return u"".join([s.capitalize() for s in split_upper_re.split(c)]) 180 181 def modify(self, content, debug=False, parent=None, **kwargs): 182 for entry in parent.f: 183 entry.plaintext = self.capitalize(entry.plaintext) 184 185 186registry.register(CommonFixes) 187registry.register(RemoveTags) 188registry.register(ReverseRTL) 189registry.register(FixUppercase) 190