1#
2# Copyright 2004-2011 Zuza Software Foundation
3# 2013, 2016 F Wolff
4#
5# This file is part of translate.
6#
7# translate is free software; you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation; either version 2 of the License, or
10# (at your option) any later version.
11#
12# translate is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with this program; if not, see <http://www.gnu.org/licenses/>.
19
20"""This is a set of validation checks that can be performed on translation
21units.
22
23Derivatives of UnitChecker (like StandardUnitChecker) check translation units,
24and derivatives of TranslationChecker (like StandardChecker) check
25(source, target) translation pairs.
26
27When adding a new test here, please document and explain their behaviour on the
28:doc:`pofilter tests </commands/pofilter_tests>` page.
29"""
30
31import logging
32import re
33
34from translate.filters import decoration, helpers, prefilters, spelling
35from translate.filters.decorators import cosmetic, critical, extraction, functional
36from translate.lang import data, factory
37
38
39logger = logging.getLogger(__name__)
40
41# These are some regular expressions that are compiled for use in some tests
42
43# printf syntax based on http://en.wikipedia.org/wiki/Printf which doesn't
44# cover everything we leave \w instead of specifying the exact letters as
45# this should capture printf types defined in other platforms.
46# Extended to support Python named format specifiers and objective-C special
47# "%@" format specifier
48# (see https://developer.apple.com/library/mac/documentation/Cocoa/Conceptual/Strings/Articles/formatSpecifiers.html)
49printf_pat = re.compile(
50    r"""
51        %(                          # initial %
52        (?P<boost_ord>\d+)%         # boost::format style variable order, like %1%
53        |
54              (?:(?P<ord>\d+)\$|    # variable order, like %1$s
55              \((?P<key>\w+)\))?    # Python style variables, like %(var)s
56        (?P<fullvar>
57            [+#-]*                  # flags
58            (?:\d+)?                # width
59            (?:\.\d+)?              # precision
60            (hh\|h\|l\|ll)?         # length formatting
61            (?P<type>[\w@]))        # type (%s, %d, etc.)
62        )""",
63    re.VERBOSE,
64)
65
66# The name of the XML tag
67tagname_re = re.compile(r"<[\s]*([\w\/]*).*?(/)?[\s]*>", re.DOTALL)
68
69# We allow escaped quotes, probably for old escaping style of OOo helpcontent
70# TODO: remove escaped strings once usage is audited
71property_re = re.compile(" (\\w*)=((\\\\?\".*?\\\\?\")|(\\\\?'.*?\\\\?'))")
72
73# The whole tag
74tag_re = re.compile("<[^>]+>")
75
76gconf_attribute_re = re.compile('"[a-z_]+?"')
77
78# XML/HTML tags in LibreOffice help and readme, exclude short tags
79lo_tag_re = re.compile("""</?(?P<tag>[a-z][a-z_-]+)(?: +[a-z]+="[^"]+")* */?>""")
80lo_emptytags = frozenset(["br", "embed", "embedvar", "object", "help-id-missing"])
81
82
83def tagname(string):
84    """Returns the name of the XML/HTML tag in string"""
85    tagname_match = tagname_re.match(string)
86    return tagname_match.groups(1)[0] + tagname_match.groups("")[1]
87
88
89def intuplelist(pair, list):
90    """Tests to see if pair == (a,b,c) is in list, but handles None entries in
91    list as wildcards (only allowed in positions "a" and "c"). We take a
92    shortcut by only considering "c" if "b" has already matched.
93    """
94    a, b, c = pair
95
96    if (b, c) == (None, None):
97        # This is a tagname
98        return pair
99
100    for pattern in list:
101        x, y, z = pattern
102
103        if (x, y) in [(a, b), (None, b)]:
104            if z in [None, c]:
105                return pattern
106
107    return pair
108
109
110def tagproperties(strings, ignore):
111    """Returns all the properties in the XML/HTML tag string as (tagname,
112    propertyname, propertyvalue), but ignore those combinations specified in
113    ignore.
114    """
115    properties = []
116
117    for string in strings:
118        tag = tagname(string)
119        properties += [(tag, None, None)]
120        # Now we isolate the attribute pairs.
121        pairs = property_re.findall(string)
122
123        for property, value, a, b in pairs:
124            # Strip the quotes:
125            value = value[1:-1]
126
127            canignore = False
128
129            if (tag, property, value) in ignore or intuplelist(
130                (tag, property, value), ignore
131            ) != (tag, property, value):
132                canignore = True
133                break
134
135            if not canignore:
136                properties += [(tag, property, value)]
137
138    return properties
139
140
141class FilterFailure(Exception):
142    """This exception signals that a Filter didn't pass, and gives an
143    explanation or a comment.
144    """
145
146    def __init__(self, messages):
147        if not isinstance(messages, list):
148            messages = [messages]
149
150        assert isinstance(messages[0], str)  # Assumption: all of same type
151
152        self.messages = messages
153
154    def __str__(self):
155        return ", ".join(self.messages)
156
157
158class SeriousFilterFailure(FilterFailure):
159    """This exception signals that a Filter didn't pass, and the bad
160    translation might break an application (so the string will be marked
161    fuzzy)
162    """
163
164    pass
165
166
167# (tag, attribute, value) specifies a certain attribute which can be changed/
168# ignored if it exists inside tag. In the case where there is a third element
169# in the tuple, it indicates a property value that can be ignored if present
170# (like defaults, for example)
171# If a certain item is None, it indicates that it is relevant for all values of
172# the property/tag that is specified as None. A non-None value of "value"
173# indicates that the value of the attribute must be taken into account.
174common_ignoretags = [(None, "xml-lang", None)]
175common_canchangetags = [
176    ("img", "alt", None),
177    (None, "title", None),
178    (None, "dir", None),
179    (None, "lang", None),
180]
181# Actually the title tag is allowed on many tags in HTML (but probably not all)
182
183
184class CheckerConfig:
185    """Object representing the configuration of a checker."""
186
187    def __init__(
188        self,
189        targetlanguage=None,
190        accelmarkers=None,
191        varmatches=None,
192        notranslatewords=None,
193        musttranslatewords=None,
194        validchars=None,
195        punctuation=None,
196        endpunctuation=None,
197        ignoretags=None,
198        canchangetags=None,
199        criticaltests=None,
200        credit_sources=None,
201    ):
202        # Init lists
203        self.accelmarkers = self._init_list(accelmarkers)
204        self.varmatches = self._init_list(varmatches)
205        self.criticaltests = self._init_list(criticaltests)
206        self.credit_sources = self._init_list(credit_sources)
207
208        # Lang data
209        self.updatetargetlanguage(targetlanguage)
210        self.sourcelang = factory.getlanguage("en")
211
212        # Inits with default values
213        self.punctuation = self._init_default(
214            data.normalize(punctuation), self.lang.punctuation
215        )
216        self.endpunctuation = self._init_default(
217            data.normalize(endpunctuation), self.lang.sentenceend
218        )
219        self.ignoretags = self._init_default(ignoretags, common_ignoretags)
220        self.canchangetags = self._init_default(canchangetags, common_canchangetags)
221
222        # Other data
223        # TODO: allow user configuration of untranslatable words
224        self.notranslatewords = dict.fromkeys(
225            [data.normalize(key) for key in self._init_list(notranslatewords)]
226        )
227        self.musttranslatewords = dict.fromkeys(
228            [data.normalize(key) for key in self._init_list(musttranslatewords)]
229        )
230        validchars = data.normalize(validchars)
231        self.validcharsmap = {}
232        self.updatevalidchars(validchars)
233
234    def _init_list(self, list):
235        """initialise configuration paramaters that are lists
236
237        :type list: List
238        :param list: None (we'll initialise a blank list) or a list paramater
239        :rtype: List
240        """
241        if list is None:
242            list = []
243
244        return list
245
246    def _init_default(self, param, default):
247        """Initialise parameters that can have default options.
248
249        :param param: the user supplied paramater value
250        :param default: default values when param is not specified
251        :return: the paramater as specified by the user of the default settings
252        """
253        if param is None:
254            return default
255
256        return param
257
258    def update(self, otherconfig):
259        """Combines the info in ``otherconfig`` into this config object."""
260        self.targetlanguage = otherconfig.targetlanguage or self.targetlanguage
261        self.updatetargetlanguage(self.targetlanguage)
262        self.accelmarkers.extend(
263            [c for c in otherconfig.accelmarkers if c not in self.accelmarkers]
264        )
265        self.varmatches.extend(otherconfig.varmatches)
266        self.notranslatewords.update(otherconfig.notranslatewords)
267        self.musttranslatewords.update(otherconfig.musttranslatewords)
268        self.validcharsmap.update(otherconfig.validcharsmap)
269        self.punctuation += otherconfig.punctuation
270        self.endpunctuation += otherconfig.endpunctuation
271        # TODO: consider also updating in the following cases:
272        self.ignoretags = otherconfig.ignoretags
273        self.canchangetags = otherconfig.canchangetags
274        self.criticaltests.extend(otherconfig.criticaltests)
275        self.credit_sources = otherconfig.credit_sources
276
277    def updatevalidchars(self, validchars):
278        """Updates the map that eliminates valid characters."""
279        if validchars is None:
280            return True
281
282        validcharsmap = {
283            ord(validchar): None for validchar in data.normalize(validchars)
284        }
285        self.validcharsmap.update(validcharsmap)
286
287    def updatetargetlanguage(self, langcode):
288        """Updates the target language in the config to the given target
289        language and sets its script.
290        """
291        self.targetlanguage = langcode
292        self.lang = factory.getlanguage(langcode)
293        self.language_script = ""
294
295        for script, langs in data.scripts.items():
296            if langcode in langs or data.simplercode(langcode) in langs:
297                self.language_script = script
298                break
299
300
301def cache_results(f):
302    def cached_f(self, param1):
303        key = (f.__name__, param1)
304        res_cache = self.results_cache
305
306        if key in res_cache:
307            return res_cache[key]
308        else:
309            value = f(self, param1)
310            res_cache[key] = value
311            return value
312
313    return cached_f
314
315
316class UnitChecker:
317    """Parent Checker class which does the checking based on functions
318    available in derived classes.
319    """
320
321    preconditions = {}
322
323    def __init__(
324        self,
325        checkerconfig=None,
326        excludefilters=None,
327        limitfilters=None,
328        errorhandler=None,
329    ):
330        self.errorhandler = errorhandler
331
332        #: Categories where each checking function falls into
333        #: Function names are used as keys, categories are the values
334        self.categories = {}
335
336        if checkerconfig is None:
337            self.setconfig(CheckerConfig())
338        else:
339            self.setconfig(checkerconfig)
340
341        # Exclude functions defined in UnitChecker from being treated as tests.
342        self.helperfunctions = {}
343
344        for functionname in dir(UnitChecker):
345            function = getattr(self, functionname)
346
347            if callable(function):
348                self.helperfunctions[functionname] = function
349
350        self.defaultfilters = self.getfilters(excludefilters, limitfilters)
351        self.results_cache = {}
352
353    def getfilters(self, excludefilters=None, limitfilters=None):
354        """Returns dictionary of available filters, including/excluding those
355        in the given lists.
356        """
357        filters = {}
358
359        if limitfilters is None:
360            # use everything available unless instructed
361            limitfilters = dir(self)
362
363        if excludefilters is None:
364            excludefilters = {}
365
366        for functionname in limitfilters:
367
368            if functionname in excludefilters:
369                continue
370
371            if functionname in self.helperfunctions:
372                continue
373
374            if functionname == "errorhandler":
375                continue
376
377            filterfunction = getattr(self, functionname, None)
378            if not callable(filterfunction):
379                continue
380
381            filters[functionname] = filterfunction
382
383        return filters
384
385    def setconfig(self, config):
386        """Sets the accelerator list."""
387        self.config = config
388        self.accfilters = [
389            prefilters.filteraccelerators(accelmarker)
390            for accelmarker in self.config.accelmarkers
391        ]
392        self.varfilters = [
393            prefilters.filtervariables(startmatch, endmatch, prefilters.varname)
394            for startmatch, endmatch in self.config.varmatches
395        ]
396        self.removevarfilter = [
397            prefilters.filtervariables(startmatch, endmatch, prefilters.varnone)
398            for startmatch, endmatch in self.config.varmatches
399        ]
400
401    def setsuggestionstore(self, store):
402        """Sets the filename that a checker should use for evaluating
403        suggestions.
404        """
405        self.suggestion_store = store
406
407        if self.suggestion_store:
408            self.suggestion_store.require_index()
409
410    def filtervariables(self, str1):
411        """Filter out variables from ``str1``."""
412        return helpers.multifilter(str1, self.varfilters)
413
414    filtervariables = cache_results(filtervariables)
415
416    def removevariables(self, str1):
417        """Remove variables from ``str1``."""
418        return helpers.multifilter(str1, self.removevarfilter)
419
420    removevariables = cache_results(removevariables)
421
422    def filteraccelerators(self, str1):
423        """Filter out accelerators from ``str1``."""
424        return helpers.multifilter(str1, self.accfilters, None)
425
426    filteraccelerators = cache_results(filteraccelerators)
427
428    def filteraccelerators_by_list(self, str1, acceptlist=None):
429        """Filter out accelerators from ``str1``."""
430        return helpers.multifilter(str1, self.accfilters, acceptlist)
431
432    def filterwordswithpunctuation(self, str1):
433        """Replaces words with punctuation with their unpunctuated
434        equivalents.
435        """
436        return prefilters.filterwordswithpunctuation(str1)
437
438    filterwordswithpunctuation = cache_results(filterwordswithpunctuation)
439
440    def filterxml(self, str1):
441        """Filter out XML from the string so only text remains."""
442        return tag_re.sub("", str1)
443
444    filterxml = cache_results(filterxml)
445
446    def run_test(self, test, unit):
447        """Runs the given test on the given unit.
448
449        Note that this can raise a :exc:`FilterFailure` as part of normal operation.
450        """
451        return test(unit)
452
453    @property
454    def checker_name(self):
455        """Extract checker name, for example 'mozilla' from MozillaChecker."""
456        return str(self.__class__.__name__).lower()[: -len("checker")]
457
458    def get_ignored_filters(self):
459        """Return checker's additional filters for current language."""
460        return list(
461            set(
462                self.config.lang.ignoretests.get(self.checker_name, [])
463                + self.config.lang.ignoretests.get("all", [])
464            )
465        )
466
467    def run_filters(self, unit, categorised=False):
468        """Run all the tests in this suite.
469
470        :rtype: Dictionary
471        :return: Content of the dictionary is as follows::
472
473           {'testname': { 'message': message_or_exception, 'category': failure_category } }
474        """
475        self.results_cache = {}
476        failures = {}
477        ignores = self.get_ignored_filters()
478        functionnames = self.defaultfilters.keys()
479        priorityfunctionnames = self.preconditions.keys()
480        otherfunctionnames = filter(
481            lambda functionname: functionname not in self.preconditions, functionnames
482        )
483
484        for functionname in list(priorityfunctionnames) + list(otherfunctionnames):
485            if functionname in ignores:
486                continue
487
488            filterfunction = getattr(self, functionname, None)
489
490            # This filterfunction may only be defined on another checker if
491            # using TeeChecker
492            if filterfunction is None:
493                continue
494
495            filtermessage = ""
496
497            try:
498                filterresult = self.run_test(filterfunction, unit)
499            except FilterFailure as e:
500                filterresult = False
501                filtermessage = str(e)
502            except Exception as e:
503                if self.errorhandler is None:
504                    raise ValueError(
505                        "error in filter %s: %r, %r, %s"
506                        % (functionname, unit.source, unit.target, e)
507                    )
508                else:
509                    filterresult = self.errorhandler(
510                        functionname, unit.source, unit.target, e
511                    )
512            if not filterresult:
513                if not filtermessage:
514                    # Should be quite rare
515                    import pydoc
516
517                    # Strip out unnecessary whitespace from docstring
518                    filtermessage = pydoc.getdoc(filterfunction)
519                # We test some preconditions that aren't actually a cause for
520                # failure
521                if functionname in self.defaultfilters:
522                    failures[functionname] = {
523                        "message": filtermessage,
524                        "category": self.categories[functionname],
525                    }
526
527                if functionname in self.preconditions:
528                    for ignoredfunctionname in self.preconditions[functionname]:
529                        ignores.append(ignoredfunctionname)
530
531        self.results_cache = {}
532
533        if not categorised:
534            for name, info in failures.items():
535                failures[name] = info["message"]
536        return failures
537
538
539class TranslationChecker(UnitChecker):
540    """A checker that passes source and target strings to the checks, not the
541    whole unit.
542
543    This provides some speedup and simplifies testing.
544    """
545
546    def __init__(
547        self,
548        checkerconfig=None,
549        excludefilters=None,
550        limitfilters=None,
551        errorhandler=None,
552    ):
553        super().__init__(checkerconfig, excludefilters, limitfilters, errorhandler)
554
555        self.locations = []
556
557    def run_test(self, test, unit):
558        """Runs the given test on the given unit.
559
560        Note that this can raise a :exc:`FilterFailure` as part of normal
561        operation.
562        """
563        if self.hasplural:
564            filtermessages = []
565            filterresult = True
566
567            for pluralform in unit.target.strings:
568                try:
569                    if not test(self.str1, str(pluralform)):
570                        filterresult = False
571                except FilterFailure as e:
572                    filterresult = False
573                    filtermessages.extend(e.messages)
574
575            if not filterresult and filtermessages:
576                raise FilterFailure(filtermessages)
577            else:
578                return filterresult
579        else:
580            return test(self.str1, self.str2)
581
582    def run_filters(self, unit, categorised=False):
583        """Do some optimisation by caching some data of the unit for the
584        benefit of :meth:`~TranslationChecker.run_test`.
585        """
586        self.str1 = data.normalize(unit.source) or ""
587        self.str2 = data.normalize(unit.target) or ""
588        self.hasplural = unit.hasplural()
589        self.locations = unit.getlocations()
590
591        return super().run_filters(unit, categorised)
592
593
594class TeeChecker:
595    """A Checker that controls multiple checkers."""
596
597    #: Categories where each checking function falls into
598    #: Function names are used as keys, categories are the values
599    categories = {}
600
601    def __init__(
602        self,
603        checkerconfig=None,
604        excludefilters=None,
605        limitfilters=None,
606        checkerclasses=None,
607        errorhandler=None,
608        languagecode=None,
609    ):
610        """construct a TeeChecker from the given checkers"""
611        self.limitfilters = limitfilters
612
613        if checkerclasses is None:
614            checkerclasses = [StandardChecker]
615
616        self.checkers = [
617            checkerclass(
618                checkerconfig=checkerconfig,
619                excludefilters=excludefilters,
620                limitfilters=limitfilters,
621                errorhandler=errorhandler,
622            )
623            for checkerclass in checkerclasses
624        ]
625
626        if languagecode:
627            for checker in self.checkers:
628                checker.config.updatetargetlanguage(languagecode)
629
630            # Let's hook up the language specific checker
631            lang_checker = self.checkers[0].config.lang.checker
632
633            if lang_checker:
634                self.checkers.append(lang_checker)
635
636        self.combinedfilters = self.getfilters(excludefilters, limitfilters)
637        self.config = checkerconfig or self.checkers[0].config
638
639    def getfilters(self, excludefilters=None, limitfilters=None):
640        """Returns a dictionary of available filters, including/excluding
641        those in the given lists.
642        """
643        if excludefilters is None:
644            excludefilters = {}
645
646        filterslist = [
647            checker.getfilters(excludefilters, limitfilters)
648            for checker in self.checkers
649        ]
650        self.combinedfilters = {}
651
652        for filters in filterslist:
653            self.combinedfilters.update(filters)
654
655        # TODO: move this somewhere more sensible (a checkfilters method?)
656        if limitfilters is not None:
657
658            for filtername in limitfilters:
659
660                if filtername not in self.combinedfilters:
661                    logger.warning("could not find filter %s", filtername)
662
663        return self.combinedfilters
664
665    def run_filters(self, unit, categorised=False):
666        """Run all the tests in the checker's suites."""
667        failures = {}
668
669        for checker in self.checkers:
670            failures.update(checker.run_filters(unit, categorised))
671
672        return failures
673
674    def setsuggestionstore(self, store):
675        """Sets the filename that a checker should use for evaluating
676        suggestions.
677        """
678        for checker in self.checkers:
679            checker.setsuggestionstore(store)
680
681
682class StandardChecker(TranslationChecker):
683    """The basic test suite for source -> target translations."""
684
685    @extraction
686    def untranslated(self, str1, str2):
687        """Checks whether a string has been translated at all.
688
689        This check is really only useful if you want to extract untranslated
690        strings so that they can be translated independently of the main work.
691        """
692        str2 = prefilters.removekdecomments(str2)
693
694        return not (len(str1.strip()) > 0 and len(str2) == 0)
695
696    @functional
697    def unchanged(self, str1, str2):
698        """Checks whether a translation is basically identical to the original
699        string.
700
701        This checks to see if the translation isn’t just a copy of the English
702        original. Sometimes, this is what you want, but other times you will
703        detect words that should have been translated.
704        """
705        str1 = self.filteraccelerators(self.removevariables(str1)).strip()
706        str2 = self.filteraccelerators(self.removevariables(str2)).strip()
707
708        if len(str1) < 2:
709            return True
710
711        # If the whole string is upperase, or nothing in the string can go
712        # towards uppercase, let's assume there is nothing translatable
713        # TODO: reconsider
714        if (str1.isupper() or str1.upper() == str1) and str1 == str2:
715            return True
716
717        if self.config.notranslatewords:
718            words1 = str1.split()
719            if len(words1) == 1 and [
720                word for word in words1 if word in self.config.notranslatewords
721            ]:
722                # currently equivalent to:
723                #   if len(words1) == 1 and words1[0] in self.config.notranslatewords:
724                # why do we only test for one notranslate word?
725                return True
726
727        # we could also check for things like str1.isnumeric(), but the test
728        # above (str1.upper() == str1) makes this unnecessary
729        if str1.lower() == str2.lower():
730            raise FilterFailure("Consider translating")
731
732        return True
733
734    @functional
735    def blank(self, str1, str2):
736        """Checks whether a translation is totally blank.
737
738        This will check to see if a translation has inadvertently been
739        translated as blank i.e. as spaces. This is different from untranslated
740        which is completely empty. This test is useful in that if something is
741        translated as "  " it will appear to most tools as if it is translated.
742        """
743        len1 = len(str1.strip())
744        len2 = len(str2.strip())
745
746        if len1 > 0 and len(str2) != 0 and len2 == 0:
747            raise FilterFailure("Translation is empty")
748        else:
749            return True
750
751    @functional
752    def short(self, str1, str2):
753        """Checks whether a translation is much shorter than the original
754        string.
755
756        This is most useful in the special case where the translation is 1
757        characters long while the source text is multiple characters long.
758        Otherwise, we use a general ratio that will catch very big differences
759        but is set conservatively to limit the number of false positives.
760        """
761        len1 = len(str1.strip())
762        len2 = len(str2.strip())
763
764        if (len1 > 0) and (0 < len2 < (len1 * 0.1)) or ((len1 > 1) and (len2 == 1)):
765            raise FilterFailure("The translation is much shorter than the original")
766        else:
767            return True
768
769    @functional
770    def long(self, str1, str2):
771        """Checks whether a translation is much longer than the original
772        string.
773
774        This is most useful in the special case where the translation is
775        multiple characters long while the source text is only 1 character
776        long. Otherwise, we use a general ratio that will catch very big
777        differences but is set conservatively to limit the number of false
778        positives.
779        """
780        len1 = len(str1.strip())
781        len2 = len(str2.strip())
782
783        if (len1 > 0) and (0 < len1 < (len2 * 0.1)) or ((len1 == 1) and (len2 > 1)):
784            raise FilterFailure("The translation is much longer than the original")
785        else:
786            return True
787
788    @critical
789    def escapes(self, str1, str2):
790        """Checks whether escaping is consistent between the two strings.
791
792        Checks escapes such as ``\\n`` ``\u0000`` to ensure that if they exist
793        in the original string you also have them in the translation.
794        """
795        if not helpers.countsmatch(str1, str2, ("\\", "\\\\")):
796            escapes1 = ", ".join("'%s'" % word for word in str1.split() if "\\" in word)
797            escapes2 = ", ".join("'%s'" % word for word in str2.split() if "\\" in word)
798
799            raise SeriousFilterFailure(
800                "Escapes in original (%s) don't match "
801                "escapes in translation (%s)" % (escapes1, escapes2)
802            )
803        else:
804            return True
805
806    @critical
807    def newlines(self, str1, str2):
808        """Checks whether newlines are consistent between the two strings.
809
810        Counts the number of ``\\n`` newlines (and variants such as ``\\r\\n``)
811        and reports and error if they differ.
812        """
813        if not helpers.countsmatch(str1, str2, ("\n", "\r")):
814            raise FilterFailure("Different line endings")
815
816        if str1.endswith("\n") and not str2.endswith("\n"):
817            raise FilterFailure("Newlines different at end")
818
819        if str1.startswith("\n") and not str2.startswith("\n"):
820            raise FilterFailure("Newlines different at beginning")
821
822        return True
823
824    @critical
825    def tabs(self, str1, str2):
826        """Checks whether tabs are consistent between the two strings.
827
828        Counts the number of ``\\t`` tab markers and reports an error if they
829        differ.
830        """
831        if not helpers.countmatch(str1, str2, "\t"):
832            raise SeriousFilterFailure("Different tabs")
833        else:
834            return True
835
836    @cosmetic
837    def singlequoting(self, str1, str2):
838        """Checks whether singlequoting is consistent between the two strings.
839
840        The same as doublequoting but checks for the ``'`` character. Because
841        this is used in contractions like it's and in possessive forms like
842        user's, this test can output spurious errors if your language doesn't
843        use such forms. If a quote appears at the end of a sentence in the
844        translation, i.e. ``'.``, this might not be detected properly by the
845        check.
846        """
847        str1 = self.filterwordswithpunctuation(
848            self.filteraccelerators(self.filtervariables(str1))
849        )
850        str1 = self.config.lang.punctranslate(str1)
851
852        str2 = self.filterwordswithpunctuation(
853            self.filteraccelerators(self.filtervariables(str2))
854        )
855
856        if helpers.countsmatch(str1, str2, ("'", "''", "\\'")):
857            return True
858        else:
859            raise FilterFailure("Different quotation marks")
860
861    @cosmetic
862    def doublequoting(self, str1, str2):
863        """Checks whether doublequoting is consistent between the two strings.
864
865        Checks on double quotes ``"`` to ensure that you have the same number
866        in both the original and the translated string. This tests takes into
867        account that several languages use different quoting characters, and
868        will test for them instead.
869        """
870        str1 = self.filteraccelerators(self.filtervariables(str1))
871        str1 = self.filterxml(str1)
872        str1 = self.config.lang.punctranslate(str1)
873
874        str2 = self.filteraccelerators(self.filtervariables(str2))
875        str2 = self.filterxml(str2)
876
877        if helpers.countsmatch(str1, str2, ('"', '""', '\\"', "«", "»", "“", "”")):
878            return True
879        else:
880            raise FilterFailure("Different quotation marks")
881
882    @cosmetic
883    def doublespacing(self, str1, str2):
884        """Checks for bad double-spaces by comparing to original.
885
886        This will identify if you have [space][space] in when you don't have it
887        in the original or it appears in the original but not in your
888        translation. Some of these are spurious and how you correct them
889        depends on the conventions of your language.
890        """
891        str1 = self.filteraccelerators(str1)
892        str2 = self.filteraccelerators(str2)
893
894        if helpers.countmatch(str1, str2, "  "):
895            return True
896        else:
897            raise FilterFailure("Different use of double spaces")
898
899    @cosmetic
900    def puncspacing(self, str1, str2):
901        """Checks for bad spacing after punctuation.
902
903        In the case of [full-stop][space] in the original, this test checks
904        that your translation does not remove the space. It checks also for
905        [comma], [colon], etc.
906
907        Some languages don't use spaces after common punctuation marks,
908        especially where full-width punctuation marks are used. This check will
909        take that into account.
910        """
911        # Convert all nbsp to space, and just check spaces. Useful intermediate
912        # step to stricter nbsp checking?
913        str1 = self.filteraccelerators(self.filtervariables(str1))
914        str1 = self.config.lang.punctranslate(str1)
915        str1 = str1.replace("\u00a0", " ")
916
917        if str1.find(" ") == -1:
918            return True
919
920        str2 = self.filteraccelerators(self.filtervariables(str2))
921        # Substitute: nbsp
922        str2 = str2.replace("\u00a0", " ")
923        # Strip: Bidi markers and ZW* chars
924        str2 = str2.translate(
925            {
926                ord(c): None
927                for c in (
928                    # Bidi markers
929                    "\u200e",  # LRM
930                    "\u200f",  # RLM
931                    "\u202b",  # RLE
932                    "\u202a",  # LRE
933                    "\u202e",  # RLO
934                    "\u202d",  # LRO
935                    "\u202c",  # PDF
936                    "\u2069",  # PDI
937                    "\u2068",  # FSI
938                    "\u2067",  # RLI
939                    "\u2066",  # LRI
940                    # ZW*
941                    "\u200d",  # ZWJ
942                    "\u200c",  # ZWNJ
943                )
944            }
945        )
946
947        for puncchar in self.config.punctuation:
948            plaincount1 = str1.count(puncchar)
949
950            if not plaincount1:
951                continue
952
953            plaincount2 = str2.count(puncchar)
954
955            if plaincount1 != plaincount2:
956                continue
957
958            spacecount1 = str1.count(puncchar + " ")
959            spacecount2 = str2.count(puncchar + " ")
960
961            if spacecount1 != spacecount2:
962                # Handle extra spaces that are because of transposed punctuation
963
964                if abs(spacecount1 - spacecount2) == 1 and str1.endswith(
965                    puncchar
966                ) != str2.endswith(puncchar):
967                    continue
968
969                raise FilterFailure("Different spacing around punctuation")
970
971        return True
972
973    @critical
974    def printf(self, str1, str2):
975        """Checks whether printf format strings match.
976
977        If the printf formatting variables are not identical, then this will
978        indicate an error. Printf statements are used by programs to format
979        output in a human readable form (they are placeholders for variable
980        data). They allow you to specify lengths of string variables, string
981        padding, number padding, precision, etc. Generally they will look like
982        this: ``%d``, ``%5.2f``, ``%100s``, etc. The test can also manage
983        variables-reordering using the ``%1$s`` syntax. The variables' type and
984        details following data are tested to ensure that they are strictly
985        identical, but they may be reordered.
986
987        See also `printf Format String
988        <http://en.wikipedia.org/wiki/Printf_format_string>`_.
989        """
990        count1 = count2 = plural = None
991
992        # self.hasplural only set by run_filters, not always available
993        if "hasplural" in self.__dict__:
994            plural = self.hasplural
995
996        for var_num2, match2 in enumerate(printf_pat.finditer(str2)):
997            count2 = var_num2 + 1
998            str2ord = (
999                match2.group("ord")
1000                if not match2.group("boost_ord")
1001                else match2.group("boost_ord")
1002            )
1003            str2key = match2.group("key")
1004            str2fullvar = (
1005                match2.group("fullvar") if not match2.group("boost_ord") else "%"
1006            )
1007
1008            if str2ord:
1009                str1ord = None
1010                gotmatch = False
1011
1012                for var_num1, match1 in enumerate(printf_pat.finditer(str1)):
1013                    count1 = var_num1 + 1
1014                    localstr1ord = (
1015                        match1.group("ord")
1016                        if not match1.group("boost_ord")
1017                        else match1.group("boost_ord")
1018                    )
1019
1020                    if localstr1ord:
1021                        if str2ord == localstr1ord:
1022                            str1ord = str2ord
1023                            str1fullvar = (
1024                                match1.group("fullvar")
1025                                if not match1.group("boost_ord")
1026                                else "%"
1027                            )
1028
1029                            if str2fullvar == str1fullvar:
1030                                gotmatch = True
1031                    elif int(str2ord) == var_num1 + 1:
1032                        str1ord = str2ord
1033                        str1fullvar = (
1034                            match1.group("fullvar")
1035                            if not match1.group("boost_ord")
1036                            else "%"
1037                        )
1038
1039                        if str2fullvar == str1fullvar:
1040                            gotmatch = True
1041
1042                if str1ord is None:
1043                    raise FilterFailure("Added printf variable: %s" % match2.group())
1044
1045                if not gotmatch:
1046                    raise FilterFailure(
1047                        "Different printf variable: %s" % match2.group()
1048                    )
1049            elif str2key:
1050                str1key = None
1051
1052                for var_num1, match1 in enumerate(printf_pat.finditer(str1)):
1053                    count1 = var_num1 + 1
1054                    str1fullvar = (
1055                        match1.group("fullvar")
1056                        if not match1.group("boost_ord")
1057                        else "%"
1058                    )
1059
1060                    if match1.group("key") and str2key == match1.group("key"):
1061                        str1key = match1.group("key")
1062
1063                        # '%.0s' "placeholder" in plural will match anything
1064                        if plural and str2fullvar == ".0s":
1065                            continue
1066
1067                        if str1fullvar != str2fullvar:
1068                            raise FilterFailure(
1069                                "Different printf variable: %s" % match2.group()
1070                            )
1071
1072                if str1key is None:
1073                    raise FilterFailure("Added printf variable: %s" % match2.group())
1074            else:
1075                for var_num1, match1 in enumerate(printf_pat.finditer(str1)):
1076                    count1 = var_num1 + 1
1077                    str1fullvar = (
1078                        match1.group("fullvar")
1079                        if not match1.group("boost_ord")
1080                        else "%"
1081                    )
1082
1083                    # '%.0s' "placeholder" in plural will match anything
1084                    if plural and str2fullvar == ".0s":
1085                        continue
1086
1087                    if (var_num1 == var_num2) and (str1fullvar != str2fullvar):
1088                        raise FilterFailure(
1089                            "Different printf variable: %s" % match2.group()
1090                        )
1091
1092        if count2 is None:
1093            str1_variables = list(m.group() for m in printf_pat.finditer(str1))
1094
1095            if str1_variables:
1096                raise FilterFailure(
1097                    "Missing printf variable: %s" % ", ".join(str1_variables)
1098                )
1099
1100        if (count1 or count2) and (count1 != count2):
1101            raise FilterFailure("Different number of printf variables")
1102
1103        return 1
1104
1105    @critical
1106    def pythonbraceformat(self, str1, str2):
1107        """Checks whether python brace format strings match."""
1108
1109        # Helper function
1110        def max_anons(anons):
1111            """
1112            Takes a list of anonymous placeholder variables, e.g.
1113            ['', '1', ...]
1114            Determines how many anonymous formatting args the string
1115            they come from requires. Motivation for this function:
1116              * max_anons(vars_from_original) tells us how many
1117                anonymous placeholders are supported (at least).
1118              * max_anons(vars_from_translation) should not
1119                exceed it.
1120            """
1121
1122            # implicit_n: you need at least as many anonymous args as
1123            # there are anonymous placeholders.
1124            implicit_n = anons.count("")
1125            # explicit_n: you need at least as many anonymous args as
1126            # the highest '{99}'-style placeholder. (The `+ 1` is to
1127            # correct for 0-indexing)
1128            try:
1129                explicit_n = max(
1130                    int(numbered_anon) + 1
1131                    for numbered_anon in anons
1132                    if len(numbered_anon) >= 1
1133                )
1134            except ValueError:
1135                explicit_n = 0
1136
1137            highest_n = max(implicit_n, explicit_n)
1138
1139            return highest_n
1140
1141        messages = []
1142        # Possible failure states: 0 = ok, 1 = mild, 2 = serious
1143        STATE_OK, STATE_MILD, STATE_SERIOUS = 0, 1, 2
1144        failure_state = STATE_OK
1145        pythonbraceformat_pat = re.compile("{[^}]*}")
1146        data1 = {}
1147        data2 = {}
1148
1149        # Populate the data1 and data2 dicts.
1150        for data_, str_ in [(data1, str1), (data2, str2)]:
1151            # Remove all escaped braces {{ and }}
1152            data_["strclean"] = re.sub("{{|}}", "", str_)
1153            data_["allvars"] = pythonbraceformat_pat.findall(data_["strclean"])
1154            data_["anonvars"] = [
1155                var[1:-1] for var in data_["allvars"] if re.match(r"^{[0-9]*}$", var)
1156            ]
1157            data_["namedvars"] = [
1158                var for var in data_["allvars"] if not re.match(r"^{[0-9]*}$", var)
1159            ]
1160
1161        max1 = max_anons(data1["anonvars"])
1162        max2 = max_anons(data2["anonvars"])
1163
1164        if max1 == max2:
1165            pass
1166        elif max1 < max2:
1167            failure_state = max(failure_state, STATE_SERIOUS)
1168            messages.append(
1169                "Translation requires %s anonymous formatting args, original only %s"
1170                % (max2, max1)
1171            )
1172        else:
1173            failure_state = max(failure_state, STATE_MILD)
1174            messages.append(
1175                "Highest anonymous placeholder in original is %s, in translation %s"
1176                % (max1, max2)
1177            )
1178
1179        if set(data1["namedvars"]) == set(data2["namedvars"]):
1180            pass
1181
1182        extra_in_2 = set(data2["namedvars"]).difference(set(data1["namedvars"]))
1183        if 0 < len(extra_in_2):
1184            failure_state = max(failure_state, STATE_SERIOUS)
1185            messages.append(
1186                "Unknown named placeholders in translation: %s" % ", ".join(extra_in_2)
1187            )
1188
1189        extra_in_1 = set(data1["namedvars"]).difference(set(data2["namedvars"]))
1190        if 0 < len(extra_in_1):
1191            failure_state = max(failure_state, STATE_MILD)
1192            messages.append(
1193                "Named placeholders absent in translation: %s" % ", ".join(extra_in_1)
1194            )
1195
1196        if failure_state == STATE_OK:
1197            return 1
1198        elif failure_state == STATE_MILD:
1199            raise FilterFailure(messages)
1200        elif failure_state == STATE_SERIOUS:
1201            raise SeriousFilterFailure(messages)
1202        else:
1203            raise ValueError(
1204                "Something wrong in python brace checks: unreachable state reached"
1205            )
1206
1207    @functional
1208    def accelerators(self, str1, str2):
1209        """Checks whether accelerators are consistent between the two strings.
1210
1211        This test is capable of checking the different type of accelerators
1212        that are used in different projects, like Mozilla or KDE. The test will
1213        pick up accelerators that are missing and ones that shouldn't be there.
1214
1215        See `accelerators on the localization guide
1216        <http://docs.translatehouse.org/projects/localization-guide/en/latest/guide/translation/accelerators.html>`_
1217        for a full description on accelerators.
1218        """
1219        str1 = self.filtervariables(str1)
1220        str2 = self.filtervariables(str2)
1221        messages = []
1222
1223        for accelmarker in self.config.accelmarkers:
1224            counter1 = decoration.countaccelerators(
1225                accelmarker, self.config.sourcelang.validaccel
1226            )
1227            counter2 = decoration.countaccelerators(
1228                accelmarker, self.config.lang.validaccel
1229            )
1230            count1, countbad1 = counter1(str1)
1231            count2, countbad2 = counter2(str2)
1232            getaccel = decoration.getaccelerators(
1233                accelmarker, self.config.lang.validaccel
1234            )
1235            accel2, bad2 = getaccel(str2)
1236
1237            if count1 == count2:
1238                continue
1239
1240            if count1 == 1 and count2 == 0:
1241                if countbad2 == 1:
1242                    messages.append(
1243                        "Accelerator '%s' appears before an invalid "
1244                        "accelerator character '%s'" % (accelmarker, bad2[0])
1245                    )
1246                else:
1247                    messages.append("Missing accelerator '%s'" % accelmarker)
1248            elif count1 == 0:
1249                messages.append("Added accelerator '%s'" % accelmarker)
1250            elif count1 == 1 and count2 > count1:
1251                messages.append(
1252                    "Accelerator '%s' is repeated in translation" % accelmarker
1253                )
1254            else:
1255                messages.append(
1256                    "Accelerator '%s' occurs %d time(s) in original "
1257                    "and %d time(s) in translation" % (accelmarker, count1, count2)
1258                )
1259
1260        if messages:
1261            if "accelerators" in self.config.criticaltests:
1262                raise SeriousFilterFailure(messages)
1263            else:
1264                raise FilterFailure(messages)
1265
1266        return True
1267
1268    #    def acceleratedvariables(self, str1, str2):
1269    #        """checks that no variables are accelerated"""
1270    #        messages = []
1271    #        for accelerator in self.config.accelmarkers:
1272    #            for variablestart, variableend in self.config.varmatches:
1273    #                error = accelerator + variablestart
1274    #                if str1.find(error) >= 0:
1275    #                    messages.append("original has an accelerated variable")
1276    #                if str2.find(error) >= 0:
1277    #                    messages.append("translation has an accelerated variable")
1278    #        if messages:
1279    #            raise FilterFailure(messages)
1280    #        return True
1281
1282    @critical
1283    def variables(self, str1, str2):
1284        """Checks whether variables of various forms are consistent between the
1285        two strings.
1286
1287        This checks to make sure that variables that appear in the original
1288        also appear in the translation. It can handle variables from projects
1289        like KDE or OpenOffice. It does not at the moment cope with variables
1290        that use the reordering syntax of Gettext PO files.
1291        """
1292        messages = []
1293        mismatch1, mismatch2 = [], []
1294        varnames1, varnames2 = [], []
1295
1296        for startmarker, endmarker in self.config.varmatches:
1297            varchecker = decoration.getvariables(startmarker, endmarker)
1298
1299            if startmarker and endmarker:
1300                if isinstance(endmarker, int):
1301                    redecorate = lambda var: startmarker + var
1302                else:
1303                    redecorate = lambda var: startmarker + var + endmarker
1304            elif startmarker:
1305                redecorate = lambda var: startmarker + var
1306            else:
1307                redecorate = lambda var: var
1308
1309            vars1 = varchecker(str1)
1310            vars2 = varchecker(str2)
1311
1312            if vars1 != vars2:
1313                # we use counts to compare so we can handle multiple variables
1314                vars1, vars2 = [
1315                    var for var in vars1 if vars1.count(var) > vars2.count(var)
1316                ], [var for var in vars2 if vars1.count(var) < vars2.count(var)]
1317                # filter variable names we've already seen, so they aren't
1318                # matched by more than one filter...
1319                vars1, vars2 = [var for var in vars1 if var not in varnames1], [
1320                    var for var in vars2 if var not in varnames2
1321                ]
1322                varnames1.extend(vars1)
1323                varnames2.extend(vars2)
1324                vars1 = map(redecorate, vars1)
1325                vars2 = map(redecorate, vars2)
1326                mismatch1.extend(vars1)
1327                mismatch2.extend(vars2)
1328
1329        if mismatch1:
1330            messages.append("Do not translate: %s" % ", ".join(mismatch1))
1331        elif mismatch2:
1332            messages.append("Added variables: %s" % ", ".join(mismatch2))
1333
1334        if messages and mismatch1:
1335            raise SeriousFilterFailure(messages)
1336        elif messages:
1337            raise FilterFailure(messages)
1338
1339        return True
1340
1341    @functional
1342    def functions(self, str1, str2):
1343        """Checks that function names are not translated.
1344
1345        Checks that function names e.g. ``rgb()`` or ``getEntity.Name()`` are
1346        not translated.
1347        """
1348        # We can't just use helpers.funcmatch() since it doesn't ignore order
1349        if not set(decoration.getfunctions(str1)).symmetric_difference(
1350            set(decoration.getfunctions(str2))
1351        ):
1352            return True
1353        else:
1354            raise FilterFailure("Different functions")
1355
1356    @functional
1357    def emails(self, str1, str2):
1358        """Checks that emails are not translated.
1359
1360        Generally you should not be translating email addresses. This check
1361        will look to see that email addresses e.g. ``info@example.com`` are not
1362        translated. In some cases of course you should translate the address
1363        but generally you shouldn't.
1364        """
1365        if helpers.funcmatch(str1, str2, decoration.getemails):
1366            return True
1367        else:
1368            raise FilterFailure("Different e-mails")
1369
1370    @functional
1371    def urls(self, str1, str2):
1372        """Checks that URLs are not translated.
1373
1374        This checks only basic URLs (http, ftp, mailto etc.) not all URIs (e.g.
1375        afp, smb, file). Generally, you don't want to translate URLs, unless
1376        they are example URLs (http://your_server.com/filename.html). If the
1377        URL is for configuration information, then you need to query the
1378        developers about placing configuration information in PO files. It
1379        shouldn't really be there, unless it is very clearly marked: such
1380        information should go into a configuration file.
1381        """
1382        if helpers.funcmatch(str1, str2, decoration.geturls):
1383            return True
1384        else:
1385            raise FilterFailure("Different URLs")
1386
1387    @functional
1388    def numbers(self, str1, str2):
1389        """Checks whether numbers of various forms are consistent between the
1390        two strings.
1391
1392        You will see some errors where you have either written the number in
1393        full or converted it to the digit in your translation. Also changes in
1394        order will trigger this error.
1395        """
1396        str1 = self.config.lang.numbertranslate(str1)
1397
1398        if helpers.countsmatch(str1, str2, decoration.getnumbers(str1)):
1399            return True
1400        else:
1401            raise FilterFailure("Different numbers")
1402
1403    @cosmetic
1404    def startwhitespace(self, str1, str2):
1405        """Checks whether whitespace at the beginning of the strings matches.
1406
1407        As in endwhitespace but you will see fewer errors.
1408        """
1409        if helpers.funcmatch(str1, str2, decoration.spacestart):
1410            return True
1411        else:
1412            raise FilterFailure("Different whitespace at the start")
1413
1414    @cosmetic
1415    def endwhitespace(self, str1, str2):
1416        """Checks whether whitespace at the end of the strings matches.
1417
1418        Operates the same as endpunc but is only concerned with whitespace.
1419        This filter is particularly useful for those strings which will
1420        evidently be followed by another string in the program, e.g.
1421        [Password: ] or [Enter your username: ]. The whitespace is an inherent
1422        part of the string. This filter makes sure you don't miss those
1423        important but otherwise invisible spaces!
1424
1425        If your language uses full-width punctuation (like Chinese), the visual
1426        spacing in the character might be enough without an added extra space.
1427        """
1428        str1 = self.config.lang.punctranslate(str1)
1429
1430        if helpers.funcmatch(str1, str2, decoration.spaceend):
1431            return True
1432        else:
1433            raise FilterFailure("Different whitespace at the end")
1434
1435    @cosmetic
1436    def startpunc(self, str1, str2):
1437        """Checks whether punctuation at the beginning of the strings match.
1438
1439        Operates as endpunc but you will probably see fewer errors.
1440        """
1441        str1 = self.filterxml(
1442            self.filterwordswithpunctuation(
1443                self.filteraccelerators(self.filtervariables(str1))
1444            )
1445        )
1446        str1 = self.config.lang.punctranslate(str1)
1447        str2 = self.filterxml(
1448            self.filterwordswithpunctuation(
1449                self.filteraccelerators(self.filtervariables(str2))
1450            )
1451        )
1452
1453        if helpers.funcmatch(str1, str2, decoration.puncstart, self.config.punctuation):
1454            return True
1455        else:
1456            raise FilterFailure("Different punctuation at the start")
1457
1458    @cosmetic
1459    def endpunc(self, str1, str2):
1460        """Checks whether punctuation at the end of the strings match.
1461
1462        This will ensure that the ending of your translation has the same
1463        punctuation as the original. E.g. if it ends in :[space] then so should
1464        yours. It is useful for ensuring that you have ellipses [...] in all
1465        your translations, not simply three separate full-stops. You may pick
1466        up some errors in the original: feel free to keep your translation and
1467        notify the programmers. In some languages, characters such as ``?`` or
1468        ``!`` are always preceded by a space e.g. [space]? — do what your
1469        language customs dictate. Other false positives you will notice are,
1470        for example, if through changes in word-order you add "), etc. at the
1471        end of the sentence. Do not change these: your language word-order
1472        takes precedence.
1473
1474        It must be noted that if you are tempted to leave out [full-stop] or
1475        [colon] or add [full-stop] to a sentence, that often these have been
1476        done for a reason, e.g. a list where fullstops make it look cluttered.
1477        So, initially match them with the English, and make changes once the
1478        program is being used.
1479
1480        This check is aware of several language conventions for punctuation
1481        characters, such as the custom question marks for Greek and Arabic,
1482        Devanagari Danda, full-width punctuation for CJK languages, etc.
1483        Support for your language can be added easily if it is not there yet.
1484        """
1485        str1 = self.filtervariables(str1)
1486        str1 = self.config.lang.punctranslate(str1)
1487        str2 = self.filtervariables(str2)
1488        str1 = str1.rstrip()
1489        str2 = str2.rstrip()
1490
1491        if helpers.funcmatch(
1492            str1, str2, decoration.puncend, self.config.endpunctuation + ":"
1493        ):
1494            return True
1495        else:
1496            raise FilterFailure("Different punctuation at the end")
1497
1498    @functional
1499    def purepunc(self, str1, str2):
1500        """Checks that strings that are purely punctuation are not changed.
1501
1502        This extracts strings like ``+`` or ``-`` as these usually should not
1503        be changed.
1504        """
1505        # this test is a subset of startandend
1506        if decoration.ispurepunctuation(str1):
1507            success = str1 == str2
1508        else:
1509            success = not decoration.ispurepunctuation(str2)
1510
1511        if success:
1512            return True
1513        else:
1514            raise FilterFailure("Consider not translating punctuation")
1515
1516    @cosmetic
1517    def brackets(self, str1, str2):
1518        """Checks that the number of brackets in both strings match.
1519
1520        If ``([{`` or ``}])`` appear in the original this will check that the
1521        same number appear in the translation.
1522        """
1523        str1 = self.filtervariables(str1)
1524        str2 = self.filtervariables(str2)
1525
1526        messages = []
1527        missing = []
1528        extra = []
1529
1530        for bracket in ("[", "]", "{", "}", "(", ")"):
1531            count1 = str1.count(bracket)
1532            count2 = str2.count(bracket)
1533
1534            if count2 < count1:
1535                missing.append("'%s'" % bracket)
1536            elif count2 > count1:
1537                extra.append("'%s'" % bracket)
1538
1539        if missing:
1540            messages.append("Missing %s" % ", ".join(missing))
1541
1542        if extra:
1543            messages.append("Added %s" % ", ".join(extra))
1544
1545        if messages:
1546            raise FilterFailure(messages)
1547
1548        return True
1549
1550    @functional
1551    def sentencecount(self, str1, str2):
1552        """Checks that the number of sentences in both strings match.
1553
1554        Adds the number of sentences to see that the sentence count is the same
1555        between the original and translated string. You may not always want to
1556        use this test, if you find you often need to reformat your translation,
1557        because the original is badly-expressed, or because the structure of
1558        your language works better that way. Do what works best for your
1559        language: it's the meaning of the original you want to convey, not the
1560        exact way it was written in the English.
1561        """
1562        str1 = self.filteraccelerators(str1)
1563        str2 = self.filteraccelerators(str2)
1564
1565        sentences1 = len(self.config.sourcelang.sentences(str1))
1566        sentences2 = len(self.config.lang.sentences(str2))
1567
1568        if not sentences1 == sentences2:
1569            raise FilterFailure(
1570                "Different number of sentences: " "%d ≠ %d" % (sentences1, sentences2)
1571            )
1572
1573        return True
1574
1575    @functional
1576    def options(self, str1, str2):
1577        """Checks that command line options are not translated.
1578
1579        In messages that contain command line options, such as ``--help``,
1580        this test will check that these remain untranslated. These could be
1581        translated in the future if programs can create a mechanism to allow
1582        this, but currently they are not translated. If the options has a
1583        parameter, e.g. ``--file=FILE``, then the test will check that the
1584        parameter has been translated.
1585        """
1586        str1 = self.filtervariables(str1)
1587
1588        for word1 in str1.split():
1589            if word1 != "--" and word1.startswith("--") and word1[-1].isalnum():
1590                parts = word1.split("=")
1591
1592                if not parts[0] in str2:
1593                    raise FilterFailure("Missing or translated option '%s'" % parts[0])
1594
1595                if len(parts) > 1 and parts[1] in str2:
1596                    raise FilterFailure(
1597                        "Consider translating parameter "
1598                        "'%(param)s' of option '%(option)s'"
1599                        % {"param": parts[1], "option": parts[0]}
1600                    )
1601
1602        return True
1603
1604    @cosmetic
1605    def startcaps(self, str1, str2):
1606        """Checks that the message starts with the correct capitalisation.
1607
1608        After stripping whitespace and common punctuation characters, it then
1609        checks to see that the first remaining character is correctly
1610        capitalised. So, if the sentence starts with an upper-case letter, and
1611        the translation does not, an error is produced.
1612
1613        This check is entirely disabled for many languages that don't make a
1614        distinction between upper and lower case. Contact us if this is not yet
1615        disabled for your language.
1616        """
1617        str1 = self.filteraccelerators(str1)
1618        str2 = self.filteraccelerators(str2)
1619
1620        if len(str1) > 1 and len(str2) > 1:
1621            if self.config.sourcelang.capsstart(str1) == self.config.lang.capsstart(
1622                str2
1623            ):
1624                return True
1625            elif self.config.sourcelang.numstart(str1) or self.config.lang.numstart(
1626                str2
1627            ):
1628                return True
1629            else:
1630                raise FilterFailure("Different capitalization at the start")
1631
1632        if len(str1) == 0 and len(str2) == 0:
1633            return True
1634
1635        if len(str1) == 0 or len(str2) == 0:
1636            raise FilterFailure("Different capitalization at the start")
1637
1638        return True
1639
1640    @cosmetic
1641    def simplecaps(self, str1, str2):
1642        """Checks the capitalisation of two strings isn't wildly different.
1643
1644        This will pick up many false positives, so don't be a slave to it. It
1645        is useful for identifying translations that don't start with a capital
1646        letter (upper-case letter) when they should, or those that do when they
1647        shouldn't. It will also highlight sentences that have extra capitals;
1648        depending on the capitalisation convention of your language, you might
1649        want to change these to Title Case, or change them all to normal
1650        sentence case.
1651        """
1652        str1 = self.removevariables(str1)
1653        str2 = self.removevariables(str2)
1654        # TODO: review this. The 'I' is specific to English, so it probably
1655        # serves no purpose to get sourcelang.sentenceend
1656        str1 = re.sub("[^%s]( I )" % self.config.sourcelang.sentenceend, " i ", str1)
1657
1658        capitals1 = helpers.filtercount(str1, str.isupper)
1659        capitals2 = helpers.filtercount(str2, str.isupper)
1660
1661        alpha1 = helpers.filtercount(str1, str.isalpha)
1662        alpha2 = helpers.filtercount(str2, str.isalpha)
1663
1664        # Capture the all caps case
1665        if capitals1 == alpha1:
1666            if capitals2 == alpha2:
1667                return True
1668            else:
1669                raise FilterFailure("Different capitalization")
1670
1671        # some heuristic tests to try and see that the style of capitals is
1672        # vaguely the same
1673        if capitals1 == 0 or capitals1 == 1:
1674            success = capitals2 == capitals1
1675        elif capitals1 < len(str1) / 10:
1676            success = capitals2 <= len(str2) / 8
1677        elif len(str1) < 10:
1678            success = abs(capitals1 - capitals2) < 3
1679        elif capitals1 > len(str1) * 6 / 10:
1680            success = capitals2 > len(str2) * 6 / 10
1681        else:
1682            success = abs(capitals1 - capitals2) < (len(str1) + len(str2)) / 6
1683
1684        if success:
1685            return True
1686        else:
1687            raise FilterFailure("Different capitalization")
1688
1689    @functional
1690    def acronyms(self, str1, str2):
1691        """Checks that acronyms that appear are unchanged.
1692
1693        If an acronym appears in the original this test will check that it
1694        appears in the translation. Translating acronyms is a language decision
1695        but many languages leave them unchanged. In that case this test is
1696        useful for tracking down translations of the acronym and correcting
1697        them.
1698        """
1699        acronyms = []
1700        allowed = []
1701
1702        for startmatch, endmatch in self.config.varmatches:
1703            allowed += decoration.getvariables(startmatch, endmatch)(str1)
1704
1705        allowed += self.config.musttranslatewords.keys()
1706        str1 = self.filteraccelerators(self.filtervariables(str1))
1707        iter = self.config.lang.word_iter(str1)
1708        str2 = self.filteraccelerators(self.filtervariables(str2))
1709
1710        # TODO: strip XML? - should provide better error messsages
1711        # see mail/chrome/messanger/smime.properties.po
1712        # TODO: consider limiting the word length for recognising acronyms to
1713        # something like 5/6 characters
1714        for word in iter:
1715            if word.isupper() and len(word) > 1 and word not in allowed:
1716                if str2.find(word) == -1:
1717                    acronyms.append(word)
1718
1719        if acronyms:
1720            raise FilterFailure(
1721                "Consider not translating acronyms: %s" % ", ".join(acronyms)
1722            )
1723
1724        return True
1725
1726    @cosmetic
1727    def doublewords(self, str1, str2):
1728        """Checks for repeated words in the translation.
1729
1730        Words that have been repeated in a translation will be highlighted with
1731        this test e.g. "the the", "a a". These are generally typos that need
1732        correcting. Some languages may have valid repeated words in their
1733        structure, in that case either ignore those instances or switch this
1734        test off.
1735        """
1736        lastword = ""
1737        without_newlines = "\n".join(str2.split("\n"))
1738        words = (
1739            self.filteraccelerators(
1740                self.removevariables(self.filterxml(without_newlines))
1741            )
1742            .replace(".", "")
1743            .lower()
1744            .split()
1745        )
1746
1747        for word in words:
1748            if word == lastword and word not in self.config.lang.validdoublewords:
1749                raise FilterFailure("The word '%s' is repeated" % word)
1750            lastword = word
1751
1752        return True
1753
1754    @functional
1755    def notranslatewords(self, str1, str2):
1756        """Checks that words configured as untranslatable appear in the
1757        translation too.
1758
1759        Many brand names should not be translated, this test allows you to
1760        easily make sure that words like: Word, Excel, Impress, Calc, etc. are
1761        not translated. You must specify a file containing all of the
1762        *no translate* words using ``--notranslatefile``.
1763        """
1764        if not self.config.notranslatewords:
1765            return True
1766
1767        str1 = self.filtervariables(str1)
1768        str2 = self.filtervariables(str2)
1769
1770        # The above is full of strange quotes and things in utf-8 encoding.
1771        # single apostrophe perhaps problematic in words like "doesn't"
1772        for seperator in self.config.punctuation:
1773            str1 = str1.replace(seperator, " ")
1774            str2 = str2.replace(seperator, " ")
1775
1776        words1 = self.filteraccelerators(str1).split()
1777        words2 = self.filteraccelerators(str2).split()
1778        stopwords = [
1779            word
1780            for word in words1
1781            if word in self.config.notranslatewords and word not in words2
1782        ]
1783
1784        if stopwords:
1785            raise FilterFailure("Do not translate: %s" % (", ".join(stopwords)))
1786
1787        return True
1788
1789    @functional
1790    def musttranslatewords(self, str1, str2):
1791        """Checks that words configured as definitely translatable don't appear
1792        in the translation.
1793
1794        If for instance in your language you decide that you must translate
1795        'OK' then this test will flag any occurrences of 'OK' in the
1796        translation if it appeared in the source string. You must specify a
1797        file containing all of the *must translate* words using
1798        ``--musttranslatefile``.
1799        """
1800        if not self.config.musttranslatewords:
1801            return True
1802
1803        str1 = self.removevariables(str1)
1804        str2 = self.removevariables(str2)
1805
1806        # The above is full of strange quotes and things in utf-8 encoding.
1807        # single apostrophe perhaps problematic in words like "doesn't"
1808        for seperator in self.config.punctuation:
1809            str1 = str1.replace(seperator, " ")
1810            str2 = str2.replace(seperator, " ")
1811
1812        words1 = self.filteraccelerators(str1).split()
1813        words2 = self.filteraccelerators(str2).split()
1814        stopwords = [
1815            word
1816            for word in words1
1817            if word.lower() in self.config.musttranslatewords and word in words2
1818        ]
1819
1820        if stopwords:
1821            raise FilterFailure("Please translate: %s" % (", ".join(stopwords)))
1822
1823        return True
1824
1825    @cosmetic
1826    def validchars(self, str1, str2):
1827        """Checks that only characters specified as valid appear in the
1828        translation.
1829
1830        Often during character conversion to and from UTF-8 you get some
1831        strange characters appearing in your translation. This test presents a
1832        simple way to try and identify such errors.
1833
1834        This test will only run of you specify the ``--validcharsfile`` command
1835        line option. This file contains all the characters that are valid in
1836        your language. You must use UTF-8 encoding for the characters in the
1837        file.
1838
1839        If the test finds any characters not in your valid characters file then
1840        the test will print the character together with its Unicode value
1841        (e.g. 002B).
1842        """
1843        if not self.config.validcharsmap:
1844            return True
1845
1846        invalid1 = str1.translate(self.config.validcharsmap)
1847        invalid2 = str2.translate(self.config.validcharsmap)
1848        invalidchars = [
1849            f"'{invalidchar}' (\\u{ord(invalidchar):04x})"
1850            for invalidchar in invalid2
1851            if invalidchar not in invalid1
1852        ]
1853
1854        if invalidchars:
1855            raise FilterFailure("Invalid characters: %s" % (", ".join(invalidchars)))
1856
1857        return True
1858
1859    @functional
1860    def filepaths(self, str1, str2):
1861        """Checks that file paths have not been translated.
1862
1863        Checks that paths such as ``/home/user1`` have not been translated.
1864        Generally you do not translate a file path, unless it is being used as
1865        an example, e.g. ``your_user_name/path/to/filename.conf``.
1866        """
1867        for word1 in self.filteraccelerators(self.filterxml(str1)).split():
1868            if word1.startswith("/"):
1869                if not helpers.countsmatch(str1, str2, (word1,)):
1870                    raise FilterFailure("Different file paths")
1871
1872        return True
1873
1874    @critical
1875    def xmltags(self, str1, str2):
1876        """Checks that XML/HTML tags have not been translated.
1877
1878        This check finds the number of tags in the source string and checks
1879        that the same number are in the translation. If the counts don't match
1880        then either the tag is missing or it was mistakenly translated by the
1881        translator, both of which are errors.
1882
1883        The check ignores tags or things that look like tags that cover the
1884        whole string e.g. ``<Error>`` but will produce false positives for
1885        things like ``An <Error> occurred`` as here ``Error`` should be
1886        translated. It also will allow translation of the *alt* attribute in
1887        e.g. ``<img src="bob.png" alt="Image description">`` or similar
1888        translatable attributes in OpenOffice.org help files.
1889        """
1890        tags1 = tag_re.findall(str1)
1891
1892        if len(tags1) > 0:
1893            if (len(tags1[0]) == len(str1)) and "=" not in tags1[0]:
1894                return True
1895
1896            tags2 = tag_re.findall(str2)
1897            properties1 = tagproperties(tags1, self.config.ignoretags)
1898            properties2 = tagproperties(tags2, self.config.ignoretags)
1899
1900            filtered1 = []
1901            filtered2 = []
1902
1903            for property1 in properties1:
1904                filtered1 += [intuplelist(property1, self.config.canchangetags)]
1905
1906            for property2 in properties2:
1907                filtered2 += [intuplelist(property2, self.config.canchangetags)]
1908
1909            # TODO: consider the consequences of different ordering of
1910            # attributes/tags
1911            if filtered1 != filtered2:
1912                raise FilterFailure("Different XML tags")
1913        else:
1914            # No tags in str1, let's just check that none were added in str2.
1915            # This might be useful for fuzzy strings wrongly unfuzzied.
1916            tags2 = tag_re.findall(str2)
1917
1918            if len(tags2) > 0:
1919                raise FilterFailure("Added XML tags")
1920
1921        return True
1922
1923    @functional
1924    def kdecomments(self, str1, str2):
1925        """Checks to ensure that no KDE style comments appear in the
1926        translation.
1927
1928        KDE style translator comments appear in PO files as
1929        ``"_: comment\\n"``. New translators often translate the comment. This
1930        test tries to identify instances where the comment has been translated.
1931        """
1932        return str2.find("\n_:") == -1 and not str2.startswith("_:")
1933
1934    @extraction
1935    def compendiumconflicts(self, str1, str2):
1936        """Checks for Gettext compendium conflicts (#-#-#-#-#).
1937
1938        When you use msgcat to create a PO compendium it will insert
1939        ``#-#-#-#-#`` into entries that are not consistent. If the compendium
1940        is used later in a message merge then these conflicts will appear in
1941        your translations. This test quickly extracts those for correction.
1942        """
1943        return str2.find("#-#-#-#-#") == -1
1944
1945    @cosmetic
1946    def simpleplurals(self, str1, str2):
1947        """Checks for English style plural(s) for you to review.
1948
1949        This test will extract any message that contains words with a final
1950        "(s)" in the source text. You can then inspect the message, to check
1951        that the correct plural form has been used for your language. In some
1952        languages, plurals are made by adding text at the beginning of words,
1953        making the English style messy. In this case, they often revert to the
1954        plural form. This test allows an editor to check that the plurals used
1955        are correct. Be aware that this test may create a number of false
1956        positives.
1957
1958        For languages with no plural forms (only one noun form) this test will
1959        simply test that nothing like "(s)" was used in the translation.
1960        """
1961
1962        def numberofpatterns(string, patterns):
1963            number = 0
1964
1965            for pattern in patterns:
1966                number += len(re.findall(pattern, string))
1967
1968            return number
1969
1970        sourcepatterns = [r"\(s\)"]
1971        targetpatterns = [r"\(s\)"]
1972        sourcecount = numberofpatterns(str1, sourcepatterns)
1973        targetcount = numberofpatterns(str2, targetpatterns)
1974
1975        if self.config.lang.nplurals == 1:
1976            if targetcount:
1977                raise FilterFailure("Plural(s) were kept in translation")
1978            else:
1979                return True
1980
1981        if sourcecount == targetcount:
1982            return True
1983        else:
1984            raise FilterFailure("The original uses plural(s)")
1985
1986    @functional
1987    def spellcheck(self, str1, str2):
1988        """Checks words that don't pass a spell check.
1989
1990        This test will check for misspelled words in your translation. The test
1991        first checks for misspelled words in the original (usually English)
1992        text, and adds those to an exclusion list. The advantage of this
1993        exclusion is that many words that are specific to the application will
1994        not raise errors e.g. program names, brand names, function names.
1995
1996        The checker works with `PyEnchant
1997        <http://pythonhosted.org/pyenchant/>`_. You need to have PyEnchant
1998        installed as well as a dictionary for your language (for example, one
1999        of the `Hunspell <https://wiki.openoffice.org/wiki/Dictionaries>`_ or
2000        `aspell <http://ftp.gnu.org/gnu/aspell/dict/>`_ dictionaries). This
2001        test will only work if you have specified the ``--language`` option.
2002
2003        The pofilter error that is created, lists the misspelled word, plus
2004        suggestions returned from the spell checker. That makes it easy for you
2005        to identify the word and select a replacement.
2006        """
2007        if not self.config.targetlanguage:
2008            return True
2009
2010        if not spelling.available:
2011            return True
2012
2013        # TODO: filterxml?
2014        str1 = self.filteraccelerators_by_list(
2015            self.removevariables(str1), self.config.sourcelang.validaccel
2016        )
2017        str2 = self.filteraccelerators_by_list(
2018            self.removevariables(str2), self.config.lang.validaccel
2019        )
2020        errors = set()
2021
2022        # We cache spelling results of source texts:
2023        ignore1 = set(spelling.simple_check(str1, lang=self.config.sourcelang.code))
2024
2025        # We cache spelling results of target texts sentence-by-sentence. This
2026        # way we can reuse most of the results while someone is typing a long
2027        # segment in Virtaal.
2028        sentences2 = self.config.lang.sentences(str2)
2029        for sentence in sentences2:
2030            sentence_errors = spelling.simple_check(
2031                sentence, lang=self.config.targetlanguage
2032            )
2033            errors.update(sentence_errors)
2034
2035        errors.difference_update(ignore1, self.config.notranslatewords)
2036
2037        if errors:
2038            messages = ["Check the spelling of: %s" % ", ".join(errors)]
2039            raise FilterFailure(messages)
2040
2041        return True
2042
2043    @extraction
2044    def credits(self, str1, str2):
2045        """Checks for messages containing translation credits instead of
2046        normal translations.
2047
2048        Some projects have consistent ways of giving credit to translators by
2049        having a unit or two where translators can fill in their name and
2050        possibly their contact details. This test allows you to find these
2051        units easily to check that they are completed correctly and also
2052        disables other tests that might incorrectly get triggered for these
2053        units (such as urls, emails, etc.)
2054        """
2055        if str1 in self.config.credit_sources:
2056            raise FilterFailure("Don't translate. Just credit the translators.")
2057        else:
2058            return True
2059
2060    # If the precondition filter is run and fails then the other tests listed are ignored
2061    preconditions = {
2062        "untranslated": (
2063            "simplecaps",
2064            "variables",
2065            "startcaps",
2066            "accelerators",
2067            "brackets",
2068            "endpunc",
2069            "acronyms",
2070            "xmltags",
2071            "startpunc",
2072            "endwhitespace",
2073            "startwhitespace",
2074            "escapes",
2075            "doublequoting",
2076            "singlequoting",
2077            "filepaths",
2078            "purepunc",
2079            "doublespacing",
2080            "sentencecount",
2081            "numbers",
2082            "isfuzzy",
2083            "isreview",
2084            "notranslatewords",
2085            "musttranslatewords",
2086            "emails",
2087            "simpleplurals",
2088            "urls",
2089            "printf",
2090            "pythonbraceformat",
2091            "tabs",
2092            "newlines",
2093            "functions",
2094            "options",
2095            "blank",
2096            "nplurals",
2097            "gconf",
2098            "dialogsizes",
2099            "validxml",
2100        ),
2101        "blank": (
2102            "simplecaps",
2103            "variables",
2104            "startcaps",
2105            "accelerators",
2106            "brackets",
2107            "endpunc",
2108            "acronyms",
2109            "xmltags",
2110            "startpunc",
2111            "endwhitespace",
2112            "startwhitespace",
2113            "escapes",
2114            "doublequoting",
2115            "singlequoting",
2116            "filepaths",
2117            "purepunc",
2118            "doublespacing",
2119            "sentencecount",
2120            "numbers",
2121            "isfuzzy",
2122            "isreview",
2123            "notranslatewords",
2124            "musttranslatewords",
2125            "emails",
2126            "simpleplurals",
2127            "urls",
2128            "printf",
2129            "pythonbraceformat",
2130            "tabs",
2131            "newlines",
2132            "functions",
2133            "options",
2134            "gconf",
2135            "dialogsizes",
2136            "validxml",
2137        ),
2138        "credits": (
2139            "simplecaps",
2140            "variables",
2141            "startcaps",
2142            "accelerators",
2143            "brackets",
2144            "endpunc",
2145            "acronyms",
2146            "xmltags",
2147            "startpunc",
2148            "escapes",
2149            "doublequoting",
2150            "singlequoting",
2151            "filepaths",
2152            "doublespacing",
2153            "sentencecount",
2154            "numbers",
2155            "emails",
2156            "simpleplurals",
2157            "urls",
2158            "printf",
2159            "pythonbraceformat",
2160            "tabs",
2161            "newlines",
2162            "functions",
2163            "options",
2164            "validxml",
2165        ),
2166        "purepunc": ("startcaps", "options"),
2167        # This is causing some problems since Python 2.6, as
2168        # startcaps is now seen as an important one to always execute
2169        # and could now be done before it is blocked by a failing
2170        # "untranslated" or "blank" test. This is probably happening
2171        # due to slightly different implementation of the internal
2172        # dict handling since Python 2.6. We should never have relied
2173        # on this ordering anyway.
2174        # "startcaps": ("simplecaps",),
2175        "endwhitespace": ("endpunc",),
2176        "startwhitespace": ("startpunc",),
2177        "unchanged": ("doublewords",),
2178        "compendiumconflicts": (
2179            "accelerators",
2180            "brackets",
2181            "escapes",
2182            "numbers",
2183            "startpunc",
2184            "long",
2185            "variables",
2186            "startcaps",
2187            "sentencecount",
2188            "simplecaps",
2189            "doublespacing",
2190            "endpunc",
2191            "xmltags",
2192            "startwhitespace",
2193            "endwhitespace",
2194            "singlequoting",
2195            "doublequoting",
2196            "filepaths",
2197            "purepunc",
2198            "doublewords",
2199            "printf",
2200            "newlines",
2201            "validxml",
2202        ),
2203    }
2204
2205
2206# code to actually run the tests (use unittest?)
2207
2208
2209openofficeconfig = CheckerConfig(
2210    accelmarkers=["~"],
2211    varmatches=[
2212        ("&", ";"),
2213        ("%", "%"),
2214        ("%", None),
2215        ("%", 0),
2216        ("$(", ")"),
2217        ("$", "$"),
2218        ("${", "}"),
2219        ("#", "#"),
2220        ("#", 1),
2221        ("#", 0),
2222        ("($", ")"),
2223        ("$[", "]"),
2224        ("[", "]"),
2225        ("@", "@"),
2226        ("$", None),
2227    ],
2228    ignoretags=[
2229        ("alt", "xml-lang", None),
2230        ("ahelp", "visibility", "visible"),
2231        ("img", "width", None),
2232        ("img", "height", None),
2233    ],
2234    canchangetags=[("link", "name", None)],
2235)
2236
2237
2238class OpenOfficeChecker(StandardChecker):
2239    def __init__(self, **kwargs):
2240        checkerconfig = kwargs.get("checkerconfig", None)
2241
2242        if checkerconfig is None:
2243            checkerconfig = CheckerConfig()
2244            kwargs["checkerconfig"] = checkerconfig
2245
2246        checkerconfig.update(openofficeconfig)
2247        super().__init__(**kwargs)
2248
2249
2250libreofficeconfig = CheckerConfig(
2251    accelmarkers=["~"],
2252    varmatches=[
2253        ("&", ";"),
2254        ("%", "%"),
2255        ("%", None),
2256        ("%", 0),
2257        ("$(", ")"),
2258        ("$", "$"),
2259        ("${", "}"),
2260        ("#", "#"),
2261        ("#", 1),
2262        ("#", 0),
2263        ("($", ")"),
2264        ("$[", "]"),
2265        ("[", "]"),
2266        ("@", "@"),
2267        ("$", None),
2268    ],
2269    ignoretags=[
2270        ("alt", "xml-lang", None),
2271        ("ahelp", "visibility", "visible"),
2272        ("img", "width", None),
2273        ("img", "height", None),
2274    ],
2275    canchangetags=[("link", "name", None)],
2276)
2277
2278
2279class LibreOfficeChecker(StandardChecker):
2280    def __init__(self, **kwargs):
2281        checkerconfig = kwargs.get("checkerconfig", None)
2282
2283        if checkerconfig is None:
2284            checkerconfig = CheckerConfig()
2285            kwargs["checkerconfig"] = checkerconfig
2286
2287        checkerconfig.update(libreofficeconfig)
2288        checkerconfig.update(openofficeconfig)
2289        super().__init__(**kwargs)
2290
2291    @critical
2292    def validxml(self, str1, str2):
2293        """Check that all XML/HTML open/close tags has close/open pair in the
2294        translation.
2295        """
2296        for location in self.locations:
2297            if location.endswith(".xrm") or location.endswith(".xhp"):
2298                opentags = []
2299                match = re.search(lo_tag_re, str2)
2300                while match:
2301                    acttag = match.group(0)
2302                    if acttag.startswith("</"):
2303                        if match.group("tag") in lo_emptytags:
2304                            raise FilterFailure(
2305                                "»%s« should be self-closing/empty" % acttag
2306                            )
2307                        if len(opentags) == 0:
2308                            raise FilterFailure(
2309                                "There is no open tag for »%s«" % acttag
2310                            )
2311                        opentag = opentags.pop()
2312                        if tagname(acttag) != "/" + tagname(opentag):
2313                            raise FilterFailure(
2314                                "Open tag »%s« and close tag »%s« "
2315                                "don't match" % (opentag, acttag)
2316                            )
2317                    elif acttag.endswith("/>"):
2318                        if match.group("tag") not in lo_emptytags:
2319                            raise FilterFailure(
2320                                "»%s« should not be self-closing/empty" % acttag
2321                            )
2322                    else:
2323                        opentags.append(acttag)
2324                    str2 = str2[match.end(0) :]
2325                    match = re.search(lo_tag_re, str2)
2326                if len(opentags) != 0:
2327                    raise FilterFailure(
2328                        "There is no close tag for »%s«" % opentags.pop()
2329                    )
2330        return True
2331
2332    @critical
2333    def pythonbraceformat(self, str1, str2):
2334        """Not used in LibreOffice"""
2335        return True
2336
2337
2338mozillaconfig = CheckerConfig(
2339    accelmarkers=["&"],
2340    varmatches=[
2341        ("&", ";"),
2342        ("%", "%"),
2343        ("%", 1),
2344        ("$", "$"),
2345        ("$", None),
2346        ("#", 1),
2347        ("${", "}"),
2348        ("$(^", ")"),
2349        ("{{", "}}"),
2350    ],
2351    criticaltests=["accelerators"],
2352)
2353
2354
2355class MozillaChecker(StandardChecker):
2356    accelerators_skipped_scripts = [
2357        "Deva",
2358        "Beng",
2359        "Tibt",
2360        "Orya",
2361        "Gujr",
2362        "Khmr",
2363        "Knda",
2364        "Laoo",
2365        "Mlym",
2366        "Mymr",
2367        "Sind",
2368        "Taml",
2369        "assamese",
2370        "perso-arabic",
2371        "mon",
2372        "chinese",
2373    ]
2374
2375    def __init__(self, **kwargs):
2376        checkerconfig = kwargs.get("checkerconfig", None)
2377
2378        if checkerconfig is None:
2379            checkerconfig = CheckerConfig()
2380            kwargs["checkerconfig"] = checkerconfig
2381
2382        checkerconfig.update(mozillaconfig)
2383        super().__init__(**kwargs)
2384
2385    @extraction
2386    def credits(self, str1, str2):
2387        """Checks for messages containing translation credits instead of
2388        normal translations.
2389
2390        Some projects have consistent ways of giving credit to translators by
2391        having a unit or two where translators can fill in their name and
2392        possibly their contact details. This test allows you to find these
2393        units easily to check that they are completed correctly and also
2394        disables other tests that might incorrectly get triggered for these
2395        units (such as urls, emails, etc.)
2396        """
2397        for location in self.locations:
2398            if location in ["MOZ_LANGPACK_CONTRIBUTORS", "credit.translation"]:
2399                raise FilterFailure("Don't translate. Just credit the translators.")
2400
2401        return True
2402
2403    mozilla_dialog_re = re.compile(
2404        r"""(                         # option pair "key: value;"
2405                                      (?P<key>[-a-z]+)           # key
2406                                      :\s+                       # seperator
2407                                      (?P<number>\d+(?:[.]\d+)?) # number
2408                                      (?P<unit>[a-z][a-z]);?     # units
2409                                      )+                         # multiple pairs
2410                                   """,
2411        re.VERBOSE,
2412    )
2413    mozilla_dialog_valid_units = ["em", "px", "ch"]
2414
2415    @critical
2416    def dialogsizes(self, str1, str2):
2417        """Checks that dialog sizes are not translated.
2418
2419        This is a Mozilla specific test. Mozilla uses a language called XUL to
2420        define dialogues and screens. This can make use of CSS to specify
2421        properties of the dialogue. These properties include things such as the
2422        width and height of the box. The size might need to be changed if the
2423        dialogue size changes due to longer translations. Thus translators can
2424        change these settings. But you are only meant to change the number not
2425        translate the words 'width' or 'height'. This check capture instances
2426        where these are translated. It will also catch other types of errors in
2427        these units.
2428        """
2429        # Example: "width: 635px; height: 400px;"
2430        if "width" in str1 or "height" in str1:
2431            str1pairs = self.mozilla_dialog_re.findall(str1)
2432
2433            if str1pairs:
2434                str2pairs = self.mozilla_dialog_re.findall(str2)
2435
2436                if len(str1pairs) != len(str2pairs):
2437                    raise FilterFailure("A dialog pair is missing")
2438
2439                for i, pair1 in enumerate(str1pairs):
2440                    pair2 = str2pairs[i]
2441
2442                    if pair1[0] != pair2[0]:  # Only check pairs that differ
2443                        if len(pair2) != 4:
2444                            raise FilterFailure("A part of the dialog pair is missing")
2445
2446                        if pair1[1] not in pair2:  # key
2447                            raise FilterFailure(
2448                                "Do not translate the key '%s'" % pair1[1]
2449                            )
2450
2451                        # FIXME we could check more carefully for numbers in pair1[2]
2452                        if pair2[3] not in self.mozilla_dialog_valid_units:
2453                            raise FilterFailure(
2454                                "Units should be one of '%s'. "
2455                                "The source string uses '%s'"
2456                                % (", ".join(self.mozilla_dialog_valid_units), pair1[3])
2457                            )
2458
2459        return True
2460
2461    @functional
2462    def numbers(self, str1, str2):
2463        """Checks that numbers are not translated.
2464
2465        Special handling for Mozilla to ignore entries that are dialog sizes.
2466        """
2467        if self.mozilla_dialog_re.findall(str1):
2468            return True
2469
2470        return super().numbers(str1, str2)
2471
2472    @functional
2473    def unchanged(self, str1, str2):
2474        """Checks whether a translation is basically identical to the original
2475        string.
2476
2477        Special handling for Mozilla to ignore entries that are dialog sizes.
2478        """
2479        if (
2480            self.mozilla_dialog_re.findall(str1)
2481            or str1.strip().lstrip("0123456789") in self.mozilla_dialog_valid_units
2482        ):
2483            return True
2484
2485        return super().unchanged(str1, str2)
2486
2487    @cosmetic
2488    def accelerators(self, str1, str2):
2489        """Checks whether accelerators are consistent between the
2490        two strings.
2491
2492        For Mozilla we lower the severity to cosmetic, and for some languages
2493        it also ensures accelerators are absent in the target string since some
2494        languages do not use accelerators, for example Indic languages.
2495        """
2496        # Mozilla's specific no-accelerators behavior.
2497        if self.config.language_script in self.accelerators_skipped_scripts:
2498            str2 = self.filtervariables(str2)
2499            messages = []
2500
2501            for accelmarker in self.config.accelmarkers:
2502                counter2 = decoration.countaccelerators(
2503                    accelmarker,
2504                    self.config.lang.validaccel,
2505                )
2506                if counter2(str2)[0] > 0:
2507                    messages.append(
2508                        "Accelerator '%s' should not appear in "
2509                        "translation" % accelmarker
2510                    )
2511
2512            if messages:
2513                raise FilterFailure(messages)
2514
2515            return True
2516
2517        # Default accelerators behavior.
2518        return super().accelerators(str1, str2)
2519
2520
2521drupalconfig = CheckerConfig(
2522    varmatches=[("%", None), ("@", None), ("!", None)],
2523)
2524
2525
2526class DrupalChecker(StandardChecker):
2527    def __init__(self, **kwargs):
2528        checkerconfig = kwargs.get("checkerconfig", None)
2529
2530        if checkerconfig is None:
2531            checkerconfig = CheckerConfig()
2532            kwargs["checkerconfig"] = checkerconfig
2533
2534        checkerconfig.update(drupalconfig)
2535        super().__init__(**kwargs)
2536
2537
2538gnomeconfig = CheckerConfig(
2539    accelmarkers=["_"],
2540    varmatches=[("%", 1), ("$(", ")")],
2541    credit_sources=["translator-credits"],
2542)
2543
2544
2545class GnomeChecker(StandardChecker):
2546    def __init__(self, **kwargs):
2547        checkerconfig = kwargs.get("checkerconfig", None)
2548
2549        if checkerconfig is None:
2550            checkerconfig = CheckerConfig()
2551            kwargs["checkerconfig"] = checkerconfig
2552
2553        checkerconfig.update(gnomeconfig)
2554        super().__init__(**kwargs)
2555
2556    @functional
2557    def gconf(self, str1, str2):
2558        """Checks if we have any gconf config settings translated.
2559
2560        Gconf settings should not be translated so this check checks that gconf
2561        settings such as "name" or "modification_date" are not translated in
2562        the translation. It allows you to change the surrounding quotes but
2563        will ensure that the setting values remain untranslated.
2564        """
2565        for location in self.locations:
2566            if (
2567                location.find("schemas.in") != -1
2568                or location.find("gschema.xml.in") != -1
2569            ):
2570                gconf_attributes = gconf_attribute_re.findall(str1)
2571                # stopwords = [word for word in words1 if word in self.config.notranslatewords and word not in words2]
2572                stopwords = [
2573                    word for word in gconf_attributes if word[1:-1] not in str2
2574                ]
2575
2576                if stopwords:
2577                    raise FilterFailure(
2578                        "Do not translate GConf attributes: %s" % (", ".join(stopwords))
2579                    )
2580
2581                return True
2582
2583        return True
2584
2585
2586kdeconfig = CheckerConfig(
2587    accelmarkers=["&"],
2588    varmatches=[("%", 1)],
2589    credit_sources=["Your names", "Your emails", "ROLES_OF_TRANSLATORS"],
2590)
2591
2592
2593class KdeChecker(StandardChecker):
2594    def __init__(self, **kwargs):
2595        # TODO allow setup of KDE plural and translator comments so that they do
2596        # not create false postives
2597        checkerconfig = kwargs.get("checkerconfig", None)
2598
2599        if checkerconfig is None:
2600            checkerconfig = CheckerConfig()
2601            kwargs["checkerconfig"] = checkerconfig
2602
2603        checkerconfig.update(kdeconfig)
2604        super().__init__(**kwargs)
2605
2606
2607cclicenseconfig = CheckerConfig(varmatches=[("@", "@")])
2608
2609
2610class CCLicenseChecker(StandardChecker):
2611    def __init__(self, **kwargs):
2612        checkerconfig = kwargs.get("checkerconfig", None)
2613
2614        if checkerconfig is None:
2615            checkerconfig = CheckerConfig()
2616            kwargs["checkerconfig"] = checkerconfig
2617
2618        checkerconfig.update(cclicenseconfig)
2619        super().__init__(**kwargs)
2620
2621
2622minimalconfig = CheckerConfig()
2623
2624
2625class MinimalChecker(StandardChecker):
2626    def __init__(self, **kwargs):
2627        checkerconfig = kwargs.get("checkerconfig", None)
2628
2629        if checkerconfig is None:
2630            checkerconfig = CheckerConfig()
2631            kwargs["checkerconfig"] = checkerconfig
2632
2633        limitfilters = kwargs.get("limitfilters", None)
2634
2635        if limitfilters is None:
2636            limitfilters = ["untranslated", "unchanged", "blank"]
2637            kwargs["limitfilters"] = limitfilters
2638
2639        checkerconfig.update(minimalconfig)
2640        super().__init__(**kwargs)
2641
2642
2643reducedconfig = CheckerConfig()
2644
2645
2646class ReducedChecker(StandardChecker):
2647    def __init__(self, **kwargs):
2648        checkerconfig = kwargs.get("checkerconfig", None)
2649
2650        if checkerconfig is None:
2651            checkerconfig = CheckerConfig()
2652            kwargs["checkerconfig"] = checkerconfig
2653
2654        limitfilters = kwargs.get("limitfilters", None)
2655
2656        if limitfilters is None:
2657            limitfilters = [
2658                "untranslated",
2659                "unchanged",
2660                "blank",
2661                "doublespacing",
2662                "doublewords",
2663                "spellcheck",
2664            ]
2665            kwargs["limitfilters"] = limitfilters
2666
2667        checkerconfig.update(minimalconfig)
2668        super().__init__(**kwargs)
2669
2670
2671termconfig = CheckerConfig()
2672
2673
2674class TermChecker(StandardChecker):
2675    def __init__(self, **kwargs):
2676        checkerconfig = kwargs.get("checkerconfig", None)
2677
2678        if checkerconfig is None:
2679            checkerconfig = CheckerConfig()
2680            kwargs["checkerconfig"] = checkerconfig
2681
2682        checkerconfig.update(termconfig)
2683        super().__init__(**kwargs)
2684
2685
2686class L20nChecker(MozillaChecker):
2687    excluded_filters_for_complex_units = [
2688        "escapes",
2689        "newlines",
2690        "tabs",
2691        "singlequoting",
2692        "doublequoting",
2693        "doublespacing",
2694        "brackets",
2695        "pythonbraceformat",
2696        "sentencecount",
2697        "variables",
2698    ]
2699    complex_unit_pattern = "->"
2700
2701    def __init__(self, **kwargs):
2702        checkerconfig = kwargs.get("checkerconfig", None)
2703
2704        if checkerconfig is None:
2705            checkerconfig = CheckerConfig()
2706            kwargs["checkerconfig"] = checkerconfig
2707
2708        super().__init__(**kwargs)
2709
2710    def run_filters(self, unit, categorised=False):
2711        is_unit_complex = (
2712            self.complex_unit_pattern in unit.source
2713            or self.complex_unit_pattern in unit.target
2714        )
2715
2716        saved_default_filters = {}
2717        if is_unit_complex:
2718            saved_default_filters = self.defaultfilters
2719            self.defaultfilters = {
2720                key: value
2721                for (key, value) in self.defaultfilters.items()
2722                if key not in self.excluded_filters_for_complex_units
2723            }
2724
2725        result = super().run_filters(unit, categorised=categorised)
2726
2727        if is_unit_complex:
2728            self.defaultfilters = saved_default_filters
2729
2730        return result
2731
2732
2733iosconfig = CheckerConfig(
2734    varmatches=[("$(", ")"), ("%", "@")],
2735)
2736
2737
2738class IOSChecker(StandardChecker):
2739    def __init__(self, **kwargs):
2740        checkerconfig = kwargs.get("checkerconfig", None)
2741
2742        if checkerconfig is None:
2743            checkerconfig = CheckerConfig()
2744            kwargs["checkerconfig"] = checkerconfig
2745
2746        checkerconfig.update(iosconfig)
2747        super().__init__(**kwargs)
2748
2749
2750projectcheckers = {
2751    "minimal": MinimalChecker,
2752    "standard": StandardChecker,
2753    "reduced": ReducedChecker,
2754    "openoffice": OpenOfficeChecker,
2755    "libreoffice": LibreOfficeChecker,
2756    "mozilla": MozillaChecker,
2757    "kde": KdeChecker,
2758    "wx": KdeChecker,
2759    "gnome": GnomeChecker,
2760    "creativecommons": CCLicenseChecker,
2761    "drupal": DrupalChecker,
2762    "terminology": TermChecker,
2763    "ios": IOSChecker,
2764}
2765
2766
2767class StandardUnitChecker(UnitChecker):
2768    """The standard checks for common checks on translation units."""
2769
2770    @extraction
2771    def isfuzzy(self, unit):
2772        """Check if the unit has been marked fuzzy.
2773
2774        If a message is marked fuzzy in the PO file then it is extracted.
2775        Note this is different from ``--fuzzy`` and ``--nofuzzy`` options which
2776        specify whether tests should be performed against messages marked
2777        fuzzy.
2778        """
2779        return not unit.isfuzzy()
2780
2781    @extraction
2782    def isreview(self, unit):
2783        """Check if the unit has been marked review.
2784
2785        If you have made use of the 'review' flags in your translations::
2786
2787          # (review) reason for review
2788          # (pofilter) testname: explanation for translator
2789
2790        Then if a message is marked for review in the PO file it will be
2791        extracted. Note this is different from ``--review`` and ``--noreview``
2792        options which specify whether tests should be performed against
2793        messages already marked as under review.
2794        """
2795        return not unit.isreview()
2796
2797    @critical
2798    def nplurals(self, unit):
2799        """Checks for the correct number of noun forms for plural translations.
2800
2801        This uses the plural information in the language module of the
2802        Translate Toolkit. This is the same as the Gettext nplural value. It
2803        will check that the number of plurals required is the same as the
2804        number supplied in your translation.
2805        """
2806        if unit.hasplural():
2807            # if we don't have a valid nplurals value, don't run the test
2808            nplurals = self.config.lang.nplurals
2809
2810            if nplurals > 0:
2811                return len(list(filter(None, unit.target.strings))) == nplurals
2812
2813        return True
2814
2815    @extraction
2816    def hassuggestion(self, unit):
2817        """Checks if there is at least one suggested translation for this unit.
2818
2819        If a message has a suggestion (an alternate translation stored in
2820        alt-trans units in XLIFF and .pending files in PO) then these will be
2821        extracted. This is used by Pootle and is probably only useful in
2822        pofilter when using XLIFF files.
2823        """
2824        self.suggestion_store = getattr(self, "suggestion_store", None)
2825        suggestions = []
2826
2827        if self.suggestion_store:
2828            suggestions = self.suggestion_store.findunits(unit.source)
2829        elif getattr(unit, "getalttrans", None):
2830            # TODO: we probably want to filter them somehow
2831            suggestions = unit.getalttrans()
2832
2833        return not bool(suggestions)
2834
2835
2836def runtests(str1, str2, ignorelist=()):
2837    """Verifies that the tests pass for a pair of strings."""
2838    from translate.storage import base
2839
2840    str1 = data.normalize(str1)
2841    str2 = data.normalize(str2)
2842    unit = base.TranslationUnit(str1)
2843    unit.target = str2
2844    checker = StandardChecker(excludefilters=ignorelist)
2845    failures = checker.run_filters(unit)
2846
2847    for test in failures:
2848        print(
2849            "failure: %s: %s\n  %r\n  %r"
2850            % (test, failures[test]["message"], str1, str2)
2851        )
2852
2853    return failures
2854
2855
2856def batchruntests(pairs):
2857    """Runs test on a batch of string pairs."""
2858    passed, numpairs = 0, len(pairs)
2859
2860    for str1, str2 in pairs:
2861        if runtests(str1, str2):
2862            passed += 1
2863
2864    print("\ntotal: %d/%d pairs passed" % (passed, numpairs))
2865
2866
2867if __name__ == "__main__":
2868    testset = [
2869        (r"simple", r"somple"),
2870        (r"\this equals \that", r"does \this equal \that?"),
2871        (r"this \'equals\' that", r"this 'equals' that"),
2872        (r" start and end! they must match.", r"start and end! they must match."),
2873        (
2874            r"check for matching %variables marked like %this",
2875            r"%this %variable is marked",
2876        ),
2877        (
2878            r"check for mismatching %variables marked like %this",
2879            r"%that %variable is marked",
2880        ),
2881        (r"check for mismatching %variables% too", r"how many %variable% are marked"),
2882        (r"%% %%", r"%%"),
2883        (r"Row: %1, Column: %2", r"Mothalo: %1, Kholomo: %2"),
2884        (r"simple lowercase", r"it is all lowercase"),
2885        (r"simple lowercase", r"It Is All Lowercase"),
2886        (r"Simple First Letter Capitals", r"First Letters"),
2887        (r"SIMPLE CAPITALS", r"First Letters"),
2888        (r"SIMPLE CAPITALS", r"ALL CAPITALS"),
2889        (r"forgot to translate", r"  "),
2890    ]
2891    batchruntests(testset)
2892