1# 2# Copyright 2004-2011 Zuza Software Foundation 3# 2013, 2016 F Wolff 4# 5# This file is part of translate. 6# 7# translate is free software; you can redistribute it and/or modify 8# it under the terms of the GNU General Public License as published by 9# the Free Software Foundation; either version 2 of the License, or 10# (at your option) any later version. 11# 12# translate is distributed in the hope that it will be useful, 13# but WITHOUT ANY WARRANTY; without even the implied warranty of 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15# GNU General Public License for more details. 16# 17# You should have received a copy of the GNU General Public License 18# along with this program; if not, see <http://www.gnu.org/licenses/>. 19 20"""This is a set of validation checks that can be performed on translation 21units. 22 23Derivatives of UnitChecker (like StandardUnitChecker) check translation units, 24and derivatives of TranslationChecker (like StandardChecker) check 25(source, target) translation pairs. 26 27When adding a new test here, please document and explain their behaviour on the 28:doc:`pofilter tests </commands/pofilter_tests>` page. 29""" 30 31import logging 32import re 33 34from translate.filters import decoration, helpers, prefilters, spelling 35from translate.filters.decorators import cosmetic, critical, extraction, functional 36from translate.lang import data, factory 37 38 39logger = logging.getLogger(__name__) 40 41# These are some regular expressions that are compiled for use in some tests 42 43# printf syntax based on http://en.wikipedia.org/wiki/Printf which doesn't 44# cover everything we leave \w instead of specifying the exact letters as 45# this should capture printf types defined in other platforms. 46# Extended to support Python named format specifiers and objective-C special 47# "%@" format specifier 48# (see https://developer.apple.com/library/mac/documentation/Cocoa/Conceptual/Strings/Articles/formatSpecifiers.html) 49printf_pat = re.compile( 50 r""" 51 %( # initial % 52 (?P<boost_ord>\d+)% # boost::format style variable order, like %1% 53 | 54 (?:(?P<ord>\d+)\$| # variable order, like %1$s 55 \((?P<key>\w+)\))? # Python style variables, like %(var)s 56 (?P<fullvar> 57 [+#-]* # flags 58 (?:\d+)? # width 59 (?:\.\d+)? # precision 60 (hh\|h\|l\|ll)? # length formatting 61 (?P<type>[\w@])) # type (%s, %d, etc.) 62 )""", 63 re.VERBOSE, 64) 65 66# The name of the XML tag 67tagname_re = re.compile(r"<[\s]*([\w\/]*).*?(/)?[\s]*>", re.DOTALL) 68 69# We allow escaped quotes, probably for old escaping style of OOo helpcontent 70# TODO: remove escaped strings once usage is audited 71property_re = re.compile(" (\\w*)=((\\\\?\".*?\\\\?\")|(\\\\?'.*?\\\\?'))") 72 73# The whole tag 74tag_re = re.compile("<[^>]+>") 75 76gconf_attribute_re = re.compile('"[a-z_]+?"') 77 78# XML/HTML tags in LibreOffice help and readme, exclude short tags 79lo_tag_re = re.compile("""</?(?P<tag>[a-z][a-z_-]+)(?: +[a-z]+="[^"]+")* */?>""") 80lo_emptytags = frozenset(["br", "embed", "embedvar", "object", "help-id-missing"]) 81 82 83def tagname(string): 84 """Returns the name of the XML/HTML tag in string""" 85 tagname_match = tagname_re.match(string) 86 return tagname_match.groups(1)[0] + tagname_match.groups("")[1] 87 88 89def intuplelist(pair, list): 90 """Tests to see if pair == (a,b,c) is in list, but handles None entries in 91 list as wildcards (only allowed in positions "a" and "c"). We take a 92 shortcut by only considering "c" if "b" has already matched. 93 """ 94 a, b, c = pair 95 96 if (b, c) == (None, None): 97 # This is a tagname 98 return pair 99 100 for pattern in list: 101 x, y, z = pattern 102 103 if (x, y) in [(a, b), (None, b)]: 104 if z in [None, c]: 105 return pattern 106 107 return pair 108 109 110def tagproperties(strings, ignore): 111 """Returns all the properties in the XML/HTML tag string as (tagname, 112 propertyname, propertyvalue), but ignore those combinations specified in 113 ignore. 114 """ 115 properties = [] 116 117 for string in strings: 118 tag = tagname(string) 119 properties += [(tag, None, None)] 120 # Now we isolate the attribute pairs. 121 pairs = property_re.findall(string) 122 123 for property, value, a, b in pairs: 124 # Strip the quotes: 125 value = value[1:-1] 126 127 canignore = False 128 129 if (tag, property, value) in ignore or intuplelist( 130 (tag, property, value), ignore 131 ) != (tag, property, value): 132 canignore = True 133 break 134 135 if not canignore: 136 properties += [(tag, property, value)] 137 138 return properties 139 140 141class FilterFailure(Exception): 142 """This exception signals that a Filter didn't pass, and gives an 143 explanation or a comment. 144 """ 145 146 def __init__(self, messages): 147 if not isinstance(messages, list): 148 messages = [messages] 149 150 assert isinstance(messages[0], str) # Assumption: all of same type 151 152 self.messages = messages 153 154 def __str__(self): 155 return ", ".join(self.messages) 156 157 158class SeriousFilterFailure(FilterFailure): 159 """This exception signals that a Filter didn't pass, and the bad 160 translation might break an application (so the string will be marked 161 fuzzy) 162 """ 163 164 pass 165 166 167# (tag, attribute, value) specifies a certain attribute which can be changed/ 168# ignored if it exists inside tag. In the case where there is a third element 169# in the tuple, it indicates a property value that can be ignored if present 170# (like defaults, for example) 171# If a certain item is None, it indicates that it is relevant for all values of 172# the property/tag that is specified as None. A non-None value of "value" 173# indicates that the value of the attribute must be taken into account. 174common_ignoretags = [(None, "xml-lang", None)] 175common_canchangetags = [ 176 ("img", "alt", None), 177 (None, "title", None), 178 (None, "dir", None), 179 (None, "lang", None), 180] 181# Actually the title tag is allowed on many tags in HTML (but probably not all) 182 183 184class CheckerConfig: 185 """Object representing the configuration of a checker.""" 186 187 def __init__( 188 self, 189 targetlanguage=None, 190 accelmarkers=None, 191 varmatches=None, 192 notranslatewords=None, 193 musttranslatewords=None, 194 validchars=None, 195 punctuation=None, 196 endpunctuation=None, 197 ignoretags=None, 198 canchangetags=None, 199 criticaltests=None, 200 credit_sources=None, 201 ): 202 # Init lists 203 self.accelmarkers = self._init_list(accelmarkers) 204 self.varmatches = self._init_list(varmatches) 205 self.criticaltests = self._init_list(criticaltests) 206 self.credit_sources = self._init_list(credit_sources) 207 208 # Lang data 209 self.updatetargetlanguage(targetlanguage) 210 self.sourcelang = factory.getlanguage("en") 211 212 # Inits with default values 213 self.punctuation = self._init_default( 214 data.normalize(punctuation), self.lang.punctuation 215 ) 216 self.endpunctuation = self._init_default( 217 data.normalize(endpunctuation), self.lang.sentenceend 218 ) 219 self.ignoretags = self._init_default(ignoretags, common_ignoretags) 220 self.canchangetags = self._init_default(canchangetags, common_canchangetags) 221 222 # Other data 223 # TODO: allow user configuration of untranslatable words 224 self.notranslatewords = dict.fromkeys( 225 [data.normalize(key) for key in self._init_list(notranslatewords)] 226 ) 227 self.musttranslatewords = dict.fromkeys( 228 [data.normalize(key) for key in self._init_list(musttranslatewords)] 229 ) 230 validchars = data.normalize(validchars) 231 self.validcharsmap = {} 232 self.updatevalidchars(validchars) 233 234 def _init_list(self, list): 235 """initialise configuration paramaters that are lists 236 237 :type list: List 238 :param list: None (we'll initialise a blank list) or a list paramater 239 :rtype: List 240 """ 241 if list is None: 242 list = [] 243 244 return list 245 246 def _init_default(self, param, default): 247 """Initialise parameters that can have default options. 248 249 :param param: the user supplied paramater value 250 :param default: default values when param is not specified 251 :return: the paramater as specified by the user of the default settings 252 """ 253 if param is None: 254 return default 255 256 return param 257 258 def update(self, otherconfig): 259 """Combines the info in ``otherconfig`` into this config object.""" 260 self.targetlanguage = otherconfig.targetlanguage or self.targetlanguage 261 self.updatetargetlanguage(self.targetlanguage) 262 self.accelmarkers.extend( 263 [c for c in otherconfig.accelmarkers if c not in self.accelmarkers] 264 ) 265 self.varmatches.extend(otherconfig.varmatches) 266 self.notranslatewords.update(otherconfig.notranslatewords) 267 self.musttranslatewords.update(otherconfig.musttranslatewords) 268 self.validcharsmap.update(otherconfig.validcharsmap) 269 self.punctuation += otherconfig.punctuation 270 self.endpunctuation += otherconfig.endpunctuation 271 # TODO: consider also updating in the following cases: 272 self.ignoretags = otherconfig.ignoretags 273 self.canchangetags = otherconfig.canchangetags 274 self.criticaltests.extend(otherconfig.criticaltests) 275 self.credit_sources = otherconfig.credit_sources 276 277 def updatevalidchars(self, validchars): 278 """Updates the map that eliminates valid characters.""" 279 if validchars is None: 280 return True 281 282 validcharsmap = { 283 ord(validchar): None for validchar in data.normalize(validchars) 284 } 285 self.validcharsmap.update(validcharsmap) 286 287 def updatetargetlanguage(self, langcode): 288 """Updates the target language in the config to the given target 289 language and sets its script. 290 """ 291 self.targetlanguage = langcode 292 self.lang = factory.getlanguage(langcode) 293 self.language_script = "" 294 295 for script, langs in data.scripts.items(): 296 if langcode in langs or data.simplercode(langcode) in langs: 297 self.language_script = script 298 break 299 300 301def cache_results(f): 302 def cached_f(self, param1): 303 key = (f.__name__, param1) 304 res_cache = self.results_cache 305 306 if key in res_cache: 307 return res_cache[key] 308 else: 309 value = f(self, param1) 310 res_cache[key] = value 311 return value 312 313 return cached_f 314 315 316class UnitChecker: 317 """Parent Checker class which does the checking based on functions 318 available in derived classes. 319 """ 320 321 preconditions = {} 322 323 def __init__( 324 self, 325 checkerconfig=None, 326 excludefilters=None, 327 limitfilters=None, 328 errorhandler=None, 329 ): 330 self.errorhandler = errorhandler 331 332 #: Categories where each checking function falls into 333 #: Function names are used as keys, categories are the values 334 self.categories = {} 335 336 if checkerconfig is None: 337 self.setconfig(CheckerConfig()) 338 else: 339 self.setconfig(checkerconfig) 340 341 # Exclude functions defined in UnitChecker from being treated as tests. 342 self.helperfunctions = {} 343 344 for functionname in dir(UnitChecker): 345 function = getattr(self, functionname) 346 347 if callable(function): 348 self.helperfunctions[functionname] = function 349 350 self.defaultfilters = self.getfilters(excludefilters, limitfilters) 351 self.results_cache = {} 352 353 def getfilters(self, excludefilters=None, limitfilters=None): 354 """Returns dictionary of available filters, including/excluding those 355 in the given lists. 356 """ 357 filters = {} 358 359 if limitfilters is None: 360 # use everything available unless instructed 361 limitfilters = dir(self) 362 363 if excludefilters is None: 364 excludefilters = {} 365 366 for functionname in limitfilters: 367 368 if functionname in excludefilters: 369 continue 370 371 if functionname in self.helperfunctions: 372 continue 373 374 if functionname == "errorhandler": 375 continue 376 377 filterfunction = getattr(self, functionname, None) 378 if not callable(filterfunction): 379 continue 380 381 filters[functionname] = filterfunction 382 383 return filters 384 385 def setconfig(self, config): 386 """Sets the accelerator list.""" 387 self.config = config 388 self.accfilters = [ 389 prefilters.filteraccelerators(accelmarker) 390 for accelmarker in self.config.accelmarkers 391 ] 392 self.varfilters = [ 393 prefilters.filtervariables(startmatch, endmatch, prefilters.varname) 394 for startmatch, endmatch in self.config.varmatches 395 ] 396 self.removevarfilter = [ 397 prefilters.filtervariables(startmatch, endmatch, prefilters.varnone) 398 for startmatch, endmatch in self.config.varmatches 399 ] 400 401 def setsuggestionstore(self, store): 402 """Sets the filename that a checker should use for evaluating 403 suggestions. 404 """ 405 self.suggestion_store = store 406 407 if self.suggestion_store: 408 self.suggestion_store.require_index() 409 410 def filtervariables(self, str1): 411 """Filter out variables from ``str1``.""" 412 return helpers.multifilter(str1, self.varfilters) 413 414 filtervariables = cache_results(filtervariables) 415 416 def removevariables(self, str1): 417 """Remove variables from ``str1``.""" 418 return helpers.multifilter(str1, self.removevarfilter) 419 420 removevariables = cache_results(removevariables) 421 422 def filteraccelerators(self, str1): 423 """Filter out accelerators from ``str1``.""" 424 return helpers.multifilter(str1, self.accfilters, None) 425 426 filteraccelerators = cache_results(filteraccelerators) 427 428 def filteraccelerators_by_list(self, str1, acceptlist=None): 429 """Filter out accelerators from ``str1``.""" 430 return helpers.multifilter(str1, self.accfilters, acceptlist) 431 432 def filterwordswithpunctuation(self, str1): 433 """Replaces words with punctuation with their unpunctuated 434 equivalents. 435 """ 436 return prefilters.filterwordswithpunctuation(str1) 437 438 filterwordswithpunctuation = cache_results(filterwordswithpunctuation) 439 440 def filterxml(self, str1): 441 """Filter out XML from the string so only text remains.""" 442 return tag_re.sub("", str1) 443 444 filterxml = cache_results(filterxml) 445 446 def run_test(self, test, unit): 447 """Runs the given test on the given unit. 448 449 Note that this can raise a :exc:`FilterFailure` as part of normal operation. 450 """ 451 return test(unit) 452 453 @property 454 def checker_name(self): 455 """Extract checker name, for example 'mozilla' from MozillaChecker.""" 456 return str(self.__class__.__name__).lower()[: -len("checker")] 457 458 def get_ignored_filters(self): 459 """Return checker's additional filters for current language.""" 460 return list( 461 set( 462 self.config.lang.ignoretests.get(self.checker_name, []) 463 + self.config.lang.ignoretests.get("all", []) 464 ) 465 ) 466 467 def run_filters(self, unit, categorised=False): 468 """Run all the tests in this suite. 469 470 :rtype: Dictionary 471 :return: Content of the dictionary is as follows:: 472 473 {'testname': { 'message': message_or_exception, 'category': failure_category } } 474 """ 475 self.results_cache = {} 476 failures = {} 477 ignores = self.get_ignored_filters() 478 functionnames = self.defaultfilters.keys() 479 priorityfunctionnames = self.preconditions.keys() 480 otherfunctionnames = filter( 481 lambda functionname: functionname not in self.preconditions, functionnames 482 ) 483 484 for functionname in list(priorityfunctionnames) + list(otherfunctionnames): 485 if functionname in ignores: 486 continue 487 488 filterfunction = getattr(self, functionname, None) 489 490 # This filterfunction may only be defined on another checker if 491 # using TeeChecker 492 if filterfunction is None: 493 continue 494 495 filtermessage = "" 496 497 try: 498 filterresult = self.run_test(filterfunction, unit) 499 except FilterFailure as e: 500 filterresult = False 501 filtermessage = str(e) 502 except Exception as e: 503 if self.errorhandler is None: 504 raise ValueError( 505 "error in filter %s: %r, %r, %s" 506 % (functionname, unit.source, unit.target, e) 507 ) 508 else: 509 filterresult = self.errorhandler( 510 functionname, unit.source, unit.target, e 511 ) 512 if not filterresult: 513 if not filtermessage: 514 # Should be quite rare 515 import pydoc 516 517 # Strip out unnecessary whitespace from docstring 518 filtermessage = pydoc.getdoc(filterfunction) 519 # We test some preconditions that aren't actually a cause for 520 # failure 521 if functionname in self.defaultfilters: 522 failures[functionname] = { 523 "message": filtermessage, 524 "category": self.categories[functionname], 525 } 526 527 if functionname in self.preconditions: 528 for ignoredfunctionname in self.preconditions[functionname]: 529 ignores.append(ignoredfunctionname) 530 531 self.results_cache = {} 532 533 if not categorised: 534 for name, info in failures.items(): 535 failures[name] = info["message"] 536 return failures 537 538 539class TranslationChecker(UnitChecker): 540 """A checker that passes source and target strings to the checks, not the 541 whole unit. 542 543 This provides some speedup and simplifies testing. 544 """ 545 546 def __init__( 547 self, 548 checkerconfig=None, 549 excludefilters=None, 550 limitfilters=None, 551 errorhandler=None, 552 ): 553 super().__init__(checkerconfig, excludefilters, limitfilters, errorhandler) 554 555 self.locations = [] 556 557 def run_test(self, test, unit): 558 """Runs the given test on the given unit. 559 560 Note that this can raise a :exc:`FilterFailure` as part of normal 561 operation. 562 """ 563 if self.hasplural: 564 filtermessages = [] 565 filterresult = True 566 567 for pluralform in unit.target.strings: 568 try: 569 if not test(self.str1, str(pluralform)): 570 filterresult = False 571 except FilterFailure as e: 572 filterresult = False 573 filtermessages.extend(e.messages) 574 575 if not filterresult and filtermessages: 576 raise FilterFailure(filtermessages) 577 else: 578 return filterresult 579 else: 580 return test(self.str1, self.str2) 581 582 def run_filters(self, unit, categorised=False): 583 """Do some optimisation by caching some data of the unit for the 584 benefit of :meth:`~TranslationChecker.run_test`. 585 """ 586 self.str1 = data.normalize(unit.source) or "" 587 self.str2 = data.normalize(unit.target) or "" 588 self.hasplural = unit.hasplural() 589 self.locations = unit.getlocations() 590 591 return super().run_filters(unit, categorised) 592 593 594class TeeChecker: 595 """A Checker that controls multiple checkers.""" 596 597 #: Categories where each checking function falls into 598 #: Function names are used as keys, categories are the values 599 categories = {} 600 601 def __init__( 602 self, 603 checkerconfig=None, 604 excludefilters=None, 605 limitfilters=None, 606 checkerclasses=None, 607 errorhandler=None, 608 languagecode=None, 609 ): 610 """construct a TeeChecker from the given checkers""" 611 self.limitfilters = limitfilters 612 613 if checkerclasses is None: 614 checkerclasses = [StandardChecker] 615 616 self.checkers = [ 617 checkerclass( 618 checkerconfig=checkerconfig, 619 excludefilters=excludefilters, 620 limitfilters=limitfilters, 621 errorhandler=errorhandler, 622 ) 623 for checkerclass in checkerclasses 624 ] 625 626 if languagecode: 627 for checker in self.checkers: 628 checker.config.updatetargetlanguage(languagecode) 629 630 # Let's hook up the language specific checker 631 lang_checker = self.checkers[0].config.lang.checker 632 633 if lang_checker: 634 self.checkers.append(lang_checker) 635 636 self.combinedfilters = self.getfilters(excludefilters, limitfilters) 637 self.config = checkerconfig or self.checkers[0].config 638 639 def getfilters(self, excludefilters=None, limitfilters=None): 640 """Returns a dictionary of available filters, including/excluding 641 those in the given lists. 642 """ 643 if excludefilters is None: 644 excludefilters = {} 645 646 filterslist = [ 647 checker.getfilters(excludefilters, limitfilters) 648 for checker in self.checkers 649 ] 650 self.combinedfilters = {} 651 652 for filters in filterslist: 653 self.combinedfilters.update(filters) 654 655 # TODO: move this somewhere more sensible (a checkfilters method?) 656 if limitfilters is not None: 657 658 for filtername in limitfilters: 659 660 if filtername not in self.combinedfilters: 661 logger.warning("could not find filter %s", filtername) 662 663 return self.combinedfilters 664 665 def run_filters(self, unit, categorised=False): 666 """Run all the tests in the checker's suites.""" 667 failures = {} 668 669 for checker in self.checkers: 670 failures.update(checker.run_filters(unit, categorised)) 671 672 return failures 673 674 def setsuggestionstore(self, store): 675 """Sets the filename that a checker should use for evaluating 676 suggestions. 677 """ 678 for checker in self.checkers: 679 checker.setsuggestionstore(store) 680 681 682class StandardChecker(TranslationChecker): 683 """The basic test suite for source -> target translations.""" 684 685 @extraction 686 def untranslated(self, str1, str2): 687 """Checks whether a string has been translated at all. 688 689 This check is really only useful if you want to extract untranslated 690 strings so that they can be translated independently of the main work. 691 """ 692 str2 = prefilters.removekdecomments(str2) 693 694 return not (len(str1.strip()) > 0 and len(str2) == 0) 695 696 @functional 697 def unchanged(self, str1, str2): 698 """Checks whether a translation is basically identical to the original 699 string. 700 701 This checks to see if the translation isn’t just a copy of the English 702 original. Sometimes, this is what you want, but other times you will 703 detect words that should have been translated. 704 """ 705 str1 = self.filteraccelerators(self.removevariables(str1)).strip() 706 str2 = self.filteraccelerators(self.removevariables(str2)).strip() 707 708 if len(str1) < 2: 709 return True 710 711 # If the whole string is upperase, or nothing in the string can go 712 # towards uppercase, let's assume there is nothing translatable 713 # TODO: reconsider 714 if (str1.isupper() or str1.upper() == str1) and str1 == str2: 715 return True 716 717 if self.config.notranslatewords: 718 words1 = str1.split() 719 if len(words1) == 1 and [ 720 word for word in words1 if word in self.config.notranslatewords 721 ]: 722 # currently equivalent to: 723 # if len(words1) == 1 and words1[0] in self.config.notranslatewords: 724 # why do we only test for one notranslate word? 725 return True 726 727 # we could also check for things like str1.isnumeric(), but the test 728 # above (str1.upper() == str1) makes this unnecessary 729 if str1.lower() == str2.lower(): 730 raise FilterFailure("Consider translating") 731 732 return True 733 734 @functional 735 def blank(self, str1, str2): 736 """Checks whether a translation is totally blank. 737 738 This will check to see if a translation has inadvertently been 739 translated as blank i.e. as spaces. This is different from untranslated 740 which is completely empty. This test is useful in that if something is 741 translated as " " it will appear to most tools as if it is translated. 742 """ 743 len1 = len(str1.strip()) 744 len2 = len(str2.strip()) 745 746 if len1 > 0 and len(str2) != 0 and len2 == 0: 747 raise FilterFailure("Translation is empty") 748 else: 749 return True 750 751 @functional 752 def short(self, str1, str2): 753 """Checks whether a translation is much shorter than the original 754 string. 755 756 This is most useful in the special case where the translation is 1 757 characters long while the source text is multiple characters long. 758 Otherwise, we use a general ratio that will catch very big differences 759 but is set conservatively to limit the number of false positives. 760 """ 761 len1 = len(str1.strip()) 762 len2 = len(str2.strip()) 763 764 if (len1 > 0) and (0 < len2 < (len1 * 0.1)) or ((len1 > 1) and (len2 == 1)): 765 raise FilterFailure("The translation is much shorter than the original") 766 else: 767 return True 768 769 @functional 770 def long(self, str1, str2): 771 """Checks whether a translation is much longer than the original 772 string. 773 774 This is most useful in the special case where the translation is 775 multiple characters long while the source text is only 1 character 776 long. Otherwise, we use a general ratio that will catch very big 777 differences but is set conservatively to limit the number of false 778 positives. 779 """ 780 len1 = len(str1.strip()) 781 len2 = len(str2.strip()) 782 783 if (len1 > 0) and (0 < len1 < (len2 * 0.1)) or ((len1 == 1) and (len2 > 1)): 784 raise FilterFailure("The translation is much longer than the original") 785 else: 786 return True 787 788 @critical 789 def escapes(self, str1, str2): 790 """Checks whether escaping is consistent between the two strings. 791 792 Checks escapes such as ``\\n`` ``\u0000`` to ensure that if they exist 793 in the original string you also have them in the translation. 794 """ 795 if not helpers.countsmatch(str1, str2, ("\\", "\\\\")): 796 escapes1 = ", ".join("'%s'" % word for word in str1.split() if "\\" in word) 797 escapes2 = ", ".join("'%s'" % word for word in str2.split() if "\\" in word) 798 799 raise SeriousFilterFailure( 800 "Escapes in original (%s) don't match " 801 "escapes in translation (%s)" % (escapes1, escapes2) 802 ) 803 else: 804 return True 805 806 @critical 807 def newlines(self, str1, str2): 808 """Checks whether newlines are consistent between the two strings. 809 810 Counts the number of ``\\n`` newlines (and variants such as ``\\r\\n``) 811 and reports and error if they differ. 812 """ 813 if not helpers.countsmatch(str1, str2, ("\n", "\r")): 814 raise FilterFailure("Different line endings") 815 816 if str1.endswith("\n") and not str2.endswith("\n"): 817 raise FilterFailure("Newlines different at end") 818 819 if str1.startswith("\n") and not str2.startswith("\n"): 820 raise FilterFailure("Newlines different at beginning") 821 822 return True 823 824 @critical 825 def tabs(self, str1, str2): 826 """Checks whether tabs are consistent between the two strings. 827 828 Counts the number of ``\\t`` tab markers and reports an error if they 829 differ. 830 """ 831 if not helpers.countmatch(str1, str2, "\t"): 832 raise SeriousFilterFailure("Different tabs") 833 else: 834 return True 835 836 @cosmetic 837 def singlequoting(self, str1, str2): 838 """Checks whether singlequoting is consistent between the two strings. 839 840 The same as doublequoting but checks for the ``'`` character. Because 841 this is used in contractions like it's and in possessive forms like 842 user's, this test can output spurious errors if your language doesn't 843 use such forms. If a quote appears at the end of a sentence in the 844 translation, i.e. ``'.``, this might not be detected properly by the 845 check. 846 """ 847 str1 = self.filterwordswithpunctuation( 848 self.filteraccelerators(self.filtervariables(str1)) 849 ) 850 str1 = self.config.lang.punctranslate(str1) 851 852 str2 = self.filterwordswithpunctuation( 853 self.filteraccelerators(self.filtervariables(str2)) 854 ) 855 856 if helpers.countsmatch(str1, str2, ("'", "''", "\\'")): 857 return True 858 else: 859 raise FilterFailure("Different quotation marks") 860 861 @cosmetic 862 def doublequoting(self, str1, str2): 863 """Checks whether doublequoting is consistent between the two strings. 864 865 Checks on double quotes ``"`` to ensure that you have the same number 866 in both the original and the translated string. This tests takes into 867 account that several languages use different quoting characters, and 868 will test for them instead. 869 """ 870 str1 = self.filteraccelerators(self.filtervariables(str1)) 871 str1 = self.filterxml(str1) 872 str1 = self.config.lang.punctranslate(str1) 873 874 str2 = self.filteraccelerators(self.filtervariables(str2)) 875 str2 = self.filterxml(str2) 876 877 if helpers.countsmatch(str1, str2, ('"', '""', '\\"', "«", "»", "“", "”")): 878 return True 879 else: 880 raise FilterFailure("Different quotation marks") 881 882 @cosmetic 883 def doublespacing(self, str1, str2): 884 """Checks for bad double-spaces by comparing to original. 885 886 This will identify if you have [space][space] in when you don't have it 887 in the original or it appears in the original but not in your 888 translation. Some of these are spurious and how you correct them 889 depends on the conventions of your language. 890 """ 891 str1 = self.filteraccelerators(str1) 892 str2 = self.filteraccelerators(str2) 893 894 if helpers.countmatch(str1, str2, " "): 895 return True 896 else: 897 raise FilterFailure("Different use of double spaces") 898 899 @cosmetic 900 def puncspacing(self, str1, str2): 901 """Checks for bad spacing after punctuation. 902 903 In the case of [full-stop][space] in the original, this test checks 904 that your translation does not remove the space. It checks also for 905 [comma], [colon], etc. 906 907 Some languages don't use spaces after common punctuation marks, 908 especially where full-width punctuation marks are used. This check will 909 take that into account. 910 """ 911 # Convert all nbsp to space, and just check spaces. Useful intermediate 912 # step to stricter nbsp checking? 913 str1 = self.filteraccelerators(self.filtervariables(str1)) 914 str1 = self.config.lang.punctranslate(str1) 915 str1 = str1.replace("\u00a0", " ") 916 917 if str1.find(" ") == -1: 918 return True 919 920 str2 = self.filteraccelerators(self.filtervariables(str2)) 921 # Substitute: nbsp 922 str2 = str2.replace("\u00a0", " ") 923 # Strip: Bidi markers and ZW* chars 924 str2 = str2.translate( 925 { 926 ord(c): None 927 for c in ( 928 # Bidi markers 929 "\u200e", # LRM 930 "\u200f", # RLM 931 "\u202b", # RLE 932 "\u202a", # LRE 933 "\u202e", # RLO 934 "\u202d", # LRO 935 "\u202c", # PDF 936 "\u2069", # PDI 937 "\u2068", # FSI 938 "\u2067", # RLI 939 "\u2066", # LRI 940 # ZW* 941 "\u200d", # ZWJ 942 "\u200c", # ZWNJ 943 ) 944 } 945 ) 946 947 for puncchar in self.config.punctuation: 948 plaincount1 = str1.count(puncchar) 949 950 if not plaincount1: 951 continue 952 953 plaincount2 = str2.count(puncchar) 954 955 if plaincount1 != plaincount2: 956 continue 957 958 spacecount1 = str1.count(puncchar + " ") 959 spacecount2 = str2.count(puncchar + " ") 960 961 if spacecount1 != spacecount2: 962 # Handle extra spaces that are because of transposed punctuation 963 964 if abs(spacecount1 - spacecount2) == 1 and str1.endswith( 965 puncchar 966 ) != str2.endswith(puncchar): 967 continue 968 969 raise FilterFailure("Different spacing around punctuation") 970 971 return True 972 973 @critical 974 def printf(self, str1, str2): 975 """Checks whether printf format strings match. 976 977 If the printf formatting variables are not identical, then this will 978 indicate an error. Printf statements are used by programs to format 979 output in a human readable form (they are placeholders for variable 980 data). They allow you to specify lengths of string variables, string 981 padding, number padding, precision, etc. Generally they will look like 982 this: ``%d``, ``%5.2f``, ``%100s``, etc. The test can also manage 983 variables-reordering using the ``%1$s`` syntax. The variables' type and 984 details following data are tested to ensure that they are strictly 985 identical, but they may be reordered. 986 987 See also `printf Format String 988 <http://en.wikipedia.org/wiki/Printf_format_string>`_. 989 """ 990 count1 = count2 = plural = None 991 992 # self.hasplural only set by run_filters, not always available 993 if "hasplural" in self.__dict__: 994 plural = self.hasplural 995 996 for var_num2, match2 in enumerate(printf_pat.finditer(str2)): 997 count2 = var_num2 + 1 998 str2ord = ( 999 match2.group("ord") 1000 if not match2.group("boost_ord") 1001 else match2.group("boost_ord") 1002 ) 1003 str2key = match2.group("key") 1004 str2fullvar = ( 1005 match2.group("fullvar") if not match2.group("boost_ord") else "%" 1006 ) 1007 1008 if str2ord: 1009 str1ord = None 1010 gotmatch = False 1011 1012 for var_num1, match1 in enumerate(printf_pat.finditer(str1)): 1013 count1 = var_num1 + 1 1014 localstr1ord = ( 1015 match1.group("ord") 1016 if not match1.group("boost_ord") 1017 else match1.group("boost_ord") 1018 ) 1019 1020 if localstr1ord: 1021 if str2ord == localstr1ord: 1022 str1ord = str2ord 1023 str1fullvar = ( 1024 match1.group("fullvar") 1025 if not match1.group("boost_ord") 1026 else "%" 1027 ) 1028 1029 if str2fullvar == str1fullvar: 1030 gotmatch = True 1031 elif int(str2ord) == var_num1 + 1: 1032 str1ord = str2ord 1033 str1fullvar = ( 1034 match1.group("fullvar") 1035 if not match1.group("boost_ord") 1036 else "%" 1037 ) 1038 1039 if str2fullvar == str1fullvar: 1040 gotmatch = True 1041 1042 if str1ord is None: 1043 raise FilterFailure("Added printf variable: %s" % match2.group()) 1044 1045 if not gotmatch: 1046 raise FilterFailure( 1047 "Different printf variable: %s" % match2.group() 1048 ) 1049 elif str2key: 1050 str1key = None 1051 1052 for var_num1, match1 in enumerate(printf_pat.finditer(str1)): 1053 count1 = var_num1 + 1 1054 str1fullvar = ( 1055 match1.group("fullvar") 1056 if not match1.group("boost_ord") 1057 else "%" 1058 ) 1059 1060 if match1.group("key") and str2key == match1.group("key"): 1061 str1key = match1.group("key") 1062 1063 # '%.0s' "placeholder" in plural will match anything 1064 if plural and str2fullvar == ".0s": 1065 continue 1066 1067 if str1fullvar != str2fullvar: 1068 raise FilterFailure( 1069 "Different printf variable: %s" % match2.group() 1070 ) 1071 1072 if str1key is None: 1073 raise FilterFailure("Added printf variable: %s" % match2.group()) 1074 else: 1075 for var_num1, match1 in enumerate(printf_pat.finditer(str1)): 1076 count1 = var_num1 + 1 1077 str1fullvar = ( 1078 match1.group("fullvar") 1079 if not match1.group("boost_ord") 1080 else "%" 1081 ) 1082 1083 # '%.0s' "placeholder" in plural will match anything 1084 if plural and str2fullvar == ".0s": 1085 continue 1086 1087 if (var_num1 == var_num2) and (str1fullvar != str2fullvar): 1088 raise FilterFailure( 1089 "Different printf variable: %s" % match2.group() 1090 ) 1091 1092 if count2 is None: 1093 str1_variables = list(m.group() for m in printf_pat.finditer(str1)) 1094 1095 if str1_variables: 1096 raise FilterFailure( 1097 "Missing printf variable: %s" % ", ".join(str1_variables) 1098 ) 1099 1100 if (count1 or count2) and (count1 != count2): 1101 raise FilterFailure("Different number of printf variables") 1102 1103 return 1 1104 1105 @critical 1106 def pythonbraceformat(self, str1, str2): 1107 """Checks whether python brace format strings match.""" 1108 1109 # Helper function 1110 def max_anons(anons): 1111 """ 1112 Takes a list of anonymous placeholder variables, e.g. 1113 ['', '1', ...] 1114 Determines how many anonymous formatting args the string 1115 they come from requires. Motivation for this function: 1116 * max_anons(vars_from_original) tells us how many 1117 anonymous placeholders are supported (at least). 1118 * max_anons(vars_from_translation) should not 1119 exceed it. 1120 """ 1121 1122 # implicit_n: you need at least as many anonymous args as 1123 # there are anonymous placeholders. 1124 implicit_n = anons.count("") 1125 # explicit_n: you need at least as many anonymous args as 1126 # the highest '{99}'-style placeholder. (The `+ 1` is to 1127 # correct for 0-indexing) 1128 try: 1129 explicit_n = max( 1130 int(numbered_anon) + 1 1131 for numbered_anon in anons 1132 if len(numbered_anon) >= 1 1133 ) 1134 except ValueError: 1135 explicit_n = 0 1136 1137 highest_n = max(implicit_n, explicit_n) 1138 1139 return highest_n 1140 1141 messages = [] 1142 # Possible failure states: 0 = ok, 1 = mild, 2 = serious 1143 STATE_OK, STATE_MILD, STATE_SERIOUS = 0, 1, 2 1144 failure_state = STATE_OK 1145 pythonbraceformat_pat = re.compile("{[^}]*}") 1146 data1 = {} 1147 data2 = {} 1148 1149 # Populate the data1 and data2 dicts. 1150 for data_, str_ in [(data1, str1), (data2, str2)]: 1151 # Remove all escaped braces {{ and }} 1152 data_["strclean"] = re.sub("{{|}}", "", str_) 1153 data_["allvars"] = pythonbraceformat_pat.findall(data_["strclean"]) 1154 data_["anonvars"] = [ 1155 var[1:-1] for var in data_["allvars"] if re.match(r"^{[0-9]*}$", var) 1156 ] 1157 data_["namedvars"] = [ 1158 var for var in data_["allvars"] if not re.match(r"^{[0-9]*}$", var) 1159 ] 1160 1161 max1 = max_anons(data1["anonvars"]) 1162 max2 = max_anons(data2["anonvars"]) 1163 1164 if max1 == max2: 1165 pass 1166 elif max1 < max2: 1167 failure_state = max(failure_state, STATE_SERIOUS) 1168 messages.append( 1169 "Translation requires %s anonymous formatting args, original only %s" 1170 % (max2, max1) 1171 ) 1172 else: 1173 failure_state = max(failure_state, STATE_MILD) 1174 messages.append( 1175 "Highest anonymous placeholder in original is %s, in translation %s" 1176 % (max1, max2) 1177 ) 1178 1179 if set(data1["namedvars"]) == set(data2["namedvars"]): 1180 pass 1181 1182 extra_in_2 = set(data2["namedvars"]).difference(set(data1["namedvars"])) 1183 if 0 < len(extra_in_2): 1184 failure_state = max(failure_state, STATE_SERIOUS) 1185 messages.append( 1186 "Unknown named placeholders in translation: %s" % ", ".join(extra_in_2) 1187 ) 1188 1189 extra_in_1 = set(data1["namedvars"]).difference(set(data2["namedvars"])) 1190 if 0 < len(extra_in_1): 1191 failure_state = max(failure_state, STATE_MILD) 1192 messages.append( 1193 "Named placeholders absent in translation: %s" % ", ".join(extra_in_1) 1194 ) 1195 1196 if failure_state == STATE_OK: 1197 return 1 1198 elif failure_state == STATE_MILD: 1199 raise FilterFailure(messages) 1200 elif failure_state == STATE_SERIOUS: 1201 raise SeriousFilterFailure(messages) 1202 else: 1203 raise ValueError( 1204 "Something wrong in python brace checks: unreachable state reached" 1205 ) 1206 1207 @functional 1208 def accelerators(self, str1, str2): 1209 """Checks whether accelerators are consistent between the two strings. 1210 1211 This test is capable of checking the different type of accelerators 1212 that are used in different projects, like Mozilla or KDE. The test will 1213 pick up accelerators that are missing and ones that shouldn't be there. 1214 1215 See `accelerators on the localization guide 1216 <http://docs.translatehouse.org/projects/localization-guide/en/latest/guide/translation/accelerators.html>`_ 1217 for a full description on accelerators. 1218 """ 1219 str1 = self.filtervariables(str1) 1220 str2 = self.filtervariables(str2) 1221 messages = [] 1222 1223 for accelmarker in self.config.accelmarkers: 1224 counter1 = decoration.countaccelerators( 1225 accelmarker, self.config.sourcelang.validaccel 1226 ) 1227 counter2 = decoration.countaccelerators( 1228 accelmarker, self.config.lang.validaccel 1229 ) 1230 count1, countbad1 = counter1(str1) 1231 count2, countbad2 = counter2(str2) 1232 getaccel = decoration.getaccelerators( 1233 accelmarker, self.config.lang.validaccel 1234 ) 1235 accel2, bad2 = getaccel(str2) 1236 1237 if count1 == count2: 1238 continue 1239 1240 if count1 == 1 and count2 == 0: 1241 if countbad2 == 1: 1242 messages.append( 1243 "Accelerator '%s' appears before an invalid " 1244 "accelerator character '%s'" % (accelmarker, bad2[0]) 1245 ) 1246 else: 1247 messages.append("Missing accelerator '%s'" % accelmarker) 1248 elif count1 == 0: 1249 messages.append("Added accelerator '%s'" % accelmarker) 1250 elif count1 == 1 and count2 > count1: 1251 messages.append( 1252 "Accelerator '%s' is repeated in translation" % accelmarker 1253 ) 1254 else: 1255 messages.append( 1256 "Accelerator '%s' occurs %d time(s) in original " 1257 "and %d time(s) in translation" % (accelmarker, count1, count2) 1258 ) 1259 1260 if messages: 1261 if "accelerators" in self.config.criticaltests: 1262 raise SeriousFilterFailure(messages) 1263 else: 1264 raise FilterFailure(messages) 1265 1266 return True 1267 1268 # def acceleratedvariables(self, str1, str2): 1269 # """checks that no variables are accelerated""" 1270 # messages = [] 1271 # for accelerator in self.config.accelmarkers: 1272 # for variablestart, variableend in self.config.varmatches: 1273 # error = accelerator + variablestart 1274 # if str1.find(error) >= 0: 1275 # messages.append("original has an accelerated variable") 1276 # if str2.find(error) >= 0: 1277 # messages.append("translation has an accelerated variable") 1278 # if messages: 1279 # raise FilterFailure(messages) 1280 # return True 1281 1282 @critical 1283 def variables(self, str1, str2): 1284 """Checks whether variables of various forms are consistent between the 1285 two strings. 1286 1287 This checks to make sure that variables that appear in the original 1288 also appear in the translation. It can handle variables from projects 1289 like KDE or OpenOffice. It does not at the moment cope with variables 1290 that use the reordering syntax of Gettext PO files. 1291 """ 1292 messages = [] 1293 mismatch1, mismatch2 = [], [] 1294 varnames1, varnames2 = [], [] 1295 1296 for startmarker, endmarker in self.config.varmatches: 1297 varchecker = decoration.getvariables(startmarker, endmarker) 1298 1299 if startmarker and endmarker: 1300 if isinstance(endmarker, int): 1301 redecorate = lambda var: startmarker + var 1302 else: 1303 redecorate = lambda var: startmarker + var + endmarker 1304 elif startmarker: 1305 redecorate = lambda var: startmarker + var 1306 else: 1307 redecorate = lambda var: var 1308 1309 vars1 = varchecker(str1) 1310 vars2 = varchecker(str2) 1311 1312 if vars1 != vars2: 1313 # we use counts to compare so we can handle multiple variables 1314 vars1, vars2 = [ 1315 var for var in vars1 if vars1.count(var) > vars2.count(var) 1316 ], [var for var in vars2 if vars1.count(var) < vars2.count(var)] 1317 # filter variable names we've already seen, so they aren't 1318 # matched by more than one filter... 1319 vars1, vars2 = [var for var in vars1 if var not in varnames1], [ 1320 var for var in vars2 if var not in varnames2 1321 ] 1322 varnames1.extend(vars1) 1323 varnames2.extend(vars2) 1324 vars1 = map(redecorate, vars1) 1325 vars2 = map(redecorate, vars2) 1326 mismatch1.extend(vars1) 1327 mismatch2.extend(vars2) 1328 1329 if mismatch1: 1330 messages.append("Do not translate: %s" % ", ".join(mismatch1)) 1331 elif mismatch2: 1332 messages.append("Added variables: %s" % ", ".join(mismatch2)) 1333 1334 if messages and mismatch1: 1335 raise SeriousFilterFailure(messages) 1336 elif messages: 1337 raise FilterFailure(messages) 1338 1339 return True 1340 1341 @functional 1342 def functions(self, str1, str2): 1343 """Checks that function names are not translated. 1344 1345 Checks that function names e.g. ``rgb()`` or ``getEntity.Name()`` are 1346 not translated. 1347 """ 1348 # We can't just use helpers.funcmatch() since it doesn't ignore order 1349 if not set(decoration.getfunctions(str1)).symmetric_difference( 1350 set(decoration.getfunctions(str2)) 1351 ): 1352 return True 1353 else: 1354 raise FilterFailure("Different functions") 1355 1356 @functional 1357 def emails(self, str1, str2): 1358 """Checks that emails are not translated. 1359 1360 Generally you should not be translating email addresses. This check 1361 will look to see that email addresses e.g. ``info@example.com`` are not 1362 translated. In some cases of course you should translate the address 1363 but generally you shouldn't. 1364 """ 1365 if helpers.funcmatch(str1, str2, decoration.getemails): 1366 return True 1367 else: 1368 raise FilterFailure("Different e-mails") 1369 1370 @functional 1371 def urls(self, str1, str2): 1372 """Checks that URLs are not translated. 1373 1374 This checks only basic URLs (http, ftp, mailto etc.) not all URIs (e.g. 1375 afp, smb, file). Generally, you don't want to translate URLs, unless 1376 they are example URLs (http://your_server.com/filename.html). If the 1377 URL is for configuration information, then you need to query the 1378 developers about placing configuration information in PO files. It 1379 shouldn't really be there, unless it is very clearly marked: such 1380 information should go into a configuration file. 1381 """ 1382 if helpers.funcmatch(str1, str2, decoration.geturls): 1383 return True 1384 else: 1385 raise FilterFailure("Different URLs") 1386 1387 @functional 1388 def numbers(self, str1, str2): 1389 """Checks whether numbers of various forms are consistent between the 1390 two strings. 1391 1392 You will see some errors where you have either written the number in 1393 full or converted it to the digit in your translation. Also changes in 1394 order will trigger this error. 1395 """ 1396 str1 = self.config.lang.numbertranslate(str1) 1397 1398 if helpers.countsmatch(str1, str2, decoration.getnumbers(str1)): 1399 return True 1400 else: 1401 raise FilterFailure("Different numbers") 1402 1403 @cosmetic 1404 def startwhitespace(self, str1, str2): 1405 """Checks whether whitespace at the beginning of the strings matches. 1406 1407 As in endwhitespace but you will see fewer errors. 1408 """ 1409 if helpers.funcmatch(str1, str2, decoration.spacestart): 1410 return True 1411 else: 1412 raise FilterFailure("Different whitespace at the start") 1413 1414 @cosmetic 1415 def endwhitespace(self, str1, str2): 1416 """Checks whether whitespace at the end of the strings matches. 1417 1418 Operates the same as endpunc but is only concerned with whitespace. 1419 This filter is particularly useful for those strings which will 1420 evidently be followed by another string in the program, e.g. 1421 [Password: ] or [Enter your username: ]. The whitespace is an inherent 1422 part of the string. This filter makes sure you don't miss those 1423 important but otherwise invisible spaces! 1424 1425 If your language uses full-width punctuation (like Chinese), the visual 1426 spacing in the character might be enough without an added extra space. 1427 """ 1428 str1 = self.config.lang.punctranslate(str1) 1429 1430 if helpers.funcmatch(str1, str2, decoration.spaceend): 1431 return True 1432 else: 1433 raise FilterFailure("Different whitespace at the end") 1434 1435 @cosmetic 1436 def startpunc(self, str1, str2): 1437 """Checks whether punctuation at the beginning of the strings match. 1438 1439 Operates as endpunc but you will probably see fewer errors. 1440 """ 1441 str1 = self.filterxml( 1442 self.filterwordswithpunctuation( 1443 self.filteraccelerators(self.filtervariables(str1)) 1444 ) 1445 ) 1446 str1 = self.config.lang.punctranslate(str1) 1447 str2 = self.filterxml( 1448 self.filterwordswithpunctuation( 1449 self.filteraccelerators(self.filtervariables(str2)) 1450 ) 1451 ) 1452 1453 if helpers.funcmatch(str1, str2, decoration.puncstart, self.config.punctuation): 1454 return True 1455 else: 1456 raise FilterFailure("Different punctuation at the start") 1457 1458 @cosmetic 1459 def endpunc(self, str1, str2): 1460 """Checks whether punctuation at the end of the strings match. 1461 1462 This will ensure that the ending of your translation has the same 1463 punctuation as the original. E.g. if it ends in :[space] then so should 1464 yours. It is useful for ensuring that you have ellipses [...] in all 1465 your translations, not simply three separate full-stops. You may pick 1466 up some errors in the original: feel free to keep your translation and 1467 notify the programmers. In some languages, characters such as ``?`` or 1468 ``!`` are always preceded by a space e.g. [space]? — do what your 1469 language customs dictate. Other false positives you will notice are, 1470 for example, if through changes in word-order you add "), etc. at the 1471 end of the sentence. Do not change these: your language word-order 1472 takes precedence. 1473 1474 It must be noted that if you are tempted to leave out [full-stop] or 1475 [colon] or add [full-stop] to a sentence, that often these have been 1476 done for a reason, e.g. a list where fullstops make it look cluttered. 1477 So, initially match them with the English, and make changes once the 1478 program is being used. 1479 1480 This check is aware of several language conventions for punctuation 1481 characters, such as the custom question marks for Greek and Arabic, 1482 Devanagari Danda, full-width punctuation for CJK languages, etc. 1483 Support for your language can be added easily if it is not there yet. 1484 """ 1485 str1 = self.filtervariables(str1) 1486 str1 = self.config.lang.punctranslate(str1) 1487 str2 = self.filtervariables(str2) 1488 str1 = str1.rstrip() 1489 str2 = str2.rstrip() 1490 1491 if helpers.funcmatch( 1492 str1, str2, decoration.puncend, self.config.endpunctuation + ":" 1493 ): 1494 return True 1495 else: 1496 raise FilterFailure("Different punctuation at the end") 1497 1498 @functional 1499 def purepunc(self, str1, str2): 1500 """Checks that strings that are purely punctuation are not changed. 1501 1502 This extracts strings like ``+`` or ``-`` as these usually should not 1503 be changed. 1504 """ 1505 # this test is a subset of startandend 1506 if decoration.ispurepunctuation(str1): 1507 success = str1 == str2 1508 else: 1509 success = not decoration.ispurepunctuation(str2) 1510 1511 if success: 1512 return True 1513 else: 1514 raise FilterFailure("Consider not translating punctuation") 1515 1516 @cosmetic 1517 def brackets(self, str1, str2): 1518 """Checks that the number of brackets in both strings match. 1519 1520 If ``([{`` or ``}])`` appear in the original this will check that the 1521 same number appear in the translation. 1522 """ 1523 str1 = self.filtervariables(str1) 1524 str2 = self.filtervariables(str2) 1525 1526 messages = [] 1527 missing = [] 1528 extra = [] 1529 1530 for bracket in ("[", "]", "{", "}", "(", ")"): 1531 count1 = str1.count(bracket) 1532 count2 = str2.count(bracket) 1533 1534 if count2 < count1: 1535 missing.append("'%s'" % bracket) 1536 elif count2 > count1: 1537 extra.append("'%s'" % bracket) 1538 1539 if missing: 1540 messages.append("Missing %s" % ", ".join(missing)) 1541 1542 if extra: 1543 messages.append("Added %s" % ", ".join(extra)) 1544 1545 if messages: 1546 raise FilterFailure(messages) 1547 1548 return True 1549 1550 @functional 1551 def sentencecount(self, str1, str2): 1552 """Checks that the number of sentences in both strings match. 1553 1554 Adds the number of sentences to see that the sentence count is the same 1555 between the original and translated string. You may not always want to 1556 use this test, if you find you often need to reformat your translation, 1557 because the original is badly-expressed, or because the structure of 1558 your language works better that way. Do what works best for your 1559 language: it's the meaning of the original you want to convey, not the 1560 exact way it was written in the English. 1561 """ 1562 str1 = self.filteraccelerators(str1) 1563 str2 = self.filteraccelerators(str2) 1564 1565 sentences1 = len(self.config.sourcelang.sentences(str1)) 1566 sentences2 = len(self.config.lang.sentences(str2)) 1567 1568 if not sentences1 == sentences2: 1569 raise FilterFailure( 1570 "Different number of sentences: " "%d ≠ %d" % (sentences1, sentences2) 1571 ) 1572 1573 return True 1574 1575 @functional 1576 def options(self, str1, str2): 1577 """Checks that command line options are not translated. 1578 1579 In messages that contain command line options, such as ``--help``, 1580 this test will check that these remain untranslated. These could be 1581 translated in the future if programs can create a mechanism to allow 1582 this, but currently they are not translated. If the options has a 1583 parameter, e.g. ``--file=FILE``, then the test will check that the 1584 parameter has been translated. 1585 """ 1586 str1 = self.filtervariables(str1) 1587 1588 for word1 in str1.split(): 1589 if word1 != "--" and word1.startswith("--") and word1[-1].isalnum(): 1590 parts = word1.split("=") 1591 1592 if not parts[0] in str2: 1593 raise FilterFailure("Missing or translated option '%s'" % parts[0]) 1594 1595 if len(parts) > 1 and parts[1] in str2: 1596 raise FilterFailure( 1597 "Consider translating parameter " 1598 "'%(param)s' of option '%(option)s'" 1599 % {"param": parts[1], "option": parts[0]} 1600 ) 1601 1602 return True 1603 1604 @cosmetic 1605 def startcaps(self, str1, str2): 1606 """Checks that the message starts with the correct capitalisation. 1607 1608 After stripping whitespace and common punctuation characters, it then 1609 checks to see that the first remaining character is correctly 1610 capitalised. So, if the sentence starts with an upper-case letter, and 1611 the translation does not, an error is produced. 1612 1613 This check is entirely disabled for many languages that don't make a 1614 distinction between upper and lower case. Contact us if this is not yet 1615 disabled for your language. 1616 """ 1617 str1 = self.filteraccelerators(str1) 1618 str2 = self.filteraccelerators(str2) 1619 1620 if len(str1) > 1 and len(str2) > 1: 1621 if self.config.sourcelang.capsstart(str1) == self.config.lang.capsstart( 1622 str2 1623 ): 1624 return True 1625 elif self.config.sourcelang.numstart(str1) or self.config.lang.numstart( 1626 str2 1627 ): 1628 return True 1629 else: 1630 raise FilterFailure("Different capitalization at the start") 1631 1632 if len(str1) == 0 and len(str2) == 0: 1633 return True 1634 1635 if len(str1) == 0 or len(str2) == 0: 1636 raise FilterFailure("Different capitalization at the start") 1637 1638 return True 1639 1640 @cosmetic 1641 def simplecaps(self, str1, str2): 1642 """Checks the capitalisation of two strings isn't wildly different. 1643 1644 This will pick up many false positives, so don't be a slave to it. It 1645 is useful for identifying translations that don't start with a capital 1646 letter (upper-case letter) when they should, or those that do when they 1647 shouldn't. It will also highlight sentences that have extra capitals; 1648 depending on the capitalisation convention of your language, you might 1649 want to change these to Title Case, or change them all to normal 1650 sentence case. 1651 """ 1652 str1 = self.removevariables(str1) 1653 str2 = self.removevariables(str2) 1654 # TODO: review this. The 'I' is specific to English, so it probably 1655 # serves no purpose to get sourcelang.sentenceend 1656 str1 = re.sub("[^%s]( I )" % self.config.sourcelang.sentenceend, " i ", str1) 1657 1658 capitals1 = helpers.filtercount(str1, str.isupper) 1659 capitals2 = helpers.filtercount(str2, str.isupper) 1660 1661 alpha1 = helpers.filtercount(str1, str.isalpha) 1662 alpha2 = helpers.filtercount(str2, str.isalpha) 1663 1664 # Capture the all caps case 1665 if capitals1 == alpha1: 1666 if capitals2 == alpha2: 1667 return True 1668 else: 1669 raise FilterFailure("Different capitalization") 1670 1671 # some heuristic tests to try and see that the style of capitals is 1672 # vaguely the same 1673 if capitals1 == 0 or capitals1 == 1: 1674 success = capitals2 == capitals1 1675 elif capitals1 < len(str1) / 10: 1676 success = capitals2 <= len(str2) / 8 1677 elif len(str1) < 10: 1678 success = abs(capitals1 - capitals2) < 3 1679 elif capitals1 > len(str1) * 6 / 10: 1680 success = capitals2 > len(str2) * 6 / 10 1681 else: 1682 success = abs(capitals1 - capitals2) < (len(str1) + len(str2)) / 6 1683 1684 if success: 1685 return True 1686 else: 1687 raise FilterFailure("Different capitalization") 1688 1689 @functional 1690 def acronyms(self, str1, str2): 1691 """Checks that acronyms that appear are unchanged. 1692 1693 If an acronym appears in the original this test will check that it 1694 appears in the translation. Translating acronyms is a language decision 1695 but many languages leave them unchanged. In that case this test is 1696 useful for tracking down translations of the acronym and correcting 1697 them. 1698 """ 1699 acronyms = [] 1700 allowed = [] 1701 1702 for startmatch, endmatch in self.config.varmatches: 1703 allowed += decoration.getvariables(startmatch, endmatch)(str1) 1704 1705 allowed += self.config.musttranslatewords.keys() 1706 str1 = self.filteraccelerators(self.filtervariables(str1)) 1707 iter = self.config.lang.word_iter(str1) 1708 str2 = self.filteraccelerators(self.filtervariables(str2)) 1709 1710 # TODO: strip XML? - should provide better error messsages 1711 # see mail/chrome/messanger/smime.properties.po 1712 # TODO: consider limiting the word length for recognising acronyms to 1713 # something like 5/6 characters 1714 for word in iter: 1715 if word.isupper() and len(word) > 1 and word not in allowed: 1716 if str2.find(word) == -1: 1717 acronyms.append(word) 1718 1719 if acronyms: 1720 raise FilterFailure( 1721 "Consider not translating acronyms: %s" % ", ".join(acronyms) 1722 ) 1723 1724 return True 1725 1726 @cosmetic 1727 def doublewords(self, str1, str2): 1728 """Checks for repeated words in the translation. 1729 1730 Words that have been repeated in a translation will be highlighted with 1731 this test e.g. "the the", "a a". These are generally typos that need 1732 correcting. Some languages may have valid repeated words in their 1733 structure, in that case either ignore those instances or switch this 1734 test off. 1735 """ 1736 lastword = "" 1737 without_newlines = "\n".join(str2.split("\n")) 1738 words = ( 1739 self.filteraccelerators( 1740 self.removevariables(self.filterxml(without_newlines)) 1741 ) 1742 .replace(".", "") 1743 .lower() 1744 .split() 1745 ) 1746 1747 for word in words: 1748 if word == lastword and word not in self.config.lang.validdoublewords: 1749 raise FilterFailure("The word '%s' is repeated" % word) 1750 lastword = word 1751 1752 return True 1753 1754 @functional 1755 def notranslatewords(self, str1, str2): 1756 """Checks that words configured as untranslatable appear in the 1757 translation too. 1758 1759 Many brand names should not be translated, this test allows you to 1760 easily make sure that words like: Word, Excel, Impress, Calc, etc. are 1761 not translated. You must specify a file containing all of the 1762 *no translate* words using ``--notranslatefile``. 1763 """ 1764 if not self.config.notranslatewords: 1765 return True 1766 1767 str1 = self.filtervariables(str1) 1768 str2 = self.filtervariables(str2) 1769 1770 # The above is full of strange quotes and things in utf-8 encoding. 1771 # single apostrophe perhaps problematic in words like "doesn't" 1772 for seperator in self.config.punctuation: 1773 str1 = str1.replace(seperator, " ") 1774 str2 = str2.replace(seperator, " ") 1775 1776 words1 = self.filteraccelerators(str1).split() 1777 words2 = self.filteraccelerators(str2).split() 1778 stopwords = [ 1779 word 1780 for word in words1 1781 if word in self.config.notranslatewords and word not in words2 1782 ] 1783 1784 if stopwords: 1785 raise FilterFailure("Do not translate: %s" % (", ".join(stopwords))) 1786 1787 return True 1788 1789 @functional 1790 def musttranslatewords(self, str1, str2): 1791 """Checks that words configured as definitely translatable don't appear 1792 in the translation. 1793 1794 If for instance in your language you decide that you must translate 1795 'OK' then this test will flag any occurrences of 'OK' in the 1796 translation if it appeared in the source string. You must specify a 1797 file containing all of the *must translate* words using 1798 ``--musttranslatefile``. 1799 """ 1800 if not self.config.musttranslatewords: 1801 return True 1802 1803 str1 = self.removevariables(str1) 1804 str2 = self.removevariables(str2) 1805 1806 # The above is full of strange quotes and things in utf-8 encoding. 1807 # single apostrophe perhaps problematic in words like "doesn't" 1808 for seperator in self.config.punctuation: 1809 str1 = str1.replace(seperator, " ") 1810 str2 = str2.replace(seperator, " ") 1811 1812 words1 = self.filteraccelerators(str1).split() 1813 words2 = self.filteraccelerators(str2).split() 1814 stopwords = [ 1815 word 1816 for word in words1 1817 if word.lower() in self.config.musttranslatewords and word in words2 1818 ] 1819 1820 if stopwords: 1821 raise FilterFailure("Please translate: %s" % (", ".join(stopwords))) 1822 1823 return True 1824 1825 @cosmetic 1826 def validchars(self, str1, str2): 1827 """Checks that only characters specified as valid appear in the 1828 translation. 1829 1830 Often during character conversion to and from UTF-8 you get some 1831 strange characters appearing in your translation. This test presents a 1832 simple way to try and identify such errors. 1833 1834 This test will only run of you specify the ``--validcharsfile`` command 1835 line option. This file contains all the characters that are valid in 1836 your language. You must use UTF-8 encoding for the characters in the 1837 file. 1838 1839 If the test finds any characters not in your valid characters file then 1840 the test will print the character together with its Unicode value 1841 (e.g. 002B). 1842 """ 1843 if not self.config.validcharsmap: 1844 return True 1845 1846 invalid1 = str1.translate(self.config.validcharsmap) 1847 invalid2 = str2.translate(self.config.validcharsmap) 1848 invalidchars = [ 1849 f"'{invalidchar}' (\\u{ord(invalidchar):04x})" 1850 for invalidchar in invalid2 1851 if invalidchar not in invalid1 1852 ] 1853 1854 if invalidchars: 1855 raise FilterFailure("Invalid characters: %s" % (", ".join(invalidchars))) 1856 1857 return True 1858 1859 @functional 1860 def filepaths(self, str1, str2): 1861 """Checks that file paths have not been translated. 1862 1863 Checks that paths such as ``/home/user1`` have not been translated. 1864 Generally you do not translate a file path, unless it is being used as 1865 an example, e.g. ``your_user_name/path/to/filename.conf``. 1866 """ 1867 for word1 in self.filteraccelerators(self.filterxml(str1)).split(): 1868 if word1.startswith("/"): 1869 if not helpers.countsmatch(str1, str2, (word1,)): 1870 raise FilterFailure("Different file paths") 1871 1872 return True 1873 1874 @critical 1875 def xmltags(self, str1, str2): 1876 """Checks that XML/HTML tags have not been translated. 1877 1878 This check finds the number of tags in the source string and checks 1879 that the same number are in the translation. If the counts don't match 1880 then either the tag is missing or it was mistakenly translated by the 1881 translator, both of which are errors. 1882 1883 The check ignores tags or things that look like tags that cover the 1884 whole string e.g. ``<Error>`` but will produce false positives for 1885 things like ``An <Error> occurred`` as here ``Error`` should be 1886 translated. It also will allow translation of the *alt* attribute in 1887 e.g. ``<img src="bob.png" alt="Image description">`` or similar 1888 translatable attributes in OpenOffice.org help files. 1889 """ 1890 tags1 = tag_re.findall(str1) 1891 1892 if len(tags1) > 0: 1893 if (len(tags1[0]) == len(str1)) and "=" not in tags1[0]: 1894 return True 1895 1896 tags2 = tag_re.findall(str2) 1897 properties1 = tagproperties(tags1, self.config.ignoretags) 1898 properties2 = tagproperties(tags2, self.config.ignoretags) 1899 1900 filtered1 = [] 1901 filtered2 = [] 1902 1903 for property1 in properties1: 1904 filtered1 += [intuplelist(property1, self.config.canchangetags)] 1905 1906 for property2 in properties2: 1907 filtered2 += [intuplelist(property2, self.config.canchangetags)] 1908 1909 # TODO: consider the consequences of different ordering of 1910 # attributes/tags 1911 if filtered1 != filtered2: 1912 raise FilterFailure("Different XML tags") 1913 else: 1914 # No tags in str1, let's just check that none were added in str2. 1915 # This might be useful for fuzzy strings wrongly unfuzzied. 1916 tags2 = tag_re.findall(str2) 1917 1918 if len(tags2) > 0: 1919 raise FilterFailure("Added XML tags") 1920 1921 return True 1922 1923 @functional 1924 def kdecomments(self, str1, str2): 1925 """Checks to ensure that no KDE style comments appear in the 1926 translation. 1927 1928 KDE style translator comments appear in PO files as 1929 ``"_: comment\\n"``. New translators often translate the comment. This 1930 test tries to identify instances where the comment has been translated. 1931 """ 1932 return str2.find("\n_:") == -1 and not str2.startswith("_:") 1933 1934 @extraction 1935 def compendiumconflicts(self, str1, str2): 1936 """Checks for Gettext compendium conflicts (#-#-#-#-#). 1937 1938 When you use msgcat to create a PO compendium it will insert 1939 ``#-#-#-#-#`` into entries that are not consistent. If the compendium 1940 is used later in a message merge then these conflicts will appear in 1941 your translations. This test quickly extracts those for correction. 1942 """ 1943 return str2.find("#-#-#-#-#") == -1 1944 1945 @cosmetic 1946 def simpleplurals(self, str1, str2): 1947 """Checks for English style plural(s) for you to review. 1948 1949 This test will extract any message that contains words with a final 1950 "(s)" in the source text. You can then inspect the message, to check 1951 that the correct plural form has been used for your language. In some 1952 languages, plurals are made by adding text at the beginning of words, 1953 making the English style messy. In this case, they often revert to the 1954 plural form. This test allows an editor to check that the plurals used 1955 are correct. Be aware that this test may create a number of false 1956 positives. 1957 1958 For languages with no plural forms (only one noun form) this test will 1959 simply test that nothing like "(s)" was used in the translation. 1960 """ 1961 1962 def numberofpatterns(string, patterns): 1963 number = 0 1964 1965 for pattern in patterns: 1966 number += len(re.findall(pattern, string)) 1967 1968 return number 1969 1970 sourcepatterns = [r"\(s\)"] 1971 targetpatterns = [r"\(s\)"] 1972 sourcecount = numberofpatterns(str1, sourcepatterns) 1973 targetcount = numberofpatterns(str2, targetpatterns) 1974 1975 if self.config.lang.nplurals == 1: 1976 if targetcount: 1977 raise FilterFailure("Plural(s) were kept in translation") 1978 else: 1979 return True 1980 1981 if sourcecount == targetcount: 1982 return True 1983 else: 1984 raise FilterFailure("The original uses plural(s)") 1985 1986 @functional 1987 def spellcheck(self, str1, str2): 1988 """Checks words that don't pass a spell check. 1989 1990 This test will check for misspelled words in your translation. The test 1991 first checks for misspelled words in the original (usually English) 1992 text, and adds those to an exclusion list. The advantage of this 1993 exclusion is that many words that are specific to the application will 1994 not raise errors e.g. program names, brand names, function names. 1995 1996 The checker works with `PyEnchant 1997 <http://pythonhosted.org/pyenchant/>`_. You need to have PyEnchant 1998 installed as well as a dictionary for your language (for example, one 1999 of the `Hunspell <https://wiki.openoffice.org/wiki/Dictionaries>`_ or 2000 `aspell <http://ftp.gnu.org/gnu/aspell/dict/>`_ dictionaries). This 2001 test will only work if you have specified the ``--language`` option. 2002 2003 The pofilter error that is created, lists the misspelled word, plus 2004 suggestions returned from the spell checker. That makes it easy for you 2005 to identify the word and select a replacement. 2006 """ 2007 if not self.config.targetlanguage: 2008 return True 2009 2010 if not spelling.available: 2011 return True 2012 2013 # TODO: filterxml? 2014 str1 = self.filteraccelerators_by_list( 2015 self.removevariables(str1), self.config.sourcelang.validaccel 2016 ) 2017 str2 = self.filteraccelerators_by_list( 2018 self.removevariables(str2), self.config.lang.validaccel 2019 ) 2020 errors = set() 2021 2022 # We cache spelling results of source texts: 2023 ignore1 = set(spelling.simple_check(str1, lang=self.config.sourcelang.code)) 2024 2025 # We cache spelling results of target texts sentence-by-sentence. This 2026 # way we can reuse most of the results while someone is typing a long 2027 # segment in Virtaal. 2028 sentences2 = self.config.lang.sentences(str2) 2029 for sentence in sentences2: 2030 sentence_errors = spelling.simple_check( 2031 sentence, lang=self.config.targetlanguage 2032 ) 2033 errors.update(sentence_errors) 2034 2035 errors.difference_update(ignore1, self.config.notranslatewords) 2036 2037 if errors: 2038 messages = ["Check the spelling of: %s" % ", ".join(errors)] 2039 raise FilterFailure(messages) 2040 2041 return True 2042 2043 @extraction 2044 def credits(self, str1, str2): 2045 """Checks for messages containing translation credits instead of 2046 normal translations. 2047 2048 Some projects have consistent ways of giving credit to translators by 2049 having a unit or two where translators can fill in their name and 2050 possibly their contact details. This test allows you to find these 2051 units easily to check that they are completed correctly and also 2052 disables other tests that might incorrectly get triggered for these 2053 units (such as urls, emails, etc.) 2054 """ 2055 if str1 in self.config.credit_sources: 2056 raise FilterFailure("Don't translate. Just credit the translators.") 2057 else: 2058 return True 2059 2060 # If the precondition filter is run and fails then the other tests listed are ignored 2061 preconditions = { 2062 "untranslated": ( 2063 "simplecaps", 2064 "variables", 2065 "startcaps", 2066 "accelerators", 2067 "brackets", 2068 "endpunc", 2069 "acronyms", 2070 "xmltags", 2071 "startpunc", 2072 "endwhitespace", 2073 "startwhitespace", 2074 "escapes", 2075 "doublequoting", 2076 "singlequoting", 2077 "filepaths", 2078 "purepunc", 2079 "doublespacing", 2080 "sentencecount", 2081 "numbers", 2082 "isfuzzy", 2083 "isreview", 2084 "notranslatewords", 2085 "musttranslatewords", 2086 "emails", 2087 "simpleplurals", 2088 "urls", 2089 "printf", 2090 "pythonbraceformat", 2091 "tabs", 2092 "newlines", 2093 "functions", 2094 "options", 2095 "blank", 2096 "nplurals", 2097 "gconf", 2098 "dialogsizes", 2099 "validxml", 2100 ), 2101 "blank": ( 2102 "simplecaps", 2103 "variables", 2104 "startcaps", 2105 "accelerators", 2106 "brackets", 2107 "endpunc", 2108 "acronyms", 2109 "xmltags", 2110 "startpunc", 2111 "endwhitespace", 2112 "startwhitespace", 2113 "escapes", 2114 "doublequoting", 2115 "singlequoting", 2116 "filepaths", 2117 "purepunc", 2118 "doublespacing", 2119 "sentencecount", 2120 "numbers", 2121 "isfuzzy", 2122 "isreview", 2123 "notranslatewords", 2124 "musttranslatewords", 2125 "emails", 2126 "simpleplurals", 2127 "urls", 2128 "printf", 2129 "pythonbraceformat", 2130 "tabs", 2131 "newlines", 2132 "functions", 2133 "options", 2134 "gconf", 2135 "dialogsizes", 2136 "validxml", 2137 ), 2138 "credits": ( 2139 "simplecaps", 2140 "variables", 2141 "startcaps", 2142 "accelerators", 2143 "brackets", 2144 "endpunc", 2145 "acronyms", 2146 "xmltags", 2147 "startpunc", 2148 "escapes", 2149 "doublequoting", 2150 "singlequoting", 2151 "filepaths", 2152 "doublespacing", 2153 "sentencecount", 2154 "numbers", 2155 "emails", 2156 "simpleplurals", 2157 "urls", 2158 "printf", 2159 "pythonbraceformat", 2160 "tabs", 2161 "newlines", 2162 "functions", 2163 "options", 2164 "validxml", 2165 ), 2166 "purepunc": ("startcaps", "options"), 2167 # This is causing some problems since Python 2.6, as 2168 # startcaps is now seen as an important one to always execute 2169 # and could now be done before it is blocked by a failing 2170 # "untranslated" or "blank" test. This is probably happening 2171 # due to slightly different implementation of the internal 2172 # dict handling since Python 2.6. We should never have relied 2173 # on this ordering anyway. 2174 # "startcaps": ("simplecaps",), 2175 "endwhitespace": ("endpunc",), 2176 "startwhitespace": ("startpunc",), 2177 "unchanged": ("doublewords",), 2178 "compendiumconflicts": ( 2179 "accelerators", 2180 "brackets", 2181 "escapes", 2182 "numbers", 2183 "startpunc", 2184 "long", 2185 "variables", 2186 "startcaps", 2187 "sentencecount", 2188 "simplecaps", 2189 "doublespacing", 2190 "endpunc", 2191 "xmltags", 2192 "startwhitespace", 2193 "endwhitespace", 2194 "singlequoting", 2195 "doublequoting", 2196 "filepaths", 2197 "purepunc", 2198 "doublewords", 2199 "printf", 2200 "newlines", 2201 "validxml", 2202 ), 2203 } 2204 2205 2206# code to actually run the tests (use unittest?) 2207 2208 2209openofficeconfig = CheckerConfig( 2210 accelmarkers=["~"], 2211 varmatches=[ 2212 ("&", ";"), 2213 ("%", "%"), 2214 ("%", None), 2215 ("%", 0), 2216 ("$(", ")"), 2217 ("$", "$"), 2218 ("${", "}"), 2219 ("#", "#"), 2220 ("#", 1), 2221 ("#", 0), 2222 ("($", ")"), 2223 ("$[", "]"), 2224 ("[", "]"), 2225 ("@", "@"), 2226 ("$", None), 2227 ], 2228 ignoretags=[ 2229 ("alt", "xml-lang", None), 2230 ("ahelp", "visibility", "visible"), 2231 ("img", "width", None), 2232 ("img", "height", None), 2233 ], 2234 canchangetags=[("link", "name", None)], 2235) 2236 2237 2238class OpenOfficeChecker(StandardChecker): 2239 def __init__(self, **kwargs): 2240 checkerconfig = kwargs.get("checkerconfig", None) 2241 2242 if checkerconfig is None: 2243 checkerconfig = CheckerConfig() 2244 kwargs["checkerconfig"] = checkerconfig 2245 2246 checkerconfig.update(openofficeconfig) 2247 super().__init__(**kwargs) 2248 2249 2250libreofficeconfig = CheckerConfig( 2251 accelmarkers=["~"], 2252 varmatches=[ 2253 ("&", ";"), 2254 ("%", "%"), 2255 ("%", None), 2256 ("%", 0), 2257 ("$(", ")"), 2258 ("$", "$"), 2259 ("${", "}"), 2260 ("#", "#"), 2261 ("#", 1), 2262 ("#", 0), 2263 ("($", ")"), 2264 ("$[", "]"), 2265 ("[", "]"), 2266 ("@", "@"), 2267 ("$", None), 2268 ], 2269 ignoretags=[ 2270 ("alt", "xml-lang", None), 2271 ("ahelp", "visibility", "visible"), 2272 ("img", "width", None), 2273 ("img", "height", None), 2274 ], 2275 canchangetags=[("link", "name", None)], 2276) 2277 2278 2279class LibreOfficeChecker(StandardChecker): 2280 def __init__(self, **kwargs): 2281 checkerconfig = kwargs.get("checkerconfig", None) 2282 2283 if checkerconfig is None: 2284 checkerconfig = CheckerConfig() 2285 kwargs["checkerconfig"] = checkerconfig 2286 2287 checkerconfig.update(libreofficeconfig) 2288 checkerconfig.update(openofficeconfig) 2289 super().__init__(**kwargs) 2290 2291 @critical 2292 def validxml(self, str1, str2): 2293 """Check that all XML/HTML open/close tags has close/open pair in the 2294 translation. 2295 """ 2296 for location in self.locations: 2297 if location.endswith(".xrm") or location.endswith(".xhp"): 2298 opentags = [] 2299 match = re.search(lo_tag_re, str2) 2300 while match: 2301 acttag = match.group(0) 2302 if acttag.startswith("</"): 2303 if match.group("tag") in lo_emptytags: 2304 raise FilterFailure( 2305 "»%s« should be self-closing/empty" % acttag 2306 ) 2307 if len(opentags) == 0: 2308 raise FilterFailure( 2309 "There is no open tag for »%s«" % acttag 2310 ) 2311 opentag = opentags.pop() 2312 if tagname(acttag) != "/" + tagname(opentag): 2313 raise FilterFailure( 2314 "Open tag »%s« and close tag »%s« " 2315 "don't match" % (opentag, acttag) 2316 ) 2317 elif acttag.endswith("/>"): 2318 if match.group("tag") not in lo_emptytags: 2319 raise FilterFailure( 2320 "»%s« should not be self-closing/empty" % acttag 2321 ) 2322 else: 2323 opentags.append(acttag) 2324 str2 = str2[match.end(0) :] 2325 match = re.search(lo_tag_re, str2) 2326 if len(opentags) != 0: 2327 raise FilterFailure( 2328 "There is no close tag for »%s«" % opentags.pop() 2329 ) 2330 return True 2331 2332 @critical 2333 def pythonbraceformat(self, str1, str2): 2334 """Not used in LibreOffice""" 2335 return True 2336 2337 2338mozillaconfig = CheckerConfig( 2339 accelmarkers=["&"], 2340 varmatches=[ 2341 ("&", ";"), 2342 ("%", "%"), 2343 ("%", 1), 2344 ("$", "$"), 2345 ("$", None), 2346 ("#", 1), 2347 ("${", "}"), 2348 ("$(^", ")"), 2349 ("{{", "}}"), 2350 ], 2351 criticaltests=["accelerators"], 2352) 2353 2354 2355class MozillaChecker(StandardChecker): 2356 accelerators_skipped_scripts = [ 2357 "Deva", 2358 "Beng", 2359 "Tibt", 2360 "Orya", 2361 "Gujr", 2362 "Khmr", 2363 "Knda", 2364 "Laoo", 2365 "Mlym", 2366 "Mymr", 2367 "Sind", 2368 "Taml", 2369 "assamese", 2370 "perso-arabic", 2371 "mon", 2372 "chinese", 2373 ] 2374 2375 def __init__(self, **kwargs): 2376 checkerconfig = kwargs.get("checkerconfig", None) 2377 2378 if checkerconfig is None: 2379 checkerconfig = CheckerConfig() 2380 kwargs["checkerconfig"] = checkerconfig 2381 2382 checkerconfig.update(mozillaconfig) 2383 super().__init__(**kwargs) 2384 2385 @extraction 2386 def credits(self, str1, str2): 2387 """Checks for messages containing translation credits instead of 2388 normal translations. 2389 2390 Some projects have consistent ways of giving credit to translators by 2391 having a unit or two where translators can fill in their name and 2392 possibly their contact details. This test allows you to find these 2393 units easily to check that they are completed correctly and also 2394 disables other tests that might incorrectly get triggered for these 2395 units (such as urls, emails, etc.) 2396 """ 2397 for location in self.locations: 2398 if location in ["MOZ_LANGPACK_CONTRIBUTORS", "credit.translation"]: 2399 raise FilterFailure("Don't translate. Just credit the translators.") 2400 2401 return True 2402 2403 mozilla_dialog_re = re.compile( 2404 r"""( # option pair "key: value;" 2405 (?P<key>[-a-z]+) # key 2406 :\s+ # seperator 2407 (?P<number>\d+(?:[.]\d+)?) # number 2408 (?P<unit>[a-z][a-z]);? # units 2409 )+ # multiple pairs 2410 """, 2411 re.VERBOSE, 2412 ) 2413 mozilla_dialog_valid_units = ["em", "px", "ch"] 2414 2415 @critical 2416 def dialogsizes(self, str1, str2): 2417 """Checks that dialog sizes are not translated. 2418 2419 This is a Mozilla specific test. Mozilla uses a language called XUL to 2420 define dialogues and screens. This can make use of CSS to specify 2421 properties of the dialogue. These properties include things such as the 2422 width and height of the box. The size might need to be changed if the 2423 dialogue size changes due to longer translations. Thus translators can 2424 change these settings. But you are only meant to change the number not 2425 translate the words 'width' or 'height'. This check capture instances 2426 where these are translated. It will also catch other types of errors in 2427 these units. 2428 """ 2429 # Example: "width: 635px; height: 400px;" 2430 if "width" in str1 or "height" in str1: 2431 str1pairs = self.mozilla_dialog_re.findall(str1) 2432 2433 if str1pairs: 2434 str2pairs = self.mozilla_dialog_re.findall(str2) 2435 2436 if len(str1pairs) != len(str2pairs): 2437 raise FilterFailure("A dialog pair is missing") 2438 2439 for i, pair1 in enumerate(str1pairs): 2440 pair2 = str2pairs[i] 2441 2442 if pair1[0] != pair2[0]: # Only check pairs that differ 2443 if len(pair2) != 4: 2444 raise FilterFailure("A part of the dialog pair is missing") 2445 2446 if pair1[1] not in pair2: # key 2447 raise FilterFailure( 2448 "Do not translate the key '%s'" % pair1[1] 2449 ) 2450 2451 # FIXME we could check more carefully for numbers in pair1[2] 2452 if pair2[3] not in self.mozilla_dialog_valid_units: 2453 raise FilterFailure( 2454 "Units should be one of '%s'. " 2455 "The source string uses '%s'" 2456 % (", ".join(self.mozilla_dialog_valid_units), pair1[3]) 2457 ) 2458 2459 return True 2460 2461 @functional 2462 def numbers(self, str1, str2): 2463 """Checks that numbers are not translated. 2464 2465 Special handling for Mozilla to ignore entries that are dialog sizes. 2466 """ 2467 if self.mozilla_dialog_re.findall(str1): 2468 return True 2469 2470 return super().numbers(str1, str2) 2471 2472 @functional 2473 def unchanged(self, str1, str2): 2474 """Checks whether a translation is basically identical to the original 2475 string. 2476 2477 Special handling for Mozilla to ignore entries that are dialog sizes. 2478 """ 2479 if ( 2480 self.mozilla_dialog_re.findall(str1) 2481 or str1.strip().lstrip("0123456789") in self.mozilla_dialog_valid_units 2482 ): 2483 return True 2484 2485 return super().unchanged(str1, str2) 2486 2487 @cosmetic 2488 def accelerators(self, str1, str2): 2489 """Checks whether accelerators are consistent between the 2490 two strings. 2491 2492 For Mozilla we lower the severity to cosmetic, and for some languages 2493 it also ensures accelerators are absent in the target string since some 2494 languages do not use accelerators, for example Indic languages. 2495 """ 2496 # Mozilla's specific no-accelerators behavior. 2497 if self.config.language_script in self.accelerators_skipped_scripts: 2498 str2 = self.filtervariables(str2) 2499 messages = [] 2500 2501 for accelmarker in self.config.accelmarkers: 2502 counter2 = decoration.countaccelerators( 2503 accelmarker, 2504 self.config.lang.validaccel, 2505 ) 2506 if counter2(str2)[0] > 0: 2507 messages.append( 2508 "Accelerator '%s' should not appear in " 2509 "translation" % accelmarker 2510 ) 2511 2512 if messages: 2513 raise FilterFailure(messages) 2514 2515 return True 2516 2517 # Default accelerators behavior. 2518 return super().accelerators(str1, str2) 2519 2520 2521drupalconfig = CheckerConfig( 2522 varmatches=[("%", None), ("@", None), ("!", None)], 2523) 2524 2525 2526class DrupalChecker(StandardChecker): 2527 def __init__(self, **kwargs): 2528 checkerconfig = kwargs.get("checkerconfig", None) 2529 2530 if checkerconfig is None: 2531 checkerconfig = CheckerConfig() 2532 kwargs["checkerconfig"] = checkerconfig 2533 2534 checkerconfig.update(drupalconfig) 2535 super().__init__(**kwargs) 2536 2537 2538gnomeconfig = CheckerConfig( 2539 accelmarkers=["_"], 2540 varmatches=[("%", 1), ("$(", ")")], 2541 credit_sources=["translator-credits"], 2542) 2543 2544 2545class GnomeChecker(StandardChecker): 2546 def __init__(self, **kwargs): 2547 checkerconfig = kwargs.get("checkerconfig", None) 2548 2549 if checkerconfig is None: 2550 checkerconfig = CheckerConfig() 2551 kwargs["checkerconfig"] = checkerconfig 2552 2553 checkerconfig.update(gnomeconfig) 2554 super().__init__(**kwargs) 2555 2556 @functional 2557 def gconf(self, str1, str2): 2558 """Checks if we have any gconf config settings translated. 2559 2560 Gconf settings should not be translated so this check checks that gconf 2561 settings such as "name" or "modification_date" are not translated in 2562 the translation. It allows you to change the surrounding quotes but 2563 will ensure that the setting values remain untranslated. 2564 """ 2565 for location in self.locations: 2566 if ( 2567 location.find("schemas.in") != -1 2568 or location.find("gschema.xml.in") != -1 2569 ): 2570 gconf_attributes = gconf_attribute_re.findall(str1) 2571 # stopwords = [word for word in words1 if word in self.config.notranslatewords and word not in words2] 2572 stopwords = [ 2573 word for word in gconf_attributes if word[1:-1] not in str2 2574 ] 2575 2576 if stopwords: 2577 raise FilterFailure( 2578 "Do not translate GConf attributes: %s" % (", ".join(stopwords)) 2579 ) 2580 2581 return True 2582 2583 return True 2584 2585 2586kdeconfig = CheckerConfig( 2587 accelmarkers=["&"], 2588 varmatches=[("%", 1)], 2589 credit_sources=["Your names", "Your emails", "ROLES_OF_TRANSLATORS"], 2590) 2591 2592 2593class KdeChecker(StandardChecker): 2594 def __init__(self, **kwargs): 2595 # TODO allow setup of KDE plural and translator comments so that they do 2596 # not create false postives 2597 checkerconfig = kwargs.get("checkerconfig", None) 2598 2599 if checkerconfig is None: 2600 checkerconfig = CheckerConfig() 2601 kwargs["checkerconfig"] = checkerconfig 2602 2603 checkerconfig.update(kdeconfig) 2604 super().__init__(**kwargs) 2605 2606 2607cclicenseconfig = CheckerConfig(varmatches=[("@", "@")]) 2608 2609 2610class CCLicenseChecker(StandardChecker): 2611 def __init__(self, **kwargs): 2612 checkerconfig = kwargs.get("checkerconfig", None) 2613 2614 if checkerconfig is None: 2615 checkerconfig = CheckerConfig() 2616 kwargs["checkerconfig"] = checkerconfig 2617 2618 checkerconfig.update(cclicenseconfig) 2619 super().__init__(**kwargs) 2620 2621 2622minimalconfig = CheckerConfig() 2623 2624 2625class MinimalChecker(StandardChecker): 2626 def __init__(self, **kwargs): 2627 checkerconfig = kwargs.get("checkerconfig", None) 2628 2629 if checkerconfig is None: 2630 checkerconfig = CheckerConfig() 2631 kwargs["checkerconfig"] = checkerconfig 2632 2633 limitfilters = kwargs.get("limitfilters", None) 2634 2635 if limitfilters is None: 2636 limitfilters = ["untranslated", "unchanged", "blank"] 2637 kwargs["limitfilters"] = limitfilters 2638 2639 checkerconfig.update(minimalconfig) 2640 super().__init__(**kwargs) 2641 2642 2643reducedconfig = CheckerConfig() 2644 2645 2646class ReducedChecker(StandardChecker): 2647 def __init__(self, **kwargs): 2648 checkerconfig = kwargs.get("checkerconfig", None) 2649 2650 if checkerconfig is None: 2651 checkerconfig = CheckerConfig() 2652 kwargs["checkerconfig"] = checkerconfig 2653 2654 limitfilters = kwargs.get("limitfilters", None) 2655 2656 if limitfilters is None: 2657 limitfilters = [ 2658 "untranslated", 2659 "unchanged", 2660 "blank", 2661 "doublespacing", 2662 "doublewords", 2663 "spellcheck", 2664 ] 2665 kwargs["limitfilters"] = limitfilters 2666 2667 checkerconfig.update(minimalconfig) 2668 super().__init__(**kwargs) 2669 2670 2671termconfig = CheckerConfig() 2672 2673 2674class TermChecker(StandardChecker): 2675 def __init__(self, **kwargs): 2676 checkerconfig = kwargs.get("checkerconfig", None) 2677 2678 if checkerconfig is None: 2679 checkerconfig = CheckerConfig() 2680 kwargs["checkerconfig"] = checkerconfig 2681 2682 checkerconfig.update(termconfig) 2683 super().__init__(**kwargs) 2684 2685 2686class L20nChecker(MozillaChecker): 2687 excluded_filters_for_complex_units = [ 2688 "escapes", 2689 "newlines", 2690 "tabs", 2691 "singlequoting", 2692 "doublequoting", 2693 "doublespacing", 2694 "brackets", 2695 "pythonbraceformat", 2696 "sentencecount", 2697 "variables", 2698 ] 2699 complex_unit_pattern = "->" 2700 2701 def __init__(self, **kwargs): 2702 checkerconfig = kwargs.get("checkerconfig", None) 2703 2704 if checkerconfig is None: 2705 checkerconfig = CheckerConfig() 2706 kwargs["checkerconfig"] = checkerconfig 2707 2708 super().__init__(**kwargs) 2709 2710 def run_filters(self, unit, categorised=False): 2711 is_unit_complex = ( 2712 self.complex_unit_pattern in unit.source 2713 or self.complex_unit_pattern in unit.target 2714 ) 2715 2716 saved_default_filters = {} 2717 if is_unit_complex: 2718 saved_default_filters = self.defaultfilters 2719 self.defaultfilters = { 2720 key: value 2721 for (key, value) in self.defaultfilters.items() 2722 if key not in self.excluded_filters_for_complex_units 2723 } 2724 2725 result = super().run_filters(unit, categorised=categorised) 2726 2727 if is_unit_complex: 2728 self.defaultfilters = saved_default_filters 2729 2730 return result 2731 2732 2733iosconfig = CheckerConfig( 2734 varmatches=[("$(", ")"), ("%", "@")], 2735) 2736 2737 2738class IOSChecker(StandardChecker): 2739 def __init__(self, **kwargs): 2740 checkerconfig = kwargs.get("checkerconfig", None) 2741 2742 if checkerconfig is None: 2743 checkerconfig = CheckerConfig() 2744 kwargs["checkerconfig"] = checkerconfig 2745 2746 checkerconfig.update(iosconfig) 2747 super().__init__(**kwargs) 2748 2749 2750projectcheckers = { 2751 "minimal": MinimalChecker, 2752 "standard": StandardChecker, 2753 "reduced": ReducedChecker, 2754 "openoffice": OpenOfficeChecker, 2755 "libreoffice": LibreOfficeChecker, 2756 "mozilla": MozillaChecker, 2757 "kde": KdeChecker, 2758 "wx": KdeChecker, 2759 "gnome": GnomeChecker, 2760 "creativecommons": CCLicenseChecker, 2761 "drupal": DrupalChecker, 2762 "terminology": TermChecker, 2763 "ios": IOSChecker, 2764} 2765 2766 2767class StandardUnitChecker(UnitChecker): 2768 """The standard checks for common checks on translation units.""" 2769 2770 @extraction 2771 def isfuzzy(self, unit): 2772 """Check if the unit has been marked fuzzy. 2773 2774 If a message is marked fuzzy in the PO file then it is extracted. 2775 Note this is different from ``--fuzzy`` and ``--nofuzzy`` options which 2776 specify whether tests should be performed against messages marked 2777 fuzzy. 2778 """ 2779 return not unit.isfuzzy() 2780 2781 @extraction 2782 def isreview(self, unit): 2783 """Check if the unit has been marked review. 2784 2785 If you have made use of the 'review' flags in your translations:: 2786 2787 # (review) reason for review 2788 # (pofilter) testname: explanation for translator 2789 2790 Then if a message is marked for review in the PO file it will be 2791 extracted. Note this is different from ``--review`` and ``--noreview`` 2792 options which specify whether tests should be performed against 2793 messages already marked as under review. 2794 """ 2795 return not unit.isreview() 2796 2797 @critical 2798 def nplurals(self, unit): 2799 """Checks for the correct number of noun forms for plural translations. 2800 2801 This uses the plural information in the language module of the 2802 Translate Toolkit. This is the same as the Gettext nplural value. It 2803 will check that the number of plurals required is the same as the 2804 number supplied in your translation. 2805 """ 2806 if unit.hasplural(): 2807 # if we don't have a valid nplurals value, don't run the test 2808 nplurals = self.config.lang.nplurals 2809 2810 if nplurals > 0: 2811 return len(list(filter(None, unit.target.strings))) == nplurals 2812 2813 return True 2814 2815 @extraction 2816 def hassuggestion(self, unit): 2817 """Checks if there is at least one suggested translation for this unit. 2818 2819 If a message has a suggestion (an alternate translation stored in 2820 alt-trans units in XLIFF and .pending files in PO) then these will be 2821 extracted. This is used by Pootle and is probably only useful in 2822 pofilter when using XLIFF files. 2823 """ 2824 self.suggestion_store = getattr(self, "suggestion_store", None) 2825 suggestions = [] 2826 2827 if self.suggestion_store: 2828 suggestions = self.suggestion_store.findunits(unit.source) 2829 elif getattr(unit, "getalttrans", None): 2830 # TODO: we probably want to filter them somehow 2831 suggestions = unit.getalttrans() 2832 2833 return not bool(suggestions) 2834 2835 2836def runtests(str1, str2, ignorelist=()): 2837 """Verifies that the tests pass for a pair of strings.""" 2838 from translate.storage import base 2839 2840 str1 = data.normalize(str1) 2841 str2 = data.normalize(str2) 2842 unit = base.TranslationUnit(str1) 2843 unit.target = str2 2844 checker = StandardChecker(excludefilters=ignorelist) 2845 failures = checker.run_filters(unit) 2846 2847 for test in failures: 2848 print( 2849 "failure: %s: %s\n %r\n %r" 2850 % (test, failures[test]["message"], str1, str2) 2851 ) 2852 2853 return failures 2854 2855 2856def batchruntests(pairs): 2857 """Runs test on a batch of string pairs.""" 2858 passed, numpairs = 0, len(pairs) 2859 2860 for str1, str2 in pairs: 2861 if runtests(str1, str2): 2862 passed += 1 2863 2864 print("\ntotal: %d/%d pairs passed" % (passed, numpairs)) 2865 2866 2867if __name__ == "__main__": 2868 testset = [ 2869 (r"simple", r"somple"), 2870 (r"\this equals \that", r"does \this equal \that?"), 2871 (r"this \'equals\' that", r"this 'equals' that"), 2872 (r" start and end! they must match.", r"start and end! they must match."), 2873 ( 2874 r"check for matching %variables marked like %this", 2875 r"%this %variable is marked", 2876 ), 2877 ( 2878 r"check for mismatching %variables marked like %this", 2879 r"%that %variable is marked", 2880 ), 2881 (r"check for mismatching %variables% too", r"how many %variable% are marked"), 2882 (r"%% %%", r"%%"), 2883 (r"Row: %1, Column: %2", r"Mothalo: %1, Kholomo: %2"), 2884 (r"simple lowercase", r"it is all lowercase"), 2885 (r"simple lowercase", r"It Is All Lowercase"), 2886 (r"Simple First Letter Capitals", r"First Letters"), 2887 (r"SIMPLE CAPITALS", r"First Letters"), 2888 (r"SIMPLE CAPITALS", r"ALL CAPITALS"), 2889 (r"forgot to translate", r" "), 2890 ] 2891 batchruntests(testset) 2892