1#!/usr/bin/env python3
2#
3# Check gcc.pot file for stylistic issues as described in
4# https://gcc.gnu.org/onlinedocs/gccint/Guidelines-for-Diagnostics.html,
5# especially in gcc-internal-format messages.
6#
7# This file is part of GCC.
8#
9# GCC is free software; you can redistribute it and/or modify it under
10# the terms of the GNU General Public License as published by the Free
11# Software Foundation; either version 3, or (at your option) any later
12# version.
13#
14# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15# WARRANTY; without even the implied warranty of MERCHANTABILITY or
16# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
17# for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with GCC; see the file COPYING3.  If not see
21# <http://www.gnu.org/licenses/>.
22
23import argparse
24import re
25from collections import Counter
26from typing import Dict, Match
27
28import polib
29
30seen_warnings = Counter()
31
32
33def location(msg: polib.POEntry):
34    if msg.occurrences:
35        occ = msg.occurrences[0]
36        return f'{occ[0]}:{occ[1]}'
37    return '<unknown location>'
38
39
40def warn(msg: polib.POEntry,
41         diagnostic_id: str, diagnostic: str, include_msgid=True):
42    """
43    To suppress a warning for a particular message,
44    add a line "#, gcclint:ignore:{diagnostic_id}" to the message.
45    """
46
47    if f'gcclint:ignore:{diagnostic_id}' in msg.flags:
48        return
49
50    seen_warnings[diagnostic] += 1
51
52    if include_msgid:
53        print(f'{location(msg)}: {diagnostic} in {repr(msg.msgid)}')
54    else:
55        print(f'{location(msg)}: {diagnostic}')
56
57
58def lint_gcc_internal_format(msg: polib.POEntry):
59    """
60    Checks a single message that has the gcc-internal-format. These
61    messages use a variety of placeholders like %qs, %<quotes%> and
62    %q#E.
63    """
64
65    msgid: str = msg.msgid
66
67    def outside_quotes(m: Match[str]):
68        before = msgid[:m.start(0)]
69        return before.count("%<") == before.count("%>")
70
71    def lint_matching_placeholders():
72        """
73        Warns when literal values in placeholders are not exactly equal
74        in the translation. This can happen when doing copy-and-paste
75        translations of similar messages.
76
77        To avoid these mismatches in the first place,
78        structurally equal messages are found by
79        lint_diagnostics_differing_only_in_placeholders.
80
81        This check only applies when checking a finished translation
82        such as de.po, not gcc.pot.
83        """
84
85        if not msg.translated():
86            return
87
88        in_msgid = re.findall('%<[^%]+%>', msgid)
89        in_msgstr = re.findall('%<[^%]+%>', msg.msgstr)
90
91        if set(in_msgid) != set(in_msgstr):
92            warn(msg,
93                 'placeholder-mismatch',
94                 f'placeholder mismatch: msgid has {in_msgid}, '
95                 f'msgstr has {in_msgstr}',
96                 include_msgid=False)
97
98    def lint_option_outside_quotes():
99        for match in re.finditer(r'\S+', msgid):
100            part = match.group()
101            if not outside_quotes(match):
102                continue
103
104            if part.startswith('-'):
105                if len(part) >= 2 and part[1].isalpha():
106                    if part == '-INF':
107                        continue
108
109                    warn(msg,
110                         'option-outside-quotes',
111                         'command line option outside %<quotes%>')
112
113            if part.startswith('__builtin_'):
114                warn(msg,
115                     'builtin-outside-quotes',
116                     'builtin function outside %<quotes%>')
117
118    def lint_plain_apostrophe():
119        for match in re.finditer("[^%]'", msgid):
120            if outside_quotes(match):
121                warn(msg, 'apostrophe', 'apostrophe without leading %')
122
123    def lint_space_before_quote():
124        """
125        A space before %< is often the result of string literals that
126        are joined by the C compiler and neither literal has a space
127        to separate the words.
128        """
129
130        for match in re.finditer("(.?[a-zA-Z0-9])%<", msgid):
131            if match.group(1) != '%s':
132                warn(msg,
133                     'no-space-before-quote',
134                     '%< directly following a letter or digit')
135
136    def lint_underscore_outside_quotes():
137        """
138        An underscore outside of quotes is used in several contexts,
139        and many of them violate the GCC Guidelines for Diagnostics:
140
141        * names of GCC-internal compiler functions
142        * names of GCC-internal data structures
143        * static_cast and the like (which are legitimate)
144        """
145
146        for match in re.finditer("_", msgid):
147            if outside_quotes(match):
148                warn(msg,
149                     'underscore-outside-quotes',
150                     'underscore outside of %<quotes%>')
151                return
152
153    def lint_may_not():
154        """
155        The term "may not" may either mean "it could be the case"
156        or "should not". These two different meanings are sometimes
157        hard to tell apart.
158        """
159
160        if re.search(r'\bmay not\b', msgid):
161            warn(msg,
162                 'ambiguous-may-not',
163                 'the term "may not" is ambiguous')
164
165    def lint_unbalanced_quotes():
166        if msgid.count("%<") != msgid.count("%>"):
167            warn(msg,
168                 'unbalanced-quotes',
169                 'unbalanced %< and %> quotes')
170
171        if msg.translated():
172            if msg.msgstr.count("%<") != msg.msgstr.count("%>"):
173                warn(msg,
174                     'unbalanced-quotes',
175                     'unbalanced %< and %> quotes')
176
177    def lint_single_space_after_sentence():
178        """
179        After a sentence there should be two spaces.
180        """
181
182        if re.search(r'[.] [A-Z]', msgid):
183            warn(msg,
184                 'single-space-after-sentence',
185                 'single space after sentence')
186
187    def lint_non_canonical_quotes():
188        """
189        Catches %<%s%>, which can be written in the shorter form %qs.
190        """
191        match = re.search("%<%s%>|'%s'|\"%s\"|`%s'", msgid)
192        if match:
193            warn(msg,
194                 'non-canonical-quotes',
195                 f'placeholder {match.group()} should be written as %qs')
196
197    lint_option_outside_quotes()
198    lint_plain_apostrophe()
199    lint_space_before_quote()
200    lint_underscore_outside_quotes()
201    lint_may_not()
202    lint_unbalanced_quotes()
203    lint_matching_placeholders()
204    lint_single_space_after_sentence()
205    lint_non_canonical_quotes()
206
207
208def lint_diagnostics_differing_only_in_placeholders(po: polib.POFile):
209    """
210    Detects messages that are structurally the same, except that they
211    use different plain strings inside %<quotes%>. These messages can
212    be merged in order to prevent copy-and-paste mistakes by the
213    translators.
214
215    See bug 90119.
216    """
217
218    seen: Dict[str, polib.POEntry] = {}
219
220    for msg in po:
221        msg: polib.POEntry
222        msgid = msg.msgid
223
224        normalized = re.sub('%<[^%]+%>', '%qs', msgid)
225        if normalized not in seen:
226            seen[normalized] = msg
227            seen[msgid] = msg
228            continue
229
230        prev = seen[normalized]
231        warn(msg,
232             'same-pattern',
233             f'same pattern for {repr(msgid)} and '
234             f'{repr(prev.msgid)} in {location(prev)}',
235             include_msgid=False)
236
237
238def lint_file(po: polib.POFile):
239    for msg in po:
240        msg: polib.POEntry
241
242        if not msg.obsolete and not msg.fuzzy:
243            if 'gcc-internal-format' in msg.flags:
244                lint_gcc_internal_format(msg)
245
246    lint_diagnostics_differing_only_in_placeholders(po)
247
248
249def main():
250    parser = argparse.ArgumentParser(description='')
251    parser.add_argument('file', help='pot file')
252
253    args = parser.parse_args()
254
255    po = polib.pofile(args.file)
256    lint_file(po)
257
258    print()
259    print('summary:')
260    for entry in seen_warnings.most_common():
261        if entry[1] > 1:
262            print(f'{entry[1]}\t{entry[0]}')
263
264
265if __name__ == '__main__':
266    main()
267