1"""
2Try to detect suspicious constructs, resembling markup
3that has leaked into the final output.
4
5Suspicious lines are reported in a comma-separated-file,
6``suspicious.csv``, located in the output directory.
7
8The file is utf-8 encoded, and each line contains four fields:
9
10 * document name (normalized)
11 * line number in the source document
12 * problematic text
13 * complete line showing the problematic text in context
14
15It is common to find many false positives. To avoid reporting them
16again and again, they may be added to the ``ignored.csv`` file
17(located in the configuration directory). The file has the same
18format as ``suspicious.csv`` with a few differences:
19
20  - each line defines a rule; if the rule matches, the issue
21    is ignored.
22  - line number may be empty (that is, nothing between the
23    commas: ",,"). In this case, line numbers are ignored (the
24    rule matches anywhere in the file).
25  - the last field does not have to be a complete line; some
26    surrounding text (never more than a line) is enough for
27    context.
28
29Rules are processed sequentially. A rule matches when:
30
31 * document names are the same
32 * problematic texts are the same
33 * line numbers are close to each other (5 lines up or down)
34 * the rule text is completely contained into the source line
35
36The simplest way to create the ignored.csv file is by copying
37undesired entries from suspicious.csv (possibly trimming the last
38field.)
39
40Copyright 2009 Gabriel A. Genellina
41
42"""
43
44import os
45import re
46import csv
47import sys
48
49from docutils import nodes
50from sphinx.builders import Builder
51import sphinx.util
52
53detect_all = re.compile(r'''
54    ::(?=[^=])|            # two :: (but NOT ::=)
55    :[a-zA-Z][a-zA-Z0-9]+| # :foo
56    `|                     # ` (seldom used by itself)
57    (?<!\.)\.\.[ \t]*\w+:  # .. foo: (but NOT ... else:)
58    ''', re.UNICODE | re.VERBOSE).finditer
59
60py3 = sys.version_info >= (3, 0)
61
62
63class Rule:
64    def __init__(self, docname, lineno, issue, line):
65        """A rule for ignoring issues"""
66        self.docname = docname # document to which this rule applies
67        self.lineno = lineno   # line number in the original source;
68                               # this rule matches only near that.
69                               # None -> don't care
70        self.issue = issue     # the markup fragment that triggered this rule
71        self.line = line       # text of the container element (single line only)
72        self.used = False
73
74    def __repr__(self):
75        return '{0.docname},,{0.issue},{0.line}'.format(self)
76
77
78
79class dialect(csv.excel):
80    """Our dialect: uses only linefeed as newline."""
81    lineterminator = '\n'
82
83
84class CheckSuspiciousMarkupBuilder(Builder):
85    """
86    Checks for possibly invalid markup that may leak into the output.
87    """
88    name = 'suspicious'
89    logger = sphinx.util.logging.getLogger("CheckSuspiciousMarkupBuilder")
90
91    def init(self):
92        # create output file
93        self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
94        open(self.log_file_name, 'w').close()
95        # load database of previously ignored issues
96        self.load_rules(os.path.join(os.path.dirname(__file__), '..',
97                                     'susp-ignored.csv'))
98
99    def get_outdated_docs(self):
100        return self.env.found_docs
101
102    def get_target_uri(self, docname, typ=None):
103        return ''
104
105    def prepare_writing(self, docnames):
106        pass
107
108    def write_doc(self, docname, doctree):
109        # set when any issue is encountered in this document
110        self.any_issue = False
111        self.docname = docname
112        visitor = SuspiciousVisitor(doctree, self)
113        doctree.walk(visitor)
114
115    def finish(self):
116        unused_rules = [rule for rule in self.rules if not rule.used]
117        if unused_rules:
118            self.logger.warning(
119                'Found %s/%s unused rules: %s' % (
120                    len(unused_rules), len(self.rules),
121                    ''.join(repr(rule) for rule in unused_rules),
122                )
123            )
124        return
125
126    def check_issue(self, line, lineno, issue):
127        if not self.is_ignored(line, lineno, issue):
128            self.report_issue(line, lineno, issue)
129
130    def is_ignored(self, line, lineno, issue):
131        """Determine whether this issue should be ignored."""
132        docname = self.docname
133        for rule in self.rules:
134            if rule.docname != docname: continue
135            if rule.issue != issue: continue
136            # Both lines must match *exactly*. This is rather strict,
137            # and probably should be improved.
138            # Doing fuzzy matches with levenshtein distance could work,
139            # but that means bringing other libraries...
140            # Ok, relax that requirement: just check if the rule fragment
141            # is contained in the document line
142            if rule.line not in line: continue
143            # Check both line numbers. If they're "near"
144            # this rule matches. (lineno=None means "don't care")
145            if (rule.lineno is not None) and \
146                abs(rule.lineno - lineno) > 5: continue
147            # if it came this far, the rule matched
148            rule.used = True
149            return True
150        return False
151
152    def report_issue(self, text, lineno, issue):
153        self.any_issue = True
154        self.write_log_entry(lineno, issue, text)
155        if py3:
156            self.logger.warning('[%s:%d] "%s" found in "%-.120s"' %
157                                (self.docname, lineno, issue, text))
158        else:
159            self.logger.warning(
160                '[%s:%d] "%s" found in "%-.120s"' % (
161                    self.docname.encode(sys.getdefaultencoding(),'replace'),
162                    lineno,
163                    issue.encode(sys.getdefaultencoding(),'replace'),
164                    text.strip().encode(sys.getdefaultencoding(),'replace')))
165        self.app.statuscode = 1
166
167    def write_log_entry(self, lineno, issue, text):
168        if py3:
169            f = open(self.log_file_name, 'a')
170            writer = csv.writer(f, dialect)
171            writer.writerow([self.docname, lineno, issue, text.strip()])
172            f.close()
173        else:
174            f = open(self.log_file_name, 'ab')
175            writer = csv.writer(f, dialect)
176            writer.writerow([self.docname.encode('utf-8'),
177                             lineno,
178                             issue.encode('utf-8'),
179                             text.strip().encode('utf-8')])
180            f.close()
181
182    def load_rules(self, filename):
183        """Load database of previously ignored issues.
184
185        A csv file, with exactly the same format as suspicious.csv
186        Fields: document name (normalized), line number, issue, surrounding text
187        """
188        self.logger.info("loading ignore rules... ", nonl=1)
189        self.rules = rules = []
190        try:
191            if py3:
192                f = open(filename, 'r')
193            else:
194                f = open(filename, 'rb')
195        except IOError:
196            return
197        for i, row in enumerate(csv.reader(f)):
198            if len(row) != 4:
199                raise ValueError(
200                    "wrong format in %s, line %d: %s" % (filename, i+1, row))
201            docname, lineno, issue, text = row
202            if lineno:
203                lineno = int(lineno)
204            else:
205                lineno = None
206            if not py3:
207                docname = docname.decode('utf-8')
208                issue = issue.decode('utf-8')
209                text = text.decode('utf-8')
210            rule = Rule(docname, lineno, issue, text)
211            rules.append(rule)
212        f.close()
213        self.logger.info('done, %d rules loaded' % len(self.rules))
214
215
216def get_lineno(node):
217    """Obtain line number information for a node."""
218    lineno = None
219    while lineno is None and node:
220        node = node.parent
221        lineno = node.line
222    return lineno
223
224
225def extract_line(text, index):
226    """text may be a multiline string; extract
227    only the line containing the given character index.
228
229    >>> extract_line("abc\ndefgh\ni", 6)
230    >>> 'defgh'
231    >>> for i in (0, 2, 3, 4, 10):
232    ...   print extract_line("abc\ndefgh\ni", i)
233    abc
234    abc
235    abc
236    defgh
237    defgh
238    i
239    """
240    p = text.rfind('\n', 0, index) + 1
241    q = text.find('\n', index)
242    if q < 0:
243        q = len(text)
244    return text[p:q]
245
246
247class SuspiciousVisitor(nodes.GenericNodeVisitor):
248
249    lastlineno = 0
250
251    def __init__(self, document, builder):
252        nodes.GenericNodeVisitor.__init__(self, document)
253        self.builder = builder
254
255    def default_visit(self, node):
256        if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
257            text = node.astext()
258            # lineno seems to go backwards sometimes (?)
259            self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
260            seen = set() # don't report the same issue more than only once per line
261            for match in detect_all(text):
262                issue = match.group()
263                line = extract_line(text, match.start())
264                if (issue, line) not in seen:
265                    self.builder.check_issue(line, lineno, issue)
266                    seen.add((issue, line))
267
268    unknown_visit = default_visit
269
270    def visit_document(self, node):
271        self.lastlineno = 0
272
273    def visit_comment(self, node):
274        # ignore comments -- too much false positives.
275        # (although doing this could miss some errors;
276        # there were two sections "commented-out" by mistake
277        # in the Python docs that would not be caught)
278        raise nodes.SkipNode
279