1"""
2Try to detect suspicious constructs, resembling markup
3that has leaked into the final output.
4
5Suspicious lines are reported in a comma-separated-file,
6``suspicious.csv``, located in the output directory.
7
8The file is utf-8 encoded, and each line contains four fields:
9
10 * document name (normalized)
11 * line number in the source document
12 * problematic text
13 * complete line showing the problematic text in context
14
15It is common to find many false positives. To avoid reporting them
16again and again, they may be added to the ``ignored.csv`` file
17(located in the configuration directory). The file has the same
18format as ``suspicious.csv`` with a few differences:
19
20  - each line defines a rule; if the rule matches, the issue
21    is ignored.
22  - line number may be empty (that is, nothing between the
23    commas: ",,"). In this case, line numbers are ignored (the
24    rule matches anywhere in the file).
25  - the last field does not have to be a complete line; some
26    surrounding text (never more than a line) is enough for
27    context.
28
29Rules are processed sequentially. A rule matches when:
30
31 * document names are the same
32 * problematic texts are the same
33 * line numbers are close to each other (5 lines up or down)
34 * the rule text is completely contained into the source line
35
36The simplest way to create the ignored.csv file is by copying
37undesired entries from suspicious.csv (possibly trimming the last
38field.)
39
40Copyright 2009 Gabriel A. Genellina
41
42"""
43
44import os
45import re
46import csv
47import sys
48
49from docutils import nodes
50from sphinx.builders import Builder
51import sphinx.util
52
53detect_all = re.compile(r'''
54    ::(?=[^=])|            # two :: (but NOT ::=)
55    :[a-zA-Z][a-zA-Z0-9]+| # :foo
56    `|                     # ` (seldom used by itself)
57    (?<!\.)\.\.[ \t]*\w+:  # .. foo: (but NOT ... else:)
58    ''', re.UNICODE | re.VERBOSE).finditer
59
60py3 = sys.version_info >= (3, 0)
61
62
63class Rule:
64    def __init__(self, docname, lineno, issue, line):
65        """A rule for ignoring issues"""
66        self.docname = docname # document to which this rule applies
67        self.lineno = lineno   # line number in the original source;
68                               # this rule matches only near that.
69                               # None -> don't care
70        self.issue = issue     # the markup fragment that triggered this rule
71        self.line = line       # text of the container element (single line only)
72        self.used = False
73
74    def __repr__(self):
75        return '{0.docname},,{0.issue},{0.line}'.format(self)
76
77
78
79class dialect(csv.excel):
80    """Our dialect: uses only linefeed as newline."""
81    lineterminator = '\n'
82
83
84class CheckSuspiciousMarkupBuilder(Builder):
85    """
86    Checks for possibly invalid markup that may leak into the output.
87    """
88    name = 'suspicious'
89    logger = sphinx.util.logging.getLogger("CheckSuspiciousMarkupBuilder")
90
91    def init(self):
92        # create output file
93        self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
94        open(self.log_file_name, 'w').close()
95        # load database of previously ignored issues
96        self.load_rules(os.path.join(os.path.dirname(__file__), '..',
97                                     'susp-ignored.csv'))
98
99    def get_outdated_docs(self):
100        return self.env.found_docs
101
102    def get_target_uri(self, docname, typ=None):
103        return ''
104
105    def prepare_writing(self, docnames):
106        pass
107
108    def write_doc(self, docname, doctree):
109        # set when any issue is encountered in this document
110        self.any_issue = False
111        self.docname = docname
112        visitor = SuspiciousVisitor(doctree, self)
113        doctree.walk(visitor)
114
115    def finish(self):
116        unused_rules = [rule for rule in self.rules if not rule.used]
117        if unused_rules:
118            self.warn('Found %s/%s unused rules:' %
119                      (len(unused_rules), len(self.rules)))
120            for rule in unused_rules:
121                self.logger.info(repr(rule))
122        return
123
124    def check_issue(self, line, lineno, issue):
125        if not self.is_ignored(line, lineno, issue):
126            self.report_issue(line, lineno, issue)
127
128    def is_ignored(self, line, lineno, issue):
129        """Determine whether this issue should be ignored."""
130        docname = self.docname
131        for rule in self.rules:
132            if rule.docname != docname: continue
133            if rule.issue != issue: continue
134            # Both lines must match *exactly*. This is rather strict,
135            # and probably should be improved.
136            # Doing fuzzy matches with levenshtein distance could work,
137            # but that means bringing other libraries...
138            # Ok, relax that requirement: just check if the rule fragment
139            # is contained in the document line
140            if rule.line not in line: continue
141            # Check both line numbers. If they're "near"
142            # this rule matches. (lineno=None means "don't care")
143            if (rule.lineno is not None) and \
144                abs(rule.lineno - lineno) > 5: continue
145            # if it came this far, the rule matched
146            rule.used = True
147            return True
148        return False
149
150    def report_issue(self, text, lineno, issue):
151        self.any_issue = True
152        self.write_log_entry(lineno, issue, text)
153        if py3:
154            self.warn('[%s:%d] "%s" found in "%-.120s"' %
155                      (self.docname, lineno, issue, text))
156        else:
157            self.warn('[%s:%d] "%s" found in "%-.120s"' % (
158                self.docname.encode(sys.getdefaultencoding(),'replace'),
159                lineno,
160                issue.encode(sys.getdefaultencoding(),'replace'),
161                text.strip().encode(sys.getdefaultencoding(),'replace')))
162        self.app.statuscode = 1
163
164    def write_log_entry(self, lineno, issue, text):
165        if py3:
166            f = open(self.log_file_name, 'a')
167            writer = csv.writer(f, dialect)
168            writer.writerow([self.docname, lineno, issue, text.strip()])
169            f.close()
170        else:
171            f = open(self.log_file_name, 'ab')
172            writer = csv.writer(f, dialect)
173            writer.writerow([self.docname.encode('utf-8'),
174                             lineno,
175                             issue.encode('utf-8'),
176                             text.strip().encode('utf-8')])
177            f.close()
178
179    def load_rules(self, filename):
180        """Load database of previously ignored issues.
181
182        A csv file, with exactly the same format as suspicious.csv
183        Fields: document name (normalized), line number, issue, surrounding text
184        """
185        self.logger.info("loading ignore rules... ", nonl=1)
186        self.rules = rules = []
187        try:
188            if py3:
189                f = open(filename, 'r')
190            else:
191                f = open(filename, 'rb')
192        except IOError:
193            return
194        for i, row in enumerate(csv.reader(f)):
195            if len(row) != 4:
196                raise ValueError(
197                    "wrong format in %s, line %d: %s" % (filename, i+1, row))
198            docname, lineno, issue, text = row
199            if lineno:
200                lineno = int(lineno)
201            else:
202                lineno = None
203            if not py3:
204                docname = docname.decode('utf-8')
205                issue = issue.decode('utf-8')
206                text = text.decode('utf-8')
207            rule = Rule(docname, lineno, issue, text)
208            rules.append(rule)
209        f.close()
210        self.logger.info('done, %d rules loaded' % len(self.rules))
211
212
213def get_lineno(node):
214    """Obtain line number information for a node."""
215    lineno = None
216    while lineno is None and node:
217        node = node.parent
218        lineno = node.line
219    return lineno
220
221
222def extract_line(text, index):
223    """text may be a multiline string; extract
224    only the line containing the given character index.
225
226    >>> extract_line("abc\ndefgh\ni", 6)
227    >>> 'defgh'
228    >>> for i in (0, 2, 3, 4, 10):
229    ...   print extract_line("abc\ndefgh\ni", i)
230    abc
231    abc
232    abc
233    defgh
234    defgh
235    i
236    """
237    p = text.rfind('\n', 0, index) + 1
238    q = text.find('\n', index)
239    if q < 0:
240        q = len(text)
241    return text[p:q]
242
243
244class SuspiciousVisitor(nodes.GenericNodeVisitor):
245
246    lastlineno = 0
247
248    def __init__(self, document, builder):
249        nodes.GenericNodeVisitor.__init__(self, document)
250        self.builder = builder
251
252    def default_visit(self, node):
253        if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
254            text = node.astext()
255            # lineno seems to go backwards sometimes (?)
256            self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
257            seen = set() # don't report the same issue more than only once per line
258            for match in detect_all(text):
259                issue = match.group()
260                line = extract_line(text, match.start())
261                if (issue, line) not in seen:
262                    self.builder.check_issue(line, lineno, issue)
263                    seen.add((issue, line))
264
265    unknown_visit = default_visit
266
267    def visit_document(self, node):
268        self.lastlineno = 0
269
270    def visit_comment(self, node):
271        # ignore comments -- too much false positives.
272        # (although doing this could miss some errors;
273        # there were two sections "commented-out" by mistake
274        # in the Python docs that would not be caught)
275        raise nodes.SkipNode
276