1""" 2Try to detect suspicious constructs, resembling markup 3that has leaked into the final output. 4 5Suspicious lines are reported in a comma-separated-file, 6``suspicious.csv``, located in the output directory. 7 8The file is utf-8 encoded, and each line contains four fields: 9 10 * document name (normalized) 11 * line number in the source document 12 * problematic text 13 * complete line showing the problematic text in context 14 15It is common to find many false positives. To avoid reporting them 16again and again, they may be added to the ``ignored.csv`` file 17(located in the configuration directory). The file has the same 18format as ``suspicious.csv`` with a few differences: 19 20 - each line defines a rule; if the rule matches, the issue 21 is ignored. 22 - line number may be empty (that is, nothing between the 23 commas: ",,"). In this case, line numbers are ignored (the 24 rule matches anywhere in the file). 25 - the last field does not have to be a complete line; some 26 surrounding text (never more than a line) is enough for 27 context. 28 29Rules are processed sequentially. A rule matches when: 30 31 * document names are the same 32 * problematic texts are the same 33 * line numbers are close to each other (5 lines up or down) 34 * the rule text is completely contained into the source line 35 36The simplest way to create the ignored.csv file is by copying 37undesired entries from suspicious.csv (possibly trimming the last 38field.) 39 40Copyright 2009 Gabriel A. Genellina 41 42""" 43 44import os 45import re 46import csv 47import sys 48 49from docutils import nodes 50from sphinx.builders import Builder 51import sphinx.util 52 53detect_all = re.compile(r''' 54 ::(?=[^=])| # two :: (but NOT ::=) 55 :[a-zA-Z][a-zA-Z0-9]+| # :foo 56 `| # ` (seldom used by itself) 57 (?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:) 58 ''', re.UNICODE | re.VERBOSE).finditer 59 60py3 = sys.version_info >= (3, 0) 61 62 63class Rule: 64 def __init__(self, docname, lineno, issue, line): 65 """A rule for ignoring issues""" 66 self.docname = docname # document to which this rule applies 67 self.lineno = lineno # line number in the original source; 68 # this rule matches only near that. 69 # None -> don't care 70 self.issue = issue # the markup fragment that triggered this rule 71 self.line = line # text of the container element (single line only) 72 self.used = False 73 74 def __repr__(self): 75 return '{0.docname},,{0.issue},{0.line}'.format(self) 76 77 78 79class dialect(csv.excel): 80 """Our dialect: uses only linefeed as newline.""" 81 lineterminator = '\n' 82 83 84class CheckSuspiciousMarkupBuilder(Builder): 85 """ 86 Checks for possibly invalid markup that may leak into the output. 87 """ 88 name = 'suspicious' 89 logger = sphinx.util.logging.getLogger("CheckSuspiciousMarkupBuilder") 90 91 def init(self): 92 # create output file 93 self.log_file_name = os.path.join(self.outdir, 'suspicious.csv') 94 open(self.log_file_name, 'w').close() 95 # load database of previously ignored issues 96 self.load_rules(os.path.join(os.path.dirname(__file__), '..', 97 'susp-ignored.csv')) 98 99 def get_outdated_docs(self): 100 return self.env.found_docs 101 102 def get_target_uri(self, docname, typ=None): 103 return '' 104 105 def prepare_writing(self, docnames): 106 pass 107 108 def write_doc(self, docname, doctree): 109 # set when any issue is encountered in this document 110 self.any_issue = False 111 self.docname = docname 112 visitor = SuspiciousVisitor(doctree, self) 113 doctree.walk(visitor) 114 115 def finish(self): 116 unused_rules = [rule for rule in self.rules if not rule.used] 117 if unused_rules: 118 self.warn('Found %s/%s unused rules:' % 119 (len(unused_rules), len(self.rules))) 120 for rule in unused_rules: 121 self.logger.info(repr(rule)) 122 return 123 124 def check_issue(self, line, lineno, issue): 125 if not self.is_ignored(line, lineno, issue): 126 self.report_issue(line, lineno, issue) 127 128 def is_ignored(self, line, lineno, issue): 129 """Determine whether this issue should be ignored.""" 130 docname = self.docname 131 for rule in self.rules: 132 if rule.docname != docname: continue 133 if rule.issue != issue: continue 134 # Both lines must match *exactly*. This is rather strict, 135 # and probably should be improved. 136 # Doing fuzzy matches with levenshtein distance could work, 137 # but that means bringing other libraries... 138 # Ok, relax that requirement: just check if the rule fragment 139 # is contained in the document line 140 if rule.line not in line: continue 141 # Check both line numbers. If they're "near" 142 # this rule matches. (lineno=None means "don't care") 143 if (rule.lineno is not None) and \ 144 abs(rule.lineno - lineno) > 5: continue 145 # if it came this far, the rule matched 146 rule.used = True 147 return True 148 return False 149 150 def report_issue(self, text, lineno, issue): 151 self.any_issue = True 152 self.write_log_entry(lineno, issue, text) 153 if py3: 154 self.warn('[%s:%d] "%s" found in "%-.120s"' % 155 (self.docname, lineno, issue, text)) 156 else: 157 self.warn('[%s:%d] "%s" found in "%-.120s"' % ( 158 self.docname.encode(sys.getdefaultencoding(),'replace'), 159 lineno, 160 issue.encode(sys.getdefaultencoding(),'replace'), 161 text.strip().encode(sys.getdefaultencoding(),'replace'))) 162 self.app.statuscode = 1 163 164 def write_log_entry(self, lineno, issue, text): 165 if py3: 166 f = open(self.log_file_name, 'a') 167 writer = csv.writer(f, dialect) 168 writer.writerow([self.docname, lineno, issue, text.strip()]) 169 f.close() 170 else: 171 f = open(self.log_file_name, 'ab') 172 writer = csv.writer(f, dialect) 173 writer.writerow([self.docname.encode('utf-8'), 174 lineno, 175 issue.encode('utf-8'), 176 text.strip().encode('utf-8')]) 177 f.close() 178 179 def load_rules(self, filename): 180 """Load database of previously ignored issues. 181 182 A csv file, with exactly the same format as suspicious.csv 183 Fields: document name (normalized), line number, issue, surrounding text 184 """ 185 self.logger.info("loading ignore rules... ", nonl=1) 186 self.rules = rules = [] 187 try: 188 if py3: 189 f = open(filename, 'r') 190 else: 191 f = open(filename, 'rb') 192 except IOError: 193 return 194 for i, row in enumerate(csv.reader(f)): 195 if len(row) != 4: 196 raise ValueError( 197 "wrong format in %s, line %d: %s" % (filename, i+1, row)) 198 docname, lineno, issue, text = row 199 if lineno: 200 lineno = int(lineno) 201 else: 202 lineno = None 203 if not py3: 204 docname = docname.decode('utf-8') 205 issue = issue.decode('utf-8') 206 text = text.decode('utf-8') 207 rule = Rule(docname, lineno, issue, text) 208 rules.append(rule) 209 f.close() 210 self.logger.info('done, %d rules loaded' % len(self.rules)) 211 212 213def get_lineno(node): 214 """Obtain line number information for a node.""" 215 lineno = None 216 while lineno is None and node: 217 node = node.parent 218 lineno = node.line 219 return lineno 220 221 222def extract_line(text, index): 223 """text may be a multiline string; extract 224 only the line containing the given character index. 225 226 >>> extract_line("abc\ndefgh\ni", 6) 227 >>> 'defgh' 228 >>> for i in (0, 2, 3, 4, 10): 229 ... print extract_line("abc\ndefgh\ni", i) 230 abc 231 abc 232 abc 233 defgh 234 defgh 235 i 236 """ 237 p = text.rfind('\n', 0, index) + 1 238 q = text.find('\n', index) 239 if q < 0: 240 q = len(text) 241 return text[p:q] 242 243 244class SuspiciousVisitor(nodes.GenericNodeVisitor): 245 246 lastlineno = 0 247 248 def __init__(self, document, builder): 249 nodes.GenericNodeVisitor.__init__(self, document) 250 self.builder = builder 251 252 def default_visit(self, node): 253 if isinstance(node, (nodes.Text, nodes.image)): # direct text containers 254 text = node.astext() 255 # lineno seems to go backwards sometimes (?) 256 self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno) 257 seen = set() # don't report the same issue more than only once per line 258 for match in detect_all(text): 259 issue = match.group() 260 line = extract_line(text, match.start()) 261 if (issue, line) not in seen: 262 self.builder.check_issue(line, lineno, issue) 263 seen.add((issue, line)) 264 265 unknown_visit = default_visit 266 267 def visit_document(self, node): 268 self.lastlineno = 0 269 270 def visit_comment(self, node): 271 # ignore comments -- too much false positives. 272 # (although doing this could miss some errors; 273 # there were two sections "commented-out" by mistake 274 # in the Python docs that would not be caught) 275 raise nodes.SkipNode 276