1#!/usr/bin/env python3 2# Wireshark - Network traffic analyzer 3# By Gerald Combs <gerald@wireshark.org> 4# Copyright 1998 Gerald Combs 5# 6# SPDX-License-Identifier: GPL-2.0-or-later 7 8import argparse 9import os 10import re 11import shutil 12import signal 13import subprocess 14 15import requests 16 17# This utility scans the dissector code for URLs, then attempts to 18# fetch the links. The results are shown in stdout, but also, at 19# the end of the run, written to files: 20# - URLs that couldn't be loaded are written to failures.txt 21# - working URLs are written to successes.txt 22# - any previous failures.txt is also copied to failures_last_run.txt 23 24 25# TODO: 26# - option to write back to dissector file when there is a failure? 27# - make requests in parallel (run takes around 35 minutes)? 28# - optionally parse previous successes.txt and avoid fetching them again? 29# - make sure URLs are really within comments in code? 30# - use urllib.parse or similar to better check URLs? 31# - improve regex to allow '+' in URL (like confluence uses) 32 33# Try to exit soon after Ctrl-C is pressed. 34should_exit = False 35 36 37def signal_handler(sig, frame): 38 global should_exit 39 should_exit = True 40 print('You pressed Ctrl+C - exiting') 41 42 43signal.signal(signal.SIGINT, signal_handler) 44 45 46class FailedLookup: 47 48 def __init__(self): 49 # Fake values that will be queried (for a requests.get() return value) 50 self.status_code = 0 51 self.headers = {} 52 self.headers['content-type'] = '<NONE>' 53 54 def __str__(self): 55 s = ('FailedLookup: status_code=' + str(self.status_code) + 56 ' content-type=' + self.headers['content-type']) 57 return s 58 59 60# Dictionary from url -> result 61cached_lookups = {} 62 63 64class Link(object): 65 66 def __init__(self, file, line_number, url): 67 self.file = file 68 self.line_number = line_number 69 self.url = url 70 self.tested = False 71 self.r = None 72 self.success = False 73 self.result_from_cache = False 74 75 def __str__(self): 76 epan_idx = self.file.find('epan') 77 if epan_idx == -1: 78 filename = self.file 79 else: 80 filename = self.file[epan_idx:] 81 s = ('SUCCESS ' if self.success else 'FAILED ') + \ 82 filename + ':' + str(self.line_number) + ' ' + self.url 83 if True: # self.r: 84 if self.r.status_code: 85 s += " status-code=" + str(self.r.status_code) 86 if 'content-type' in self.r.headers: 87 s += (' content-type="' + 88 self.r.headers['content-type'] + '"') 89 else: 90 s += ' <No response Received>' 91 return s 92 93 def validate(self, session): 94 # Fetch, but first look in cache 95 global cached_lookups 96 self.tested = True 97 if self.url in cached_lookups: 98 if args.verbose: 99 print('[Using cached result for', self.url, ']') 100 self.r = cached_lookups[self.url] 101 self.result_from_cache = True 102 else: 103 104 try: 105 # Try it. 106 self.r = session.get(self.url, timeout=15) 107 108 # Cache this result. 109 cached_lookups[self.url] = self.r 110 except (ValueError, ConnectionError, Exception): 111 if args.verbose: 112 print(self.url, ': failed to make request') 113 self.success = False 114 # Add bad result to crashed_lookups. 115 cached_lookups[self.url] = FailedLookup() 116 self.r = cached_lookups[self.url] 117 return 118 119 # Check return value 120 if self.r.status_code < 200 or self.r.status_code >= 300: 121 self.success = False 122 return 123 124 # Assume its Ok. 125 self.success = True 126 127 128links = [] 129files = [] 130 131 132def find_links_in_file(filename): 133 with open(filename, 'r') as f: 134 for line_number, line in enumerate(f, start=1): 135 # TODO: not matching 136 # https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol 137 urls = re.findall( 138 r'https?://(?:[a-zA-Z0-9./_?&=-]+|%[0-9a-fA-F]{2})+', line) 139 140 for url in urls: 141 # Lop off any trailing chars that are not part of it 142 url = url.rstrip(").',") 143 144 # A url must have a period somewhere 145 if '.' not in url: 146 continue 147 if args.verbose: 148 print('Found URL:', url) 149 global links 150 links.append(Link(filename, line_number, url)) 151 152 153# Scan the given folder for links to test. 154def find_links_in_folder(folder): 155 # Look at files in sorted order, to give some idea of how far through it 156 # is. 157 for filename in sorted(os.listdir(folder)): 158 if filename.endswith('.c'): 159 global links 160 find_links_in_file(os.path.join(folder, filename)) 161 162 163################################################################# 164# Main logic. 165 166# command-line args. Controls which dissector files should be scanned. 167# If no args given, will just scan epan/dissectors folder. 168parser = argparse.ArgumentParser(description='Check URL links in dissectors') 169parser.add_argument('--file', action='store', default='', 170 help='specify individual dissector file to test') 171parser.add_argument('--commits', action='store', 172 help='last N commits to check') 173parser.add_argument('--open', action='store_true', 174 help='check open files') 175parser.add_argument('--verbose', action='store_true', 176 help='when enabled, show more output') 177 178args = parser.parse_args() 179 180 181def is_dissector_file(filename): 182 p = re.compile(r'epan/dissectors/packet-.*\.c') 183 return p.match(filename) 184 185 186# Get files from wherever command-line args indicate. 187if args.file: 188 # Fetch links from single file. 189 find_links_in_file(args.file) 190elif args.commits: 191 # Get files affected by specified number of commits. 192 command = ['git', 'diff', '--name-only', 'HEAD~' + args.commits] 193 files = [f.decode('utf-8') 194 for f in subprocess.check_output(command).splitlines()] 195 # Fetch links from files (dissectors files only) 196 files = list(filter(lambda f: is_dissector_file(f), files)) 197 for f in files: 198 find_links_in_file(f) 199elif args.open: 200 # Unstaged changes. 201 command = ['git', 'diff', '--name-only'] 202 files = [f.decode('utf-8') 203 for f in subprocess.check_output(command).splitlines()] 204 files = list(filter(lambda f: is_dissector_file(f), files)) 205 # Staged changes. 206 command = ['git', 'diff', '--staged', '--name-only'] 207 files_staged = [f.decode('utf-8') 208 for f in subprocess.check_output(command).splitlines()] 209 files_staged = list(filter(lambda f: is_dissector_file(f), files_staged)) 210 for f in files: 211 find_links_in_file(f) 212 for f in files_staged: 213 if f not in files: 214 find_links_in_file(f) 215 files.append(f) 216else: 217 # Find links from dissector folder. 218 find_links_in_folder(os.path.join(os.path.dirname( 219 __file__), '..', 'epan', 'dissectors')) 220 221 222# If scanning a subset of files, list them here. 223print('Examining:') 224if args.file or args.commits or args.open: 225 if files: 226 print(' '.join(files), '\n') 227 else: 228 print('No files to check.\n') 229else: 230 print('All dissector modules\n') 231 232 233# Prepare one session for all requests. For args, see 234# https://requests.readthedocs.io/en/master/ 235session = requests.Session() 236# N.B. Can set timeout here but doesn't get used. 237# Default headers don't always get responses where proper browsers do. 238session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36', 239 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}) 240 241# Try out the links. 242for checks, link in enumerate(links): 243 if should_exit: 244 # i.e. if Ctrl-C has been pressed. 245 exit(0) 246 link.validate(session) 247 if args.verbose or not link.success: 248 print(link) 249 250 251# Write failures to a file. Back up any previous first though. 252if os.path.exists('failures.txt'): 253 shutil.copyfile('failures.txt', 'failures_last_run.txt') 254with open('failures.txt', 'w') as f_f: 255 for l in links: 256 if l.tested and not l.success: 257 f_f.write(str(l) + '\n') 258# And successes 259with open('successes.txt', 'w') as f_s: 260 for l in links: 261 if l.tested and l.success: 262 f_s.write(str(l) + '\n') 263 264 265# Count and show overall stats. 266passed, failed, cached = 0, 0, 0 267for l in links: 268 if not l.result_from_cache: 269 if l.tested: 270 if l.success: 271 passed += 1 272 else: 273 failed += 1 274 else: 275 cached += 1 276 277print('--------------------------------------------------------------------------------------------------') 278print(len(links), 'links checked: ', passed, 'passed,', 279 failed, 'failed (', cached, 'results from cache)') 280