1#!/usr/bin/env python3
2# Wireshark - Network traffic analyzer
3# By Gerald Combs <gerald@wireshark.org>
4# Copyright 1998 Gerald Combs
5#
6# SPDX-License-Identifier: GPL-2.0-or-later
7
8import argparse
9import os
10import re
11import shutil
12import signal
13import subprocess
14
15import requests
16
17# This utility scans the dissector code for URLs, then attempts to
18# fetch the links.  The results are shown in stdout, but also, at
19# the end of the run, written to files:
20# - URLs that couldn't be loaded are written to failures.txt
21# - working URLs are written to successes.txt
22# - any previous failures.txt is also copied to failures_last_run.txt
23
24
25# TODO:
26# - option to write back to dissector file when there is a failure?
27# - make requests in parallel (run takes around 35 minutes)?
28# - optionally parse previous successes.txt and avoid fetching them again?
29# - make sure URLs are really within comments in code?
30# - use urllib.parse or similar to better check URLs?
31# - improve regex to allow '+' in URL (like confluence uses)
32
33# Try to exit soon after Ctrl-C is pressed.
34should_exit = False
35
36
37def signal_handler(sig, frame):
38    global should_exit
39    should_exit = True
40    print('You pressed Ctrl+C - exiting')
41
42
43signal.signal(signal.SIGINT, signal_handler)
44
45
46class FailedLookup:
47
48    def __init__(self):
49        # Fake values that will be queried (for a requests.get() return value)
50        self.status_code = 0
51        self.headers = {}
52        self.headers['content-type'] = '<NONE>'
53
54    def __str__(self):
55        s = ('FailedLookup: status_code=' + str(self.status_code) +
56             ' content-type=' + self.headers['content-type'])
57        return s
58
59
60# Dictionary from url -> result
61cached_lookups = {}
62
63
64class Link(object):
65
66    def __init__(self, file, line_number, url):
67        self.file = file
68        self.line_number = line_number
69        self.url = url
70        self.tested = False
71        self.r = None
72        self.success = False
73        self.result_from_cache = False
74
75    def __str__(self):
76        epan_idx = self.file.find('epan')
77        if epan_idx == -1:
78            filename = self.file
79        else:
80            filename = self.file[epan_idx:]
81        s = ('SUCCESS  ' if self.success else 'FAILED  ') + \
82            filename + ':' + str(self.line_number) + '   ' + self.url
83        if True:  # self.r:
84            if self.r.status_code:
85                s += "   status-code=" + str(self.r.status_code)
86                if 'content-type' in self.r.headers:
87                    s += (' content-type="' +
88                          self.r.headers['content-type'] + '"')
89            else:
90                s += '    <No response Received>'
91        return s
92
93    def validate(self, session):
94        # Fetch, but first look in cache
95        global cached_lookups
96        self.tested = True
97        if self.url in cached_lookups:
98            if args.verbose:
99                print('[Using cached result for', self.url, ']')
100            self.r = cached_lookups[self.url]
101            self.result_from_cache = True
102        else:
103
104            try:
105                # Try it.
106                self.r = session.get(self.url, timeout=15)
107
108                # Cache this result.
109                cached_lookups[self.url] = self.r
110            except (ValueError, ConnectionError, Exception):
111                if args.verbose:
112                    print(self.url, ': failed to make request')
113                self.success = False
114                # Add bad result to crashed_lookups.
115                cached_lookups[self.url] = FailedLookup()
116                self.r = cached_lookups[self.url]
117                return
118
119        # Check return value
120        if self.r.status_code < 200 or self.r.status_code >= 300:
121            self.success = False
122            return
123
124        # Assume its Ok.
125        self.success = True
126
127
128links = []
129files = []
130
131
132def find_links_in_file(filename):
133    with open(filename, 'r') as f:
134        for line_number, line in enumerate(f, start=1):
135            # TODO: not matching
136            # https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol
137            urls = re.findall(
138                r'https?://(?:[a-zA-Z0-9./_?&=-]+|%[0-9a-fA-F]{2})+', line)
139
140            for url in urls:
141                # Lop off any trailing chars that are not part of it
142                url = url.rstrip(").',")
143
144                # A url must have a period somewhere
145                if '.' not in url:
146                    continue
147                if args.verbose:
148                    print('Found URL:', url)
149                global links
150                links.append(Link(filename, line_number, url))
151
152
153# Scan the given folder for links to test.
154def find_links_in_folder(folder):
155    # Look at files in sorted order, to give some idea of how far through it
156    # is.
157    for filename in sorted(os.listdir(folder)):
158        if filename.endswith('.c'):
159            global links
160            find_links_in_file(os.path.join(folder, filename))
161
162
163#################################################################
164# Main logic.
165
166# command-line args.  Controls which dissector files should be scanned.
167# If no args given, will just scan epan/dissectors folder.
168parser = argparse.ArgumentParser(description='Check URL links in dissectors')
169parser.add_argument('--file', action='store', default='',
170                    help='specify individual dissector file to test')
171parser.add_argument('--commits', action='store',
172                    help='last N commits to check')
173parser.add_argument('--open', action='store_true',
174                    help='check open files')
175parser.add_argument('--verbose', action='store_true',
176                    help='when enabled, show more output')
177
178args = parser.parse_args()
179
180
181def is_dissector_file(filename):
182    p = re.compile(r'epan/dissectors/packet-.*\.c')
183    return p.match(filename)
184
185
186# Get files from wherever command-line args indicate.
187if args.file:
188    # Fetch links from single file.
189    find_links_in_file(args.file)
190elif args.commits:
191    # Get files affected by specified number of commits.
192    command = ['git', 'diff', '--name-only', 'HEAD~' + args.commits]
193    files = [f.decode('utf-8')
194             for f in subprocess.check_output(command).splitlines()]
195    # Fetch links from files (dissectors files only)
196    files = list(filter(lambda f: is_dissector_file(f), files))
197    for f in files:
198        find_links_in_file(f)
199elif args.open:
200    # Unstaged changes.
201    command = ['git', 'diff', '--name-only']
202    files = [f.decode('utf-8')
203             for f in subprocess.check_output(command).splitlines()]
204    files = list(filter(lambda f: is_dissector_file(f), files))
205    # Staged changes.
206    command = ['git', 'diff', '--staged', '--name-only']
207    files_staged = [f.decode('utf-8')
208                    for f in subprocess.check_output(command).splitlines()]
209    files_staged = list(filter(lambda f: is_dissector_file(f), files_staged))
210    for f in files:
211        find_links_in_file(f)
212    for f in files_staged:
213        if f not in files:
214            find_links_in_file(f)
215            files.append(f)
216else:
217    # Find links from dissector folder.
218    find_links_in_folder(os.path.join(os.path.dirname(
219        __file__), '..', 'epan', 'dissectors'))
220
221
222# If scanning a subset of files, list them here.
223print('Examining:')
224if args.file or args.commits or args.open:
225    if files:
226        print(' '.join(files), '\n')
227    else:
228        print('No files to check.\n')
229else:
230    print('All dissector modules\n')
231
232
233# Prepare one session for all requests. For args, see
234# https://requests.readthedocs.io/en/master/
235session = requests.Session()
236# N.B. Can set timeout here but doesn't get used.
237# Default headers don't always get responses where proper browsers do.
238session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36',
239                        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'})
240
241# Try out the links.
242for checks, link in enumerate(links):
243    if should_exit:
244        # i.e. if Ctrl-C has been pressed.
245        exit(0)
246    link.validate(session)
247    if args.verbose or not link.success:
248        print(link)
249
250
251# Write failures to a file.  Back up any previous first though.
252if os.path.exists('failures.txt'):
253    shutil.copyfile('failures.txt', 'failures_last_run.txt')
254with open('failures.txt', 'w') as f_f:
255    for l in links:
256        if l.tested and not l.success:
257            f_f.write(str(l) + '\n')
258# And successes
259with open('successes.txt', 'w') as f_s:
260    for l in links:
261        if l.tested and l.success:
262            f_s.write(str(l) + '\n')
263
264
265# Count and show overall stats.
266passed, failed, cached = 0, 0, 0
267for l in links:
268    if not l.result_from_cache:
269        if l.tested:
270            if l.success:
271                passed += 1
272            else:
273                failed += 1
274    else:
275        cached += 1
276
277print('--------------------------------------------------------------------------------------------------')
278print(len(links), 'links checked: ', passed, 'passed,',
279      failed, 'failed (', cached, 'results from cache)')
280