1#!/usr/bin/env python3
2# Wireshark - Network traffic analyzer
3# By Gerald Combs <gerald@wireshark.org>
4# Copyright 1998 Gerald Combs
5#
6# SPDX-License-Identifier: GPL-2.0-or-later
7
8import os
9import re
10import subprocess
11import argparse
12import signal
13from collections import Counter
14
15# Looks for spelling errors among strings found in source or documentation files.
16
17# TODO: check structured doxygen comments?
18
19# For text colouring/highlighting.
20class bcolors:
21    HEADER = '\033[95m'
22    OKBLUE = '\033[94m'
23    OKGREEN = '\033[92m'
24    ADDED = '\033[45m'
25    WARNING = '\033[93m'
26    FAIL = '\033[91m'
27    ENDC = '\033[0m'
28    BOLD = '\033[1m'
29    UNDERLINE = '\033[4m'
30
31
32# Try to exit soon after Ctrl-C is pressed.
33should_exit = False
34
35def signal_handler(sig, frame):
36    global should_exit
37    should_exit = True
38    print('You pressed Ctrl+C - exiting')
39
40signal.signal(signal.SIGINT, signal_handler)
41
42
43
44# Create spellchecker, and augment with some Wireshark words.
45from spellchecker import SpellChecker
46# Set up our dict with words from text file.
47spell = SpellChecker()
48spell.word_frequency.load_text_file('./tools/wireshark_words.txt')
49
50
51# Track words that were not found.
52missing_words = []
53
54
55# Split camelCase string into separate words.
56def camelCaseSplit(identifier):
57    matches = re.finditer(r'.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
58    return [m.group(0) for m in matches]
59
60
61# A File object contains all of the strings to be checked for a given file.
62class File:
63    def __init__(self, file):
64        self.file = file
65        self.values = []
66
67        filename, extension = os.path.splitext(file)
68        self.code_file = extension in {'.c', '.cpp'}
69
70
71        with open(file, 'r') as f:
72            contents = f.read()
73
74            if self.code_file:
75                # Remove comments so as not to trip up RE.
76                contents = removeComments(contents)
77
78            # Find protocol name and add to dict.
79            # N.B. doesn't work when a variable is used instead of a literal for the protocol name...
80            matches = re.finditer(r'proto_register_protocol\s*\([\n\r\s]*\"(.*)\",[\n\r\s]*\"(.*)\",[\n\r\s]*\"(.*)\"', contents)
81            for m in matches:
82                protocol = m.group(3)
83                # Add to dict.
84                spell.word_frequency.load_words([protocol])
85                spell.known([protocol])
86                print('Protocol is: ' + bcolors.BOLD +  protocol + bcolors.ENDC)
87
88    # Add a string found in this file.
89    def add(self, value):
90        self.values.append(value)
91
92    # Whole word is not recognised, but is it 2 words concatenated (without camelcase) ?
93    def checkMultiWords(self, word):
94        if len(word) < 6:
95            return False
96
97        # Don't consider if mixed cases.
98        if not (word.islower() or word.isupper()):
99            # But make an exception if only the fist letter is uppercase..
100            if not word == (word[0].upper() + word[1:]):
101                return False
102
103        # Try splitting into 2 words recognised at various points.
104        # Allow 3-letter words.
105        length = len(word)
106        for idx in range(3, length-3):
107            word1 = word[0:idx]
108            word2 = word[idx:]
109
110            if not spell.unknown([word1, word2]):
111                return True
112
113        return self.checkMultiWordsRecursive(word)
114
115    def checkMultiWordsRecursive(self, word):
116        length = len(word)
117        #print('word=', word)
118        if length < 4:
119            return False
120
121        for idx in range(4, length+1):
122            w = word[0:idx]
123            #print('considering', w)
124            if not spell.unknown([w]):
125                #print('Recognised!')
126                if idx == len(word):
127                    #print('Was end of word, so TRUEE!!!!')
128                    return True
129                else:
130                    #print('More to go..')
131                    if self.checkMultiWordsRecursive(word[idx:]):
132                        return True
133
134        return False
135
136    # Check the spelling of all the words we have found
137    def spellCheck(self):
138
139        num_values = len(self.values)
140        this_value = 0
141        for v in self.values:
142            if should_exit:
143                exit(1)
144
145            this_value += 1
146
147            # Ignore includes.
148            if v.endswith('.h'):
149                continue
150
151            # Store original (as want to include for context in error report).
152            original = str(v)
153
154            # Replace most punctuation with spaces, and eliminate common format specifiers.
155            v = v.replace('.', ' ')
156            v = v.replace(',', ' ')
157            v = v.replace('`', ' ')
158            v = v.replace(':', ' ')
159            v = v.replace(';', ' ')
160            v = v.replace('"', ' ')
161            v = v.replace('\\', ' ')
162            v = v.replace('+', ' ')
163            v = v.replace('|', ' ')
164            v = v.replace('(', ' ')
165            v = v.replace(')', ' ')
166            v = v.replace('[', ' ')
167            v = v.replace(']', ' ')
168            v = v.replace('{', ' ')
169            v = v.replace('}', ' ')
170            v = v.replace('<', ' ')
171            v = v.replace('>', ' ')
172            v = v.replace('_', ' ')
173            v = v.replace('-', ' ')
174            v = v.replace('/', ' ')
175            v = v.replace('!', ' ')
176            v = v.replace('?', ' ')
177            v = v.replace('=', ' ')
178            v = v.replace('*', ' ')
179            v = v.replace('%', ' ')
180            v = v.replace('#', ' ')
181            v = v.replace('&', ' ')
182            v = v.replace('@', ' ')
183            v = v.replace('$', ' ')
184            v = v.replace("'", ' ')
185            v = v.replace('"', ' ')
186            v = v.replace('%u', '')
187            v = v.replace('%d', '')
188            v = v.replace('%s', '')
189
190            # Split into words.
191            value_words = v.split()
192            # Further split up any camelCase words.
193            words = []
194            for w in value_words:
195                words +=  camelCaseSplit(w)
196
197            # Check each word within this string in turn.
198            for word in words:
199                # Strip trailing digits from word.
200                word = word.rstrip('1234567890')
201
202                # Quote marks found in some of the docs...
203                word = word.replace('“', '')
204                word = word.replace('”', '')
205
206                if len(word) > 4 and spell.unknown([word]) and not self.checkMultiWords(word):
207                    print(self.file, this_value, '/', num_values, '"' + original + '"', bcolors.FAIL + word + bcolors.ENDC,
208                         ' -> ', '?')
209                    # TODO: this can be interesting, but takes too long!
210                    # bcolors.OKGREEN + spell.correction(word) + bcolors.ENDC
211                    global missing_words
212                    missing_words.append(word)
213
214def removeWhitespaceControl(code_string):
215    code_string = code_string.replace('\\n', ' ')
216    code_string = code_string.replace('\\r', ' ')
217    code_string = code_string.replace('\\t', ' ')
218    return code_string
219
220# Remove any contractions from the given string.
221def removeContractions(code_string):
222    contractions = [ "wireshark’s", "don’t", "let’s", "isn’t", "won’t", "user’s", "hasn’t", "you’re", "o’clock", "you’ll",
223                     "you’d", "developer’s", "doesn’t", "what’s", "let’s", "haven’t", "can’t", "you’ve",
224                     "shouldn’t", "didn’t", "wouldn’t", "aren’t", "there’s", "packet’s", "couldn’t", "world’s",
225                     "needn’t", "graph’s", "table’s", "parent’s", "entity’s", "server’s", "node’s",
226                     "querier’s", "sender’s", "receiver’s", "computer’s", "frame’s", "vendor’s", "system’s"]
227    for c in contractions:
228        code_string = code_string.replace(c, "")
229        code_string = code_string.replace(c.capitalize(), "")
230        code_string = code_string.replace(c.replace('’', "'"), "")
231        code_string = code_string.replace(c.capitalize().replace('’', "'"), "")
232    return code_string
233
234def removeComments(code_string):
235    code_string = re.sub(re.compile(r"/\*.*?\*/",re.DOTALL ) ,"" ,code_string) # C-style comment
236    # Remove this for now as can get tripped up if see htpps://www.... within a string!
237    code_string = re.sub(re.compile(r"^\s*//.*?\n" ) ,"" ,code_string)             # C++-style comment
238    return code_string
239
240def removeSingleQuotes(code_string):
241    code_string = code_string.replace('\\\\', " ")        # Separate at \\
242    code_string = code_string.replace('\"\\\\\"', "")
243    code_string = code_string.replace("\\\"", " ")
244    code_string = code_string.replace("'\"'", "")
245    code_string = code_string.replace('…', ' ')
246    return code_string
247
248def removeHexSpecifiers(code_string):
249    # Find all hex numbers
250
251    looking = True
252    while looking:
253        m = re.search(r'(0x[0-9a-fA-F]*)', code_string)
254        if m:
255            code_string = code_string.replace(m.group(0), "")
256        else:
257            looking = False
258
259    return code_string
260
261
262# Create a File object that knows about all of the strings in the given file.
263def findStrings(filename):
264    with open(filename, 'r') as f:
265        contents = f.read()
266
267        # Remove comments & embedded quotes so as not to trip up RE.
268        contents = removeContractions(contents)
269        contents = removeWhitespaceControl(contents)
270        contents = removeSingleQuotes(contents)
271        contents = removeHexSpecifiers(contents)
272
273        # Create file object.
274        file = File(filename)
275
276        # What we check depends upon file type.
277        if file.code_file:
278            contents = removeComments(contents)
279            # Code so only checking strings.
280            matches =   re.finditer(r'\"([^\"]*)\"', contents)
281            for m in matches:
282                file.add(m.group(1))
283        else:
284            # A documentation file, so examine all words.
285            words = contents.split()
286            for w in words:
287                file.add(w)
288
289        return file
290
291
292# Test for whether the given file was automatically generated.
293def isGeneratedFile(filename):
294    if not filename.endswith('.c'):
295        return False
296
297    # Open file
298    f_read = open(os.path.join(filename), 'r')
299    lines_tested = 0
300    for line in f_read:
301        # The comment to say that its generated is near the top, so give up once
302        # get a few lines down.
303        if lines_tested > 10:
304            f_read.close()
305            return False
306        if (line.find('Generated automatically') != -1 or
307            line.find('Autogenerated from') != -1 or
308            line.find('is autogenerated') != -1 or
309            line.find('automatically generated by Pidl') != -1 or
310            line.find('Created by: The Qt Meta Object Compiler') != -1 or
311            line.find('This file was generated') != -1 or
312            line.find('This filter was automatically generated') != -1):
313
314            f_read.close()
315            return True
316        lines_tested = lines_tested + 1
317
318    # OK, looks like a hand-written file!
319    f_read.close()
320    return False
321
322
323def isAppropriateFile(filename):
324    file, extension = os.path.splitext(filename)
325    return extension in { '.adoc', '.c', '.cpp', '.pod', '.nsi'} or file.endswith('README')
326
327
328def findFilesInFolder(folder, recursive=True):
329    files_to_check = []
330
331    if recursive:
332        for root, subfolders, files in os.walk(folder):
333            for f in files:
334                if should_exit:
335                    return
336                f = os.path.join(root, f)
337                if isAppropriateFile(f) and not isGeneratedFile(f):
338                    files_to_check.append(f)
339    else:
340        for f in sorted(os.listdir(folder)):
341            f = os.path.join(folder, f)
342            if isAppropriateFile(f) and not isGeneratedFile(f):
343                files_to_check.append(f)
344
345    return files_to_check
346
347
348# Check the given file.
349def checkFile(filename):
350    # Check file exists - e.g. may have been deleted in a recent commit.
351    if not os.path.exists(filename):
352        print(filename, 'does not exist!')
353        return
354
355    file = findStrings(filename)
356    file.spellCheck()
357
358
359
360#################################################################
361# Main logic.
362
363# command-line args.  Controls which files should be checked.
364# If no args given, will just scan epan/dissectors folder.
365parser = argparse.ArgumentParser(description='Check spellings in specified files')
366parser.add_argument('--file', action='store', default='',
367                    help='specify individual file to test')
368parser.add_argument('--folder', action='store', default='',
369                    help='specify folder to test')
370parser.add_argument('--no-recurse', action='store_true', default='',
371                    help='do not recurse inside chosen folder')
372parser.add_argument('--commits', action='store',
373                    help='last N commits to check')
374parser.add_argument('--open', action='store_true',
375                    help='check open files')
376
377args = parser.parse_args()
378
379
380# Get files from wherever command-line args indicate.
381files = []
382if args.file:
383    # Add single specified file..
384    if not os.path.isfile(args.file):
385        print('Chosen file', args.file, 'does not exist.')
386        exit(1)
387    else:
388        files.append(args.file)
389elif args.commits:
390    # Get files affected by specified number of commits.
391    command = ['git', 'diff', '--name-only', 'HEAD~' + args.commits]
392    files = [f.decode('utf-8')
393             for f in subprocess.check_output(command).splitlines()]
394    # Filter files
395    files = list(filter(lambda f : os.path.exists(f) and isAppropriateFile(f) and not isGeneratedFile(f), files))
396elif args.open:
397    # Unstaged changes.
398    command = ['git', 'diff', '--name-only']
399    files = [f.decode('utf-8')
400             for f in subprocess.check_output(command).splitlines()]
401    # Filter files.
402    files = list(filter(lambda f : isAppropriateFile(f) and not isGeneratedFile(f), files))
403    # Staged changes.
404    command = ['git', 'diff', '--staged', '--name-only']
405    files_staged = [f.decode('utf-8')
406                    for f in subprocess.check_output(command).splitlines()]
407    # Filter files.
408    files_staged = list(filter(lambda f : isAppropriateFile(f) and not isGeneratedFile(f), files_staged))
409    for f in files_staged:
410        if not f in files:
411            files.append(f)
412else:
413    # By default, scan dissectors directory
414    folder = os.path.join('epan', 'dissectors')
415    # But overwrite with any folder entry.
416    if args.folder:
417        folder = args.folder
418        if not os.path.isdir(folder):
419            print('Folder', folder, 'not found!')
420            exit(1)
421
422    # Find files from folder.
423    print('Looking for files in', folder)
424    files = findFilesInFolder(folder, not args.no_recurse)
425
426
427# If scanning a subset of files, list them here.
428print('Examining:')
429if args.file or args.folder or args.commits or args.open:
430    if files:
431        print(' '.join(files), '\n')
432    else:
433        print('No files to check.\n')
434else:
435    print('All dissector modules\n')
436
437
438# Now check the chosen files.
439for f in files:
440    # Jump out if control-C has been pressed.
441    if should_exit:
442        exit(1)
443    checkFile(f)
444
445
446
447# Show the most commonly not-recognised words.
448print('')
449counter = Counter(missing_words).most_common(100)
450if len(counter) > 0:
451    for c in counter:
452        print(c[0], ':', c[1])
453
454# Show error count.
455print('\n' + bcolors.BOLD + str(len(missing_words)) + ' issues found' + bcolors.ENDC + '\n')
456