1#!/usr/bin/env python3 2# Wireshark - Network traffic analyzer 3# By Gerald Combs <gerald@wireshark.org> 4# Copyright 1998 Gerald Combs 5# 6# SPDX-License-Identifier: GPL-2.0-or-later 7 8import os 9import re 10import subprocess 11import argparse 12import signal 13from collections import Counter 14 15# Looks for spelling errors among strings found in source or documentation files. 16 17# TODO: check structured doxygen comments? 18 19# For text colouring/highlighting. 20class bcolors: 21 HEADER = '\033[95m' 22 OKBLUE = '\033[94m' 23 OKGREEN = '\033[92m' 24 ADDED = '\033[45m' 25 WARNING = '\033[93m' 26 FAIL = '\033[91m' 27 ENDC = '\033[0m' 28 BOLD = '\033[1m' 29 UNDERLINE = '\033[4m' 30 31 32# Try to exit soon after Ctrl-C is pressed. 33should_exit = False 34 35def signal_handler(sig, frame): 36 global should_exit 37 should_exit = True 38 print('You pressed Ctrl+C - exiting') 39 40signal.signal(signal.SIGINT, signal_handler) 41 42 43 44# Create spellchecker, and augment with some Wireshark words. 45from spellchecker import SpellChecker 46# Set up our dict with words from text file. 47spell = SpellChecker() 48spell.word_frequency.load_text_file('./tools/wireshark_words.txt') 49 50 51# Track words that were not found. 52missing_words = [] 53 54 55# Split camelCase string into separate words. 56def camelCaseSplit(identifier): 57 matches = re.finditer(r'.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier) 58 return [m.group(0) for m in matches] 59 60 61# A File object contains all of the strings to be checked for a given file. 62class File: 63 def __init__(self, file): 64 self.file = file 65 self.values = [] 66 67 filename, extension = os.path.splitext(file) 68 self.code_file = extension in {'.c', '.cpp'} 69 70 71 with open(file, 'r') as f: 72 contents = f.read() 73 74 if self.code_file: 75 # Remove comments so as not to trip up RE. 76 contents = removeComments(contents) 77 78 # Find protocol name and add to dict. 79 # N.B. doesn't work when a variable is used instead of a literal for the protocol name... 80 matches = re.finditer(r'proto_register_protocol\s*\([\n\r\s]*\"(.*)\",[\n\r\s]*\"(.*)\",[\n\r\s]*\"(.*)\"', contents) 81 for m in matches: 82 protocol = m.group(3) 83 # Add to dict. 84 spell.word_frequency.load_words([protocol]) 85 spell.known([protocol]) 86 print('Protocol is: ' + bcolors.BOLD + protocol + bcolors.ENDC) 87 88 # Add a string found in this file. 89 def add(self, value): 90 self.values.append(value) 91 92 # Whole word is not recognised, but is it 2 words concatenated (without camelcase) ? 93 def checkMultiWords(self, word): 94 if len(word) < 6: 95 return False 96 97 # Don't consider if mixed cases. 98 if not (word.islower() or word.isupper()): 99 # But make an exception if only the fist letter is uppercase.. 100 if not word == (word[0].upper() + word[1:]): 101 return False 102 103 # Try splitting into 2 words recognised at various points. 104 # Allow 3-letter words. 105 length = len(word) 106 for idx in range(3, length-3): 107 word1 = word[0:idx] 108 word2 = word[idx:] 109 110 if not spell.unknown([word1, word2]): 111 return True 112 113 return self.checkMultiWordsRecursive(word) 114 115 def checkMultiWordsRecursive(self, word): 116 length = len(word) 117 #print('word=', word) 118 if length < 4: 119 return False 120 121 for idx in range(4, length+1): 122 w = word[0:idx] 123 #print('considering', w) 124 if not spell.unknown([w]): 125 #print('Recognised!') 126 if idx == len(word): 127 #print('Was end of word, so TRUEE!!!!') 128 return True 129 else: 130 #print('More to go..') 131 if self.checkMultiWordsRecursive(word[idx:]): 132 return True 133 134 return False 135 136 # Check the spelling of all the words we have found 137 def spellCheck(self): 138 139 num_values = len(self.values) 140 this_value = 0 141 for v in self.values: 142 if should_exit: 143 exit(1) 144 145 this_value += 1 146 147 # Ignore includes. 148 if v.endswith('.h'): 149 continue 150 151 # Store original (as want to include for context in error report). 152 original = str(v) 153 154 # Replace most punctuation with spaces, and eliminate common format specifiers. 155 v = v.replace('.', ' ') 156 v = v.replace(',', ' ') 157 v = v.replace('`', ' ') 158 v = v.replace(':', ' ') 159 v = v.replace(';', ' ') 160 v = v.replace('"', ' ') 161 v = v.replace('\\', ' ') 162 v = v.replace('+', ' ') 163 v = v.replace('|', ' ') 164 v = v.replace('(', ' ') 165 v = v.replace(')', ' ') 166 v = v.replace('[', ' ') 167 v = v.replace(']', ' ') 168 v = v.replace('{', ' ') 169 v = v.replace('}', ' ') 170 v = v.replace('<', ' ') 171 v = v.replace('>', ' ') 172 v = v.replace('_', ' ') 173 v = v.replace('-', ' ') 174 v = v.replace('/', ' ') 175 v = v.replace('!', ' ') 176 v = v.replace('?', ' ') 177 v = v.replace('=', ' ') 178 v = v.replace('*', ' ') 179 v = v.replace('%', ' ') 180 v = v.replace('#', ' ') 181 v = v.replace('&', ' ') 182 v = v.replace('@', ' ') 183 v = v.replace('$', ' ') 184 v = v.replace("'", ' ') 185 v = v.replace('"', ' ') 186 v = v.replace('%u', '') 187 v = v.replace('%d', '') 188 v = v.replace('%s', '') 189 190 # Split into words. 191 value_words = v.split() 192 # Further split up any camelCase words. 193 words = [] 194 for w in value_words: 195 words += camelCaseSplit(w) 196 197 # Check each word within this string in turn. 198 for word in words: 199 # Strip trailing digits from word. 200 word = word.rstrip('1234567890') 201 202 # Quote marks found in some of the docs... 203 word = word.replace('“', '') 204 word = word.replace('”', '') 205 206 if len(word) > 4 and spell.unknown([word]) and not self.checkMultiWords(word): 207 print(self.file, this_value, '/', num_values, '"' + original + '"', bcolors.FAIL + word + bcolors.ENDC, 208 ' -> ', '?') 209 # TODO: this can be interesting, but takes too long! 210 # bcolors.OKGREEN + spell.correction(word) + bcolors.ENDC 211 global missing_words 212 missing_words.append(word) 213 214def removeWhitespaceControl(code_string): 215 code_string = code_string.replace('\\n', ' ') 216 code_string = code_string.replace('\\r', ' ') 217 code_string = code_string.replace('\\t', ' ') 218 return code_string 219 220# Remove any contractions from the given string. 221def removeContractions(code_string): 222 contractions = [ "wireshark’s", "don’t", "let’s", "isn’t", "won’t", "user’s", "hasn’t", "you’re", "o’clock", "you’ll", 223 "you’d", "developer’s", "doesn’t", "what’s", "let’s", "haven’t", "can’t", "you’ve", 224 "shouldn’t", "didn’t", "wouldn’t", "aren’t", "there’s", "packet’s", "couldn’t", "world’s", 225 "needn’t", "graph’s", "table’s", "parent’s", "entity’s", "server’s", "node’s", 226 "querier’s", "sender’s", "receiver’s", "computer’s", "frame’s", "vendor’s", "system’s"] 227 for c in contractions: 228 code_string = code_string.replace(c, "") 229 code_string = code_string.replace(c.capitalize(), "") 230 code_string = code_string.replace(c.replace('’', "'"), "") 231 code_string = code_string.replace(c.capitalize().replace('’', "'"), "") 232 return code_string 233 234def removeComments(code_string): 235 code_string = re.sub(re.compile(r"/\*.*?\*/",re.DOTALL ) ,"" ,code_string) # C-style comment 236 # Remove this for now as can get tripped up if see htpps://www.... within a string! 237 code_string = re.sub(re.compile(r"^\s*//.*?\n" ) ,"" ,code_string) # C++-style comment 238 return code_string 239 240def removeSingleQuotes(code_string): 241 code_string = code_string.replace('\\\\', " ") # Separate at \\ 242 code_string = code_string.replace('\"\\\\\"', "") 243 code_string = code_string.replace("\\\"", " ") 244 code_string = code_string.replace("'\"'", "") 245 code_string = code_string.replace('…', ' ') 246 return code_string 247 248def removeHexSpecifiers(code_string): 249 # Find all hex numbers 250 251 looking = True 252 while looking: 253 m = re.search(r'(0x[0-9a-fA-F]*)', code_string) 254 if m: 255 code_string = code_string.replace(m.group(0), "") 256 else: 257 looking = False 258 259 return code_string 260 261 262# Create a File object that knows about all of the strings in the given file. 263def findStrings(filename): 264 with open(filename, 'r') as f: 265 contents = f.read() 266 267 # Remove comments & embedded quotes so as not to trip up RE. 268 contents = removeContractions(contents) 269 contents = removeWhitespaceControl(contents) 270 contents = removeSingleQuotes(contents) 271 contents = removeHexSpecifiers(contents) 272 273 # Create file object. 274 file = File(filename) 275 276 # What we check depends upon file type. 277 if file.code_file: 278 contents = removeComments(contents) 279 # Code so only checking strings. 280 matches = re.finditer(r'\"([^\"]*)\"', contents) 281 for m in matches: 282 file.add(m.group(1)) 283 else: 284 # A documentation file, so examine all words. 285 words = contents.split() 286 for w in words: 287 file.add(w) 288 289 return file 290 291 292# Test for whether the given file was automatically generated. 293def isGeneratedFile(filename): 294 if not filename.endswith('.c'): 295 return False 296 297 # Open file 298 f_read = open(os.path.join(filename), 'r') 299 lines_tested = 0 300 for line in f_read: 301 # The comment to say that its generated is near the top, so give up once 302 # get a few lines down. 303 if lines_tested > 10: 304 f_read.close() 305 return False 306 if (line.find('Generated automatically') != -1 or 307 line.find('Autogenerated from') != -1 or 308 line.find('is autogenerated') != -1 or 309 line.find('automatically generated by Pidl') != -1 or 310 line.find('Created by: The Qt Meta Object Compiler') != -1 or 311 line.find('This file was generated') != -1 or 312 line.find('This filter was automatically generated') != -1): 313 314 f_read.close() 315 return True 316 lines_tested = lines_tested + 1 317 318 # OK, looks like a hand-written file! 319 f_read.close() 320 return False 321 322 323def isAppropriateFile(filename): 324 file, extension = os.path.splitext(filename) 325 return extension in { '.adoc', '.c', '.cpp', '.pod', '.nsi'} or file.endswith('README') 326 327 328def findFilesInFolder(folder, recursive=True): 329 files_to_check = [] 330 331 if recursive: 332 for root, subfolders, files in os.walk(folder): 333 for f in files: 334 if should_exit: 335 return 336 f = os.path.join(root, f) 337 if isAppropriateFile(f) and not isGeneratedFile(f): 338 files_to_check.append(f) 339 else: 340 for f in sorted(os.listdir(folder)): 341 f = os.path.join(folder, f) 342 if isAppropriateFile(f) and not isGeneratedFile(f): 343 files_to_check.append(f) 344 345 return files_to_check 346 347 348# Check the given file. 349def checkFile(filename): 350 # Check file exists - e.g. may have been deleted in a recent commit. 351 if not os.path.exists(filename): 352 print(filename, 'does not exist!') 353 return 354 355 file = findStrings(filename) 356 file.spellCheck() 357 358 359 360################################################################# 361# Main logic. 362 363# command-line args. Controls which files should be checked. 364# If no args given, will just scan epan/dissectors folder. 365parser = argparse.ArgumentParser(description='Check spellings in specified files') 366parser.add_argument('--file', action='store', default='', 367 help='specify individual file to test') 368parser.add_argument('--folder', action='store', default='', 369 help='specify folder to test') 370parser.add_argument('--no-recurse', action='store_true', default='', 371 help='do not recurse inside chosen folder') 372parser.add_argument('--commits', action='store', 373 help='last N commits to check') 374parser.add_argument('--open', action='store_true', 375 help='check open files') 376 377args = parser.parse_args() 378 379 380# Get files from wherever command-line args indicate. 381files = [] 382if args.file: 383 # Add single specified file.. 384 if not os.path.isfile(args.file): 385 print('Chosen file', args.file, 'does not exist.') 386 exit(1) 387 else: 388 files.append(args.file) 389elif args.commits: 390 # Get files affected by specified number of commits. 391 command = ['git', 'diff', '--name-only', 'HEAD~' + args.commits] 392 files = [f.decode('utf-8') 393 for f in subprocess.check_output(command).splitlines()] 394 # Filter files 395 files = list(filter(lambda f : os.path.exists(f) and isAppropriateFile(f) and not isGeneratedFile(f), files)) 396elif args.open: 397 # Unstaged changes. 398 command = ['git', 'diff', '--name-only'] 399 files = [f.decode('utf-8') 400 for f in subprocess.check_output(command).splitlines()] 401 # Filter files. 402 files = list(filter(lambda f : isAppropriateFile(f) and not isGeneratedFile(f), files)) 403 # Staged changes. 404 command = ['git', 'diff', '--staged', '--name-only'] 405 files_staged = [f.decode('utf-8') 406 for f in subprocess.check_output(command).splitlines()] 407 # Filter files. 408 files_staged = list(filter(lambda f : isAppropriateFile(f) and not isGeneratedFile(f), files_staged)) 409 for f in files_staged: 410 if not f in files: 411 files.append(f) 412else: 413 # By default, scan dissectors directory 414 folder = os.path.join('epan', 'dissectors') 415 # But overwrite with any folder entry. 416 if args.folder: 417 folder = args.folder 418 if not os.path.isdir(folder): 419 print('Folder', folder, 'not found!') 420 exit(1) 421 422 # Find files from folder. 423 print('Looking for files in', folder) 424 files = findFilesInFolder(folder, not args.no_recurse) 425 426 427# If scanning a subset of files, list them here. 428print('Examining:') 429if args.file or args.folder or args.commits or args.open: 430 if files: 431 print(' '.join(files), '\n') 432 else: 433 print('No files to check.\n') 434else: 435 print('All dissector modules\n') 436 437 438# Now check the chosen files. 439for f in files: 440 # Jump out if control-C has been pressed. 441 if should_exit: 442 exit(1) 443 checkFile(f) 444 445 446 447# Show the most commonly not-recognised words. 448print('') 449counter = Counter(missing_words).most_common(100) 450if len(counter) > 0: 451 for c in counter: 452 print(c[0], ':', c[1]) 453 454# Show error count. 455print('\n' + bcolors.BOLD + str(len(missing_words)) + ' issues found' + bcolors.ENDC + '\n') 456