1#!/usr/local/bin/python3.8
2
3from __future__ import division
4
5"""flawfinder: Find potential security flaws ("hits") in source code.
6 Usage:
7   flawfinder [options] [source_code_file]+
8
9 See the man page for a description of the options."""
10
11version="1.31"
12
13# The default output is as follows:
14# filename:line_number [risk_level] (type) function_name: message
15#   where "risk_level" goes from 0 to 5. 0=no risk, 5=maximum risk.
16# The final output is sorted by risk level, most risky first.
17# Optionally ":column_number" can be added after the line number.
18#
19# Currently this program can only analyze C/C++ code.
20#
21# Copyright (C) 2001-2014 David A. Wheeler.
22# This is released under the
23# GNU General Public License (GPL) version 2 or later (GPLv2+):
24#
25#    This program is free software; you can redistribute it and/or modify
26#    it under the terms of the GNU General Public License as published by
27#    the Free Software Foundation; either version 2 of the License, or
28#    (at your option) any later version.
29#
30#    This program is distributed in the hope that it will be useful,
31#    but WITHOUT ANY WARRANTY; without even the implied warranty of
32#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33#    GNU General Public License for more details.
34#
35#    You should have received a copy of the GNU General Public License
36#    along with this program; if not, write to the Free Software
37#    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
38
39# The Python developers did a *terrible* job when they transitioned
40# to Python version 3, as I have documented elsewhere.  What's more,
41# many mechanisms that do exist do not come natively with Python 2, or
42# require version 2.6 or later (yet older versions are still in use).
43# For example, this requires Python version 2.6:
44# from __future__ import print_function
45# As a result, many Python programs (including this one) do not use Python 3.
46# The solution used here is to gradually transition the Python code
47# to Python 2 code that works across 2.2 through 2.7.  A preference is
48# given to code constructs that would ALSO work in version 3.X, but
49# only if they would *already* work in Python 2, and only if they don't
50# make the code too complicated.
51
52# The plan is to eventually switch this code so that it runs unchanged
53# on both 2.X and 3.X, but that is hard to support if the version number
54# is less than 2.6, so we won't do that at this time.  Instead, we'll
55# wait until versions lower than 2.6 are a distant memory, and then
56# start in that direction.  Hopefully by then the developers of Python
57# will begin to make it easy to transition to newer versions of Python.
58
59import sys, re, string, getopt
60import pickle               # To support load/save/diff of hitlist
61import os, glob, operator   # To support filename expansion on Windows
62import os.path
63import time
64# import formatter
65
66# Program Options - these are the default values:
67show_context = 0
68minimum_level = 1
69show_immediately = 0
70show_inputs = 0          # Only show inputs?
71falsepositive = 0        # Work to remove false positives?
72allowlink = 0            # Allow symbolic links?
73skipdotdir = 1           # If 1, don't recurse into dirs beginning with "."
74                         # Note: This doesn't affect the command line.
75num_links_skipped = 0    # Number of links skipped.
76num_dotdirs_skipped = 0  # Number of dotdirs skipped.
77show_columns = 0
78never_ignore = 0         # If true, NEVER ignore problems, even if directed.
79list_rules = 0           # If true, list the rules (helpful for debugging)
80patch_file = ""          # File containing (unified) diff output.
81loadhitlist = None
82savehitlist = None
83diffhitlist = None
84quiet = 0
85showheading = 1          # --dataonly turns this off
86output_format = 0        # 0 = normal, 1 = html.
87single_line = 0          # 1 = singleline (can 't be 0 if html)
88omit_time = 0            # 1 = omit time-to-run (needed for testing)
89required_regex = None    # If non-None, regex that must be met to report
90required_regex_compiled = None
91
92displayed_header = 0    # Have we displayed the header yet?
93num_ignored_hits = 0    # Number of ignored hits (used if never_ignore==0)
94
95def error(message):
96  sys.stderr.write("Error: %s\n"% message)
97
98
99# Support routines: find a pattern.
100# To simplify the calling convention, several global variables are used
101# and these support routines are defined, in an attempt to make the
102# actual calls simpler and clearer.
103#
104
105filename = ""      # Source filename.
106linenumber = 0     # Linenumber from original file.
107ignoreline = -1    # Line number to ignore.
108sumlines = 0       # Number of lines (total) examined.
109sloc = 0           # Physical SLOC
110starttime = time.time()  # Used to determine analyzed lines/second.
111
112
113line_beginning = re.compile( r'(?m)^' )
114blank_line     = re.compile( r'(?m)^\s+$' )
115
116# Send warning message.  This is written this way to work on
117# Python version 2.5 through Python 3.
118def print_warning(message):
119  sys.stderr.write("Warning: ")
120  sys.stderr.write(message)
121  sys.stderr.write("\n")
122  sys.stderr.flush()
123
124# The following code accepts unified diff format from both subversion (svn)
125# and GNU diff, which aren't well-documented.  It gets filenames from
126# "Index:" if exists, else from the "+++ FILENAME ..." entry.
127# Note that this is different than some tools (which will use "+++" in
128# preference to "Index:"), but subversion's nonstandard format is easier
129# to handle this way.
130# Since they aren't well-documented, here's some info on the diff formats:
131# GNU diff format:
132#    --- OLDFILENAME OLDTIMESTAMP
133#    +++ NEWFILENAME NEWTIMESTAMP
134#    @@ -OLDSTART,OLDLENGTH +NEWSTART,NEWLENGTH @@
135#    ... Changes where preceeding "+" is add, "-" is remove, " " is unchanged.
136#
137#    ",OLDLENGTH" and ",NEWLENGTH" are optional  (they default to 1).
138#    GNU unified diff format doesn't normally output "Index:"; you use
139#    the "+++/---" to find them (presuming the diff user hasn't used --label
140#    to mess it up).
141#
142# Subversion format:
143#    Index: FILENAME
144#    --- OLDFILENAME (comment)
145#    +++ NEWFILENAME (comment)
146#    @@ -OLDSTART,OLDLENGTH +NEWSTART,NEWLENGTH @@
147#
148#    In subversion, the "Index:" always occurs, and note that paren'ed
149#    comments are in the oldfilename/newfilename, NOT timestamps like
150#    everyone else.
151#
152# Git format:
153#    diff --git a/junk.c b/junk.c
154#    index 03d668d..5b005a1 100644
155#    --- a/junk.c
156#    +++ b/junk.c
157#    @@ -6,4 +6,5 @@ main() {
158#
159# Single Unix Spec version 3 (http://www.unix.org/single_unix_specification/)
160# does not specify unified format at all; it only defines the older
161# (obsolete) context diff format.  That format DOES use "Index:", but
162# only when the filename isn't specified otherwise.
163# We're only supporting unified format directly; if you have an older diff
164# format, use "patch" to apply it, and then use "diff -u" to create a
165# unified format.
166#
167
168diff_index_filename = re.compile( r'^Index:\s+(?P<filename>.*)' )
169diff_git_filename = re.compile( r'^diff --git a/.* b/(?P<filename>.*)$' )
170diff_newfile = re.compile( r'^\+\+\+\s(?P<filename>.*)$' )
171diff_hunk = re.compile( r'^@@ -\d+(,\d+)?\s+\+(?P<linenumber>\d+)[, ].*@@$' )
172diff_line_added = re.compile( r'^\+[^+].*' )
173diff_line_del = re.compile( r'^-[^-].*' )
174# The "+++" newfile entries have the filename, followed by a timestamp
175# or " (comment)" postpended.
176# Timestamps can be of these forms:
177#   2005-04-24 14:21:39.000000000 -0400
178#   Mon Mar 10 15:13:12 1997
179# Also, "newfile" can have " (comment)" postpended.  Find and eliminate this.
180# Note that the expression below is Y10K (and Y100K) ready. :-).
181diff_findjunk = re.compile( r'^(?P<filename>.*)((\s\d\d\d\d+-\d\d-\d\d\s+\d\d:\d[0-9:.]+Z?(\s+[\-\+0-9A-Z]+)?)|(\s[A-Za-z][a-z]+\s[A-za-z][a-z]+\s\d+\s\d+:\d[0-9:.]+Z?(\s[\-\+0-9]*)?\s\d\d\d\d+)|(\s\(.*\)))\s*$')
182
183def is_svn_diff(sLine):
184  if (sLine.find('Index:') != -1):
185    return True
186  return False
187
188def is_gnu_diff(sLine):
189  if sLine.startswith('--- '):
190    return True
191  return False
192
193def is_git_diff(sLine):
194  if sLine.startswith('diff --git a'):
195    return True
196  return False
197
198def svn_diff_get_filename(sLine):
199  return diff_index_filename.match(sLine)
200
201def gnu_diff_get_filename(sLine):
202  newfile_match = diff_newfile.match(sLine)
203  if (newfile_match):
204    patched_filename = string.strip(newfile_match.group('filename'))
205    # Clean up filename - remove trailing timestamp and/or (comment).
206    return diff_findjunk.match(patched_filename)
207  return None
208
209git_splitter=' b/'
210len_git_splitter=len(git_splitter)
211
212def git_diff_get_filename(sLine):
213  return diff_git_filename.match(sLine)
214
215# For each file found in the file patch_file, keep the
216# line numbers of the new file (after patch is applied) which are added.
217# We keep this information in a hash table for a quick access later.
218#
219def load_patch_info(patch_file):
220  patch={}
221  line_counter= 0
222  initial_number= 0
223  index_statement = False # Set true if we see "Index:".
224  try: hPatch = open(patch_file, 'r')
225  except:
226    print "Error: failed to open", h(patch_file)
227    sys.exit(1)
228
229  patched_filename = "" # Name of new file patched by current hunk.
230
231  sLine = hPatch.readline()
232  #Heuristic to determine if it's a svn diff, git diff, or a GNU diff.
233  if (is_svn_diff(sLine)):
234    fn_get_filename=svn_diff_get_filename
235  elif (is_git_diff(sLine)):
236    fn_get_filename=git_diff_get_filename
237  elif (is_gnu_diff(sLine)):
238    fn_get_filename=gnu_diff_get_filename
239  else:
240    print "Error: Unrecognized patch format"
241    sys.exit(1)
242
243  while True: # Loop-and-half construct.  Read a line, end loop when no more
244
245    # This is really a sequence of if ... elsif ... elsif..., but
246    # because Python forbids '=' in conditions, we do it this way.
247    filename_match = fn_get_filename(sLine)
248    if (filename_match):
249      patched_filename = string.strip(filename_match.group('filename'))
250      if (patched_file in patch):
251        error("filename occurs more than once in the patch: %s" %
252               patched_filename)
253        sys.exit(1)
254      else:
255        patch[patched_filename] = {}
256    else:
257      hunk_match = diff_hunk.match(sLine)
258      if (hunk_match):
259        if (patched_filename == ""):
260            error("wrong type of patch file : we have a line number without having seen a filename")
261            sys.exit(1)
262        initial_number= hunk_match.group('linenumber')
263        line_counter= 0
264      else:
265        line_added_match = diff_line_added.match(sLine)
266        if (line_added_match):
267          line_added = line_counter + int(initial_number)
268          patch[patched_filename][line_added] = True
269          # Let's also warn about the lines above and below this one,
270          # so that errors that "leak" into adjacent lines are caught.
271          # Besides, if you're creating a patch, you had to at least look
272          # at adjacent lines, so you're in a position to fix them.
273          patch[patched_filename][line_added - 1] = True
274          patch[patched_filename][line_added + 1] = True
275          line_counter += 1
276        else:
277          line_del_match = diff_line_del.match(sLine)
278          if (line_del_match == None):
279            line_counter += 1
280
281    sLine = hPatch.readline()
282    if (sLine == ''): break  # Done reading.
283
284  return patch
285
286
287def htmlize(s):
288  # Take s, and return legal (UTF-8) HTML.
289  s1 = string.replace(s,"&","&amp;")
290  s2 = string.replace(s1,"<","&lt;")
291  s3 = string.replace(s2,">","&gt;")
292  return s3
293
294def h(s):
295  # htmlize s if we're generating html, otherwise just return s.
296  if output_format: return htmlize(s)
297  else:             return s
298
299def print_multi_line(text):
300  # Print text as multiple indented lines.
301  width = 78
302  prefix = " "
303  starting_position = len(prefix) + 1
304  #
305  print prefix,
306  position = starting_position
307  #
308  for w in text.split():
309    if len(w) + position >= width:
310      print
311      print prefix,
312      position = starting_position
313    print w,
314    position = position + len(w) + 1
315
316# This matches references to CWE identifiers, so we can HTMLize them.
317# We don't refer to CWEs with one digit, so we'll only match on 2+ digits.
318link_cwe_pattern = re.compile(r'(CWE-([1-9][0-9]+))([,()])')
319
320class Hit:
321  """
322  Each instance of Hit is a warning of some kind in a source code file.
323  See the rulesets, which define the conditions for triggering a hit.
324  Hit is initialized with a tuple containing the following:
325    hook: function to call when function name found.
326    level: (default) warning level, 0-5. 0=no problem, 5=very risky.
327    warning: warning (text saying what's the problem)
328    suggestion: suggestion (text suggesting what to do instead)
329    category: One of "buffer" (buffer overflow), "race" (race condition),
330              "tmpfile" (temporary file creation), "format" (format string).
331              Use "" if you don't have a better category.
332    url: URL fragment reference.
333    other:  A dictionary with other settings.
334
335  Other settings usually set:
336
337    name: function name
338    parameter: the function parameters (0th parameter null)
339    input: set to 1 if the function inputs from external sources.
340    start: start position (index) of the function name (in text)
341    end:  end position of the function name (in text)
342    filename: name of file
343    line: line number in file
344    column: column in line in file
345    context_text: text surrounding hit"""
346
347  # Set default values:
348  source_position = 2 # By default, the second parameter is the source.
349  format_position = 1 # By default, the first parameter is the format.
350  input = 0           # By default, this doesn't read input.
351  note = ""          # No additional notes.
352  filename = ""      # Empty string is filename.
353  extract_lookahead = 0 # Normally don't extract lookahead.
354
355  def __init__(self, data):
356    hook, level, warning, suggestion, category, url, other = data
357    self.hook, self.level = hook, level
358    self.warning, self.suggestion = warning, suggestion
359    self.category, self.url = category, url
360    # These will be set later, but I set them here so that
361    # analysis tools like PyChecker will know about them.
362    self.column = 0
363    self.line = 0
364    self.name = ""
365    self.context_text = ""
366    for key in other.keys():
367      setattr(self, key, other[key])
368
369  def __cmp__(self, other):
370    return (cmp(other.level, self.level) or  # Highest risk first.
371            cmp(self.filename, other.filename) or
372            cmp(self.line, other.line) or
373            cmp(self.column, other.column) or
374            cmp(self.name, other.name))
375
376  def __getitem__(self, X):   # Define this so this works: "%(line)" % hit
377    return getattr(self, X)
378
379  def show(self):
380    if output_format: print "<li>",
381    sys.stdout.write(h(self.filename))
382
383    if show_columns: print ":%(line)s:%(column)s:" % self,
384    else:            print ":%(line)s:" % self,
385
386    if output_format: print "<b>",
387    # Extra space before risk level in text, makes it easier to find:
388    print " [%(level)s]" % self,
389    if output_format: print "</b>",
390    print "(%(category)s)" % self,
391    if output_format: print "<i>",
392    print h("%(name)s:" % self),
393    main_text = h("%(warning)s. " % self)
394    if output_format:  # Create HTML link to CWE definitions
395      main_text = link_cwe_pattern.sub(
396          r'<a href="http://cwe.mitre.org/data/definitions/\2.html">\1</a>\3',
397          main_text)
398    if single_line:
399      print main_text,
400      if self.suggestion: print h(self.suggestion)+".",
401      print h(self.note),
402    else:
403      if self.suggestion: main_text = main_text + h(self.suggestion) + ". "
404      main_text = main_text + h(self.note)
405      print
406      print_multi_line(main_text)
407    if output_format: print "</i>",
408    print
409    if show_context:
410      if output_format: print "<pre>"
411      print h(self.context_text)
412      if output_format: print "</pre>"
413
414
415
416# The "hitlist" is the list of all hits (warnings) found so far.
417# Use add_warning to add to it.
418
419hitlist = []
420
421def add_warning(hit):
422  global hitlist, num_ignored_hits
423  if show_inputs and not hit.input: return
424  if required_regex and (required_regex_compiled.search(hit.warning) is None):
425    return
426  if hit.level >= minimum_level:
427    if linenumber == ignoreline:
428      num_ignored_hits = num_ignored_hits + 1
429    else:
430      hitlist.append(hit)
431      if show_immediately:
432        hit.show()
433
434def internal_warn(message):
435  print h(message)
436
437# C Language Specific
438
439def extract_c_parameters(text, pos=0):
440  "Return a list of the given C function's parameters, starting at text[pos]"
441  # '(a,b)' produces ['', 'a', 'b']
442  i = pos
443  # Skip whitespace and find the "("; if there isn't one, return []:
444  while i < len(text):
445    if text[i] == '(':                 break
446    elif text[i] in string.whitespace: i = i + 1
447    else:                              return []
448  else:  # Never found a reasonable ending.
449    return []
450  i = i + 1
451  parameters = [""]  # Insert 0th entry, so 1st parameter is parameter[1].
452  currentstart = i
453  parenlevel = 1
454  instring = 0  # 1=in double-quote, 2=in single-quote
455  incomment = 0
456  while i < len(text):
457    c = text[i]
458    if instring:
459      if c == '"' and instring == 1: instring = 0
460      elif c == "'" and instring == 2: instring = 0
461      # if \, skip next character too.  The C/C++ rules for
462      # \ are actually more complex, supporting \ooo octal and
463      # \xhh hexadecimal (which can be shortened), but we don't need to
464      # parse that deeply, we just need to know we'll stay in string mode:
465      elif c == '\\': i = i + 1
466    elif incomment:
467      if c == '*' and text[i:i+2]=='*/':
468        incomment = 0
469        i = i + 1
470    else:
471      if c == '"': instring = 1
472      elif c == "'": instring = 2
473      elif c == '/' and text[i:i+2]=='/*':
474         incomment = 1
475         i = i + 1
476      elif c == '/' and text[i:i+2]=='//':
477         while i < len(text) and text[i] != "\n":
478           i = i + 1
479      elif c == '\\' and text[i:i+2]=='\\"': i = i + 1 # Handle exposed '\"'
480      elif c == '(': parenlevel = parenlevel + 1
481      elif c == ',' and (parenlevel == 1):
482        parameters.append(string.strip(
483                    p_trailingbackslashes.sub('', text[currentstart:i])))
484        currentstart = i + 1
485      elif c == ')':
486        parenlevel = parenlevel - 1
487        if parenlevel <= 0:
488            parameters.append(string.strip(
489                    p_trailingbackslashes.sub('', text[currentstart:i])))
490            # Re-enable these for debugging:
491            # print " EXTRACT_C_PARAMETERS: ", text[pos:pos+80]
492            # print " RESULTS: ", parameters
493            return parameters
494      elif c == ';':
495          internal_warn("Parsing failed to find end of parameter list; "
496                        "semicolon terminated it in %s" % text[pos:pos+200])
497          return parameters
498    i = i + 1
499  internal_warn("Parsing failed to find end of parameter list in %s" %
500                text[pos:pos+200])
501
502
503# These patterns match gettext() and _() for internationalization.
504# This is compiled here, to avoid constant recomputation.
505# FIXME: assumes simple function call if it ends with ")",
506# so will get confused by patterns like  gettext("hi") + function("bye")
507# In practice, this doesn't seem to be a problem; gettext() is usually
508# wrapped around the entire parameter.
509# The ?s makes it posible to match multi-line strings.
510gettext_pattern = re.compile(r'(?s)^\s*' + 'gettext' + r'\s*\((.*)\)\s*$')
511undersc_pattern = re.compile(r'(?s)^\s*' + '_(T(EXT)?)?' + r'\s*\((.*)\)\s*$')
512
513def strip_i18n(text):
514  "Strip any internationalization function calls surrounding 'text', "
515  "such as gettext() and _()."
516  match = gettext_pattern.search(text)
517  if match: return string.strip(match.group(1))
518  match = undersc_pattern.search(text)
519  if match: return string.strip(match.group(3))
520  return text
521
522p_trailingbackslashes = re.compile( r'(\s|\\(\n|\r))*$')
523
524p_c_singleton_string = re.compile( r'^\s*L?"([^\\]|\\[^0-6]|\\[0-6]+)?"\s*$')
525
526def c_singleton_string(text):
527  "Returns true if text is a C string with 0 or 1 character."
528  if p_c_singleton_string.search(text): return 1
529  else: return 0
530
531# This string defines a C constant.
532p_c_constant_string = re.compile( r'^\s*L?"([^\\]|\\[^0-6]|\\[0-6]+)*"$')
533
534def c_constant_string(text):
535  "Returns true if text is a constant C string."
536  if p_c_constant_string.search(text): return 1
537  else: return 0
538
539
540# Precompile patterns for speed.
541
542
543def c_buffer(hit):
544  source_position = hit.source_position
545  if source_position <= len(hit.parameters)-1:
546    source=hit.parameters[source_position]
547    if c_singleton_string(source):
548      hit.level = 1
549      hit.note = "Risk is low because the source is a constant character."
550    elif c_constant_string(strip_i18n(source)):
551      hit.level = max( hit.level - 2, 1)
552      hit.note = "Risk is low because the source is a constant string."
553  add_warning(hit)
554
555
556p_dangerous_strncat = re.compile(r'^\s*sizeof\s*(\(\s*)?[A-Za-z_$0-9]+'  +
557                                    r'\s*(\)\s*)?(-\s*1\s*)?$')
558# This is a heuristic: constants in C are usually given in all
559# upper case letters.  Yes, this need not be true, but it's true often
560# enough that it's worth using as a heuristic.
561# We check because strncat better not be passed a constant as the length!
562p_looks_like_constant = re.compile(r'^\s*[A-Z][A-Z_$0-9]+\s*(-\s*1\s*)?$')
563
564def c_strncat(hit):
565  if len(hit.parameters) > 3:
566    # A common mistake is to think that when calling strncat(dest,src,len),
567    # that "len" means the ENTIRE length of the destination.  This isn't true,
568    # it must be the length of the characters TO BE ADDED at most.
569    # Which is one reason that strlcat is better than strncat.
570    # We'll detect a common case of this error; if the length parameter
571    # is of the form "sizeof(dest)", we have this error.
572    # Actually, sizeof(dest) is okay if the dest's first character is always \0,
573    # but in that case the programmer should use strncpy, NOT strncat.
574    # The following heuristic will certainly miss some dangerous cases, but
575    # it at least catches the most obvious situation.
576    # This particular heuristic is overzealous; it detects ANY sizeof, instead
577    # of only the sizeof(dest)  (where dest is given in hit.parameters[1]).
578    # However, there aren't many other likely candidates for sizeof; some
579    # people use it to capture just the length of the source, but this is
580    # just as dangerous, since then it absolutely does NOT take care of
581    # the destination maximum length in general.
582    # It also detects if a constant is given as a length, if the
583    # constant follows common C naming rules.
584    length_text=hit.parameters[3]
585    if p_dangerous_strncat.search(length_text) or p_looks_like_constant.search(length_text):
586      hit.level = 5
587      hit.note = ( "Risk is high; the length parameter appears to be a constant, " +
588                 "instead of computing the number of characters left.")
589      add_warning(hit)
590      return
591  c_buffer(hit)
592
593def c_printf(hit):
594  format_position = hit.format_position
595  if format_position <= len(hit.parameters)-1:
596    # Assume that translators are trusted to not insert "evil" formats:
597    source = strip_i18n(hit.parameters[format_position])
598    if c_constant_string(source):
599      # Parameter is constant, so there's no risk of format string problems.
600      if hit.name == "snprintf" or hit.name == "vsnprintf":
601        hit.level = 1
602        hit.warning = \
603          "On some very old systems, snprintf is incorrectly implemented " \
604          "and permits buffer overflows; there are also incompatible " \
605          "standard definitions of it"
606        hit.suggestion = "Check it during installation, or use something else"
607        hit.category = "port"
608      else:
609        # We'll pass it on, just in case it's needed, but at level 0 risk.
610        hit.level = 0
611        hit.note = "Constant format string, so not considered very risky (there's some residual risk, especially in a loop)."
612  add_warning(hit)
613
614
615p_dangerous_sprintf_format = re.compile(r'%-?([0-9]+|\*)?s')
616
617# sprintf has both buffer and format vulnerabilities.
618def c_sprintf(hit):
619  source_position = hit.source_position
620  if hit.parameters is None:
621    # Serious parameter problem, e.g., none, or a string constant that
622    # never finishes.
623    hit.warning = "format string parameter problem"
624    hit.suggestion = "Check if required parameters present and quotes close."
625    hit.level = 4
626    hit.category = "format"
627    hit.url = ""
628  elif source_position <= len(hit.parameters)-1:
629    source=hit.parameters[source_position]
630    if c_singleton_string(source):
631      hit.level = 1
632      hit.note = "Risk is low because the source is a constant character."
633    else:
634      source = strip_i18n(source)
635      if c_constant_string(source):
636        if not p_dangerous_sprintf_format.search(source):
637          hit.level = max( hit.level - 2, 1)
638          hit.note = "Risk is low because the source has a constant maximum length."
639        # otherwise, warn of potential buffer overflow (the default)
640      else:
641        # Ho ho - a nonconstant format string - we have a different problem.
642        hit.warning = "Potential format string problem (CWE-134)"
643        hit.suggestion = "Make format string constant"
644        hit.level = 4
645        hit.category = "format"
646        hit.url = ""
647  add_warning(hit)
648
649p_dangerous_scanf_format = re.compile(r'%s')
650p_low_risk_scanf_format = re.compile(r'%[0-9]+s')
651
652def c_scanf(hit):
653  format_position = hit.format_position
654  if format_position <= len(hit.parameters)-1:
655    # Assume that translators are trusted to not insert "evil" formats;
656    # it's not clear that translators will be messing with INPUT formats,
657    # but it's possible so we'll account for it.
658    source = strip_i18n(hit.parameters[format_position])
659    if c_constant_string(source):
660      if p_dangerous_scanf_format.search(source): pass # Accept default.
661      elif p_low_risk_scanf_format.search(source):
662        # This is often okay, but sometimes extremely serious.
663        hit.level = 1
664        hit.warning = "It's unclear if the %s limit in the format string is small enough (CWE-120)"
665        hit.suggestion = "Check that the limit is sufficiently small, or use a different input function"
666      else:
667        # No risky scanf request.
668        # We'll pass it on, just in case it's needed, but at level 0 risk.
669        hit.level = 0
670        hit.note = "No risky scanf format detected."
671    else:
672        # Format isn't a constant.
673        hit.note = "If the scanf format is influenceable by an attacker, it's exploitable."
674  add_warning(hit)
675
676
677p_dangerous_multi_byte = re.compile(r'^\s*sizeof\s*(\(\s*)?[A-Za-z_$0-9]+'  +
678                                    r'\s*(\)\s*)?(-\s*1\s*)?$')
679p_safe_multi_byte =      re.compile(r'^\s*sizeof\s*(\(\s*)?[A-Za-z_$0-9]+\s*(\)\s*)?' +
680                                     r'/\s*sizeof\s*\(\s*?[A-Za-z_$0-9]+\s*' +
681                                     r'\[\s*0\s*\]\)\s*(-\s*1\s*)?$')
682
683def c_multi_byte_to_wide_char(hit):
684  # Unfortunately, this doesn't detect bad calls when it's a #define or
685  # constant set by a sizeof(), but trying to do so would create
686  # FAR too many false positives.
687  if len(hit.parameters)-1 >= 6:
688    num_chars_to_copy=hit.parameters[6]
689    if p_dangerous_multi_byte.search(num_chars_to_copy):
690      hit.level = 5
691      hit.note = ("Risk is high, it appears that the size is given as bytes, but the " +
692                 "function requires size as characters.")
693    elif p_safe_multi_byte.search(num_chars_to_copy):
694      # This isn't really risk-free, since it might not be the destination,
695      # or the destination might be a character array (if it's a char pointer,
696      # the pattern is actually quite dangerous, but programmers
697      # are unlikely to make that error).
698      hit.level = 1
699      hit.note = "Risk is very low, the length appears to be in characters not bytes."
700  add_warning(hit)
701
702p_null_text = re.compile(r'^ *(NULL|0|0x0) *$')
703
704def c_hit_if_null(hit):
705  null_position = hit.check_for_null
706  if null_position <= len(hit.parameters)-1:
707    null_text=hit.parameters[null_position]
708    if p_null_text.search(null_text):
709      add_warning(hit)
710    else:
711      return
712  add_warning(hit) # If insufficient # of parameters.
713
714p_static_array = re.compile(r'^[A-Za-z_]+\s+[A-Za-z0-9_$,\s\*()]+\[[^]]')
715
716def c_static_array(hit):
717  # This is cheating, but it does the job for most real code.
718  # In some cases it will match something that it shouldn't.
719  # We don't match ALL arrays, just those of certain types (e.g., char).
720  # In theory, any array can overflow, but in practice it seems that
721  # certain types are far more prone to problems, so we just report those.
722  if p_static_array.search(hit.lookahead):
723    add_warning(hit) # Found a static array, warn about it.
724
725def normal(hit):
726  add_warning(hit)
727
728
729# "c_ruleset": the rules for identifying "hits" in C (potential warnings).
730# It's a dictionary, where the key is the function name causing the hit,
731# and the value is a tuple with the following format:
732#  (hook, level, warning, suggestion, category, {other})
733# See the definition for class "Hit".
734# The key can have multiple values separated with "|".
735
736c_ruleset = {
737  "strcpy" :
738     (c_buffer, 4,
739      "Does not check for buffer overflows when copying to destination (CWE-120)",
740      "Consider using strcpy_s, strncpy, or strlcpy (warning, strncpy is easily misused)",
741      "buffer", "", {}),
742  "lstrcpy|wcscpy|_tcscpy|_mbscpy" :
743     (c_buffer, 4,
744      "Does not check for buffer overflows when copying to destination (CWE-120)",
745      "Consider using a function version that stops copying at the end of the buffer",
746      "buffer", "", {}),
747  "memcpy|CopyMemory|bcopy" :
748     (normal, 2, # I've found this to have a lower risk in practice.
749      "Does not check for buffer overflows when copying to destination (CWE-120)",
750      "Make sure destination can always hold the source data",
751      "buffer", "", {}),
752  "strcat" :
753     (c_buffer, 4,
754      "Does not check for buffer overflows when concatenating to destination (CWE-120)",
755      "Consider using strcat_s, strncat, or strlcat (warning, strncat is easily misused)",
756      "buffer", "", {}),
757  "lstrcat|wcscat|_tcscat|_mbscat" :
758     (c_buffer, 4,
759      "Does not check for buffer overflows when concatenating to destination (CWE-120)",
760      "",
761      "buffer", "", {}),
762  "strncpy" :
763     (c_buffer,
764      1, # Low risk level, because this is often used correctly when FIXING security
765      # problems, and raising it to a higher risk level would cause many false positives.
766      "Easily used incorrectly; doesn't always \\0-terminate or " +
767         "check for invalid pointers (CWE-120)",
768      "",
769      "buffer", "", {}),
770  "lstrcpyn|wcsncpy|_tcsncpy|_mbsnbcpy" :
771     (c_buffer,
772      1, # Low risk level, because this is often used correctly when FIXING security
773      # problems, and raising it to a higher risk levle would cause many false positives.
774      "Easily used incorrectly; doesn't always \\0-terminate or " +
775         "check for invalid pointers (CWE-120)",
776      "",
777      "buffer", "", {}),
778  "strncat" :
779     (c_strncat,
780      1, # Low risk level, because this is often used correctly when
781         # FIXING security problems, and raising it to a
782         # higher risk level would cause many false positives.
783      "Easily used incorrectly (e.g., incorrectly computing the correct maximum size to add) (CWE-120)",
784      "Consider strcat_s, strlcat, or automatically resizing strings",
785      "buffer", "", {}),
786  "lstrcatn|wcsncat|_tcsncat|_mbsnbcat" :
787     (c_strncat,
788      1, # Low risk level, because this is often used correctly when FIXING security
789      # problems, and raising it to a higher risk level would cause many false positives.
790      "Easily used incorrectly (e.g., incorrectly computing the correct maximum size to add) (CWE-120)",
791      "Consider strcat_s, strlcat, or automatically resizing strings",
792      "buffer", "", {}),
793  "strccpy|strcadd":
794     (normal, 1,
795      "Subject to buffer overflow if buffer is not as big as claimed (CWE-120)",
796      "Ensure that destination buffer is sufficiently large",
797      "buffer", "", {}),
798  "char|TCHAR|wchar_t":  # This isn't really a function call, but it works.
799     (c_static_array, 2,
800      "Statically-sized arrays can be improperly restricted, " +
801      "leading to potential overflows or other issues (CWE-119:CWE-120)",
802      "Perform bounds checking, use functions that limit length, " +
803        "or ensure that the size is larger than the maximum possible length",
804      "buffer", "", {'extract_lookahead' : 1}),
805
806  "gets|_getts":
807     (normal, 5, "Does not check for buffer overflows (CWE-120, CWE-20)",
808      "Use fgets() instead", "buffer", "", {'input' : 1}),
809
810  # The "sprintf" hook will raise "format" issues instead if appropriate:
811  "sprintf|vsprintf|swprintf|vswprintf|_stprintf|_vstprintf":
812     (c_sprintf, 4,
813      "Does not check for buffer overflows (CWE-120)",
814      "Use sprintf_s, snprintf, or vsnprintf",
815      "buffer", "", {}),
816
817  "printf|vprintf|vwprintf|vfwprintf|_vtprintf|wprintf":
818     (c_printf, 4,
819      "If format strings can be influenced by an attacker, they can be exploited (CWE-134)",
820      "Use a constant for the format specification",
821      "format", "", {}),
822
823  "fprintf|vfprintf|_ftprintf|_vftprintf|fwprintf|fvwprintf":
824     (c_printf, 4,
825      "If format strings can be influenced by an attacker, they can be exploited (CWE-134)",
826      "Use a constant for the format specification",
827      "format", "", { 'format_position' : 2}),
828
829  # The "syslog" hook will raise "format" issues.
830  "syslog":
831     (c_printf, 4,
832      "If syslog's format strings can be influenced by an attacker, " +
833      "they can be exploited (CWE-134)",
834      "Use a constant format string for syslog",
835      "format", "", { 'format_position' : 2} ),
836
837  "snprintf|vsnprintf|_snprintf|_sntprintf|_vsntprintf":
838     (c_printf, 4,
839      "If format strings can be influenced by an attacker, they can be " +
840      "exploited, and note that sprintf variations do not always \\0-terminate (CWE-134)",
841      "Use a constant for the format specification",
842      "format", "", { 'format_position' : 3}),
843
844  "scanf|vscanf|wscanf|_tscanf|vwscanf":
845     (c_scanf, 4,
846      "The scanf() family's %s operation, without a limit specification, " +
847        "permits buffer overflows (CWE-120, CWE-20)",
848      "Specify a limit to %s, or use a different input function",
849      "buffer", "", {'input' : 1}),
850
851  "fscanf|sscanf|vsscanf|vfscanf|_ftscanf|fwscanf|vfwscanf|vswscanf":
852     (c_scanf, 4,
853      "The scanf() family's %s operation, without a limit specification, "
854      "permits buffer overflows (CWE-120, CWE-20)",
855      "Specify a limit to %s, or use a different input function",
856      "buffer", "", {'input' : 1, 'format_position' : 2}),
857
858  "strlen|wcslen|_tcslen|_mbslen" :
859     (normal,
860      1, # Often this isn't really a risk, and even when, it usually at worst causes
861      # program crash (and nothing worse).
862      "Does not handle strings that are not \\0-terminated; " +
863      "if given one it may perform an over-read (it could cause a crash " +
864         "if unprotected) (CWE-126)",
865      "",
866      "buffer", "", {}),
867
868  "MultiByteToWideChar" : # Windows
869     (c_multi_byte_to_wide_char,
870      2, # Only the default - this will be changed in many cases.
871      "Requires maximum length in CHARACTERS, not bytes (CWE-120)",
872      "",
873      "buffer", "", {}),
874
875  "streadd|strecpy":
876     (normal, 4,
877      "This function does not protect against buffer overflows (CWE-120)",
878      "Ensure the destination has 4 times the size of the source, to leave room for expansion",
879      "buffer", "dangers-c", {}),
880
881  "strtrns":
882     (normal, 3,
883      "This function does not protect against buffer overflows (CWE-120)",
884      "Ensure that destination is at least as long as the source",
885      "buffer", "dangers-c", {}),
886
887  "realpath":
888     (normal, 3,
889      "This function does not protect against buffer overflows, " +
890        "and some implementations can overflow internally (CWE-120/CWE-785)",
891      "Ensure that the destination buffer is at least of size MAXPATHLEN, and" +
892        "to protect against implementation problems, the input argument should also " +
893        "be checked to ensure it is no larger than MAXPATHLEN",
894      "buffer", "dangers-c", {}),
895
896  "getopt|getopt_long":
897     (normal, 3,
898     "Some older implementations do not protect against internal buffer overflows (CWE-120, CWE-20)",
899      "Check implementation on installation, or limit the size of all string inputs",
900      "buffer", "dangers-c", {'input' : 1}),
901
902  "getpass":
903     (normal, 3,
904     "Some implementations may overflow buffers (CWE-120, CWE-20)",
905      "",
906      "buffer", "dangers-c", {'input' : 1}),
907
908  "getwd":
909     (normal, 3,
910     "This does not protect against buffer overflows "
911     "by itself, so use with caution (CWE-120, CWE-20)",
912      "Use getcwd instead",
913      "buffer", "dangers-c", {'input' : 1}),
914
915  # fread not included here; in practice I think it's rare to mistake it.
916  "getchar|fgetc|getc|read|_gettc":
917     (normal, 1,
918     "Check buffer boundaries if used in a loop including recursive loops (CWE-120, CWE-20)",
919      "",
920      "buffer", "dangers-c", {'input' : 1}),
921
922  "access":        # ???: TODO: analyze TOCTOU more carefully.
923     (normal, 4,
924      "This usually indicates a security flaw.  If an " +
925      "attacker can change anything along the path between the " +
926      "call to access() and the file's actual use (e.g., by moving " +
927      "files), the attacker can exploit the race condition (CWE-362/CWE-367)",
928      "Set up the correct permissions (e.g., using setuid()) and " +
929      "try to open the file directly",
930      "race",
931      "avoid-race#atomic-filesystem", {}),
932  "chown":
933     (normal, 5,
934      "This accepts filename arguments; if an attacker " +
935      "can move those files, a race condition results. (CWE-362)",
936      "Use fchown( ) instead",
937      "race", "", {}),
938  "chgrp":
939     (normal, 5,
940      "This accepts filename arguments; if an attacker " +
941      "can move those files, a race condition results. (CWE-362)",
942      "Use fchgrp( ) instead",
943      "race", "", {}),
944  "chmod":
945     (normal, 5,
946      "This accepts filename arguments; if an attacker " +
947      "can move those files, a race condition results. (CWE-362)",
948      "Use fchmod( ) instead",
949      "race", "", {}),
950  "vfork":
951     (normal, 2,
952      "On some old systems, vfork() permits race conditions, and it's " +
953      "very difficult to use correctly (CWE-362)",
954      "Use fork() instead",
955      "race", "", {}),
956  "readlink":
957     (normal, 5,
958      "This accepts filename arguments; if an attacker " +
959      "can move those files or change the link content, " +
960      "a race condition results.  " +
961      "Also, it does not terminate with ASCII NUL. (CWE-362, CWE-20)",
962      # This is often just a bad idea, and it's hard to suggest a
963      # simple alternative:
964      "Reconsider approach",
965      "race", "", {'input' : 1}),
966
967  "tmpfile":
968     (normal, 2,
969      "Function tmpfile() has a security flaw on some systems (e.g., older System V systems) (CWE-377)",
970      "",
971      "tmpfile", "", {}),
972  "tmpnam|tempnam":
973     (normal, 3,
974      "Temporary file race condition (CWE-377)",
975      "",
976      "tmpfile", "avoid-race", {}),
977
978  # TODO: Detect GNOME approach to mktemp and ignore it.
979  "mktemp":
980     (normal, 4,
981      "Temporary file race condition (CWE-377)",
982      "",
983      "tmpfile", "avoid-race", {}),
984
985  "mkstemp":
986     (normal, 2,
987     "Potential for temporary file vulnerability in some circumstances.  Some older Unix-like systems create temp files with permission to write by all by default, so be sure to set the umask to override this. Also, some older Unix systems might fail to use O_EXCL when opening the file, so make sure that O_EXCL is used by the library (CWE-377)",
988      "",
989      "tmpfile", "avoid-race", {}),
990
991  "fopen|open":
992     (normal, 2,
993     "Check when opening files - can an attacker redirect it (via symlinks), force the opening of special file type (e.g., device files), move things around to create a race condition, control its ancestors, or change its contents? (CWE-362)",
994      "",
995      "misc", "", {}),
996
997  "umask":
998     (normal, 1,
999      "Ensure that umask is given most restrictive possible setting (e.g., 066 or 077) (CWE-732)",
1000      "",
1001      "access", "", {}),
1002
1003  # Windows.  TODO: Detect correct usage approaches and ignore it.
1004  "GetTempFileName":
1005     (normal, 3,
1006      "Temporary file race condition in certain cases " +
1007        "(e.g., if run as SYSTEM in many versions of Windows) (CWE-377)",
1008      "",
1009      "tmpfile", "avoid-race", {}),
1010
1011  # TODO: Need to detect varying levels of danger.
1012  "execl|execlp|execle|execv|execvp|system|popen|WinExec|ShellExecute":
1013     (normal, 4,
1014      "This causes a new program to execute and is difficult to use safely (CWE-78)",
1015      "try using a library call that implements the same functionality " +
1016      "if available",
1017      "shell", "", {}),
1018
1019  # TODO: Need to detect varying levels of danger.
1020  "execl|execlp|execle|execv|execvp|system|popen|WinExec|ShellExecute":
1021     (normal, 4,
1022      "This causes a new program to execute and is difficult to use safely (CWE-78)",
1023      "try using a library call that implements the same functionality " +
1024      "if available",
1025      "shell", "", {}),
1026
1027  # TODO: Be more specific.  The biggest problem involves "first" param NULL,
1028  # second param with embedded space. Windows.
1029  "CreateProcessAsUser|CreateProcessWithLogon":
1030     (normal, 3,
1031      "This causes a new process to execute and is difficult to use safely (CWE-78)",
1032      "Especially watch out for embedded spaces",
1033      "shell", "", {}),
1034
1035  # TODO: Be more specific.  The biggest problem involves "first" param NULL,
1036  # second param with embedded space. Windows.
1037  "CreateProcess":
1038     (c_hit_if_null, 3,
1039      "This causes a new process to execute and is difficult to use safely (CWE-78)",
1040      "Specify the application path in the first argument, NOT as part of the second, " +
1041        "or embedded spaces could allow an attacker to force a different program to run",
1042      "shell", "", {'check_for_null' : 1}),
1043
1044  "atoi|atol|_wtoi|_wtoi64":
1045     (normal, 2,
1046      "Unless checked, the resulting number can exceed the expected range " +
1047      "(CWE-190)",
1048      "If source untrusted, check both minimum and maximum, even if the" +
1049      " input had no minus sign (large numbers can roll over into negative" +
1050      " number; consider saving to an unsigned value if that is intended)",
1051      "integer", "dangers-c", {}),
1052
1053  # Random values.  Don't trigger on "initstate", it's too common a term.
1054  "drand48|erand48|jrand48|lcong48|lrand48|mrand48|nrand48|random|seed48|setstate|srand|strfry|srandom":
1055     (normal, 3,
1056      "This function is not sufficiently random for security-related functions such as key and nonce creation (CWE-327)",
1057      "use a more secure technique for acquiring random values",
1058      "random", "", {}),
1059
1060  "crypt":
1061     (normal, 4,
1062      "Function crypt is a poor one-way hashing algorithm; since it only accepts passwords of 8 " +
1063        "characters or less, and only a two-byte salt, it is excessively vulnerable to " +
1064        "dictionary attacks given today's faster computing equipment (CWE-327)",
1065      "Use a different algorithm, such as SHA-1, with a larger non-repeating salt",
1066      "crypto", "", {}),
1067
1068  # OpenSSL EVP calls to use DES.
1069  "EVP_des_ecb|EVP_des_cbc|EVP_des_cfb|EVP_des_ofb|EVP_desx_cbc":
1070     (normal, 4,
1071      "DES only supports a 56-bit keysize, which is too small given today's computers (CWE-327)",
1072      "Use a different patent-free encryption algorithm with a larger keysize, " +
1073         "such as 3DES or AES",
1074      "crypto", "", {}),
1075
1076  # Other OpenSSL EVP calls to use small keys.
1077  "EVP_rc4_40|EVP_rc2_40_cbc|EVP_rc2_64_cbc":
1078     (normal, 4,
1079      "These keysizes are too small given today's computers (CWE-327)",
1080      "Use a different patent-free encryption algorithm with a larger keysize, " +
1081        "such as 3DES or AES",
1082      "crypto", "", {}),
1083
1084  "chroot":
1085     (normal, 3,
1086      "chroot can be very helpful, but is hard to use correctly (CWE-250, CWE-22)",
1087      "Make sure the program immediately chdir(\"/\")," +
1088      " closes file descriptors," +
1089      " and drops root privileges, and that all necessary files" +
1090      " (and no more!) are in the new root",
1091      "misc", "", {}),
1092
1093  "getenv|curl_getenv":
1094     (normal, 3, "Environment variables are untrustable input if they can be" +
1095                 " set by an attacker.  They can have any content and" +
1096                 " length, and the same variable can be set more than once (CWE-807, CWE-20)",
1097      "Check environment variables carefully before using them",
1098      "buffer", "", {'input' : 1}),
1099
1100  "g_get_home_dir":
1101     (normal, 3, "This function is synonymous with 'getenv(\"HOME\")';" +
1102                 "it returns untrustable input if the environment can be" +
1103                 "set by an attacker.  It can have any content and length, " +
1104                 "and the same variable can be set more than once (CWE-807, CWE-20)",
1105      "Check environment variables carefully before using them",
1106      "buffer", "", {'input' : 1}),
1107
1108  "g_get_tmp_dir":
1109     (normal, 3, "This function is synonymous with 'getenv(\"TMP\")';" +
1110                 "it returns untrustable input if the environment can be" +
1111                 "set by an attacker.  It can have any content and length, " +
1112                 "and the same variable can be set more than once (CWE-807, CWE-20)",
1113      "Check environment variables carefully before using them",
1114      "buffer", "", {'input' : 1}),
1115
1116
1117  # These are Windows-unique:
1118
1119  # TODO: Should have lower risk if the program checks return value.
1120  "RpcImpersonateClient|ImpersonateLoggedOnUser|CoImpersonateClient|" +
1121     "ImpersonateNamedPipeClient|ImpersonateDdeClientWindow|ImpersonateSecurityContext|" +
1122     "SetThreadToken":
1123     (normal, 4, "If this call fails, the program could fail to drop heightened privileges (CWE-250)",
1124      "Make sure the return value is checked, and do not continue if a failure is reported",
1125      "access", "", {}),
1126
1127  "InitializeCriticalSection":
1128     (normal, 3, "Exceptions can be thrown in low-memory situations",
1129      "Use InitializeCriticalSectionAndSpinCount instead",
1130      "misc", "", {}),
1131
1132  "EnterCriticalSection":
1133     (normal, 3, "On some versions of Windows, exceptions can be thrown in low-memory situations",
1134      "Use InitializeCriticalSectionAndSpinCount instead",
1135      "misc", "", {}),
1136
1137  "LoadLibrary|LoadLibraryEx":
1138     (normal, 3, "Ensure that the full path to the library is specified, or current directory may be used (CWE-829, CWE-20)",
1139      "Use registry entry or GetWindowsDirectory to find library path, if you aren't already",
1140      "misc", "", {'input' : 1}),
1141
1142  "SetSecurityDescriptorDacl":
1143     (c_hit_if_null, 5,
1144      "Never create NULL ACLs; an attacker can set it to Everyone (Deny All Access), " +
1145        "which would even forbid administrator access (CWE-732)",
1146      "",
1147      "misc", "", {'check_for_null' : 3}),
1148
1149  "AddAccessAllowedAce":
1150     (normal, 3,
1151      "This doesn't set the inheritance bits in the access control entry (ACE) header (CWE-732)",
1152      "Make sure that you set inheritance by hand if you wish it to inherit",
1153      "misc", "", {}),
1154
1155  "getlogin":
1156     (normal, 4,
1157      "It's often easy to fool getlogin.  Sometimes it does not work at all, because some program messed up the utmp file.  Often, it gives only the first 8 characters of the login name. The user currently logged in on the controlling tty of our program need not be the user who started it.  Avoid getlogin() for security-related purposes (CWE-807)",
1158      "Use getpwuid(geteuid()) and extract the desired information instead",
1159      "misc", "", {}),
1160
1161  "cuserid":
1162     (normal, 4,
1163      "Exactly what cuserid() does is poorly defined (e.g., some systems use the effective uid, like Linux, while others like System V use the real uid). Thus, you can't trust what it does. It's certainly not portable (The cuserid function was included in the 1988 version of POSIX, but removed from the 1990 version).  Also, if passed a non-null parameter, there's a risk of a buffer overflow if the passed-in buffer is not at least L_cuserid characters long (CWE-120)",
1164      "Use getpwuid(geteuid()) and extract the desired information instead",
1165      "misc", "", {}),
1166
1167  "getpw":
1168     (normal, 4,
1169      "This function is dangerous; it may overflow the provided buffer. It extracts data from a 'protected' area, but most systems have many commands to let users modify the protected area, and it's not always clear what their limits are.  Best to avoid using this function altogether (CWE-676, CWE-120)",
1170      "Use getpwuid() instead",
1171      "buffer", "", {}),
1172
1173  "getpass":
1174     (normal, 4,
1175      "This function is obsolete and not portable. It was in SUSv2 but removed by POSIX.2.  What it does exactly varies considerably between systems, particularly in where its prompt is displayed and where it gets its data (e.g., /dev/tty, stdin, stderr, etc.) (CWE-676)",
1176      "Make the specific calls to do exactly what you want.  If you continue to use it, or write your own, be sure to zero the password as soon as possible to avoid leaving the cleartext password visible in the process' address space",
1177      "misc", "", {}),
1178
1179  "gsignal|ssignal":
1180     (normal, 2,
1181      "These functions are considered obsolete on most systems, and very non-poertable (Linux-based systems handle them radically different, basically if gsignal/ssignal were the same as raise/signal respectively, while System V considers them a separate set and obsolete) (CWE-676)",
1182      "Switch to raise/signal, or some other signalling approach",
1183      "obsolete", "", {}),
1184
1185  "memalign":
1186     (normal, 1,
1187     "On some systems (though not Linux-based systems) an attempt to free() results from memalign() may fail. This may, on a few systems, be exploitable.  Also note that memalign() may not check that the boundary parameter is correct (CWE-676)",
1188      "Use posix_memalign instead (defined in POSIX's 1003.1d).  Don't switch to valloc(); it is marked as obsolete in BSD 4.3, as legacy in SUSv2, and is no longer defined in SUSv3.  In some cases, malloc()'s alignment may be sufficient",
1189      "free", "", {}),
1190
1191  "ulimit":
1192     (normal, 1,
1193     "This C routine is considered obsolete (as opposed to the shell command by the same name, which is NOT obsolete) (CWE-676)",
1194      "Use getrlimit(2), setrlimit(2), and sysconf(3) instead",
1195      "obsolete", "", {}),
1196
1197  "usleep":
1198     (normal, 1,
1199     "This C routine is considered obsolete (as opposed to the shell command by the same name).   The interaction of this function with SIGALRM and other timer functions such as sleep(), alarm(), setitimer(), and nanosleep() is unspecified (CWE-676)",
1200      "Use nanosleep(2) or setitimer(2) instead",
1201      "obsolete", "", {}),
1202
1203
1204   # Input functions, useful for -I
1205  "recv|recvfrom|recvmsg|fread|readv":
1206     (normal, 0, "Function accepts input from outside program (CWE-20)",
1207      "Make sure input data is filtered, especially if an attacker could manipulate it",
1208      "input", "", {'input' : 1}),
1209
1210
1211  # TODO: detect C++'s:   cin >> charbuf, where charbuf is a char array; the problem
1212  #       is that flawfinder doesn't have type information, and ">>" is safe with
1213  #       many other types.
1214  # ("send" and friends aren't todo, because they send out.. not input.)
1215  # TODO: cwd("..") in user's space - TOCTOU vulnerability
1216  # TODO: There are many more rules to add, esp. for TOCTOU.
1217  }
1218
1219template_ruleset = {
1220  # This is a template for adding new entries (the key is impossible):
1221  "9":
1222     (normal, 2,
1223      "",
1224      "",
1225      "tmpfile", "", {}),
1226  }
1227
1228
1229def find_column(text, position):
1230  "Find column number inside line."
1231  newline = string.rfind(text, "\n", 0, position)
1232  if newline == -1:
1233    return position + 1
1234  else:
1235    return position - newline
1236
1237def get_context(text, position):
1238  "Get surrounding text line starting from text[position]"
1239  linestart = string.rfind(text, "\n", 0, position+1) + 1
1240  lineend   = string.find(text, "\n", position, len(text))
1241  if lineend == -1: lineend = len(text)
1242  return text[linestart:lineend]
1243
1244def c_valid_match(text, position):
1245  # Determine if this is a valid match, or a false positive.
1246  # If false positive controls aren't on, always declare it's a match:
1247  i = position
1248  while i < len(text):
1249    c = text[i]
1250    if c == '(':                 return 1
1251    elif c in string.whitespace: i = i + 1
1252    else:
1253      if falsepositive: return 0       # No following "(", presume invalid.
1254      if c in "=+-":
1255        # This is very unlikely to be a function use. If c is '=',
1256        # the name is followed by an assignment or is-equal operation.
1257        # Since the names of library functions are really unlikely to be
1258        # followed by an assignment statement or 'is-equal' test,
1259        # while this IS common for variable names, let's declare it invalid.
1260        # It's possible that this is a variable function pointer, pointing
1261        # to the real library function, but that's really improbable.
1262        # If c is "+" or "-", we have a + or - operation.
1263        # In theory "-" could be used for a function pointer difference
1264        # computation, but this is extremely improbable.
1265        # More likely: this is a variable in a computation, so drop it.
1266        return 0
1267      return 1
1268  return 0 # Never found anything other than "(" and whitespace.
1269
1270def process_directive():
1271  "Given a directive, process it."
1272  global ignoreline, num_ignored_hits
1273  # TODO: Currently this is just a stub routine that simply removes
1274  # hits from the current line, if any, and sets a flag if not.
1275  # Thus, any directive is considered the "ignore" directive.
1276  # Currently that's okay because we don't have any other directives yet.
1277  if never_ignore: return
1278  hitfound = 0
1279  # Iterate backwards over hits, to be careful about the destructive iterator
1280  for i in xrange(len(hitlist)-1, -1, -1):
1281    if hitlist[i].filename == filename and hitlist[i].line == linenumber:
1282      del hitlist[i] # DESTROY - this is a DESTRUCTIVE iterator.
1283      hitfound = 1   # Don't break, because there may be more than one.
1284      num_ignored_hits = num_ignored_hits + 1
1285  if not hitfound:
1286    ignoreline = linenumber + 1  # Nothing found - ignore next line.
1287
1288# Characters that can be in a string.
1289# 0x4, 4.4e4, etc.
1290numberset=string.hexdigits+"_x.Ee"
1291
1292# Patterns for various circumstances:
1293p_whitespace = re.compile( r'[ \t\v\f]+' )
1294p_include = re.compile( r'#\s*include\s+(<.*?>|".*?")' )
1295p_digits  = re.compile( r'[0-9]' )
1296p_alphaunder = re.compile( r'[A-Za-z_]' )  # Alpha chars and underline.
1297# A "word" in C.  Note that "$" is permitted -- it's not permitted by the
1298# C standard in identifiers, but gcc supports it as an extension.
1299p_c_word = re.compile( r'[A-Za-z_][A-Za-z_0-9$]*' )
1300# We'll recognize ITS4 and RATS ignore directives, as well as our own,
1301# for compatibility's sake:
1302p_directive = re.compile( r'(?i)\s*(ITS4|Flawfinder|RATS):\s*([^\*]*)' )
1303
1304max_lookahead=500  # Lookahead limit for c_static_array.
1305
1306def process_c_file(f, patch_infos):
1307  global filename, linenumber, ignoreline, sumlines, num_links_skipped
1308  global sloc
1309  filename=f
1310  linenumber = 1
1311  ignoreline = -1
1312
1313  incomment = 0
1314  instring = 0
1315  linebegin = 1
1316  codeinline = 0 # 1 when we see some code (so increment sloc at newline)
1317
1318  if ((patch_infos != None) and (not (f in patch_infos))):
1319    # This file isn't in the patch list, so don't bother analyzing it.
1320    if not quiet:
1321      if output_format:
1322        print "Skipping unpatched file ", h(f), "<br>"
1323      else:
1324        print "Skipping unpatched file", f
1325      sys.stdout.flush()
1326    return
1327
1328  if f == "-":
1329   input = sys.stdin
1330  else:
1331    # Symlinks should never get here, but just in case...
1332    if ((not allowlink) and os.path.islink(f)):
1333      print "BUG! Somehow got a symlink in process_c_file!"
1334      num_links_skipped = num_links_skipped + 1
1335      return
1336    try:
1337      input = open(f, "r")
1338    except:
1339      print "Error: failed to open", h(f)
1340      sys.exit(1)
1341
1342  # Read ENTIRE file into memory.  Use readlines() to convert \n if necessary.
1343  # This turns out to be very fast in Python, even on large files, and it
1344  # eliminates lots of range checking later, making the result faster.
1345  # We're examining source files, and today, it would be EXTREMELY bad practice
1346  # to create source files larger than main memory space.
1347  # Better to load it all in, and get the increased speed and reduced
1348  # development time that results.
1349
1350  if not quiet:
1351    if output_format:
1352      print "Examining", h(f), "<br>"
1353    else:
1354      print "Examining", f
1355    sys.stdout.flush()
1356
1357  text = string.join(input.readlines(),"")
1358
1359  i = 0
1360  while i < len(text):
1361    # This is a trivial tokenizer that just tries to find "words", which
1362    # match [A-Za-z_][A-Za-z0-9_]*.  It skips comments & strings.
1363    # It also skips "#include <...>", which must be handled specially
1364    # because "<" and ">" aren't usually delimiters.
1365    # It doesn't bother to tokenize anything else, since it's not used.
1366    # The following is a state machine with 3 states: incomment, instring,
1367    # and "normal", and a separate state "linebegin" if at BOL.
1368
1369    # Skip any whitespace
1370    m = p_whitespace.match(text,i)
1371    if m:
1372      i = m.end(0)
1373
1374    if i >= len(text):
1375      c = "\n" # Last line with no newline, we're done
1376    else:
1377      c = text[i]
1378    if linebegin:  # If at beginning of line, see if #include is there.
1379       linebegin = 0
1380       if c == "#": codeinline = 1  # A directive, count as code.
1381       m = p_include.match(text,i)
1382       if m:  # Found #include, skip it.  Otherwise: #include <stdio.h>
1383         i = m.end(0)
1384         continue
1385    if c == "\n":
1386      linenumber = linenumber + 1
1387      sumlines = sumlines + 1
1388      linebegin = 1
1389      if codeinline: sloc = sloc + 1
1390      codeinline = 0
1391      i = i +1
1392      continue
1393    i = i + 1   # From here on, text[i] points to next character.
1394    if i < len(text): nextc = text[i]
1395    else:             nextc = ''
1396    if incomment:
1397       if c=='*' and nextc=='/':
1398           i = i + 1
1399           incomment = 0
1400    elif instring:
1401       if c == '\\' and (nextc != "\n"): i = i + 1
1402       elif c == '"' and instring == 1: instring = 0
1403       elif c == "'" and instring == 2: instring = 0
1404    else:
1405      if c=='/' and nextc=='*':
1406          m = p_directive.match(text, i+1)  # Is there a directive here?
1407          if m:
1408            process_directive()
1409          i = i + 1
1410          incomment = 1
1411      elif c=='/' and nextc=='/':  # "//" comments - skip to EOL.
1412          m = p_directive.match(text, i+1)  # Is there a directive here?
1413          if m:
1414            process_directive()
1415          while i<len(text) and text[i] != "\n":
1416            i = i + 1
1417      elif c=='"':
1418          instring = 1
1419          codeinline = 1
1420      elif c=="'":
1421          instring = 2
1422          codeinline = 1
1423      else:
1424          codeinline = 1  # It's not whitespace, comment, or string.
1425          m = p_c_word.match(text, i-1)
1426          if m:                        # Do we have a word?
1427            startpos=i-1
1428            endpos = m.end(0)
1429            i = endpos
1430            word = text[startpos:endpos]
1431            # print "Word is:", text[startpos:endpos]
1432            if (word in c_ruleset) and c_valid_match(text, endpos):
1433              if ((patch_infos == None) or ((patch_infos != None) and
1434                                            (linenumber in patch_infos[f]))):
1435                # FOUND A MATCH, setup & call hook.
1436                # print "HIT: #%s#\n" % word
1437                # Don't use the tuple assignment form, e.g., a,b=c,d
1438                # because Python (least 2.2.2) does that slower
1439                # (presumably because it creates & destroys temporary tuples)
1440                hit = Hit(c_ruleset[word])
1441                hit.name = word
1442                hit.start = startpos
1443                hit.end = endpos
1444                hit.line = linenumber
1445                hit.column = find_column(text, startpos)
1446                hit.filename=filename
1447                hit.context_text = get_context(text, startpos)
1448                hit.parameters = extract_c_parameters(text, endpos)
1449                if hit.extract_lookahead:
1450                  hit.lookahead = text[startpos:startpos+max_lookahead]
1451                hit.hook(hit)
1452          elif p_digits.match(c):
1453            while i<len(text) and p_digits.match(text[i]): # Process a number.
1454              i = i + 1
1455          # else some other character, which we ignore.
1456  # End of loop through text. Wrap up.
1457  if codeinline: sloc = sloc + 1
1458  if incomment: error("File ended while in comment.")
1459  if instring: error("File ended while in string.")
1460
1461def expand_ruleset(ruleset):
1462  # Rulesets can have compressed sets of rules
1463  # (multiple function names separated by "|".
1464  # Expand the given ruleset.
1465  # Note that this for loop modifies the ruleset while it's iterating!
1466  for rule in ruleset.keys():
1467    if string.find(rule, "|") != -1:  # We found a rule to expand.
1468      for newrule in string.split(rule, "|"):
1469        if newrule in ruleset:
1470          print "Error: Rule %s, when expanded, overlaps %s" % ( rule, newrule )
1471          sys.exit(1)
1472        ruleset[newrule] = ruleset[rule]
1473      del ruleset[rule]
1474  # To print out the set of keys in the expanded ruleset, run:
1475  #   print `ruleset.keys()`
1476
1477def display_ruleset(ruleset):
1478  # First, sort the list by function name:
1479  sortedkeys = ruleset.keys()
1480  sortedkeys.sort()
1481  # Now, print them out:
1482  for key in sortedkeys:
1483    print key + "\t" + str(ruleset[key][1]) + "\t" + ruleset[key][2] # function name, default level, default warning
1484
1485def initialize_ruleset():
1486  expand_ruleset(c_ruleset)
1487  if showheading:
1488    print "Number of rules (primarily dangerous function names) in C/C++ ruleset:", len(c_ruleset)
1489    if output_format: print "<p>"
1490  if list_rules:
1491    display_ruleset(c_ruleset)
1492    sys.exit(0)
1493
1494
1495# Show the header, but only if it hasn't been shown yet.
1496def display_header():
1497  global displayed_header
1498  if not showheading: return
1499  if not displayed_header:
1500    if output_format:
1501      print ('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" ' +
1502            '"http://www.w3.org/TR/html4/loose.dtd">')
1503      print "<html>"
1504      print "<head>"
1505      print '<meta http-equiv="Content-type" content="text/html; charset=utf8">'
1506      print "<title>Flawfinder Results</title>"
1507      print '<meta name="author" content="David A. Wheeler">'
1508      print '<meta name="keywords" lang="en" content="flawfinder results, security scan">'
1509      print "</head>"
1510      print "<body>"
1511      print "<h1>Flawfinder Results</h1>"
1512      print "Here are the security scan results from"
1513      print '<a href="http://www.dwheeler.com/flawfinder">Flawfinder version %s</a>,' % version
1514      print '(C) 2001-2014 <a href="http://www.dwheeler.com">David A. Wheeler</a>.'
1515    else:
1516      print "Flawfinder version %s, (C) 2001-2014 David A. Wheeler." % version
1517    displayed_header = 1
1518
1519
1520c_extensions = { '.c' : 1, '.h' : 1,
1521                 '.ec': 1, '.ecp': 1,  # Informix embedded C.
1522                 '.pgc': 1,            # Postgres embedded C.
1523                 '.C': 1, '.cpp': 1, '.CPP': 1, '.cxx': 1, '.cc': 1, # C++
1524                 '.CC' : 1, '.c++' :1,  # C++.
1525                 '.pcc': 1,            # Oracle C++
1526                 '.hpp': 1, '.H' : 1,  # .h - usually C++.
1527               }
1528
1529
1530def maybe_process_file(f, patch_infos):
1531  # process f, but only if (1) it's a directory (so we recurse), or
1532  # (2) it's source code in a language we can handle.
1533  # Currently, for files that means only C/C++, and we check if the filename
1534  # has a known C/C++ filename extension.  If it doesn't, we ignore the file.
1535  # We accept symlinks only if allowlink is true.
1536  global num_links_skipped, num_dotdirs_skipped
1537  if os.path.isdir(f):
1538    if (not allowlink) and os.path.islink(f):
1539      if not quiet: print_warning("Skipping symbolic link directory " + h(f))
1540      num_links_skipped = num_links_skipped + 1
1541      return
1542    base_filename = os.path.basename(f)
1543    if (skipdotdir and len(base_filename) > 1 and ("." == base_filename[0])):
1544      if not quiet: print_warning("Skipping directory with initial dot " + h(f))
1545      num_dotdirs_skipped = num_dotdirs_skipped + 1
1546      return
1547    for file in os.listdir(f):
1548      maybe_process_file(os.path.join(f, file), patch_infos)
1549  # Now we will FIRST check if the file appears to be a C/C++ file, and
1550  # THEN check if it's a regular file or symlink.  This is more complicated,
1551  # but I do it this way so that there won't be a lot of pointless
1552  # warnings about skipping files we wouldn't have used anyway.
1553  dotposition = string.rfind(f, ".")
1554  if dotposition > 1:
1555    extension = f[dotposition:]
1556    if extension in c_extensions:
1557      # Its name appears to be a C/C++ source code file.
1558      if (not allowlink) and os.path.islink(f):
1559        if not quiet: print_warning("Skipping symbolic link file " + h(f))
1560        num_links_skipped = num_links_skipped + 1
1561      elif not os.path.isfile(f):
1562        # Skip anything not a normal file.  This is so that
1563        # device files, etc. won't cause trouble.
1564        if not quiet: print_warning("Skipping non-regular file " + h(f))
1565      else:
1566        # We want to know the difference only with files found in the patch.
1567        if ((patch_infos == None) or (patch_infos != None and
1568                                      (f in patch_infos))):
1569          process_c_file(f, patch_infos)
1570
1571
1572def process_file_args(files, patch_infos):
1573  # Process the list of "files", some of which may be directories,
1574  # which were given on the command line.
1575  # This is handled differently than anything not found on the command line
1576  # (i.e. through recursing into a directory) because flawfinder
1577  # ALWAYS processes normal files given on the command line.
1578  # This is done to give users control over what's processed;
1579  # if a user really, really wants to analyze a file, name it!
1580  # If user wants to process "this directory and down", just say ".".
1581  # We handle symlinks specially, handle normal files and directories,
1582  # and skip the rest to prevent security problems. "-" is stdin.
1583  global num_links_skipped
1584  for f in files:
1585    if (not allowlink) and os.path.islink(f):
1586       if not quiet: print_warning("Skipping symbolic link " + h(f))
1587       num_links_skipped = num_links_skipped + 1
1588    elif os.path.isfile(f) or f == "-":
1589       # If on the command line, FORCE processing of it.
1590       # Currently, we only process C/C++.
1591       # check if we only want to review a patch
1592       if ( (patch_infos != None and k in patch_infos) or (patch_infos == None) ):
1593        process_c_file(f, patch_infos)
1594    elif os.path.isdir(f):
1595       # At one time flawfinder used os.path.walk, but that Python
1596       # built-in doesn't give us enough control over symbolic links.
1597       # So, we'll walk the filesystem hierarchy ourselves:
1598       maybe_process_file(f, patch_infos)
1599    elif not os.path.exists(f):
1600      if not quiet:
1601        if h(f).startswith("\342\210\222"):
1602          print_warning("Skipping non-existent filename starting with UTF-8 long dash " + h(f))
1603        else:
1604          print_warning("Skipping non-existent file " + h(f))
1605    else:
1606       if not quiet: print_warning("Skipping non-regular file " + h(f))
1607
1608def usage():
1609  print """
1610flawfinder [--help | -h] [--version] [--listrules]
1611  [--allowlink] [--followdotdir] [--nolink]
1612           [--patch filename | -P filename]
1613  [--inputs | -I] [--minlevel X | -m X]
1614           [--falsepositive | -F] [--neverignore | -n]
1615  [--context | -c] [--columns | -C] [--dataonly | -D]
1616           [--html | -H] [--immediate | -i] [--singleline | -S]
1617           [--omittime] [--quiet | -Q]
1618  [--loadhitlist F] [--savehitlist F] [--diffhitlist F]
1619  [--] [source code file or source root directory]+
1620
1621  The options cover various aspects of flawfinder as follows.
1622
1623  Documentation:
1624  --help | -h Show this usage help.
1625  --version   Show version number.
1626  --listrules List the rules in the ruleset (rule database).
1627
1628  Selecting Input Data:
1629  --allowlink Allow symbolic links.
1630  --followdotdir
1631              Follow directories whose names begin with ".".
1632              Normally they are ignored.
1633  --nolink    Skip symbolic links (ignored).
1634  --patch F | -P F
1635              Display information related to the patch F
1636              (patch must be already applied).
1637
1638  Selecting Hits to Display:
1639  --inputs | -I
1640              Show only functions that obtain data from outside the program;
1641              this also sets minlevel to 0.
1642  -m X | --minlevel=X
1643              Set minimum risk level to X for inclusion in hitlist.  This
1644              can be from 0 (``no risk'')  to  5  (``maximum  risk'');  the
1645              default is 1.
1646  --falsepositive | -F
1647              Do not include hits that are likely to be false  positives.
1648              Currently,  this  means  that function names are ignored if
1649              they're not followed by "(", and that declarations of char-
1650              acter  arrays  aren't noted.  Thus, if you have use a vari-
1651              able named "access" everywhere, this will eliminate  refer-
1652              ences  to  this ordinary variable.  This isn't the default,
1653              because this  also  increases  the  likelihood  of  missing
1654              important  hits;  in  particular, function names in #define
1655              clauses and calls through function pointers will be missed.
1656  --neverignore | -n
1657              Never ignore security issues, even if they have an ``ignore''
1658              directive in a comment.
1659  --regex PATTERN | -e PATTERN
1660              Only report hits that match the regular expression PATTERN.
1661
1662  Selecting Output Format:
1663  --columns | -C
1664              Show  the  column  number  (as well as the file name and
1665              line number) of each hit; this is shown after the line number
1666              by adding a colon and the column number in the line (the first
1667              character in a line is column number 1).
1668  --context | -c
1669              Show context (the line having the "hit"/potential flaw)
1670  --dataonly | -D
1671              Don't display the headers and footers of the analysis;
1672              use this along with --quiet to get just the results.
1673  --html | -H
1674              Display as HTML output.
1675  --immediate | -i
1676              Immediately display hits (don't just wait until the end).
1677  --singleline | -S
1678              Single-line output.
1679  --omittime  Omit time to run.
1680  --quiet | -Q
1681              Don't display status information (i.e., which files are being
1682              examined) while the analysis is going on.
1683
1684  Hitlist Management:
1685  --savehitlist=F
1686              Save all hits (the "hitlist") to F.
1687  --loadhitlist=F
1688              Load hits from F instead of analyzing source programs.
1689  --diffhitlist=F
1690              Show only hits (loaded or analyzed) not in F.
1691
1692
1693  For more information, please consult the manpage or available
1694  documentation.
1695"""
1696
1697def process_options():
1698  global show_context, show_inputs, allowlink, skipdotdir, omit_time
1699  global output_format, minimum_level, show_immediately, single_line
1700  global required_regex, required_regex_compiled
1701  global falsepositive
1702  global show_columns, never_ignore, quiet, showheading, list_rules
1703  global loadhitlist, savehitlist, diffhitlist
1704  global patch_file
1705  try:
1706    # Note - as a side-effect, this sets sys.argv[].
1707    optlist, args = getopt.getopt(sys.argv[1:], "ce:m:nih?CSDQHIFP:",
1708                    ["context", "minlevel=", "immediate", "inputs", "input",
1709                     "nolink", "falsepositive", "falsepositives",
1710                     "columns", "listrules", "omittime", "allowlink", "patch=",
1711                     "followdotdir",
1712                     "neverignore", "regex=",
1713                     "quiet", "dataonly", "html", "singleline",
1714                     "loadhitlist=", "savehitlist=", "diffhitlist=",
1715                     "version", "help" ])
1716    for (opt,value) in optlist:
1717      if   opt == "--context" or opt == "-c":
1718        show_context = 1
1719      elif opt == "--columns" or opt == "-C":
1720        show_columns = 1
1721      elif opt == "--quiet" or opt == "-Q":
1722        quiet = 1
1723      elif opt == "--dataonly" or opt == "-D":
1724        showheading = 0
1725      elif opt == "--inputs" or opt == "--input" or opt == "-I":
1726        show_inputs = 1
1727        minimum_level = 0
1728      elif opt == "--falsepositive" or opt == "falsepositives" or opt == "-F":
1729        falsepositive = 1
1730      elif opt == "--nolink":
1731        allowlink = 0
1732      elif opt == "--omittime":
1733        omit_time = 1
1734      elif opt == "--allowlink":
1735        allowlink = 1
1736      elif opt == "--followdotdir":
1737        skipdotdir = 0
1738      elif opt == "--listrules":
1739        list_rules = 1
1740      elif opt == "--html" or opt == "-H":
1741        output_format = 1
1742        single_line = 0
1743      elif opt == "--minlevel" or opt == "-m":
1744        minimum_level = string.atoi(value)
1745      elif opt == "--singleline" or opt == "-S":
1746        single_line = 1
1747      elif opt == "--immediate" or opt == "-i":
1748        show_immediately = 1
1749      elif opt == "-n" or opt == "--neverignore":
1750        never_ignore = 1
1751      elif opt == "-e" or opt == "--regex":
1752        required_regex = value
1753        # This will raise an exception if it can't be compiled as a regex:
1754        required_regex_compiled = re.compile(required_regex)
1755      elif opt == "-P" or opt == "--patch":
1756        # Note: This is -P, so that a future -p1 option can strip away
1757        # pathname prefixes (with the same option name as "patch").
1758        patch_file = value
1759        # If we consider ignore comments we may change a line which was
1760        # previously ignored but which will raise now a valid warning without
1761        # noticing it now.  So, set never_ignore.
1762        never_ignore = 1
1763      elif opt == "--loadhitlist":
1764        loadhitlist = value
1765        display_header()
1766        if showheading: print "Loading hits from", value
1767      elif opt == "--savehitlist":
1768        savehitlist = value
1769        display_header()
1770        if showheading: print "Saving hitlist to", value
1771      elif opt == "--diffhitlist":
1772        diffhitlist = value
1773        display_header()
1774        if showheading: print "Showing hits not in", value
1775      elif opt == "--version":
1776        print version
1777        sys.exit(0)
1778      elif opt in [ '-h', '-?', '--help' ]:
1779        # We accept "-?" but do not document it.  On Unix-like systems the
1780        # question mark in "-?" should be escaped, and many forget that.
1781        usage()
1782        sys.exit(0)
1783    # For DOS/Windows, expand filenames; for Unix, DON'T expand them
1784    # (the shell will expand them for us).  Some sloppy Python programs
1785    # always call "glob", but that's WRONG -- on Unix-like systems that
1786    # will expand twice.  Python doesn't have a clean way to detect
1787    # "has globbing occurred", so this is the best I've found:
1788    if os.name == "windows" or os.name == "nt" or os.name == "dos":
1789       sys.argv[1:] = reduce(operator.add, map(glob.glob, args))
1790    else:
1791       sys.argv[1:] = args
1792  # In Python 2 the convention is "getopt.GetoptError", but we
1793  # use "getopt.error" here so it's compatible with both
1794  # Python 1.5 and Python 2.
1795  except getopt.error, text:
1796    print "*** getopt error:", text
1797    usage()
1798    sys.exit(1)
1799
1800
1801
1802def process_files():
1803  global hitlist
1804  if loadhitlist:
1805    f = open(loadhitlist)
1806    hitlist = pickle.load(f)
1807  else:
1808    patch_infos = None
1809    if (patch_file != ""):
1810      patch_infos = load_patch_info(patch_file)
1811    files = sys.argv[1:]
1812    if not files:
1813        print "*** No input files"
1814        return None
1815    process_file_args(files, patch_infos)
1816    return 1
1817
1818
1819def show_final_results():
1820  global hitlist
1821  count = 0
1822  count_per_level = {}
1823  count_per_level_and_up = {}
1824  for i in range(0,6):  # Initialize count_per_level
1825    count_per_level[i] = 0
1826  for i in range(0,6):  # Initialize count_per_level
1827    count_per_level_and_up[i] = 0
1828  if show_immediately or not quiet:   # Separate the final results.
1829    print
1830    if showheading:
1831      if output_format:
1832        print "<h2>Final Results</h2>"
1833      else:
1834        print "FINAL RESULTS:"
1835        print
1836  hitlist.sort()
1837  # Display results.  The HTML format now uses
1838  # <ul> so that the format differentiates each entry.
1839  # I'm not using <ol>, because its numbers might be confused with
1840  # the risk levels or line numbers.
1841  if diffhitlist:
1842    diff_file = open(diffhitlist)
1843    diff_hitlist = pickle.load(diff_file)
1844    if output_format: print "<ul>"
1845    for h in hitlist:
1846      if h not in diff_hitlist:
1847        h.show()
1848        count_per_level[h.level] = count_per_level[h.level] + 1
1849        count = count + 1
1850    if output_format: print "</ul>"
1851    diff_file.close()
1852  else:
1853    if output_format: print "<ul>"
1854    for h in hitlist:
1855      h.show()
1856      count_per_level[h.level] = count_per_level[h.level] + 1
1857    if output_format: print "</ul>"
1858    count = len(hitlist)
1859  # Done with list, show the post-hitlist summary.
1860  if showheading:
1861    if output_format:
1862      print "<h2>Analysis Summary</h2>"
1863    else:
1864      print
1865      print "ANALYSIS SUMMARY:"
1866    if output_format:
1867      print "<p>"
1868    else:
1869      print
1870    if count > 0:
1871      print "Hits =", count
1872    else:
1873      print "No hits found."
1874    if output_format:
1875      print "<br>"
1876    # Compute the amount of time spent, and lines analyzed/second.
1877    # By computing time here, we also include the time for
1878    # producing the list of hits, which is reasonable.
1879    time_analyzing = time.time() - starttime
1880    if required_regex:
1881      print "Hits limited to regular expression " + required_regex
1882    print "Lines analyzed = %d" % sumlines,
1883    if time_analyzing > 0 and not omit_time:  # Avoid divide-by-zero.
1884      print "in approximately %.2f seconds (%.0f lines/second)" % (
1885             time_analyzing,
1886             (sumlines / time_analyzing) )
1887    else:
1888      print
1889    if output_format: print "<br>"
1890    print "Physical Source Lines of Code (SLOC) = %d" % sloc
1891    if output_format: print "<br>"
1892    # Output hits@each level.
1893    print "Hits@level =",
1894    for i in range(0,6):
1895      print "[%d] %3d" % (i, count_per_level[i]),
1896    if output_format:
1897      print "<br>"
1898    else:
1899      print
1900    # Compute hits at "level x or higher"
1901    print "Hits@level+ =",
1902    for i in range(0,6):
1903      for j in range(i,6):
1904        count_per_level_and_up[i] = count_per_level_and_up[i] + count_per_level[j]
1905    # Display hits at "level x or higher"
1906    for i in range(0,6):
1907      print "[%d+] %3d" % (i, count_per_level_and_up[i]),
1908    if output_format:
1909      print "<br>"
1910    else:
1911      print
1912    if (sloc > 0):
1913      print "Hits/KSLOC@level+ =",
1914      for i in range(0,6):
1915        print "[%d+] %3g" % (i, count_per_level_and_up[i]*1000.0/sloc),
1916    if output_format:
1917      print "<br>"
1918    else:
1919      print
1920    #
1921    if num_links_skipped:
1922      print "Symlinks skipped =", num_links_skipped, "(--allowlink overrides but see doc for security issue)"
1923      if output_format:
1924        print "<br>"
1925    if num_dotdirs_skipped:
1926      print "Dot directories skipped =", num_dotdirs_skipped, "(--followdotdir overrides)"
1927      if output_format:
1928        print "<br>"
1929    if num_ignored_hits > 0:
1930      print "Suppressed hits =", num_ignored_hits, "(use --neverignore to show them)"
1931      if output_format:
1932        print "<br>"
1933    print "Minimum risk level = %d" % minimum_level
1934    if output_format: print "<br>"
1935    if count > 0:
1936       print "Not every hit is necessarily a security vulnerability."
1937       if output_format:
1938         print "<br>"
1939    print "There may be other security vulnerabilities; review your code!"
1940    if output_format:
1941      print "<br>"
1942      print "See '<a href=\"http://www.dwheeler.com/secure-programs\">Secure Programming for Linux and Unix HOWTO</a>'"
1943      print "(<a href=\"http://www.dwheeler.com/secure-programs\">http://www.dwheeler.com/secure-programs</a>) for more information."
1944    else:
1945      print "See 'Secure Programming for Linux and Unix HOWTO'"
1946      print "(http://www.dwheeler.com/secure-programs) for more information."
1947    if output_format:
1948      print "</body>"
1949      print "</html>"
1950
1951
1952def save_if_desired():
1953  # We'll save entire hitlist, even if only differences displayed.
1954  if savehitlist:
1955    print "Saving hitlist to", savehitlist
1956    f = open(savehitlist, "w")
1957    pickle.dump(hitlist, f)
1958    f.close()
1959
1960def flawfind():
1961  process_options()
1962  display_header()
1963  initialize_ruleset()
1964  if process_files():
1965    show_final_results()
1966    save_if_desired()
1967
1968if __name__ == '__main__':
1969  try:
1970    flawfind()
1971  except KeyboardInterrupt:
1972    print "*** Flawfinder interrupted"
1973
1974