1#!/usr/bin/python2
2#
3# Copyright 2016 The Chromium Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7"""Generate a dictionary for libFuzzer or AFL-based fuzzer.
8
9Invoked manually using a fuzzer binary and target format/protocol specification.
10Works better for text formats or protocols. For binary ones may be useless.
11"""
12
13import argparse
14import HTMLParser
15import io
16import logging
17import os
18import re
19import shutil
20import string
21import subprocess
22import sys
23import tempfile
24
25
26ENCODING_TYPES = ['ascii', 'utf_16_be', 'utf_16_le', 'utf_32_be', 'utf_32_le']
27MIN_STRING_LENGTH = 4
28
29
30def DecodeHTML(html_data):
31  """HTML-decoding of the data."""
32  html_parser = HTMLParser.HTMLParser()
33  data = html_parser.unescape(html_data.decode('ascii', 'ignore'))
34  return data.encode('ascii', 'ignore')
35
36
37def EscapeDictionaryElement(element):
38  """Escape all unprintable and control characters in an element."""
39  element_escaped = element.encode('string_escape')
40  # Remove escaping for single quote because it breaks libFuzzer.
41  element_escaped = element_escaped.replace('\\\'', '\'')
42  # Add escaping for double quote.
43  element_escaped = element_escaped.replace('"', '\\"')
44  return element_escaped
45
46
47def ExtractWordsFromBinary(filepath, min_length=MIN_STRING_LENGTH):
48  """Extract words (splitted strings) from a binary executable file."""
49  rodata = PreprocessAndReadRodata(filepath)
50  words = []
51
52  strings_re = re.compile(r'[^\x00-\x1F\x7F-\xFF]{%d,}' % min_length)
53  # Use different encodings for strings extraction.
54  for encoding in ENCODING_TYPES:
55    data = rodata.decode(encoding, 'ignore').encode('ascii', 'ignore')
56    raw_strings = strings_re.findall(data)
57    for splitted_line in map(lambda line: line.split(), raw_strings):
58      words += splitted_line
59
60  return set(words)
61
62
63def ExtractWordsFromLines(lines):
64  """Extract all words from a list of strings."""
65  words = set()
66  for line in lines:
67    for word in line.split():
68      words.add(word)
69
70  return words
71
72
73def ExtractWordsFromSpec(filepath, is_html):
74  """Extract words from a specification."""
75  data = ReadSpecification(filepath, is_html)
76  words = data.split()
77  return set(words)
78
79
80def FindIndentedText(text):
81  """Find space-indented text blocks, e.g. code or data samples in RFCs."""
82  lines = text.split('\n')
83  indented_blocks = []
84  current_block = ''
85  previous_number_of_spaces = 0
86
87  # Go through every line and concatenate space-indented blocks into lines.
88  for i in xrange(0, len(lines), 1):
89    if not lines[i]:
90      # Ignore empty lines.
91      continue
92
93    # Space-indented text blocks have more leading spaces than regular text.
94    n = FindNumberOfLeadingSpaces(lines[i])
95
96    if n > previous_number_of_spaces:
97      # Beginning of a space-indented text block, start concatenation.
98      current_block = lines[i][n : ]
99    elif n == previous_number_of_spaces and current_block:
100      # Or continuation of a space-indented text block, concatenate lines.
101      current_block += '\n' + lines[i][n : ]
102
103    if n < previous_number_of_spaces and current_block:
104      # Current line is not indented, save previously concatenated lines.
105      indented_blocks.append(current_block)
106      current_block = ''
107
108    previous_number_of_spaces = n
109
110  return indented_blocks
111
112
113def FindNumberOfLeadingSpaces(line):
114  """Calculate number of leading whitespace characters in the string."""
115  n = 0
116  while n < len(line) and line[n].isspace():
117    n += 1
118
119  return n
120
121
122def GenerateDictionary(path_to_binary, path_to_spec, strategy, is_html=False):
123  """Generate a dictionary for given pair of fuzzer binary and specification."""
124  for filepath in [path_to_binary, path_to_spec]:
125    if not os.path.exists(filepath):
126      logging.error('%s doesn\'t exist. Exit.', filepath)
127      sys.exit(1)
128
129  words_from_binary = ExtractWordsFromBinary(path_to_binary)
130  words_from_spec = ExtractWordsFromSpec(path_to_spec, is_html)
131
132  dictionary_words = set()
133
134  if 'i' in strategy:
135    # Strategy i: only words which are common for binary and for specification.
136    dictionary_words = words_from_binary.intersection(words_from_spec)
137
138  if 'q' in strategy:
139    # Strategy q: add words from all quoted strings from specification.
140    # TODO(mmoroz): experimental and very noisy. Not recommended to use.
141    spec_data = ReadSpecification(path_to_spec, is_html)
142    quoted_strings = FindIndentedText(spec_data)
143    quoted_words = ExtractWordsFromLines(quoted_strings)
144    dictionary_words = dictionary_words.union(quoted_words)
145
146  if 'u' in strategy:
147    # Strategy u: add all uppercase words from specification.
148    uppercase_words = set(w for w in words_from_spec if w.isupper())
149    dictionary_words = dictionary_words.union(uppercase_words)
150
151  return dictionary_words
152
153
154def PreprocessAndReadRodata(filepath):
155  """Create a stripped copy of the binary and extract .rodata section."""
156  stripped_file = tempfile.NamedTemporaryFile(prefix='.stripped_')
157  stripped_filepath = stripped_file.name
158  shutil.copyfile(filepath, stripped_filepath)
159
160  # Strip all symbols to reduce amount of redundant strings.
161  strip_cmd = ['strip', '--strip-all', stripped_filepath]
162  result = subprocess.call(strip_cmd)
163  if result:
164    logging.warning('Failed to strip the binary. Using the original version.')
165    stripped_filepath = filepath
166
167  # Extract .rodata section to reduce amount of redundant strings.
168  rodata_file = tempfile.NamedTemporaryFile(prefix='.rodata_')
169  rodata_filepath = rodata_file.name
170  objcopy_cmd = ['objcopy', '-j', '.rodata', stripped_filepath, rodata_filepath]
171
172  # Hide output from stderr since objcopy prints a warning.
173  with open(os.devnull, 'w') as devnull:
174    result = subprocess.call(objcopy_cmd, stderr=devnull)
175
176  if result:
177    logging.warning('Failed to extract .rodata section. Using the whole file.')
178    rodata_filepath = stripped_filepath
179
180  with open(rodata_filepath) as file_handle:
181    data = file_handle.read()
182
183  stripped_file.close()
184  rodata_file.close()
185
186  return data
187
188
189def ReadSpecification(filepath, is_html):
190  """Read a specification file and return its contents."""
191  with open(filepath, 'r') as file_handle:
192    data = file_handle.read()
193
194  if is_html:
195    data = DecodeHTML(data)
196
197  return data
198
199
200def WriteDictionary(dictionary_path, dictionary):
201  """Write given dictionary to a file."""
202  with open(dictionary_path, 'wb') as file_handle:
203    file_handle.write('# This is an automatically generated dictionary.\n')
204    for word in dictionary:
205      if not word:
206        continue
207      line = '"%s"\n' % EscapeDictionaryElement(word)
208      file_handle.write(line)
209
210
211def main():
212  parser = argparse.ArgumentParser(description="Generate fuzzer dictionary.")
213  parser.add_argument('--fuzzer', required=True,
214                      help='Path to a fuzzer binary executable. It is '
215                      'recommended to use a binary built with '
216                      '"use_libfuzzer=false is_asan=false" to get a better '
217                      'dictionary with fewer number of redundant elements.')
218  parser.add_argument('--spec', required=True,
219                      help='Path to a target specification (in textual form).')
220  parser.add_argument('--html', default=0,
221                      help='Decode HTML [01] (0 is default value): '
222                      '1 - if specification has HTML entities to be decoded.')
223  parser.add_argument('--out', required=True,
224                      help='Path to a file to write a dictionary into.')
225  parser.add_argument('--strategy', default='iu',
226                      help='Generation strategy [iqu] ("iu" is default value): '
227                      'i - intersection, q - quoted, u - uppercase.')
228  args = parser.parse_args()
229
230  dictionary = GenerateDictionary(args.fuzzer, args.spec, args.strategy,
231                                  is_html=bool(args.html))
232  WriteDictionary(args.out, dictionary)
233
234
235if __name__ == '__main__':
236  main()
237