1#!/usr/bin/env bash
2#
3# american fuzzy lop - corpus minimization tool
4# ---------------------------------------------
5#
6# Written and maintained by Michal Zalewski <lcamtuf@google.com>
7#
8# Copyright 2014, 2015 Google Inc. All rights reserved.
9#
10# Licensed under the Apache License, Version 2.0 (the "License");
11# you may not use this file except in compliance with the License.
12# You may obtain a copy of the License at:
13#
14#   http://www.apache.org/licenses/LICENSE-2.0
15#
16# This tool tries to find the smallest subset of files in the input directory
17# that still trigger the full range of instrumentation data points seen in
18# the starting corpus. This has two uses:
19#
20#   - Screening large corpora of input files before using them as a seed for
21#     afl-fuzz. The tool will remove functionally redundant files and likely
22#     leave you with a much smaller set.
23#
24#     (In this case, you probably also want to consider running afl-tmin on
25#     the individual files later on to reduce their size.)
26#
27#   - Minimizing the corpus generated organically by afl-fuzz, perhaps when
28#     planning to feed it to more resource-intensive tools. The tool achieves
29#     this by removing all entries that used to trigger unique behaviors in the
30#     past, but have been made obsolete by later finds.
31#
32# Note that the tool doesn't modify the files themselves. For that, you want
33# afl-tmin.
34#
35# This script must use bash because other shells may have hardcoded limits on
36# array sizes.
37#
38
39echo "corpus minimization tool for afl-fuzz by <lcamtuf@google.com>"
40echo
41
42#########
43# SETUP #
44#########
45
46# Process command-line options...
47
48MEM_LIMIT=100
49TIMEOUT=none
50
51unset IN_DIR OUT_DIR STDIN_FILE EXTRA_PAR MEM_LIMIT_GIVEN \
52  AFL_CMIN_CRASHES_ONLY AFL_CMIN_ALLOW_ANY QEMU_MODE
53
54while getopts "+i:o:f:m:t:eQC" opt; do
55
56  case "$opt" in
57
58    "i")
59         IN_DIR="$OPTARG"
60         ;;
61
62    "o")
63         OUT_DIR="$OPTARG"
64         ;;
65    "f")
66         STDIN_FILE="$OPTARG"
67         ;;
68    "m")
69         MEM_LIMIT="$OPTARG"
70         MEM_LIMIT_GIVEN=1
71         ;;
72    "t")
73         TIMEOUT="$OPTARG"
74         ;;
75    "e")
76         EXTRA_PAR="$EXTRA_PAR -e"
77         ;;
78    "C")
79         export AFL_CMIN_CRASHES_ONLY=1
80         ;;
81    "Q")
82         EXTRA_PAR="$EXTRA_PAR -Q"
83         test "$MEM_LIMIT_GIVEN" = "" && MEM_LIMIT=250
84         QEMU_MODE=1
85         ;;
86    "?")
87         exit 1
88         ;;
89
90   esac
91
92done
93
94shift $((OPTIND-1))
95
96TARGET_BIN="$1"
97
98if [ "$TARGET_BIN" = "" -o "$IN_DIR" = "" -o "$OUT_DIR" = "" ]; then
99
100  cat 1>&2 <<_EOF_
101Usage: $0 [ options ] -- /path/to/target_app [ ... ]
102
103Required parameters:
104
105  -i dir        - input directory with the starting corpus
106  -o dir        - output directory for minimized files
107
108Execution control settings:
109
110  -f file       - location read by the fuzzed program (stdin)
111  -m megs       - memory limit for child process ($MEM_LIMIT MB)
112  -t msec       - run time limit for child process (none)
113  -Q            - use binary-only instrumentation (QEMU mode)
114
115Minimization settings:
116
117  -C            - keep crashing inputs, reject everything else
118  -e            - solve for edge coverage only, ignore hit counts
119
120For additional tips, please consult docs/README.
121
122_EOF_
123  exit 1
124fi
125
126# Do a sanity check to discourage the use of /tmp, since we can't really
127# handle this safely from a shell script.
128
129if [ "$AFL_ALLOW_TMP" = "" ]; then
130
131  echo "$IN_DIR" | grep -qE '^(/var)?/tmp/'
132  T1="$?"
133
134  echo "$TARGET_BIN" | grep -qE '^(/var)?/tmp/'
135  T2="$?"
136
137  echo "$OUT_DIR" | grep -qE '^(/var)?/tmp/'
138  T3="$?"
139
140  echo "$STDIN_FILE" | grep -qE '^(/var)?/tmp/'
141  T4="$?"
142
143  echo "$PWD" | grep -qE '^(/var)?/tmp/'
144  T5="$?"
145
146  if [ "$T1" = "0" -o "$T2" = "0" -o "$T3" = "0" -o "$T4" = "0" -o "$T5" = "0" ]; then
147    echo "[-] Error: do not use this script in /tmp or /var/tmp." 1>&2
148    exit 1
149  fi
150
151fi
152
153# If @@ is specified, but there's no -f, let's come up with a temporary input
154# file name.
155
156TRACE_DIR="$OUT_DIR/.traces"
157
158if [ "$STDIN_FILE" = "" ]; then
159
160  if echo "$*" | grep -qF '@@'; then
161    STDIN_FILE="$TRACE_DIR/.cur_input"
162  fi
163
164fi
165
166# Check for obvious errors.
167
168if [ ! "$MEM_LIMIT" = "none" ]; then
169
170  if [ "$MEM_LIMIT" -lt "5" ]; then
171    echo "[-] Error: dangerously low memory limit." 1>&2
172    exit 1
173  fi
174
175fi
176
177if [ ! "$TIMEOUT" = "none" ]; then
178
179  if [ "$TIMEOUT" -lt "10" ]; then
180    echo "[-] Error: dangerously low timeout." 1>&2
181    exit 1
182  fi
183
184fi
185
186if [ ! -f "$TARGET_BIN" -o ! -x "$TARGET_BIN" ]; then
187
188  TNEW="`which "$TARGET_BIN" 2>/dev/null`"
189
190  if [ ! -f "$TNEW" -o ! -x "$TNEW" ]; then
191    echo "[-] Error: binary '$TARGET_BIN' not found or not executable." 1>&2
192    exit 1
193  fi
194
195  TARGET_BIN="$TNEW"
196
197fi
198
199if [ "$AFL_SKIP_BIN_CHECK" = "" -a "$QEMU_MODE" = "" ]; then
200
201  if ! grep -qF "__AFL_SHM_ID" "$TARGET_BIN"; then
202    echo "[-] Error: binary '$TARGET_BIN' doesn't appear to be instrumented." 1>&2
203    exit 1
204  fi
205
206fi
207
208if [ ! -d "$IN_DIR" ]; then
209  echo "[-] Error: directory '$IN_DIR' not found." 1>&2
210  exit 1
211fi
212
213test -d "$IN_DIR/queue" && IN_DIR="$IN_DIR/queue"
214
215find "$OUT_DIR" -name 'id[:_]*' -maxdepth 1 -exec rm -- {} \; 2>/dev/null
216rm -rf "$TRACE_DIR" 2>/dev/null
217
218rmdir "$OUT_DIR" 2>/dev/null
219
220if [ -d "$OUT_DIR" ]; then
221  echo "[-] Error: directory '$OUT_DIR' exists and is not empty - delete it first." 1>&2
222  exit 1
223fi
224
225mkdir -m 700 -p "$TRACE_DIR" || exit 1
226
227if [ ! "$STDIN_FILE" = "" ]; then
228  rm -f "$STDIN_FILE" || exit 1
229  touch "$STDIN_FILE" || exit 1
230fi
231
232if [ "$AFL_PATH" = "" ]; then
233  SHOWMAP="${0%/afl-cmin}/afl-showmap"
234else
235  SHOWMAP="$AFL_PATH/afl-showmap"
236fi
237
238if [ ! -x "$SHOWMAP" ]; then
239  echo "[-] Error: can't find 'afl-showmap' - please set AFL_PATH." 1>&2
240  rm -rf "$TRACE_DIR"
241  exit 1
242fi
243
244IN_COUNT=$((`ls -- "$IN_DIR" 2>/dev/null | wc -l`))
245
246if [ "$IN_COUNT" = "0" ]; then
247  echo "[+] Hmm, no inputs in the target directory. Nothing to be done."
248  rm -rf "$TRACE_DIR"
249  exit 1
250fi
251
252FIRST_FILE=`ls "$IN_DIR" | head -1`
253
254# Make sure that we're not dealing with a directory.
255
256if [ -d "$IN_DIR/$FIRST_FILE" ]; then
257  echo "[-] Error: The target directory contains subdirectories - please fix." 1>&2
258  rm -rf "$TRACE_DIR"
259  exit 1
260fi
261
262# Check for the more efficient way to copy files...
263
264if ln "$IN_DIR/$FIRST_FILE" "$TRACE_DIR/.link_test" 2>/dev/null; then
265  CP_TOOL=ln
266else
267  CP_TOOL=cp
268fi
269
270# Make sure that we can actually get anything out of afl-showmap before we
271# waste too much time.
272
273echo "[*] Testing the target binary..."
274
275if [ "$STDIN_FILE" = "" ]; then
276
277  AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$FIRST_FILE"
278
279else
280
281  cp "$IN_DIR/$FIRST_FILE" "$STDIN_FILE"
282  AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -A "$STDIN_FILE" -- "$@" </dev/null
283
284fi
285
286FIRST_COUNT=$((`grep -c . "$TRACE_DIR/.run_test"`))
287
288if [ "$FIRST_COUNT" -gt "0" ]; then
289
290  echo "[+] OK, $FIRST_COUNT tuples recorded."
291
292else
293
294  echo "[-] Error: no instrumentation output detected (perhaps crash or timeout)." 1>&2
295  test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
296  exit 1
297
298fi
299
300# Let's roll!
301
302#############################
303# STEP 1: COLLECTING TRACES #
304#############################
305
306echo "[*] Obtaining traces for input files in '$IN_DIR'..."
307
308(
309
310  CUR=0
311
312  if [ "$STDIN_FILE" = "" ]; then
313
314    while read -r fn; do
315
316      CUR=$((CUR+1))
317      printf "\\r    Processing file $CUR/$IN_COUNT... "
318
319      "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$fn"
320
321    done < <(ls "$IN_DIR")
322
323  else
324
325    while read -r fn; do
326
327      CUR=$((CUR+1))
328      printf "\\r    Processing file $CUR/$IN_COUNT... "
329
330      cp "$IN_DIR/$fn" "$STDIN_FILE"
331
332      "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -A "$STDIN_FILE" -- "$@" </dev/null
333
334    done < <(ls "$IN_DIR")
335
336
337  fi
338
339)
340
341echo
342
343##########################
344# STEP 2: SORTING TUPLES #
345##########################
346
347# With this out of the way, we sort all tuples by popularity across all
348# datasets. The reasoning here is that we won't be able to avoid the files
349# that trigger unique tuples anyway, so we will want to start with them and
350# see what's left.
351
352echo "[*] Sorting trace sets (this may take a while)..."
353
354ls "$IN_DIR" | sed "s#^#$TRACE_DIR/#" | tr '\n' '\0' | xargs -0 -n 1 cat | \
355  sort | uniq -c | sort -n >"$TRACE_DIR/.all_uniq"
356
357TUPLE_COUNT=$((`grep -c . "$TRACE_DIR/.all_uniq"`))
358
359echo "[+] Found $TUPLE_COUNT unique tuples across $IN_COUNT files."
360
361#####################################
362# STEP 3: SELECTING CANDIDATE FILES #
363#####################################
364
365# The next step is to find the best candidate for each tuple. The "best"
366# part is understood simply as the smallest input that includes a particular
367# tuple in its trace. Empirical evidence suggests that this produces smaller
368# datasets than more involved algorithms that could be still pulled off in
369# a shell script.
370
371echo "[*] Finding best candidates for each tuple..."
372
373CUR=0
374
375while read -r fn; do
376
377  CUR=$((CUR+1))
378  printf "\\r    Processing file $CUR/$IN_COUNT... "
379
380  sed "s#\$# $fn#" "$TRACE_DIR/$fn" >>"$TRACE_DIR/.candidate_list"
381
382done < <(ls -rS "$IN_DIR")
383
384echo
385
386##############################
387# STEP 4: LOADING CANDIDATES #
388##############################
389
390# At this point, we have a file of tuple-file pairs, sorted by file size
391# in ascending order (as a consequence of ls -rS). By doing sort keyed
392# only by tuple (-k 1,1) and configured to output only the first line for
393# every key (-s -u), we end up with the smallest file for each tuple.
394
395echo "[*] Sorting candidate list (be patient)..."
396
397sort -k1,1 -s -u "$TRACE_DIR/.candidate_list" | \
398  sed 's/^/BEST_FILE[/;s/ /]="/;s/$/"/' >"$TRACE_DIR/.candidate_script"
399
400if [ ! -s "$TRACE_DIR/.candidate_script" ]; then
401  echo "[-] Error: no traces obtained from test cases, check syntax!" 1>&2
402  test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
403  exit 1
404fi
405
406# The sed command converted the sorted list to a shell script that populates
407# BEST_FILE[tuple]="fname". Let's load that!
408
409. "$TRACE_DIR/.candidate_script"
410
411##########################
412# STEP 5: WRITING OUTPUT #
413##########################
414
415# The final trick is to grab the top pick for each tuple, unless said tuple is
416# already set due to the inclusion of an earlier candidate; and then put all
417# tuples associated with the newly-added file to the "already have" list. The
418# loop works from least popular tuples and toward the most common ones.
419
420echo "[*] Processing candidates and writing output files..."
421
422CUR=0
423
424touch "$TRACE_DIR/.already_have"
425
426while read -r cnt tuple; do
427
428  CUR=$((CUR+1))
429  printf "\\r    Processing tuple $CUR/$TUPLE_COUNT... "
430
431  # If we already have this tuple, skip it.
432
433  grep -q "^$tuple\$" "$TRACE_DIR/.already_have" && continue
434
435  FN=${BEST_FILE[tuple]}
436
437  $CP_TOOL "$IN_DIR/$FN" "$OUT_DIR/$FN"
438
439  if [ "$((CUR % 5))" = "0" ]; then
440    sort -u "$TRACE_DIR/$FN" "$TRACE_DIR/.already_have" >"$TRACE_DIR/.tmp"
441    mv -f "$TRACE_DIR/.tmp" "$TRACE_DIR/.already_have"
442  else
443    cat "$TRACE_DIR/$FN" >>"$TRACE_DIR/.already_have"
444  fi
445
446done <"$TRACE_DIR/.all_uniq"
447
448echo
449
450OUT_COUNT=`ls -- "$OUT_DIR" | wc -l`
451
452if [ "$OUT_COUNT" = "1" ]; then
453  echo "[!] WARNING: All test cases had the same traces, check syntax!"
454fi
455
456echo "[+] Narrowed down to $OUT_COUNT files, saved in '$OUT_DIR'."
457echo
458
459test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
460
461exit 0
462