1#!/usr/bin/env bash 2# 3# american fuzzy lop - corpus minimization tool 4# --------------------------------------------- 5# 6# Written and maintained by Michal Zalewski <lcamtuf@google.com> 7# 8# Copyright 2014, 2015 Google Inc. All rights reserved. 9# 10# Licensed under the Apache License, Version 2.0 (the "License"); 11# you may not use this file except in compliance with the License. 12# You may obtain a copy of the License at: 13# 14# http://www.apache.org/licenses/LICENSE-2.0 15# 16# This tool tries to find the smallest subset of files in the input directory 17# that still trigger the full range of instrumentation data points seen in 18# the starting corpus. This has two uses: 19# 20# - Screening large corpora of input files before using them as a seed for 21# afl-fuzz. The tool will remove functionally redundant files and likely 22# leave you with a much smaller set. 23# 24# (In this case, you probably also want to consider running afl-tmin on 25# the individual files later on to reduce their size.) 26# 27# - Minimizing the corpus generated organically by afl-fuzz, perhaps when 28# planning to feed it to more resource-intensive tools. The tool achieves 29# this by removing all entries that used to trigger unique behaviors in the 30# past, but have been made obsolete by later finds. 31# 32# Note that the tool doesn't modify the files themselves. For that, you want 33# afl-tmin. 34# 35# This script must use bash because other shells may have hardcoded limits on 36# array sizes. 37# 38 39echo "corpus minimization tool for afl-fuzz by <lcamtuf@google.com>" 40echo 41 42######### 43# SETUP # 44######### 45 46# Process command-line options... 47 48MEM_LIMIT=100 49TIMEOUT=none 50 51unset IN_DIR OUT_DIR STDIN_FILE EXTRA_PAR MEM_LIMIT_GIVEN \ 52 AFL_CMIN_CRASHES_ONLY AFL_CMIN_ALLOW_ANY QEMU_MODE 53 54while getopts "+i:o:f:m:t:eQC" opt; do 55 56 case "$opt" in 57 58 "i") 59 IN_DIR="$OPTARG" 60 ;; 61 62 "o") 63 OUT_DIR="$OPTARG" 64 ;; 65 "f") 66 STDIN_FILE="$OPTARG" 67 ;; 68 "m") 69 MEM_LIMIT="$OPTARG" 70 MEM_LIMIT_GIVEN=1 71 ;; 72 "t") 73 TIMEOUT="$OPTARG" 74 ;; 75 "e") 76 EXTRA_PAR="$EXTRA_PAR -e" 77 ;; 78 "C") 79 export AFL_CMIN_CRASHES_ONLY=1 80 ;; 81 "Q") 82 EXTRA_PAR="$EXTRA_PAR -Q" 83 test "$MEM_LIMIT_GIVEN" = "" && MEM_LIMIT=250 84 QEMU_MODE=1 85 ;; 86 "?") 87 exit 1 88 ;; 89 90 esac 91 92done 93 94shift $((OPTIND-1)) 95 96TARGET_BIN="$1" 97 98if [ "$TARGET_BIN" = "" -o "$IN_DIR" = "" -o "$OUT_DIR" = "" ]; then 99 100 cat 1>&2 <<_EOF_ 101Usage: $0 [ options ] -- /path/to/target_app [ ... ] 102 103Required parameters: 104 105 -i dir - input directory with the starting corpus 106 -o dir - output directory for minimized files 107 108Execution control settings: 109 110 -f file - location read by the fuzzed program (stdin) 111 -m megs - memory limit for child process ($MEM_LIMIT MB) 112 -t msec - run time limit for child process (none) 113 -Q - use binary-only instrumentation (QEMU mode) 114 115Minimization settings: 116 117 -C - keep crashing inputs, reject everything else 118 -e - solve for edge coverage only, ignore hit counts 119 120For additional tips, please consult docs/README. 121 122_EOF_ 123 exit 1 124fi 125 126# Do a sanity check to discourage the use of /tmp, since we can't really 127# handle this safely from a shell script. 128 129if [ "$AFL_ALLOW_TMP" = "" ]; then 130 131 echo "$IN_DIR" | grep -qE '^(/var)?/tmp/' 132 T1="$?" 133 134 echo "$TARGET_BIN" | grep -qE '^(/var)?/tmp/' 135 T2="$?" 136 137 echo "$OUT_DIR" | grep -qE '^(/var)?/tmp/' 138 T3="$?" 139 140 echo "$STDIN_FILE" | grep -qE '^(/var)?/tmp/' 141 T4="$?" 142 143 echo "$PWD" | grep -qE '^(/var)?/tmp/' 144 T5="$?" 145 146 if [ "$T1" = "0" -o "$T2" = "0" -o "$T3" = "0" -o "$T4" = "0" -o "$T5" = "0" ]; then 147 echo "[-] Error: do not use this script in /tmp or /var/tmp." 1>&2 148 exit 1 149 fi 150 151fi 152 153# If @@ is specified, but there's no -f, let's come up with a temporary input 154# file name. 155 156TRACE_DIR="$OUT_DIR/.traces" 157 158if [ "$STDIN_FILE" = "" ]; then 159 160 if echo "$*" | grep -qF '@@'; then 161 STDIN_FILE="$TRACE_DIR/.cur_input" 162 fi 163 164fi 165 166# Check for obvious errors. 167 168if [ ! "$MEM_LIMIT" = "none" ]; then 169 170 if [ "$MEM_LIMIT" -lt "5" ]; then 171 echo "[-] Error: dangerously low memory limit." 1>&2 172 exit 1 173 fi 174 175fi 176 177if [ ! "$TIMEOUT" = "none" ]; then 178 179 if [ "$TIMEOUT" -lt "10" ]; then 180 echo "[-] Error: dangerously low timeout." 1>&2 181 exit 1 182 fi 183 184fi 185 186if [ ! -f "$TARGET_BIN" -o ! -x "$TARGET_BIN" ]; then 187 188 TNEW="`which "$TARGET_BIN" 2>/dev/null`" 189 190 if [ ! -f "$TNEW" -o ! -x "$TNEW" ]; then 191 echo "[-] Error: binary '$TARGET_BIN' not found or not executable." 1>&2 192 exit 1 193 fi 194 195 TARGET_BIN="$TNEW" 196 197fi 198 199if [ "$AFL_SKIP_BIN_CHECK" = "" -a "$QEMU_MODE" = "" ]; then 200 201 if ! grep -qF "__AFL_SHM_ID" "$TARGET_BIN"; then 202 echo "[-] Error: binary '$TARGET_BIN' doesn't appear to be instrumented." 1>&2 203 exit 1 204 fi 205 206fi 207 208if [ ! -d "$IN_DIR" ]; then 209 echo "[-] Error: directory '$IN_DIR' not found." 1>&2 210 exit 1 211fi 212 213test -d "$IN_DIR/queue" && IN_DIR="$IN_DIR/queue" 214 215find "$OUT_DIR" -name 'id[:_]*' -maxdepth 1 -exec rm -- {} \; 2>/dev/null 216rm -rf "$TRACE_DIR" 2>/dev/null 217 218rmdir "$OUT_DIR" 2>/dev/null 219 220if [ -d "$OUT_DIR" ]; then 221 echo "[-] Error: directory '$OUT_DIR' exists and is not empty - delete it first." 1>&2 222 exit 1 223fi 224 225mkdir -m 700 -p "$TRACE_DIR" || exit 1 226 227if [ ! "$STDIN_FILE" = "" ]; then 228 rm -f "$STDIN_FILE" || exit 1 229 touch "$STDIN_FILE" || exit 1 230fi 231 232if [ "$AFL_PATH" = "" ]; then 233 SHOWMAP="${0%/afl-cmin}/afl-showmap" 234else 235 SHOWMAP="$AFL_PATH/afl-showmap" 236fi 237 238if [ ! -x "$SHOWMAP" ]; then 239 echo "[-] Error: can't find 'afl-showmap' - please set AFL_PATH." 1>&2 240 rm -rf "$TRACE_DIR" 241 exit 1 242fi 243 244IN_COUNT=$((`ls -- "$IN_DIR" 2>/dev/null | wc -l`)) 245 246if [ "$IN_COUNT" = "0" ]; then 247 echo "[+] Hmm, no inputs in the target directory. Nothing to be done." 248 rm -rf "$TRACE_DIR" 249 exit 1 250fi 251 252FIRST_FILE=`ls "$IN_DIR" | head -1` 253 254# Make sure that we're not dealing with a directory. 255 256if [ -d "$IN_DIR/$FIRST_FILE" ]; then 257 echo "[-] Error: The target directory contains subdirectories - please fix." 1>&2 258 rm -rf "$TRACE_DIR" 259 exit 1 260fi 261 262# Check for the more efficient way to copy files... 263 264if ln "$IN_DIR/$FIRST_FILE" "$TRACE_DIR/.link_test" 2>/dev/null; then 265 CP_TOOL=ln 266else 267 CP_TOOL=cp 268fi 269 270# Make sure that we can actually get anything out of afl-showmap before we 271# waste too much time. 272 273echo "[*] Testing the target binary..." 274 275if [ "$STDIN_FILE" = "" ]; then 276 277 AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$FIRST_FILE" 278 279else 280 281 cp "$IN_DIR/$FIRST_FILE" "$STDIN_FILE" 282 AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -A "$STDIN_FILE" -- "$@" </dev/null 283 284fi 285 286FIRST_COUNT=$((`grep -c . "$TRACE_DIR/.run_test"`)) 287 288if [ "$FIRST_COUNT" -gt "0" ]; then 289 290 echo "[+] OK, $FIRST_COUNT tuples recorded." 291 292else 293 294 echo "[-] Error: no instrumentation output detected (perhaps crash or timeout)." 1>&2 295 test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR" 296 exit 1 297 298fi 299 300# Let's roll! 301 302############################# 303# STEP 1: COLLECTING TRACES # 304############################# 305 306echo "[*] Obtaining traces for input files in '$IN_DIR'..." 307 308( 309 310 CUR=0 311 312 if [ "$STDIN_FILE" = "" ]; then 313 314 while read -r fn; do 315 316 CUR=$((CUR+1)) 317 printf "\\r Processing file $CUR/$IN_COUNT... " 318 319 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$fn" 320 321 done < <(ls "$IN_DIR") 322 323 else 324 325 while read -r fn; do 326 327 CUR=$((CUR+1)) 328 printf "\\r Processing file $CUR/$IN_COUNT... " 329 330 cp "$IN_DIR/$fn" "$STDIN_FILE" 331 332 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -A "$STDIN_FILE" -- "$@" </dev/null 333 334 done < <(ls "$IN_DIR") 335 336 337 fi 338 339) 340 341echo 342 343########################## 344# STEP 2: SORTING TUPLES # 345########################## 346 347# With this out of the way, we sort all tuples by popularity across all 348# datasets. The reasoning here is that we won't be able to avoid the files 349# that trigger unique tuples anyway, so we will want to start with them and 350# see what's left. 351 352echo "[*] Sorting trace sets (this may take a while)..." 353 354ls "$IN_DIR" | sed "s#^#$TRACE_DIR/#" | tr '\n' '\0' | xargs -0 -n 1 cat | \ 355 sort | uniq -c | sort -n >"$TRACE_DIR/.all_uniq" 356 357TUPLE_COUNT=$((`grep -c . "$TRACE_DIR/.all_uniq"`)) 358 359echo "[+] Found $TUPLE_COUNT unique tuples across $IN_COUNT files." 360 361##################################### 362# STEP 3: SELECTING CANDIDATE FILES # 363##################################### 364 365# The next step is to find the best candidate for each tuple. The "best" 366# part is understood simply as the smallest input that includes a particular 367# tuple in its trace. Empirical evidence suggests that this produces smaller 368# datasets than more involved algorithms that could be still pulled off in 369# a shell script. 370 371echo "[*] Finding best candidates for each tuple..." 372 373CUR=0 374 375while read -r fn; do 376 377 CUR=$((CUR+1)) 378 printf "\\r Processing file $CUR/$IN_COUNT... " 379 380 sed "s#\$# $fn#" "$TRACE_DIR/$fn" >>"$TRACE_DIR/.candidate_list" 381 382done < <(ls -rS "$IN_DIR") 383 384echo 385 386############################## 387# STEP 4: LOADING CANDIDATES # 388############################## 389 390# At this point, we have a file of tuple-file pairs, sorted by file size 391# in ascending order (as a consequence of ls -rS). By doing sort keyed 392# only by tuple (-k 1,1) and configured to output only the first line for 393# every key (-s -u), we end up with the smallest file for each tuple. 394 395echo "[*] Sorting candidate list (be patient)..." 396 397sort -k1,1 -s -u "$TRACE_DIR/.candidate_list" | \ 398 sed 's/^/BEST_FILE[/;s/ /]="/;s/$/"/' >"$TRACE_DIR/.candidate_script" 399 400if [ ! -s "$TRACE_DIR/.candidate_script" ]; then 401 echo "[-] Error: no traces obtained from test cases, check syntax!" 1>&2 402 test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR" 403 exit 1 404fi 405 406# The sed command converted the sorted list to a shell script that populates 407# BEST_FILE[tuple]="fname". Let's load that! 408 409. "$TRACE_DIR/.candidate_script" 410 411########################## 412# STEP 5: WRITING OUTPUT # 413########################## 414 415# The final trick is to grab the top pick for each tuple, unless said tuple is 416# already set due to the inclusion of an earlier candidate; and then put all 417# tuples associated with the newly-added file to the "already have" list. The 418# loop works from least popular tuples and toward the most common ones. 419 420echo "[*] Processing candidates and writing output files..." 421 422CUR=0 423 424touch "$TRACE_DIR/.already_have" 425 426while read -r cnt tuple; do 427 428 CUR=$((CUR+1)) 429 printf "\\r Processing tuple $CUR/$TUPLE_COUNT... " 430 431 # If we already have this tuple, skip it. 432 433 grep -q "^$tuple\$" "$TRACE_DIR/.already_have" && continue 434 435 FN=${BEST_FILE[tuple]} 436 437 $CP_TOOL "$IN_DIR/$FN" "$OUT_DIR/$FN" 438 439 if [ "$((CUR % 5))" = "0" ]; then 440 sort -u "$TRACE_DIR/$FN" "$TRACE_DIR/.already_have" >"$TRACE_DIR/.tmp" 441 mv -f "$TRACE_DIR/.tmp" "$TRACE_DIR/.already_have" 442 else 443 cat "$TRACE_DIR/$FN" >>"$TRACE_DIR/.already_have" 444 fi 445 446done <"$TRACE_DIR/.all_uniq" 447 448echo 449 450OUT_COUNT=`ls -- "$OUT_DIR" | wc -l` 451 452if [ "$OUT_COUNT" = "1" ]; then 453 echo "[!] WARNING: All test cases had the same traces, check syntax!" 454fi 455 456echo "[+] Narrowed down to $OUT_COUNT files, saved in '$OUT_DIR'." 457echo 458 459test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR" 460 461exit 0 462