1 //===- FuzzedDataProvider.h - Utility header for fuzz targets ---*- C++ -* ===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // A single header library providing an utility class to break up an array of
9 // bytes. Whenever run on the same input, provides the same output, as long as
10 // its methods are called in the same order, with the same arguments.
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_
14 #define LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_
15 
16 #include <algorithm>
17 #include <climits>
18 #include <cstddef>
19 #include <cstdint>
20 #include <cstring>
21 #include <initializer_list>
22 #include <string>
23 #include <type_traits>
24 #include <utility>
25 #include <vector>
26 
27 // In addition to the comments below, the API is also briefly documented at
28 // https://github.com/google/fuzzing/blob/master/docs/split-inputs.md#fuzzed-data-provider
29 class FuzzedDataProvider
30 {
31 public:
32     // |data| is an array of length |size| that the FuzzedDataProvider wraps to
33     // provide more granular access. |data| must outlive the FuzzedDataProvider.
FuzzedDataProvider(const uint8_t * data,size_t size)34     FuzzedDataProvider(const uint8_t *data, size_t size) : data_ptr_(data), remaining_bytes_(size) { }
35     ~FuzzedDataProvider() = default;
36 
37     // See the implementation below (after the class definition) for more verbose
38     // comments for each of the methods.
39 
40     // Methods returning std::vector of bytes. These are the most popular choice
41     // when splitting fuzzing input into pieces, as every piece is put into a
42     // separate buffer (i.e. ASan would catch any under-/overflow) and the memory
43     // will be released automatically.
44     template<typename T>
45     std::vector<T> ConsumeBytes(size_t num_bytes);
46     template<typename T>
47     std::vector<T> ConsumeBytesWithTerminator(size_t num_bytes, T terminator = 0);
48     template<typename T>
49     std::vector<T> ConsumeRemainingBytes();
50 
51     // Methods returning strings. Use only when you need a std::string or a null
52     // terminated C-string. Otherwise, prefer the methods returning std::vector.
53     std::string ConsumeBytesAsString(size_t num_bytes);
54     std::string ConsumeRandomLengthString(size_t max_length);
55     std::string ConsumeRandomLengthString();
56     std::string ConsumeRemainingBytesAsString();
57 
58     // Methods returning integer values.
59     template<typename T>
60     T ConsumeIntegral();
61     template<typename T>
62     T ConsumeIntegralInRange(T min, T max);
63 
64     // Methods returning floating point values.
65     template<typename T>
66     T ConsumeFloatingPoint();
67     template<typename T>
68     T ConsumeFloatingPointInRange(T min, T max);
69 
70     // 0 <= return value <= 1.
71     template<typename T>
72     T ConsumeProbability();
73 
74     bool ConsumeBool();
75 
76     // Returns a value chosen from the given enum.
77     template<typename T>
78     T ConsumeEnum();
79 
80     // Returns a value from the given array.
81     template<typename T, size_t size>
82     T PickValueInArray(const T (&array)[size]);
83     template<typename T>
84     T PickValueInArray(std::initializer_list<const T> list);
85 
86     // Writes data to the given destination and returns number of bytes written.
87     size_t ConsumeData(void *destination, size_t num_bytes);
88 
89     // Reports the remaining bytes available for fuzzed input.
remaining_bytes()90     size_t remaining_bytes() { return remaining_bytes_; }
91 
92 private:
93     FuzzedDataProvider(const FuzzedDataProvider &) = delete;
94     FuzzedDataProvider &operator=(const FuzzedDataProvider &) = delete;
95 
96     void CopyAndAdvance(void *destination, size_t num_bytes);
97 
98     void Advance(size_t num_bytes);
99 
100     template<typename T>
101     std::vector<T> ConsumeBytes(size_t size, size_t num_bytes);
102 
103     template<typename TS, typename TU>
104     TS ConvertUnsignedToSigned(TU value);
105 
106     const uint8_t *data_ptr_;
107     size_t remaining_bytes_;
108 };
109 
110 // Returns a std::vector containing |num_bytes| of input data. If fewer than
111 // |num_bytes| of data remain, returns a shorter std::vector containing all
112 // of the data that's left. Can be used with any byte sized type, such as
113 // char, unsigned char, uint8_t, etc.
114 template<typename T>
ConsumeBytes(size_t num_bytes)115 std::vector<T> FuzzedDataProvider::ConsumeBytes(size_t num_bytes)
116 {
117     num_bytes = std::min(num_bytes, remaining_bytes_);
118     return ConsumeBytes<T>(num_bytes, num_bytes);
119 }
120 
121 // Similar to |ConsumeBytes|, but also appends the terminator value at the end
122 // of the resulting vector. Useful, when a mutable null-terminated C-string is
123 // needed, for example. But that is a rare case. Better avoid it, if possible,
124 // and prefer using |ConsumeBytes| or |ConsumeBytesAsString| methods.
125 template<typename T>
ConsumeBytesWithTerminator(size_t num_bytes,T terminator)126 std::vector<T> FuzzedDataProvider::ConsumeBytesWithTerminator(size_t num_bytes, T terminator)
127 {
128     num_bytes = std::min(num_bytes, remaining_bytes_);
129     std::vector<T> result = ConsumeBytes<T>(num_bytes + 1, num_bytes);
130     result.back() = terminator;
131     return result;
132 }
133 
134 // Returns a std::vector containing all remaining bytes of the input data.
135 template<typename T>
ConsumeRemainingBytes()136 std::vector<T> FuzzedDataProvider::ConsumeRemainingBytes()
137 {
138     return ConsumeBytes<T>(remaining_bytes_);
139 }
140 
141 // Returns a std::string containing |num_bytes| of input data. Using this and
142 // |.c_str()| on the resulting string is the best way to get an immutable
143 // null-terminated C string. If fewer than |num_bytes| of data remain, returns
144 // a shorter std::string containing all of the data that's left.
ConsumeBytesAsString(size_t num_bytes)145 inline std::string FuzzedDataProvider::ConsumeBytesAsString(size_t num_bytes)
146 {
147     static_assert(sizeof(std::string::value_type) == sizeof(uint8_t), "ConsumeBytesAsString cannot convert the data to a string.");
148 
149     num_bytes = std::min(num_bytes, remaining_bytes_);
150     std::string result(reinterpret_cast<const std::string::value_type *>(data_ptr_), num_bytes);
151     Advance(num_bytes);
152     return result;
153 }
154 
155 // Returns a std::string of length from 0 to |max_length|. When it runs out of
156 // input data, returns what remains of the input. Designed to be more stable
157 // with respect to a fuzzer inserting characters than just picking a random
158 // length and then consuming that many bytes with |ConsumeBytes|.
ConsumeRandomLengthString(size_t max_length)159 inline std::string FuzzedDataProvider::ConsumeRandomLengthString(size_t max_length)
160 {
161     // Reads bytes from the start of |data_ptr_|. Maps "\\" to "\", and maps "\"
162     // followed by anything else to the end of the string. As a result of this
163     // logic, a fuzzer can insert characters into the string, and the string
164     // will be lengthened to include those new characters, resulting in a more
165     // stable fuzzer than picking the length of a string independently from
166     // picking its contents.
167     std::string result;
168 
169     // Reserve the anticipated capaticity to prevent several reallocations.
170     result.reserve(std::min(max_length, remaining_bytes_));
171     for (size_t i = 0; i < max_length && remaining_bytes_ != 0; ++i) {
172         char next = ConvertUnsignedToSigned<char>(data_ptr_[0]);
173         Advance(1);
174         if (next == '\\' && remaining_bytes_ != 0) {
175             next = ConvertUnsignedToSigned<char>(data_ptr_[0]);
176             Advance(1);
177             if (next != '\\')
178                 break;
179         }
180         result += next;
181     }
182 
183     result.shrink_to_fit();
184     return result;
185 }
186 
187 // Returns a std::string of length from 0 to |remaining_bytes_|.
ConsumeRandomLengthString()188 inline std::string FuzzedDataProvider::ConsumeRandomLengthString()
189 {
190     return ConsumeRandomLengthString(remaining_bytes_);
191 }
192 
193 // Returns a std::string containing all remaining bytes of the input data.
194 // Prefer using |ConsumeRemainingBytes| unless you actually need a std::string
195 // object.
ConsumeRemainingBytesAsString()196 inline std::string FuzzedDataProvider::ConsumeRemainingBytesAsString()
197 {
198     return ConsumeBytesAsString(remaining_bytes_);
199 }
200 
201 // Returns a number in the range [Type's min, Type's max]. The value might
202 // not be uniformly distributed in the given range. If there's no input data
203 // left, always returns |min|.
204 template<typename T>
ConsumeIntegral()205 T FuzzedDataProvider::ConsumeIntegral()
206 {
207     return ConsumeIntegralInRange(std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
208 }
209 
210 // Returns a number in the range [min, max] by consuming bytes from the
211 // input data. The value might not be uniformly distributed in the given
212 // range. If there's no input data left, always returns |min|. |min| must
213 // be less than or equal to |max|.
214 template<typename T>
ConsumeIntegralInRange(T min,T max)215 T FuzzedDataProvider::ConsumeIntegralInRange(T min, T max)
216 {
217     static_assert(std::is_integral<T>::value, "An integral type is required.");
218     static_assert(sizeof(T) <= sizeof(uint64_t), "Unsupported integral type.");
219 
220     if (min > max)
221         abort();
222 
223     // Use the biggest type possible to hold the range and the result.
224     uint64_t range = static_cast<uint64_t>(max) - min;
225     uint64_t result = 0;
226     size_t offset = 0;
227 
228     while (offset < sizeof(T) * CHAR_BIT && (range >> offset) > 0 && remaining_bytes_ != 0) {
229         // Pull bytes off the end of the seed data. Experimentally, this seems to
230         // allow the fuzzer to more easily explore the input space. This makes
231         // sense, since it works by modifying inputs that caused new code to run,
232         // and this data is often used to encode length of data read by
233         // |ConsumeBytes|. Separating out read lengths makes it easier modify the
234         // contents of the data that is actually read.
235         --remaining_bytes_;
236         result = (result << CHAR_BIT) | data_ptr_[remaining_bytes_];
237         offset += CHAR_BIT;
238     }
239 
240     // Avoid division by 0, in case |range + 1| results in overflow.
241     if (range != std::numeric_limits<decltype(range)>::max())
242         result = result % (range + 1);
243 
244     return static_cast<T>(min + result);
245 }
246 
247 // Returns a floating point value in the range [Type's lowest, Type's max] by
248 // consuming bytes from the input data. If there's no input data left, always
249 // returns approximately 0.
250 template<typename T>
ConsumeFloatingPoint()251 T FuzzedDataProvider::ConsumeFloatingPoint()
252 {
253     return ConsumeFloatingPointInRange<T>(std::numeric_limits<T>::lowest(), std::numeric_limits<T>::max());
254 }
255 
256 // Returns a floating point value in the given range by consuming bytes from
257 // the input data. If there's no input data left, returns |min|. Note that
258 // |min| must be less than or equal to |max|.
259 template<typename T>
ConsumeFloatingPointInRange(T min,T max)260 T FuzzedDataProvider::ConsumeFloatingPointInRange(T min, T max)
261 {
262     if (min > max)
263         abort();
264 
265     T range = .0;
266     T result = min;
267     constexpr T zero(.0);
268     if (max > zero && min < zero && max > min + std::numeric_limits<T>::max()) {
269         // The diff |max - min| would overflow the given floating point type. Use
270         // the half of the diff as the range and consume a bool to decide whether
271         // the result is in the first of the second part of the diff.
272         range = (max / 2.0) - (min / 2.0);
273         if (ConsumeBool()) {
274             result += range;
275         }
276     } else {
277         range = max - min;
278     }
279 
280     return result + range * ConsumeProbability<T>();
281 }
282 
283 // Returns a floating point number in the range [0.0, 1.0]. If there's no
284 // input data left, always returns 0.
285 template<typename T>
ConsumeProbability()286 T FuzzedDataProvider::ConsumeProbability()
287 {
288     static_assert(std::is_floating_point<T>::value, "A floating point type is required.");
289 
290     // Use different integral types for different floating point types in order
291     // to provide better density of the resulting values.
292     using IntegralType = typename std::conditional<(sizeof(T) <= sizeof(uint32_t)), uint32_t, uint64_t>::type;
293 
294     T result = static_cast<T>(ConsumeIntegral<IntegralType>());
295     result /= static_cast<T>(std::numeric_limits<IntegralType>::max());
296     return result;
297 }
298 
299 // Reads one byte and returns a bool, or false when no data remains.
ConsumeBool()300 inline bool FuzzedDataProvider::ConsumeBool()
301 {
302     return 1 & ConsumeIntegral<uint8_t>();
303 }
304 
305 // Returns an enum value. The enum must start at 0 and be contiguous. It must
306 // also contain |kMaxValue| aliased to its largest (inclusive) value. Such as:
307 // enum class Foo { SomeValue, OtherValue, kMaxValue = OtherValue };
308 template<typename T>
ConsumeEnum()309 T FuzzedDataProvider::ConsumeEnum()
310 {
311     static_assert(std::is_enum<T>::value, "|T| must be an enum type.");
312     return static_cast<T>(ConsumeIntegralInRange<uint32_t>(0, static_cast<uint32_t>(T::kMaxValue)));
313 }
314 
315 // Returns a copy of the value selected from the given fixed-size |array|.
316 template<typename T, size_t size>
PickValueInArray(const T (& array)[size])317 T FuzzedDataProvider::PickValueInArray(const T (&array)[size])
318 {
319     static_assert(size > 0, "The array must be non empty.");
320     return array[ConsumeIntegralInRange<size_t>(0, size - 1)];
321 }
322 
323 template<typename T>
PickValueInArray(std::initializer_list<const T> list)324 T FuzzedDataProvider::PickValueInArray(std::initializer_list<const T> list)
325 {
326     // TODO(Dor1s): switch to static_assert once C++14 is allowed.
327     if (!list.size())
328         abort();
329 
330     return *(list.begin() + ConsumeIntegralInRange<size_t>(0, list.size() - 1));
331 }
332 
333 // Writes |num_bytes| of input data to the given destination pointer. If there
334 // is not enough data left, writes all remaining bytes. Return value is the
335 // number of bytes written.
336 // In general, it's better to avoid using this function, but it may be useful
337 // in cases when it's necessary to fill a certain buffer or object with
338 // fuzzing data.
ConsumeData(void * destination,size_t num_bytes)339 inline size_t FuzzedDataProvider::ConsumeData(void *destination, size_t num_bytes)
340 {
341     num_bytes = std::min(num_bytes, remaining_bytes_);
342     CopyAndAdvance(destination, num_bytes);
343     return num_bytes;
344 }
345 
346 // Private methods.
CopyAndAdvance(void * destination,size_t num_bytes)347 inline void FuzzedDataProvider::CopyAndAdvance(void *destination, size_t num_bytes)
348 {
349     std::memcpy(destination, data_ptr_, num_bytes);
350     Advance(num_bytes);
351 }
352 
Advance(size_t num_bytes)353 inline void FuzzedDataProvider::Advance(size_t num_bytes)
354 {
355     if (num_bytes > remaining_bytes_)
356         abort();
357 
358     data_ptr_ += num_bytes;
359     remaining_bytes_ -= num_bytes;
360 }
361 
362 template<typename T>
ConsumeBytes(size_t size,size_t num_bytes)363 std::vector<T> FuzzedDataProvider::ConsumeBytes(size_t size, size_t num_bytes)
364 {
365     static_assert(sizeof(T) == sizeof(uint8_t), "Incompatible data type.");
366 
367     // The point of using the size-based constructor below is to increase the
368     // odds of having a vector object with capacity being equal to the length.
369     // That part is always implementation specific, but at least both libc++ and
370     // libstdc++ allocate the requested number of bytes in that constructor,
371     // which seems to be a natural choice for other implementations as well.
372     // To increase the odds even more, we also call |shrink_to_fit| below.
373     std::vector<T> result(size);
374     if (size == 0) {
375         if (num_bytes != 0)
376             abort();
377         return result;
378     }
379 
380     CopyAndAdvance(result.data(), num_bytes);
381 
382     // Even though |shrink_to_fit| is also implementation specific, we expect it
383     // to provide an additional assurance in case vector's constructor allocated
384     // a buffer which is larger than the actual amount of data we put inside it.
385     result.shrink_to_fit();
386     return result;
387 }
388 
389 template<typename TS, typename TU>
ConvertUnsignedToSigned(TU value)390 TS FuzzedDataProvider::ConvertUnsignedToSigned(TU value)
391 {
392     static_assert(sizeof(TS) == sizeof(TU), "Incompatible data types.");
393     static_assert(!std::numeric_limits<TU>::is_signed, "Source type must be unsigned.");
394 
395     // TODO(Dor1s): change to `if constexpr` once C++17 becomes mainstream.
396     if (std::numeric_limits<TS>::is_modulo)
397         return static_cast<TS>(value);
398 
399     // Avoid using implementation-defined unsigned to signed conversions.
400     // To learn more, see https://stackoverflow.com/questions/13150449.
401     if (value <= std::numeric_limits<TS>::max()) {
402         return static_cast<TS>(value);
403     } else {
404         constexpr auto TS_min = std::numeric_limits<TS>::min();
405         return TS_min + static_cast<char>(value - TS_min);
406     }
407 }
408 
409 #endif // LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_
410