1 /* A fuzz test for CPython.
2 
3   The only exposed function is LLVMFuzzerTestOneInput, which is called by
4   fuzzers and by the _fuzz module for smoke tests.
5 
6   To build exactly one fuzz test, as when running in oss-fuzz etc.,
7   build with -D _Py_FUZZ_ONE and -D _Py_FUZZ_<test_name>. e.g. to build
8   LLVMFuzzerTestOneInput to only run "fuzz_builtin_float", build this file with
9       -D _Py_FUZZ_ONE -D _Py_FUZZ_fuzz_builtin_float.
10 
11   See the source code for LLVMFuzzerTestOneInput for details. */
12 
13 #include <Python.h>
14 #include <stdlib.h>
15 #include <inttypes.h>
16 
17 /*  Fuzz PyFloat_FromString as a proxy for float(str). */
fuzz_builtin_float(const char * data,size_t size)18 static int fuzz_builtin_float(const char* data, size_t size) {
19     PyObject* s = PyBytes_FromStringAndSize(data, size);
20     if (s == NULL) return 0;
21     PyObject* f = PyFloat_FromString(s);
22     if (PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_ValueError)) {
23         PyErr_Clear();
24     }
25 
26     Py_XDECREF(f);
27     Py_DECREF(s);
28     return 0;
29 }
30 
31 #define MAX_INT_TEST_SIZE 0x10000
32 
33 /* Fuzz PyLong_FromUnicodeObject as a proxy for int(str). */
fuzz_builtin_int(const char * data,size_t size)34 static int fuzz_builtin_int(const char* data, size_t size) {
35     /* Ignore test cases with very long ints to avoid timeouts
36        int("9" * 1000000) is not a very interesting test caase */
37     if (size > MAX_INT_TEST_SIZE) {
38         return 0;
39     }
40     /* Pick a random valid base. (When the fuzzed function takes extra
41        parameters, it's somewhat normal to hash the input to generate those
42        parameters. We want to exercise all code paths, so we do so here.) */
43     int base = _Py_HashBytes(data, size) % 37;
44     if (base == 1) {
45         // 1 is the only number between 0 and 36 that is not a valid base.
46         base = 0;
47     }
48     if (base == -1) {
49         return 0;  // An error occurred, bail early.
50     }
51     if (base < 0) {
52         base = -base;
53     }
54 
55     PyObject* s = PyUnicode_FromStringAndSize(data, size);
56     if (s == NULL) {
57         if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
58             PyErr_Clear();
59         }
60         return 0;
61     }
62     PyObject* l = PyLong_FromUnicodeObject(s, base);
63     if (l == NULL && PyErr_ExceptionMatches(PyExc_ValueError)) {
64         PyErr_Clear();
65     }
66     PyErr_Clear();
67     Py_XDECREF(l);
68     Py_DECREF(s);
69     return 0;
70 }
71 
72 /* Fuzz PyUnicode_FromStringAndSize as a proxy for unicode(str). */
fuzz_builtin_unicode(const char * data,size_t size)73 static int fuzz_builtin_unicode(const char* data, size_t size) {
74     PyObject* s = PyUnicode_FromStringAndSize(data, size);
75     if (s == NULL && PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
76         PyErr_Clear();
77     }
78     Py_XDECREF(s);
79     return 0;
80 }
81 
82 #define MAX_JSON_TEST_SIZE 0x10000
83 
84 PyObject* json_loads_method = NULL;
85 /* Called by LLVMFuzzerTestOneInput for initialization */
init_json_loads()86 static int init_json_loads() {
87     /* Import json.loads */
88     PyObject* json_module = PyImport_ImportModule("json");
89     if (json_module == NULL) {
90         return 0;
91     }
92     json_loads_method = PyObject_GetAttrString(json_module, "loads");
93     return json_loads_method != NULL;
94 }
95 /* Fuzz json.loads(x) */
fuzz_json_loads(const char * data,size_t size)96 static int fuzz_json_loads(const char* data, size_t size) {
97     /* Since python supports arbitrarily large ints in JSON,
98        long inputs can lead to timeouts on boring inputs like
99        `json.loads("9" * 100000)` */
100     if (size > MAX_JSON_TEST_SIZE) {
101         return 0;
102     }
103     PyObject* input_bytes = PyBytes_FromStringAndSize(data, size);
104     if (input_bytes == NULL) {
105         return 0;
106     }
107     PyObject* parsed = PyObject_CallFunctionObjArgs(json_loads_method, input_bytes, NULL);
108     if (parsed == NULL) {
109         /* Ignore ValueError as the fuzzer will more than likely
110            generate some invalid json and values */
111         if (PyErr_ExceptionMatches(PyExc_ValueError) ||
112         /* Ignore RecursionError as the fuzzer generates long sequences of
113            arrays such as `[[[...` */
114             PyErr_ExceptionMatches(PyExc_RecursionError) ||
115         /* Ignore unicode errors, invalid byte sequences are common */
116             PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)
117         ) {
118             PyErr_Clear();
119         }
120     }
121     Py_DECREF(input_bytes);
122     Py_XDECREF(parsed);
123     return 0;
124 }
125 
126 #define MAX_RE_TEST_SIZE 0x10000
127 
128 PyObject* sre_compile_method = NULL;
129 PyObject* sre_error_exception = NULL;
130 int SRE_FLAG_DEBUG = 0;
131 /* Called by LLVMFuzzerTestOneInput for initialization */
init_sre_compile()132 static int init_sre_compile() {
133     /* Import sre_compile.compile and sre.error */
134     PyObject* sre_compile_module = PyImport_ImportModule("sre_compile");
135     if (sre_compile_module == NULL) {
136         return 0;
137     }
138     sre_compile_method = PyObject_GetAttrString(sre_compile_module, "compile");
139     if (sre_compile_method == NULL) {
140         return 0;
141     }
142 
143     PyObject* sre_constants = PyImport_ImportModule("sre_constants");
144     if (sre_constants == NULL) {
145         return 0;
146     }
147     sre_error_exception = PyObject_GetAttrString(sre_constants, "error");
148     if (sre_error_exception == NULL) {
149         return 0;
150     }
151     PyObject* debug_flag = PyObject_GetAttrString(sre_constants, "SRE_FLAG_DEBUG");
152     if (debug_flag == NULL) {
153         return 0;
154     }
155     SRE_FLAG_DEBUG = PyLong_AsLong(debug_flag);
156     return 1;
157 }
158 /* Fuzz _sre.compile(x) */
fuzz_sre_compile(const char * data,size_t size)159 static int fuzz_sre_compile(const char* data, size_t size) {
160     /* Ignore really long regex patterns that will timeout the fuzzer */
161     if (size > MAX_RE_TEST_SIZE) {
162         return 0;
163     }
164     /* We treat the first 2 bytes of the input as a number for the flags */
165     if (size < 2) {
166         return 0;
167     }
168     uint16_t flags = ((uint16_t*) data)[0];
169     /* We remove the SRE_FLAG_DEBUG if present. This is because it
170        prints to stdout which greatly decreases fuzzing speed */
171     flags &= ~SRE_FLAG_DEBUG;
172 
173     /* Pull the pattern from the remaining bytes */
174     PyObject* pattern_bytes = PyBytes_FromStringAndSize(data + 2, size - 2);
175     if (pattern_bytes == NULL) {
176         return 0;
177     }
178     PyObject* flags_obj = PyLong_FromUnsignedLong(flags);
179     if (flags_obj == NULL) {
180         Py_DECREF(pattern_bytes);
181         return 0;
182     }
183 
184     /* compiled = _sre.compile(data[2:], data[0:2] */
185     PyObject* compiled = PyObject_CallFunctionObjArgs(
186         sre_compile_method, pattern_bytes, flags_obj, NULL);
187     /* Ignore ValueError as the fuzzer will more than likely
188        generate some invalid combination of flags */
189     if (compiled == NULL && PyErr_ExceptionMatches(PyExc_ValueError)) {
190         PyErr_Clear();
191     }
192     /* Ignore some common errors thrown by sre_parse:
193        Overflow, Assertion and Index */
194     if (compiled == NULL && (PyErr_ExceptionMatches(PyExc_OverflowError) ||
195                              PyErr_ExceptionMatches(PyExc_AssertionError) ||
196                              PyErr_ExceptionMatches(PyExc_IndexError))
197     ) {
198         PyErr_Clear();
199     }
200     /* Ignore re.error */
201     if (compiled == NULL && PyErr_ExceptionMatches(sre_error_exception)) {
202         PyErr_Clear();
203     }
204 
205     Py_DECREF(pattern_bytes);
206     Py_DECREF(flags_obj);
207     Py_XDECREF(compiled);
208     return 0;
209 }
210 
211 /* Some random patterns used to test re.match.
212    Be careful not to add catostraphically slow regexes here, we want to
213    exercise the matching code without causing timeouts.*/
214 static const char* regex_patterns[] = {
215     ".", "^", "abc", "abc|def", "^xxx$", "\\b", "()", "[a-zA-Z0-9]",
216     "abc+", "[^A-Z]", "[x]", "(?=)", "a{z}", "a+b", "a*?", "a??", "a+?",
217     "{}", "a{,}", "{", "}", "^\\(*\\d{3}\\)*( |-)*\\d{3}( |-)*\\d{4}$",
218     "(?:a*)*", "a{1,2}?"
219 };
220 const size_t NUM_PATTERNS = sizeof(regex_patterns) / sizeof(regex_patterns[0]);
221 PyObject** compiled_patterns = NULL;
222 /* Called by LLVMFuzzerTestOneInput for initialization */
init_sre_match()223 static int init_sre_match() {
224     PyObject* re_module = PyImport_ImportModule("re");
225     if (re_module == NULL) {
226         return 0;
227     }
228     compiled_patterns = (PyObject**) PyMem_RawMalloc(
229         sizeof(PyObject*) * NUM_PATTERNS);
230     if (compiled_patterns == NULL) {
231         PyErr_NoMemory();
232         return 0;
233     }
234 
235     /* Precompile all the regex patterns on the first run for faster fuzzing */
236     for (size_t i = 0; i < NUM_PATTERNS; i++) {
237         PyObject* compiled = PyObject_CallMethod(
238             re_module, "compile", "y", regex_patterns[i]);
239         /* Bail if any of the patterns fail to compile */
240         if (compiled == NULL) {
241             return 0;
242         }
243         compiled_patterns[i] = compiled;
244     }
245     return 1;
246 }
247 /* Fuzz re.match(x) */
fuzz_sre_match(const char * data,size_t size)248 static int fuzz_sre_match(const char* data, size_t size) {
249     if (size < 1 || size > MAX_RE_TEST_SIZE) {
250         return 0;
251     }
252     /* Use the first byte as a uint8_t specifying the index of the
253        regex to use */
254     unsigned char idx = (unsigned char) data[0];
255     idx = idx % NUM_PATTERNS;
256 
257     /* Pull the string to match from the remaining bytes */
258     PyObject* to_match = PyBytes_FromStringAndSize(data + 1, size - 1);
259     if (to_match == NULL) {
260         return 0;
261     }
262 
263     PyObject* pattern = compiled_patterns[idx];
264     PyObject* match_callable = PyObject_GetAttrString(pattern, "match");
265 
266     PyObject* matches = PyObject_CallFunctionObjArgs(match_callable, to_match, NULL);
267 
268     Py_XDECREF(matches);
269     Py_DECREF(match_callable);
270     Py_DECREF(to_match);
271     return 0;
272 }
273 
274 #define MAX_CSV_TEST_SIZE 0x10000
275 PyObject* csv_module = NULL;
276 PyObject* csv_error = NULL;
277 /* Called by LLVMFuzzerTestOneInput for initialization */
init_csv_reader()278 static int init_csv_reader() {
279     /* Import csv and csv.Error */
280     csv_module = PyImport_ImportModule("csv");
281     if (csv_module == NULL) {
282         return 0;
283     }
284     csv_error = PyObject_GetAttrString(csv_module, "Error");
285     return csv_error != NULL;
286 }
287 /* Fuzz csv.reader([x]) */
fuzz_csv_reader(const char * data,size_t size)288 static int fuzz_csv_reader(const char* data, size_t size) {
289     if (size < 1 || size > MAX_CSV_TEST_SIZE) {
290         return 0;
291     }
292     /* Ignore non null-terminated strings since _csv can't handle
293        embeded nulls */
294     if (memchr(data, '\0', size) == NULL) {
295         return 0;
296     }
297 
298     PyObject* s = PyUnicode_FromString(data);
299     /* Ignore exceptions until we have a valid string */
300     if (s == NULL) {
301         PyErr_Clear();
302         return 0;
303     }
304 
305     /* Split on \n so we can test multiple lines */
306     PyObject* lines = PyObject_CallMethod(s, "split", "s", "\n");
307     if (lines == NULL) {
308         Py_DECREF(s);
309         return 0;
310     }
311 
312     PyObject* reader = PyObject_CallMethod(csv_module, "reader", "N", lines);
313     if (reader) {
314         /* Consume all of the reader as an iterator */
315         PyObject* parsed_line;
316         while ((parsed_line = PyIter_Next(reader))) {
317             Py_DECREF(parsed_line);
318         }
319     }
320 
321     /* Ignore csv.Error because we're probably going to generate
322        some bad files (embeded new-lines, unterminated quotes etc) */
323     if (PyErr_ExceptionMatches(csv_error)) {
324         PyErr_Clear();
325     }
326 
327     Py_XDECREF(reader);
328     Py_DECREF(s);
329     return 0;
330 }
331 
332 /* Run fuzzer and abort on failure. */
_run_fuzz(const uint8_t * data,size_t size,int (* fuzzer)(const char *,size_t))333 static int _run_fuzz(const uint8_t *data, size_t size, int(*fuzzer)(const char* , size_t)) {
334     int rv = fuzzer((const char*) data, size);
335     if (PyErr_Occurred()) {
336         /* Fuzz tests should handle expected errors for themselves.
337            This is last-ditch check in case they didn't. */
338         PyErr_Print();
339         abort();
340     }
341     /* Someday the return value might mean something, propagate it. */
342     return rv;
343 }
344 
345 /* CPython generates a lot of leak warnings for whatever reason. */
__lsan_is_turned_off(void)346 int __lsan_is_turned_off(void) { return 1; }
347 
348 
LLVMFuzzerInitialize(int * argc,char *** argv)349 int LLVMFuzzerInitialize(int *argc, char ***argv) {
350     wchar_t* wide_program_name = Py_DecodeLocale(*argv[0], NULL);
351     Py_SetProgramName(wide_program_name);
352     return 0;
353 }
354 
355 /* Fuzz test interface.
356    This returns the bitwise or of all fuzz test's return values.
357 
358    All fuzz tests must return 0, as all nonzero return codes are reserved for
359    future use -- we propagate the return values for that future case.
360    (And we bitwise or when running multiple tests to verify that normally we
361    only return 0.) */
LLVMFuzzerTestOneInput(const uint8_t * data,size_t size)362 int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
363     if (!Py_IsInitialized()) {
364         /* LLVMFuzzerTestOneInput is called repeatedly from the same process,
365            with no separate initialization phase, sadly, so we need to
366            initialize CPython ourselves on the first run. */
367         Py_InitializeEx(0);
368     }
369 
370     int rv = 0;
371 
372 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_builtin_float)
373     rv |= _run_fuzz(data, size, fuzz_builtin_float);
374 #endif
375 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_builtin_int)
376     rv |= _run_fuzz(data, size, fuzz_builtin_int);
377 #endif
378 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_builtin_unicode)
379     rv |= _run_fuzz(data, size, fuzz_builtin_unicode);
380 #endif
381 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_json_loads)
382     static int JSON_LOADS_INITIALIZED = 0;
383     if (!JSON_LOADS_INITIALIZED && !init_json_loads()) {
384         PyErr_Print();
385         abort();
386     } else {
387         JSON_LOADS_INITIALIZED = 1;
388     }
389 
390     rv |= _run_fuzz(data, size, fuzz_json_loads);
391 #endif
392 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_sre_compile)
393     static int SRE_COMPILE_INITIALIZED = 0;
394     if (!SRE_COMPILE_INITIALIZED && !init_sre_compile()) {
395         PyErr_Print();
396         abort();
397     } else {
398         SRE_COMPILE_INITIALIZED = 1;
399     }
400 
401     rv |= _run_fuzz(data, size, fuzz_sre_compile);
402 #endif
403 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_sre_match)
404     static int SRE_MATCH_INITIALIZED = 0;
405     if (!SRE_MATCH_INITIALIZED && !init_sre_match()) {
406         PyErr_Print();
407         abort();
408     } else {
409         SRE_MATCH_INITIALIZED = 1;
410     }
411 
412     rv |= _run_fuzz(data, size, fuzz_sre_match);
413 #endif
414 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_csv_reader)
415     static int CSV_READER_INITIALIZED = 0;
416     if (!CSV_READER_INITIALIZED && !init_csv_reader()) {
417         PyErr_Print();
418         abort();
419     } else {
420         CSV_READER_INITIALIZED = 1;
421     }
422 
423     rv |= _run_fuzz(data, size, fuzz_csv_reader);
424 #endif
425   return rv;
426 }
427