1 /* A fuzz test for CPython.
2
3 The only exposed function is LLVMFuzzerTestOneInput, which is called by
4 fuzzers and by the _fuzz module for smoke tests.
5
6 To build exactly one fuzz test, as when running in oss-fuzz etc.,
7 build with -D _Py_FUZZ_ONE and -D _Py_FUZZ_<test_name>. e.g. to build
8 LLVMFuzzerTestOneInput to only run "fuzz_builtin_float", build this file with
9 -D _Py_FUZZ_ONE -D _Py_FUZZ_fuzz_builtin_float.
10
11 See the source code for LLVMFuzzerTestOneInput for details. */
12
13 #include <Python.h>
14 #include <stdlib.h>
15 #include <inttypes.h>
16
17 /* Fuzz PyFloat_FromString as a proxy for float(str). */
fuzz_builtin_float(const char * data,size_t size)18 static int fuzz_builtin_float(const char* data, size_t size) {
19 PyObject* s = PyBytes_FromStringAndSize(data, size);
20 if (s == NULL) return 0;
21 PyObject* f = PyFloat_FromString(s);
22 if (PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_ValueError)) {
23 PyErr_Clear();
24 }
25
26 Py_XDECREF(f);
27 Py_DECREF(s);
28 return 0;
29 }
30
31 #define MAX_INT_TEST_SIZE 0x10000
32
33 /* Fuzz PyLong_FromUnicodeObject as a proxy for int(str). */
fuzz_builtin_int(const char * data,size_t size)34 static int fuzz_builtin_int(const char* data, size_t size) {
35 /* Ignore test cases with very long ints to avoid timeouts
36 int("9" * 1000000) is not a very interesting test caase */
37 if (size > MAX_INT_TEST_SIZE) {
38 return 0;
39 }
40 /* Pick a random valid base. (When the fuzzed function takes extra
41 parameters, it's somewhat normal to hash the input to generate those
42 parameters. We want to exercise all code paths, so we do so here.) */
43 int base = _Py_HashBytes(data, size) % 37;
44 if (base == 1) {
45 // 1 is the only number between 0 and 36 that is not a valid base.
46 base = 0;
47 }
48 if (base == -1) {
49 return 0; // An error occurred, bail early.
50 }
51 if (base < 0) {
52 base = -base;
53 }
54
55 PyObject* s = PyUnicode_FromStringAndSize(data, size);
56 if (s == NULL) {
57 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
58 PyErr_Clear();
59 }
60 return 0;
61 }
62 PyObject* l = PyLong_FromUnicodeObject(s, base);
63 if (l == NULL && PyErr_ExceptionMatches(PyExc_ValueError)) {
64 PyErr_Clear();
65 }
66 PyErr_Clear();
67 Py_XDECREF(l);
68 Py_DECREF(s);
69 return 0;
70 }
71
72 /* Fuzz PyUnicode_FromStringAndSize as a proxy for unicode(str). */
fuzz_builtin_unicode(const char * data,size_t size)73 static int fuzz_builtin_unicode(const char* data, size_t size) {
74 PyObject* s = PyUnicode_FromStringAndSize(data, size);
75 if (s == NULL && PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
76 PyErr_Clear();
77 }
78 Py_XDECREF(s);
79 return 0;
80 }
81
82 #define MAX_JSON_TEST_SIZE 0x10000
83
84 PyObject* json_loads_method = NULL;
85 /* Called by LLVMFuzzerTestOneInput for initialization */
init_json_loads()86 static int init_json_loads() {
87 /* Import json.loads */
88 PyObject* json_module = PyImport_ImportModule("json");
89 if (json_module == NULL) {
90 return 0;
91 }
92 json_loads_method = PyObject_GetAttrString(json_module, "loads");
93 return json_loads_method != NULL;
94 }
95 /* Fuzz json.loads(x) */
fuzz_json_loads(const char * data,size_t size)96 static int fuzz_json_loads(const char* data, size_t size) {
97 /* Since python supports arbitrarily large ints in JSON,
98 long inputs can lead to timeouts on boring inputs like
99 `json.loads("9" * 100000)` */
100 if (size > MAX_JSON_TEST_SIZE) {
101 return 0;
102 }
103 PyObject* input_bytes = PyBytes_FromStringAndSize(data, size);
104 if (input_bytes == NULL) {
105 return 0;
106 }
107 PyObject* parsed = PyObject_CallFunctionObjArgs(json_loads_method, input_bytes, NULL);
108 if (parsed == NULL) {
109 /* Ignore ValueError as the fuzzer will more than likely
110 generate some invalid json and values */
111 if (PyErr_ExceptionMatches(PyExc_ValueError) ||
112 /* Ignore RecursionError as the fuzzer generates long sequences of
113 arrays such as `[[[...` */
114 PyErr_ExceptionMatches(PyExc_RecursionError) ||
115 /* Ignore unicode errors, invalid byte sequences are common */
116 PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)
117 ) {
118 PyErr_Clear();
119 }
120 }
121 Py_DECREF(input_bytes);
122 Py_XDECREF(parsed);
123 return 0;
124 }
125
126 #define MAX_RE_TEST_SIZE 0x10000
127
128 PyObject* sre_compile_method = NULL;
129 PyObject* sre_error_exception = NULL;
130 int SRE_FLAG_DEBUG = 0;
131 /* Called by LLVMFuzzerTestOneInput for initialization */
init_sre_compile()132 static int init_sre_compile() {
133 /* Import sre_compile.compile and sre.error */
134 PyObject* sre_compile_module = PyImport_ImportModule("sre_compile");
135 if (sre_compile_module == NULL) {
136 return 0;
137 }
138 sre_compile_method = PyObject_GetAttrString(sre_compile_module, "compile");
139 if (sre_compile_method == NULL) {
140 return 0;
141 }
142
143 PyObject* sre_constants = PyImport_ImportModule("sre_constants");
144 if (sre_constants == NULL) {
145 return 0;
146 }
147 sre_error_exception = PyObject_GetAttrString(sre_constants, "error");
148 if (sre_error_exception == NULL) {
149 return 0;
150 }
151 PyObject* debug_flag = PyObject_GetAttrString(sre_constants, "SRE_FLAG_DEBUG");
152 if (debug_flag == NULL) {
153 return 0;
154 }
155 SRE_FLAG_DEBUG = PyLong_AsLong(debug_flag);
156 return 1;
157 }
158 /* Fuzz _sre.compile(x) */
fuzz_sre_compile(const char * data,size_t size)159 static int fuzz_sre_compile(const char* data, size_t size) {
160 /* Ignore really long regex patterns that will timeout the fuzzer */
161 if (size > MAX_RE_TEST_SIZE) {
162 return 0;
163 }
164 /* We treat the first 2 bytes of the input as a number for the flags */
165 if (size < 2) {
166 return 0;
167 }
168 uint16_t flags = ((uint16_t*) data)[0];
169 /* We remove the SRE_FLAG_DEBUG if present. This is because it
170 prints to stdout which greatly decreases fuzzing speed */
171 flags &= ~SRE_FLAG_DEBUG;
172
173 /* Pull the pattern from the remaining bytes */
174 PyObject* pattern_bytes = PyBytes_FromStringAndSize(data + 2, size - 2);
175 if (pattern_bytes == NULL) {
176 return 0;
177 }
178 PyObject* flags_obj = PyLong_FromUnsignedLong(flags);
179 if (flags_obj == NULL) {
180 Py_DECREF(pattern_bytes);
181 return 0;
182 }
183
184 /* compiled = _sre.compile(data[2:], data[0:2] */
185 PyObject* compiled = PyObject_CallFunctionObjArgs(
186 sre_compile_method, pattern_bytes, flags_obj, NULL);
187 /* Ignore ValueError as the fuzzer will more than likely
188 generate some invalid combination of flags */
189 if (compiled == NULL && PyErr_ExceptionMatches(PyExc_ValueError)) {
190 PyErr_Clear();
191 }
192 /* Ignore some common errors thrown by sre_parse:
193 Overflow, Assertion and Index */
194 if (compiled == NULL && (PyErr_ExceptionMatches(PyExc_OverflowError) ||
195 PyErr_ExceptionMatches(PyExc_AssertionError) ||
196 PyErr_ExceptionMatches(PyExc_IndexError))
197 ) {
198 PyErr_Clear();
199 }
200 /* Ignore re.error */
201 if (compiled == NULL && PyErr_ExceptionMatches(sre_error_exception)) {
202 PyErr_Clear();
203 }
204
205 Py_DECREF(pattern_bytes);
206 Py_DECREF(flags_obj);
207 Py_XDECREF(compiled);
208 return 0;
209 }
210
211 /* Some random patterns used to test re.match.
212 Be careful not to add catostraphically slow regexes here, we want to
213 exercise the matching code without causing timeouts.*/
214 static const char* regex_patterns[] = {
215 ".", "^", "abc", "abc|def", "^xxx$", "\\b", "()", "[a-zA-Z0-9]",
216 "abc+", "[^A-Z]", "[x]", "(?=)", "a{z}", "a+b", "a*?", "a??", "a+?",
217 "{}", "a{,}", "{", "}", "^\\(*\\d{3}\\)*( |-)*\\d{3}( |-)*\\d{4}$",
218 "(?:a*)*", "a{1,2}?"
219 };
220 const size_t NUM_PATTERNS = sizeof(regex_patterns) / sizeof(regex_patterns[0]);
221 PyObject** compiled_patterns = NULL;
222 /* Called by LLVMFuzzerTestOneInput for initialization */
init_sre_match()223 static int init_sre_match() {
224 PyObject* re_module = PyImport_ImportModule("re");
225 if (re_module == NULL) {
226 return 0;
227 }
228 compiled_patterns = (PyObject**) PyMem_RawMalloc(
229 sizeof(PyObject*) * NUM_PATTERNS);
230 if (compiled_patterns == NULL) {
231 PyErr_NoMemory();
232 return 0;
233 }
234
235 /* Precompile all the regex patterns on the first run for faster fuzzing */
236 for (size_t i = 0; i < NUM_PATTERNS; i++) {
237 PyObject* compiled = PyObject_CallMethod(
238 re_module, "compile", "y", regex_patterns[i]);
239 /* Bail if any of the patterns fail to compile */
240 if (compiled == NULL) {
241 return 0;
242 }
243 compiled_patterns[i] = compiled;
244 }
245 return 1;
246 }
247 /* Fuzz re.match(x) */
fuzz_sre_match(const char * data,size_t size)248 static int fuzz_sre_match(const char* data, size_t size) {
249 if (size < 1 || size > MAX_RE_TEST_SIZE) {
250 return 0;
251 }
252 /* Use the first byte as a uint8_t specifying the index of the
253 regex to use */
254 unsigned char idx = (unsigned char) data[0];
255 idx = idx % NUM_PATTERNS;
256
257 /* Pull the string to match from the remaining bytes */
258 PyObject* to_match = PyBytes_FromStringAndSize(data + 1, size - 1);
259 if (to_match == NULL) {
260 return 0;
261 }
262
263 PyObject* pattern = compiled_patterns[idx];
264 PyObject* match_callable = PyObject_GetAttrString(pattern, "match");
265
266 PyObject* matches = PyObject_CallFunctionObjArgs(match_callable, to_match, NULL);
267
268 Py_XDECREF(matches);
269 Py_DECREF(match_callable);
270 Py_DECREF(to_match);
271 return 0;
272 }
273
274 #define MAX_CSV_TEST_SIZE 0x10000
275 PyObject* csv_module = NULL;
276 PyObject* csv_error = NULL;
277 /* Called by LLVMFuzzerTestOneInput for initialization */
init_csv_reader()278 static int init_csv_reader() {
279 /* Import csv and csv.Error */
280 csv_module = PyImport_ImportModule("csv");
281 if (csv_module == NULL) {
282 return 0;
283 }
284 csv_error = PyObject_GetAttrString(csv_module, "Error");
285 return csv_error != NULL;
286 }
287 /* Fuzz csv.reader([x]) */
fuzz_csv_reader(const char * data,size_t size)288 static int fuzz_csv_reader(const char* data, size_t size) {
289 if (size < 1 || size > MAX_CSV_TEST_SIZE) {
290 return 0;
291 }
292 /* Ignore non null-terminated strings since _csv can't handle
293 embeded nulls */
294 if (memchr(data, '\0', size) == NULL) {
295 return 0;
296 }
297
298 PyObject* s = PyUnicode_FromString(data);
299 /* Ignore exceptions until we have a valid string */
300 if (s == NULL) {
301 PyErr_Clear();
302 return 0;
303 }
304
305 /* Split on \n so we can test multiple lines */
306 PyObject* lines = PyObject_CallMethod(s, "split", "s", "\n");
307 if (lines == NULL) {
308 Py_DECREF(s);
309 return 0;
310 }
311
312 PyObject* reader = PyObject_CallMethod(csv_module, "reader", "N", lines);
313 if (reader) {
314 /* Consume all of the reader as an iterator */
315 PyObject* parsed_line;
316 while ((parsed_line = PyIter_Next(reader))) {
317 Py_DECREF(parsed_line);
318 }
319 }
320
321 /* Ignore csv.Error because we're probably going to generate
322 some bad files (embeded new-lines, unterminated quotes etc) */
323 if (PyErr_ExceptionMatches(csv_error)) {
324 PyErr_Clear();
325 }
326
327 Py_XDECREF(reader);
328 Py_DECREF(s);
329 return 0;
330 }
331
332 /* Run fuzzer and abort on failure. */
_run_fuzz(const uint8_t * data,size_t size,int (* fuzzer)(const char *,size_t))333 static int _run_fuzz(const uint8_t *data, size_t size, int(*fuzzer)(const char* , size_t)) {
334 int rv = fuzzer((const char*) data, size);
335 if (PyErr_Occurred()) {
336 /* Fuzz tests should handle expected errors for themselves.
337 This is last-ditch check in case they didn't. */
338 PyErr_Print();
339 abort();
340 }
341 /* Someday the return value might mean something, propagate it. */
342 return rv;
343 }
344
345 /* CPython generates a lot of leak warnings for whatever reason. */
__lsan_is_turned_off(void)346 int __lsan_is_turned_off(void) { return 1; }
347
348
LLVMFuzzerInitialize(int * argc,char *** argv)349 int LLVMFuzzerInitialize(int *argc, char ***argv) {
350 wchar_t* wide_program_name = Py_DecodeLocale(*argv[0], NULL);
351 Py_SetProgramName(wide_program_name);
352 return 0;
353 }
354
355 /* Fuzz test interface.
356 This returns the bitwise or of all fuzz test's return values.
357
358 All fuzz tests must return 0, as all nonzero return codes are reserved for
359 future use -- we propagate the return values for that future case.
360 (And we bitwise or when running multiple tests to verify that normally we
361 only return 0.) */
LLVMFuzzerTestOneInput(const uint8_t * data,size_t size)362 int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
363 if (!Py_IsInitialized()) {
364 /* LLVMFuzzerTestOneInput is called repeatedly from the same process,
365 with no separate initialization phase, sadly, so we need to
366 initialize CPython ourselves on the first run. */
367 Py_InitializeEx(0);
368 }
369
370 int rv = 0;
371
372 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_builtin_float)
373 rv |= _run_fuzz(data, size, fuzz_builtin_float);
374 #endif
375 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_builtin_int)
376 rv |= _run_fuzz(data, size, fuzz_builtin_int);
377 #endif
378 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_builtin_unicode)
379 rv |= _run_fuzz(data, size, fuzz_builtin_unicode);
380 #endif
381 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_json_loads)
382 static int JSON_LOADS_INITIALIZED = 0;
383 if (!JSON_LOADS_INITIALIZED && !init_json_loads()) {
384 PyErr_Print();
385 abort();
386 } else {
387 JSON_LOADS_INITIALIZED = 1;
388 }
389
390 rv |= _run_fuzz(data, size, fuzz_json_loads);
391 #endif
392 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_sre_compile)
393 static int SRE_COMPILE_INITIALIZED = 0;
394 if (!SRE_COMPILE_INITIALIZED && !init_sre_compile()) {
395 PyErr_Print();
396 abort();
397 } else {
398 SRE_COMPILE_INITIALIZED = 1;
399 }
400
401 rv |= _run_fuzz(data, size, fuzz_sre_compile);
402 #endif
403 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_sre_match)
404 static int SRE_MATCH_INITIALIZED = 0;
405 if (!SRE_MATCH_INITIALIZED && !init_sre_match()) {
406 PyErr_Print();
407 abort();
408 } else {
409 SRE_MATCH_INITIALIZED = 1;
410 }
411
412 rv |= _run_fuzz(data, size, fuzz_sre_match);
413 #endif
414 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_csv_reader)
415 static int CSV_READER_INITIALIZED = 0;
416 if (!CSV_READER_INITIALIZED && !init_csv_reader()) {
417 PyErr_Print();
418 abort();
419 } else {
420 CSV_READER_INITIALIZED = 1;
421 }
422
423 rv |= _run_fuzz(data, size, fuzz_csv_reader);
424 #endif
425 return rv;
426 }
427