1 /*
2     This is part of pyahocorasick Python module.
3 
4     Helpers functions.
5     This file is included directly.
6 
7     Author    : Wojciech Muła, wojciech_mula@poczta.onet.pl
8     WWW       : http://0x80.pl
9     License   : public domain
10 */
11 
12 //#define MEMORY_DEBUG
13 #ifdef MEMORY_DEBUG
14 #ifndef MEMORY_DUMP_PATH
15 #   define MEMORY_DUMP_PATH "memory.dump"
16 #endif
17 const char* debug_path = MEMORY_DUMP_PATH;
18 FILE* debug_file;
19 int memory_dump          = 1;   // dump to file
20 int alloc_num            = 0;   // id of allocation
21 int alloc_fail           = -1;  // id of allocation that will fail
22 int alloc_trap_on_fail   = 0;   // rather failing, execute trap (for gdb use)
23 int realloc_num          = 0;   // id of allocation
24 int realloc_fail         = -1;  // id of allocation that will fail
25 int realloc_trap_on_fail = 0;   // rather failing, execute trap (for gdb use)
26 
27 static int
env_getint(const char * name,int def)28 env_getint(const char* name, int def) {
29     const char* val = getenv(name);
30     if (val != NULL)
31         return atoi(val);
32     else
33         return def;
34 }
35 
36 static int
env_exists(const char * name)37 env_exists(const char* name) {
38     return (getenv(name) != NULL);
39 }
40 
41 static
initialize_memory_debug(void)42 void initialize_memory_debug(void) {
43     if (env_exists("ALLOC_NODUMP")) {
44         memory_dump = 0;
45     }
46 
47     alloc_fail = env_getint("ALLOC_FAIL", alloc_fail);
48     realloc_fail = env_getint("REALLOC_FAIL", realloc_fail);
49 
50     alloc_trap_on_fail = env_exists("ALLOC_TRAP");
51     realloc_trap_on_fail = env_exists("REALLOC_TRAP");
52 
53     if (memory_dump) {
54         debug_file = fopen(debug_path, "wt");
55         if (debug_file == NULL) {
56             PyErr_WarnEx(PyExc_RuntimeWarning, "Cannot open file, logging on stderr", 1);
57             debug_file = stderr;
58         }
59     }
60 }
61 #endif
62 
memory_alloc(ssize_t size)63 void* memory_alloc(ssize_t size) {
64 #ifdef MEMORY_DEBUG
65     if (alloc_num == alloc_fail) {
66         if (alloc_trap_on_fail) {
67             __builtin_trap();
68         }
69 
70         printf("DEBUG: allocation #%d failed\n", alloc_num);
71         alloc_num += 1;
72         return NULL;
73     }
74 #endif
75     void* res = PyMem_Malloc(size);
76 
77 #ifdef MEMORY_DEBUG
78     alloc_num += 1;
79     if (memory_dump)
80         fprintf(debug_file, "A %d %p %ld\n", alloc_num, res, size);
81 #endif
82 
83     return res;
84 }
85 
86 
memory_realloc(void * ptr,size_t size)87 void* memory_realloc(void* ptr, size_t size) {
88 #ifdef MEMORY_DEBUG
89     if (realloc_num == realloc_fail) {
90         if (realloc_trap_on_fail) {
91             __builtin_trap();
92         }
93 
94         printf("DEBUG: reallocation #%d failed\n", realloc_num);
95         realloc_num += 1;
96         return NULL;
97     }
98 #endif
99     void* res = PyMem_Realloc(ptr, size);
100 
101 #ifdef MEMORY_DEBUG
102     realloc_num += 1;
103     if (memory_dump) {
104         fprintf(debug_file, "R %d %p %p %ld\n", realloc_num, ptr, res, size);
105     }
106 #endif
107 
108     return res;
109 }
110 
111 
memory_free(void * ptr)112 void memory_free(void* ptr) {
113 #ifdef MEMORY_DEBUG
114     if (memory_dump)
115         fprintf(debug_file, "F %p\n", ptr);
116 #endif
117     PyMem_Free(ptr);
118 }
119 
120 
memory_safefree(void * ptr)121 void memory_safefree(void* ptr) {
122     if (ptr != NULL) {
123         memory_free(ptr);
124     }
125 }
126 
127 
128 #if !defined(PY3K) || !defined(AHOCORASICK_UNICODE)
129 //  define when pymod_get_string makes a copy of string
130 #   define INPUT_KEEPS_COPY
131 #endif
132 
133 #if defined INPUT_KEEPS_COPY
134 #    define maybe_free(flag, word) memory_free(word);
135 #    define maybe_decref(flag, ref)
136 #elif defined PEP393_UNICODE
137 #    define maybe_free(flag, word) if (flag) { memory_free(word); }
138 #    define maybe_decref(flag, ref) if (ref && !flag) { Py_DECREF(ref); }
139 #else
140 #    define maybe_free(flag, word)
141 #    define maybe_decref(flag, ref) if (ref) { Py_DECREF(ref); }
142 #endif
143 
144 /* returns bytes or unicode internal buffer */
145 static PyObject*
pymod_get_string(PyObject * obj,TRIE_LETTER_TYPE ** word,ssize_t * wordlen,bool * is_copy)146 pymod_get_string(PyObject* obj, TRIE_LETTER_TYPE** word, ssize_t* wordlen, bool* is_copy) {
147 
148 #ifdef INPUT_KEEPS_COPY
149     ssize_t i;
150     char* bytes;
151 #endif
152 
153 #if defined PEP393_UNICODE
154     if (F(PyUnicode_Check)(obj)) {
155     PyUnicode_READY(obj);
156     if (PyUnicode_KIND(obj) == PyUnicode_4BYTE_KIND) {
157             *word = (TRIE_LETTER_TYPE*)(PyUnicode_4BYTE_DATA(obj));
158             *wordlen = PyUnicode_GET_LENGTH(obj);
159             *is_copy = false;
160             Py_INCREF(obj);
161 
162         return obj;
163     } else {
164         *word = PyUnicode_AsUCS4Copy(obj);
165         *wordlen = PyUnicode_GET_LENGTH(obj);
166         *is_copy = true;
167         // No INCREF - we have our copy
168         return obj;
169     }
170     }
171     else {
172     PyErr_SetString(PyExc_TypeError, "string expected");
173     return NULL;
174     }
175 #elif defined PY3K
176 #   ifdef AHOCORASICK_UNICODE
177         if (F(PyUnicode_Check)(obj)) {
178             *word = (TRIE_LETTER_TYPE*)(PyUnicode_AS_UNICODE(obj));
179             *wordlen = PyUnicode_GET_SIZE(obj);
180             Py_INCREF(obj);
181             return obj;
182         }
183         else {
184             PyErr_SetString(PyExc_TypeError, "string expected");
185             return NULL;
186         }
187 #   else
188 #       ifndef INPUT_KEEPS_COPY
189 #           error "defines inconsistency"
190 #       endif
191         if (F(PyBytes_Check)(obj)) {
192             *wordlen = PyBytes_GET_SIZE(obj);
193             *word    = (TRIE_LETTER_TYPE*)memory_alloc(*wordlen * TRIE_LETTER_SIZE);
194             if (*word == NULL) {
195                 PyErr_NoMemory();
196                 return NULL;
197             }
198 
199             bytes = PyBytes_AS_STRING(obj);
200             for (i=0; i < *wordlen; i++) {
201                 (*word)[i] = bytes[i];
202             }
203             // Note: there is no INCREF
204             return obj;
205         }
206         else {
207             PyErr_SetString(PyExc_TypeError, "bytes expected");
208             return NULL;
209         }
210 #   endif
211 #else // PY_MAJOR_VERSION == 3
212 #       ifndef INPUT_KEEPS_COPY
213 #           error "defines inconsistency"
214 #       endif
215     if (F(PyString_Check)(obj)) {
216         *wordlen = PyString_GET_SIZE(obj);
217         *word    = (TRIE_LETTER_TYPE*)memory_alloc(*wordlen * TRIE_LETTER_SIZE);
218         if (*word == NULL) {
219             PyErr_NoMemory();
220             return NULL;
221         }
222 
223 
224         bytes = PyString_AS_STRING(obj);
225         for (i=0; i < *wordlen; i++) {
226             (*word)[i] = bytes[i];
227         };
228 
229         Py_INCREF(obj);
230         return obj;
231     } else {
232         PyErr_SetString(PyExc_TypeError, "string required");
233         return NULL;
234     }
235 #endif
236 }
237 
238 static bool
__read_sequence__from_tuple(PyObject * obj,TRIE_LETTER_TYPE ** word,ssize_t * wordlen)239 __read_sequence__from_tuple(PyObject* obj, TRIE_LETTER_TYPE** word, ssize_t* wordlen) {
240     Py_ssize_t i;
241     Py_ssize_t size = PyTuple_GET_SIZE(obj);
242     TRIE_LETTER_TYPE* tmpword;
243 
244     tmpword = (TRIE_LETTER_TYPE*)memory_alloc(size * TRIE_LETTER_SIZE);
245     if (UNLIKELY(tmpword == NULL)) {
246         PyErr_NoMemory();
247         return false;
248     }
249 
250     for (i=0; i < size; i++) {
251         Py_ssize_t value = F(PyNumber_AsSsize_t)(F(PyTuple_GetItem)(obj, i), PyExc_ValueError);
252         if (value == -1 && PyErr_Occurred()) {
253             PyErr_Format(PyExc_ValueError, "item #%zd is not a number", i);
254             memory_free(tmpword);
255             return false;
256         }
257 
258 
259         // TODO: both min and max values should be configured
260 #if TRIE_LETTER_SIZE == 4
261     #define MAX_VAL 4294967295l
262 #else
263     #define MAX_VAL 65535ul
264 #endif
265         if (value < 0 || value > MAX_VAL) {
266             PyErr_Format(PyExc_ValueError, "item #%zd: value %zd outside range [%d..%lu]", i, value, 0, MAX_VAL);
267             memory_free(tmpword);
268             return false;
269         }
270 
271         tmpword[i] = (TRIE_LETTER_TYPE)value;
272     }
273 
274     *word = tmpword;
275     *wordlen = size;
276 
277     return true;
278 }
279 
280 
281 static bool
pymod_get_sequence(PyObject * obj,TRIE_LETTER_TYPE ** word,ssize_t * wordlen)282 pymod_get_sequence(PyObject* obj, TRIE_LETTER_TYPE** word, ssize_t* wordlen) {
283     if (LIKELY(F(PyTuple_Check)(obj))) {
284         return __read_sequence__from_tuple(obj, word, wordlen);
285     } else {
286         PyErr_Format(PyExc_TypeError, "argument is not a supported sequence type");
287         return false;
288     }
289 }
290 
291 
292 /* parse optional indexes used in few functions [start, [end]] */
293 static int
pymod_parse_start_end(PyObject * args,int idx_start,int idx_end,const ssize_t min,const ssize_t max,ssize_t * Start,ssize_t * End)294 pymod_parse_start_end(
295     PyObject* args,
296     int idx_start, int idx_end,
297     const ssize_t min, const ssize_t max,
298     ssize_t* Start, ssize_t* End
299 ) {
300     PyObject* obj;
301 #define start (*Start)
302 #define end (*End)
303 
304     start   = min;
305     end     = max;
306 
307     // first argument
308     obj = F(PyTuple_GetItem)(args, idx_start);
309     if (obj == NULL) {
310         PyErr_Clear();
311         return 0;
312     }
313 
314     obj = F(PyNumber_Index)(obj);
315     if (obj == NULL)
316         return -1;
317 
318     start = F(PyNumber_AsSsize_t)(obj, PyExc_IndexError);
319     Py_DECREF(obj);
320     if (start == -1 and PyErr_Occurred())
321         return -1;
322 
323     if (start < 0)
324         start = max + start;
325 
326     if (start < min or start >= max) {
327         PyErr_Format(PyExc_IndexError, "start index not in range %zd..%zd", min, max);
328         return -1;
329     }
330 
331     // second argument
332     obj = F(PyTuple_GetItem)(args, idx_end);
333     if (obj == NULL) {
334         PyErr_Clear();
335         return 0;
336     }
337 
338     obj = F(PyNumber_Index)(obj);
339     if (obj == NULL)
340         return -1;
341 
342     end = F(PyNumber_AsSsize_t)(obj, PyExc_IndexError);
343     Py_DECREF(obj);
344     if (end == -1 and PyErr_Occurred())
345         return -1;
346 
347     if (end < 0)
348         end = max - 1 + end;
349 
350     if (end < min or end > max) {
351         PyErr_Format(PyExc_IndexError, "end index not in range %zd..%zd", min, max);
352         return -1;
353     }
354 
355     return 0;
356 
357 #undef start
358 #undef end
359 }
360 
361 
init_input(struct Input * input)362 void init_input(struct Input* input) {
363     input->word = NULL;
364     input->py_word = NULL;
365 }
366 
367 
prepare_input(PyObject * self,PyObject * tuple,struct Input * input)368 bool prepare_input(PyObject* self, PyObject* tuple, struct Input* input) {
369 #define automaton ((Automaton*)self)
370     if (automaton->key_type == KEY_STRING) {
371         input->py_word = pymod_get_string(tuple, &input->word, &input->wordlen, &input->is_copy);
372         if (not input->py_word)
373             return false;
374     } else {
375         input->is_copy = true; // we always create a copy of sequence
376         input->py_word = NULL;
377         if (not pymod_get_sequence(tuple, &input->word, &input->wordlen)) {
378             return false;
379         }
380     }
381 #undef automaton
382 
383     return true;
384 }
385 
386 
prepare_input_from_tuple(PyObject * self,PyObject * args,int index,struct Input * input)387 bool prepare_input_from_tuple(PyObject* self, PyObject* args, int index, struct Input* input) {
388     PyObject* tuple;
389 
390     tuple = F(PyTuple_GetItem)(args, index);
391     if (tuple)
392         return prepare_input(self, tuple, input);
393     else
394         return false;
395 }
396 
397 
destroy_input(struct Input * input)398 void destroy_input(struct Input* input) {
399     maybe_decref(input->is_copy, input->py_word)
400     maybe_free(input->is_copy, input->word)
401 }
402 
403 
assign_input(struct Input * dst,struct Input * src)404 void assign_input(struct Input* dst, struct Input* src) {
405 
406     dst->wordlen    = src->wordlen;
407     dst->word       = src->word;
408     dst->py_word    = src->py_word; // Note: there is no INCREF
409 }
410