1 /*
2 This is part of pyahocorasick Python module.
3
4 Helpers functions.
5 This file is included directly.
6
7 Author : Wojciech Muła, wojciech_mula@poczta.onet.pl
8 WWW : http://0x80.pl
9 License : public domain
10 */
11
12 //#define MEMORY_DEBUG
13 #ifdef MEMORY_DEBUG
14 #ifndef MEMORY_DUMP_PATH
15 # define MEMORY_DUMP_PATH "memory.dump"
16 #endif
17 const char* debug_path = MEMORY_DUMP_PATH;
18 FILE* debug_file;
19 int memory_dump = 1; // dump to file
20 int alloc_num = 0; // id of allocation
21 int alloc_fail = -1; // id of allocation that will fail
22 int alloc_trap_on_fail = 0; // rather failing, execute trap (for gdb use)
23 int realloc_num = 0; // id of allocation
24 int realloc_fail = -1; // id of allocation that will fail
25 int realloc_trap_on_fail = 0; // rather failing, execute trap (for gdb use)
26
27 static int
env_getint(const char * name,int def)28 env_getint(const char* name, int def) {
29 const char* val = getenv(name);
30 if (val != NULL)
31 return atoi(val);
32 else
33 return def;
34 }
35
36 static int
env_exists(const char * name)37 env_exists(const char* name) {
38 return (getenv(name) != NULL);
39 }
40
41 static
initialize_memory_debug(void)42 void initialize_memory_debug(void) {
43 if (env_exists("ALLOC_NODUMP")) {
44 memory_dump = 0;
45 }
46
47 alloc_fail = env_getint("ALLOC_FAIL", alloc_fail);
48 realloc_fail = env_getint("REALLOC_FAIL", realloc_fail);
49
50 alloc_trap_on_fail = env_exists("ALLOC_TRAP");
51 realloc_trap_on_fail = env_exists("REALLOC_TRAP");
52
53 if (memory_dump) {
54 debug_file = fopen(debug_path, "wt");
55 if (debug_file == NULL) {
56 PyErr_WarnEx(PyExc_RuntimeWarning, "Cannot open file, logging on stderr", 1);
57 debug_file = stderr;
58 }
59 }
60 }
61 #endif
62
memory_alloc(ssize_t size)63 void* memory_alloc(ssize_t size) {
64 #ifdef MEMORY_DEBUG
65 if (alloc_num == alloc_fail) {
66 if (alloc_trap_on_fail) {
67 __builtin_trap();
68 }
69
70 printf("DEBUG: allocation #%d failed\n", alloc_num);
71 alloc_num += 1;
72 return NULL;
73 }
74 #endif
75 void* res = PyMem_Malloc(size);
76
77 #ifdef MEMORY_DEBUG
78 alloc_num += 1;
79 if (memory_dump)
80 fprintf(debug_file, "A %d %p %ld\n", alloc_num, res, size);
81 #endif
82
83 return res;
84 }
85
86
memory_realloc(void * ptr,size_t size)87 void* memory_realloc(void* ptr, size_t size) {
88 #ifdef MEMORY_DEBUG
89 if (realloc_num == realloc_fail) {
90 if (realloc_trap_on_fail) {
91 __builtin_trap();
92 }
93
94 printf("DEBUG: reallocation #%d failed\n", realloc_num);
95 realloc_num += 1;
96 return NULL;
97 }
98 #endif
99 void* res = PyMem_Realloc(ptr, size);
100
101 #ifdef MEMORY_DEBUG
102 realloc_num += 1;
103 if (memory_dump) {
104 fprintf(debug_file, "R %d %p %p %ld\n", realloc_num, ptr, res, size);
105 }
106 #endif
107
108 return res;
109 }
110
111
memory_free(void * ptr)112 void memory_free(void* ptr) {
113 #ifdef MEMORY_DEBUG
114 if (memory_dump)
115 fprintf(debug_file, "F %p\n", ptr);
116 #endif
117 PyMem_Free(ptr);
118 }
119
120
memory_safefree(void * ptr)121 void memory_safefree(void* ptr) {
122 if (ptr != NULL) {
123 memory_free(ptr);
124 }
125 }
126
127
128 #if !defined(PY3K) || !defined(AHOCORASICK_UNICODE)
129 // define when pymod_get_string makes a copy of string
130 # define INPUT_KEEPS_COPY
131 #endif
132
133 #if defined INPUT_KEEPS_COPY
134 # define maybe_free(flag, word) memory_free(word);
135 # define maybe_decref(flag, ref)
136 #elif defined PEP393_UNICODE
137 # define maybe_free(flag, word) if (flag) { memory_free(word); }
138 # define maybe_decref(flag, ref) if (ref && !flag) { Py_DECREF(ref); }
139 #else
140 # define maybe_free(flag, word)
141 # define maybe_decref(flag, ref) if (ref) { Py_DECREF(ref); }
142 #endif
143
144 /* returns bytes or unicode internal buffer */
145 static PyObject*
pymod_get_string(PyObject * obj,TRIE_LETTER_TYPE ** word,ssize_t * wordlen,bool * is_copy)146 pymod_get_string(PyObject* obj, TRIE_LETTER_TYPE** word, ssize_t* wordlen, bool* is_copy) {
147
148 #ifdef INPUT_KEEPS_COPY
149 ssize_t i;
150 char* bytes;
151 #endif
152
153 #if defined PEP393_UNICODE
154 if (F(PyUnicode_Check)(obj)) {
155 PyUnicode_READY(obj);
156 if (PyUnicode_KIND(obj) == PyUnicode_4BYTE_KIND) {
157 *word = (TRIE_LETTER_TYPE*)(PyUnicode_4BYTE_DATA(obj));
158 *wordlen = PyUnicode_GET_LENGTH(obj);
159 *is_copy = false;
160 Py_INCREF(obj);
161
162 return obj;
163 } else {
164 *word = PyUnicode_AsUCS4Copy(obj);
165 *wordlen = PyUnicode_GET_LENGTH(obj);
166 *is_copy = true;
167 // No INCREF - we have our copy
168 return obj;
169 }
170 }
171 else {
172 PyErr_SetString(PyExc_TypeError, "string expected");
173 return NULL;
174 }
175 #elif defined PY3K
176 # ifdef AHOCORASICK_UNICODE
177 if (F(PyUnicode_Check)(obj)) {
178 *word = (TRIE_LETTER_TYPE*)(PyUnicode_AS_UNICODE(obj));
179 *wordlen = PyUnicode_GET_SIZE(obj);
180 Py_INCREF(obj);
181 return obj;
182 }
183 else {
184 PyErr_SetString(PyExc_TypeError, "string expected");
185 return NULL;
186 }
187 # else
188 # ifndef INPUT_KEEPS_COPY
189 # error "defines inconsistency"
190 # endif
191 if (F(PyBytes_Check)(obj)) {
192 *wordlen = PyBytes_GET_SIZE(obj);
193 *word = (TRIE_LETTER_TYPE*)memory_alloc(*wordlen * TRIE_LETTER_SIZE);
194 if (*word == NULL) {
195 PyErr_NoMemory();
196 return NULL;
197 }
198
199 bytes = PyBytes_AS_STRING(obj);
200 for (i=0; i < *wordlen; i++) {
201 (*word)[i] = bytes[i];
202 }
203 // Note: there is no INCREF
204 return obj;
205 }
206 else {
207 PyErr_SetString(PyExc_TypeError, "bytes expected");
208 return NULL;
209 }
210 # endif
211 #else // PY_MAJOR_VERSION == 3
212 # ifndef INPUT_KEEPS_COPY
213 # error "defines inconsistency"
214 # endif
215 if (F(PyString_Check)(obj)) {
216 *wordlen = PyString_GET_SIZE(obj);
217 *word = (TRIE_LETTER_TYPE*)memory_alloc(*wordlen * TRIE_LETTER_SIZE);
218 if (*word == NULL) {
219 PyErr_NoMemory();
220 return NULL;
221 }
222
223
224 bytes = PyString_AS_STRING(obj);
225 for (i=0; i < *wordlen; i++) {
226 (*word)[i] = bytes[i];
227 };
228
229 Py_INCREF(obj);
230 return obj;
231 } else {
232 PyErr_SetString(PyExc_TypeError, "string required");
233 return NULL;
234 }
235 #endif
236 }
237
238 static bool
__read_sequence__from_tuple(PyObject * obj,TRIE_LETTER_TYPE ** word,ssize_t * wordlen)239 __read_sequence__from_tuple(PyObject* obj, TRIE_LETTER_TYPE** word, ssize_t* wordlen) {
240 Py_ssize_t i;
241 Py_ssize_t size = PyTuple_GET_SIZE(obj);
242 TRIE_LETTER_TYPE* tmpword;
243
244 tmpword = (TRIE_LETTER_TYPE*)memory_alloc(size * TRIE_LETTER_SIZE);
245 if (UNLIKELY(tmpword == NULL)) {
246 PyErr_NoMemory();
247 return false;
248 }
249
250 for (i=0; i < size; i++) {
251 Py_ssize_t value = F(PyNumber_AsSsize_t)(F(PyTuple_GetItem)(obj, i), PyExc_ValueError);
252 if (value == -1 && PyErr_Occurred()) {
253 PyErr_Format(PyExc_ValueError, "item #%zd is not a number", i);
254 memory_free(tmpword);
255 return false;
256 }
257
258
259 // TODO: both min and max values should be configured
260 #if TRIE_LETTER_SIZE == 4
261 #define MAX_VAL 4294967295l
262 #else
263 #define MAX_VAL 65535ul
264 #endif
265 if (value < 0 || value > MAX_VAL) {
266 PyErr_Format(PyExc_ValueError, "item #%zd: value %zd outside range [%d..%lu]", i, value, 0, MAX_VAL);
267 memory_free(tmpword);
268 return false;
269 }
270
271 tmpword[i] = (TRIE_LETTER_TYPE)value;
272 }
273
274 *word = tmpword;
275 *wordlen = size;
276
277 return true;
278 }
279
280
281 static bool
pymod_get_sequence(PyObject * obj,TRIE_LETTER_TYPE ** word,ssize_t * wordlen)282 pymod_get_sequence(PyObject* obj, TRIE_LETTER_TYPE** word, ssize_t* wordlen) {
283 if (LIKELY(F(PyTuple_Check)(obj))) {
284 return __read_sequence__from_tuple(obj, word, wordlen);
285 } else {
286 PyErr_Format(PyExc_TypeError, "argument is not a supported sequence type");
287 return false;
288 }
289 }
290
291
292 /* parse optional indexes used in few functions [start, [end]] */
293 static int
pymod_parse_start_end(PyObject * args,int idx_start,int idx_end,const ssize_t min,const ssize_t max,ssize_t * Start,ssize_t * End)294 pymod_parse_start_end(
295 PyObject* args,
296 int idx_start, int idx_end,
297 const ssize_t min, const ssize_t max,
298 ssize_t* Start, ssize_t* End
299 ) {
300 PyObject* obj;
301 #define start (*Start)
302 #define end (*End)
303
304 start = min;
305 end = max;
306
307 // first argument
308 obj = F(PyTuple_GetItem)(args, idx_start);
309 if (obj == NULL) {
310 PyErr_Clear();
311 return 0;
312 }
313
314 obj = F(PyNumber_Index)(obj);
315 if (obj == NULL)
316 return -1;
317
318 start = F(PyNumber_AsSsize_t)(obj, PyExc_IndexError);
319 Py_DECREF(obj);
320 if (start == -1 and PyErr_Occurred())
321 return -1;
322
323 if (start < 0)
324 start = max + start;
325
326 if (start < min or start >= max) {
327 PyErr_Format(PyExc_IndexError, "start index not in range %zd..%zd", min, max);
328 return -1;
329 }
330
331 // second argument
332 obj = F(PyTuple_GetItem)(args, idx_end);
333 if (obj == NULL) {
334 PyErr_Clear();
335 return 0;
336 }
337
338 obj = F(PyNumber_Index)(obj);
339 if (obj == NULL)
340 return -1;
341
342 end = F(PyNumber_AsSsize_t)(obj, PyExc_IndexError);
343 Py_DECREF(obj);
344 if (end == -1 and PyErr_Occurred())
345 return -1;
346
347 if (end < 0)
348 end = max - 1 + end;
349
350 if (end < min or end > max) {
351 PyErr_Format(PyExc_IndexError, "end index not in range %zd..%zd", min, max);
352 return -1;
353 }
354
355 return 0;
356
357 #undef start
358 #undef end
359 }
360
361
init_input(struct Input * input)362 void init_input(struct Input* input) {
363 input->word = NULL;
364 input->py_word = NULL;
365 }
366
367
prepare_input(PyObject * self,PyObject * tuple,struct Input * input)368 bool prepare_input(PyObject* self, PyObject* tuple, struct Input* input) {
369 #define automaton ((Automaton*)self)
370 if (automaton->key_type == KEY_STRING) {
371 input->py_word = pymod_get_string(tuple, &input->word, &input->wordlen, &input->is_copy);
372 if (not input->py_word)
373 return false;
374 } else {
375 input->is_copy = true; // we always create a copy of sequence
376 input->py_word = NULL;
377 if (not pymod_get_sequence(tuple, &input->word, &input->wordlen)) {
378 return false;
379 }
380 }
381 #undef automaton
382
383 return true;
384 }
385
386
prepare_input_from_tuple(PyObject * self,PyObject * args,int index,struct Input * input)387 bool prepare_input_from_tuple(PyObject* self, PyObject* args, int index, struct Input* input) {
388 PyObject* tuple;
389
390 tuple = F(PyTuple_GetItem)(args, index);
391 if (tuple)
392 return prepare_input(self, tuple, input);
393 else
394 return false;
395 }
396
397
destroy_input(struct Input * input)398 void destroy_input(struct Input* input) {
399 maybe_decref(input->is_copy, input->py_word)
400 maybe_free(input->is_copy, input->word)
401 }
402
403
assign_input(struct Input * dst,struct Input * src)404 void assign_input(struct Input* dst, struct Input* src) {
405
406 dst->wordlen = src->wordlen;
407 dst->word = src->word;
408 dst->py_word = src->py_word; // Note: there is no INCREF
409 }
410