1 /* stringlib: split implementation */
2 
3 #ifndef STRINGLIB_FASTSEARCH_H
4 #error must include "stringlib/fastsearch.h" before including this module
5 #endif
6 
7 /* Overallocate the initial list to reduce the number of reallocs for small
8    split sizes.  Eg, "A A A A A A A A A A".split() (10 elements) has three
9    resizes, to sizes 4, 8, then 16.  Most observed string splits are for human
10    text (roughly 11 words per line) and field delimited data (usually 1-10
11    fields).  For large strings the split algorithms are bandwidth limited
12    so increasing the preallocation likely will not improve things.*/
13 
14 #define MAX_PREALLOC 12
15 
16 /* 5 splits gives 6 elements */
17 #define PREALLOC_SIZE(maxsplit) \
18     (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
19 
20 #define SPLIT_APPEND(data, left, right)         \
21     sub = STRINGLIB_NEW((data) + (left),        \
22                         (right) - (left));      \
23     if (sub == NULL)                            \
24         goto onError;                           \
25     if (PyList_Append(list, sub)) {             \
26         Py_DECREF(sub);                         \
27         goto onError;                           \
28     }                                           \
29     else                                        \
30         Py_DECREF(sub);
31 
32 #define SPLIT_ADD(data, left, right) {          \
33     sub = STRINGLIB_NEW((data) + (left),        \
34                         (right) - (left));      \
35     if (sub == NULL)                            \
36         goto onError;                           \
37     if (count < MAX_PREALLOC) {                 \
38         PyList_SET_ITEM(list, count, sub);      \
39     } else {                                    \
40         if (PyList_Append(list, sub)) {         \
41             Py_DECREF(sub);                     \
42             goto onError;                       \
43         }                                       \
44         else                                    \
45             Py_DECREF(sub);                     \
46     }                                           \
47     count++; }
48 
49 
50 /* Always force the list to the expected size. */
51 #define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
52 
53 Py_LOCAL_INLINE(PyObject *)
STRINGLIB(split_whitespace)54 STRINGLIB(split_whitespace)(PyObject* str_obj,
55                            const STRINGLIB_CHAR* str, Py_ssize_t str_len,
56                            Py_ssize_t maxcount)
57 {
58     Py_ssize_t i, j, count=0;
59     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
60     PyObject *sub;
61 
62     if (list == NULL)
63         return NULL;
64 
65     i = j = 0;
66     while (maxcount-- > 0) {
67         while (i < str_len && STRINGLIB_ISSPACE(str[i]))
68             i++;
69         if (i == str_len) break;
70         j = i; i++;
71         while (i < str_len && !STRINGLIB_ISSPACE(str[i]))
72             i++;
73 #ifndef STRINGLIB_MUTABLE
74         if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
75             /* No whitespace in str_obj, so just use it as list[0] */
76             Py_INCREF(str_obj);
77             PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
78             count++;
79             break;
80         }
81 #endif
82         SPLIT_ADD(str, j, i);
83     }
84 
85     if (i < str_len) {
86         /* Only occurs when maxcount was reached */
87         /* Skip any remaining whitespace and copy to end of string */
88         while (i < str_len && STRINGLIB_ISSPACE(str[i]))
89             i++;
90         if (i != str_len)
91             SPLIT_ADD(str, i, str_len);
92     }
93     FIX_PREALLOC_SIZE(list);
94     return list;
95 
96   onError:
97     Py_DECREF(list);
98     return NULL;
99 }
100 
101 Py_LOCAL_INLINE(PyObject *)
STRINGLIB(split_char)102 STRINGLIB(split_char)(PyObject* str_obj,
103                      const STRINGLIB_CHAR* str, Py_ssize_t str_len,
104                      const STRINGLIB_CHAR ch,
105                      Py_ssize_t maxcount)
106 {
107     Py_ssize_t i, j, count=0;
108     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
109     PyObject *sub;
110 
111     if (list == NULL)
112         return NULL;
113 
114     i = j = 0;
115     while ((j < str_len) && (maxcount-- > 0)) {
116         for(; j < str_len; j++) {
117             /* I found that using memchr makes no difference */
118             if (str[j] == ch) {
119                 SPLIT_ADD(str, i, j);
120                 i = j = j + 1;
121                 break;
122             }
123         }
124     }
125 #ifndef STRINGLIB_MUTABLE
126     if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
127         /* ch not in str_obj, so just use str_obj as list[0] */
128         Py_INCREF(str_obj);
129         PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
130         count++;
131     } else
132 #endif
133     if (i <= str_len) {
134         SPLIT_ADD(str, i, str_len);
135     }
136     FIX_PREALLOC_SIZE(list);
137     return list;
138 
139   onError:
140     Py_DECREF(list);
141     return NULL;
142 }
143 
144 Py_LOCAL_INLINE(PyObject *)
STRINGLIB(split)145 STRINGLIB(split)(PyObject* str_obj,
146                 const STRINGLIB_CHAR* str, Py_ssize_t str_len,
147                 const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
148                 Py_ssize_t maxcount)
149 {
150     Py_ssize_t i, j, pos, count=0;
151     PyObject *list, *sub;
152 
153     if (sep_len == 0) {
154         PyErr_SetString(PyExc_ValueError, "empty separator");
155         return NULL;
156     }
157     else if (sep_len == 1)
158         return STRINGLIB(split_char)(str_obj, str, str_len, sep[0], maxcount);
159 
160     list = PyList_New(PREALLOC_SIZE(maxcount));
161     if (list == NULL)
162         return NULL;
163 
164     i = j = 0;
165     while (maxcount-- > 0) {
166         pos = FASTSEARCH(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH);
167         if (pos < 0)
168             break;
169         j = i + pos;
170         SPLIT_ADD(str, i, j);
171         i = j + sep_len;
172     }
173 #ifndef STRINGLIB_MUTABLE
174     if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
175         /* No match in str_obj, so just use it as list[0] */
176         Py_INCREF(str_obj);
177         PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
178         count++;
179     } else
180 #endif
181     {
182         SPLIT_ADD(str, i, str_len);
183     }
184     FIX_PREALLOC_SIZE(list);
185     return list;
186 
187   onError:
188     Py_DECREF(list);
189     return NULL;
190 }
191 
192 Py_LOCAL_INLINE(PyObject *)
STRINGLIB(rsplit_whitespace)193 STRINGLIB(rsplit_whitespace)(PyObject* str_obj,
194                             const STRINGLIB_CHAR* str, Py_ssize_t str_len,
195                             Py_ssize_t maxcount)
196 {
197     Py_ssize_t i, j, count=0;
198     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
199     PyObject *sub;
200 
201     if (list == NULL)
202         return NULL;
203 
204     i = j = str_len - 1;
205     while (maxcount-- > 0) {
206         while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
207             i--;
208         if (i < 0) break;
209         j = i; i--;
210         while (i >= 0 && !STRINGLIB_ISSPACE(str[i]))
211             i--;
212 #ifndef STRINGLIB_MUTABLE
213         if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
214             /* No whitespace in str_obj, so just use it as list[0] */
215             Py_INCREF(str_obj);
216             PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
217             count++;
218             break;
219         }
220 #endif
221         SPLIT_ADD(str, i + 1, j + 1);
222     }
223 
224     if (i >= 0) {
225         /* Only occurs when maxcount was reached */
226         /* Skip any remaining whitespace and copy to beginning of string */
227         while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
228             i--;
229         if (i >= 0)
230             SPLIT_ADD(str, 0, i + 1);
231     }
232     FIX_PREALLOC_SIZE(list);
233     if (PyList_Reverse(list) < 0)
234         goto onError;
235     return list;
236 
237   onError:
238     Py_DECREF(list);
239     return NULL;
240 }
241 
242 Py_LOCAL_INLINE(PyObject *)
STRINGLIB(rsplit_char)243 STRINGLIB(rsplit_char)(PyObject* str_obj,
244                       const STRINGLIB_CHAR* str, Py_ssize_t str_len,
245                       const STRINGLIB_CHAR ch,
246                       Py_ssize_t maxcount)
247 {
248     Py_ssize_t i, j, count=0;
249     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
250     PyObject *sub;
251 
252     if (list == NULL)
253         return NULL;
254 
255     i = j = str_len - 1;
256     while ((i >= 0) && (maxcount-- > 0)) {
257         for(; i >= 0; i--) {
258             if (str[i] == ch) {
259                 SPLIT_ADD(str, i + 1, j + 1);
260                 j = i = i - 1;
261                 break;
262             }
263         }
264     }
265 #ifndef STRINGLIB_MUTABLE
266     if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
267         /* ch not in str_obj, so just use str_obj as list[0] */
268         Py_INCREF(str_obj);
269         PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
270         count++;
271     } else
272 #endif
273     if (j >= -1) {
274         SPLIT_ADD(str, 0, j + 1);
275     }
276     FIX_PREALLOC_SIZE(list);
277     if (PyList_Reverse(list) < 0)
278         goto onError;
279     return list;
280 
281   onError:
282     Py_DECREF(list);
283     return NULL;
284 }
285 
286 Py_LOCAL_INLINE(PyObject *)
STRINGLIB(rsplit)287 STRINGLIB(rsplit)(PyObject* str_obj,
288                  const STRINGLIB_CHAR* str, Py_ssize_t str_len,
289                  const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
290                  Py_ssize_t maxcount)
291 {
292     Py_ssize_t j, pos, count=0;
293     PyObject *list, *sub;
294 
295     if (sep_len == 0) {
296         PyErr_SetString(PyExc_ValueError, "empty separator");
297         return NULL;
298     }
299     else if (sep_len == 1)
300         return STRINGLIB(rsplit_char)(str_obj, str, str_len, sep[0], maxcount);
301 
302     list = PyList_New(PREALLOC_SIZE(maxcount));
303     if (list == NULL)
304         return NULL;
305 
306     j = str_len;
307     while (maxcount-- > 0) {
308         pos = FASTSEARCH(str, j, sep, sep_len, -1, FAST_RSEARCH);
309         if (pos < 0)
310             break;
311         SPLIT_ADD(str, pos + sep_len, j);
312         j = pos;
313     }
314 #ifndef STRINGLIB_MUTABLE
315     if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
316         /* No match in str_obj, so just use it as list[0] */
317         Py_INCREF(str_obj);
318         PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
319         count++;
320     } else
321 #endif
322     {
323         SPLIT_ADD(str, 0, j);
324     }
325     FIX_PREALLOC_SIZE(list);
326     if (PyList_Reverse(list) < 0)
327         goto onError;
328     return list;
329 
330   onError:
331     Py_DECREF(list);
332     return NULL;
333 }
334 
335 Py_LOCAL_INLINE(PyObject *)
STRINGLIB(splitlines)336 STRINGLIB(splitlines)(PyObject* str_obj,
337                      const STRINGLIB_CHAR* str, Py_ssize_t str_len,
338                      int keepends)
339 {
340     /* This does not use the preallocated list because splitlines is
341        usually run with hundreds of newlines.  The overhead of
342        switching between PyList_SET_ITEM and append causes about a
343        2-3% slowdown for that common case.  A smarter implementation
344        could move the if check out, so the SET_ITEMs are done first
345        and the appends only done when the prealloc buffer is full.
346        That's too much work for little gain.*/
347 
348     Py_ssize_t i;
349     Py_ssize_t j;
350     PyObject *list = PyList_New(0);
351     PyObject *sub;
352 
353     if (list == NULL)
354         return NULL;
355 
356     for (i = j = 0; i < str_len; ) {
357         Py_ssize_t eol;
358 
359         /* Find a line and append it */
360         while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i]))
361             i++;
362 
363         /* Skip the line break reading CRLF as one line break */
364         eol = i;
365         if (i < str_len) {
366             if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n')
367                 i += 2;
368             else
369                 i++;
370             if (keepends)
371                 eol = i;
372         }
373 #ifndef STRINGLIB_MUTABLE
374         if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
375             /* No linebreak in str_obj, so just use it as list[0] */
376             if (PyList_Append(list, str_obj))
377                 goto onError;
378             break;
379         }
380 #endif
381         SPLIT_APPEND(str, j, eol);
382         j = i;
383     }
384     return list;
385 
386   onError:
387     Py_DECREF(list);
388     return NULL;
389 }
390 
391