1 #if STRINGLIB_IS_UNICODE
2 # error "transmogrify.h only compatible with byte-wise strings"
3 #endif
4 
5 /* the more complicated methods.  parts of these should be pulled out into the
6    shared code in bytes_methods.c to cut down on duplicate code bloat.  */
7 
8 /*[clinic input]
9 class B "PyObject *" "&PyType_Type"
10 [clinic start generated code]*/
11 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=2935558188d97c76]*/
12 
13 #include "clinic/transmogrify.h.h"
14 
15 static inline PyObject *
return_self(PyObject * self)16 return_self(PyObject *self)
17 {
18 #if !STRINGLIB_MUTABLE
19     if (STRINGLIB_CHECK_EXACT(self)) {
20         Py_INCREF(self);
21         return self;
22     }
23 #endif
24     return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self));
25 }
26 
27 /*[clinic input]
28 B.expandtabs as stringlib_expandtabs
29 
30     tabsize: int = 8
31 
32 Return a copy where all tab characters are expanded using spaces.
33 
34 If tabsize is not given, a tab size of 8 characters is assumed.
35 [clinic start generated code]*/
36 
37 static PyObject *
stringlib_expandtabs_impl(PyObject * self,int tabsize)38 stringlib_expandtabs_impl(PyObject *self, int tabsize)
39 /*[clinic end generated code: output=069cb7fae72e4c2b input=3c6d3b12aa3ccbea]*/
40 {
41     const char *e, *p;
42     char *q;
43     Py_ssize_t i, j;
44     PyObject *u;
45 
46     /* First pass: determine size of output string */
47     i = j = 0;
48     e = STRINGLIB_STR(self) + STRINGLIB_LEN(self);
49     for (p = STRINGLIB_STR(self); p < e; p++) {
50         if (*p == '\t') {
51             if (tabsize > 0) {
52                 Py_ssize_t incr = tabsize - (j % tabsize);
53                 if (j > PY_SSIZE_T_MAX - incr)
54                     goto overflow;
55                 j += incr;
56             }
57         }
58         else {
59             if (j > PY_SSIZE_T_MAX - 1)
60                 goto overflow;
61             j++;
62             if (*p == '\n' || *p == '\r') {
63                 if (i > PY_SSIZE_T_MAX - j)
64                     goto overflow;
65                 i += j;
66                 j = 0;
67             }
68         }
69     }
70 
71     if (i > PY_SSIZE_T_MAX - j)
72         goto overflow;
73 
74     /* Second pass: create output string and fill it */
75     u = STRINGLIB_NEW(NULL, i + j);
76     if (!u)
77         return NULL;
78 
79     j = 0;
80     q = STRINGLIB_STR(u);
81 
82     for (p = STRINGLIB_STR(self); p < e; p++) {
83         if (*p == '\t') {
84             if (tabsize > 0) {
85                 i = tabsize - (j % tabsize);
86                 j += i;
87                 while (i--)
88                     *q++ = ' ';
89             }
90         }
91         else {
92             j++;
93             *q++ = *p;
94             if (*p == '\n' || *p == '\r')
95                 j = 0;
96         }
97     }
98 
99     return u;
100   overflow:
101     PyErr_SetString(PyExc_OverflowError, "result too long");
102     return NULL;
103 }
104 
105 static inline PyObject *
pad(PyObject * self,Py_ssize_t left,Py_ssize_t right,char fill)106 pad(PyObject *self, Py_ssize_t left, Py_ssize_t right, char fill)
107 {
108     PyObject *u;
109 
110     if (left < 0)
111         left = 0;
112     if (right < 0)
113         right = 0;
114 
115     if (left == 0 && right == 0) {
116         return return_self(self);
117     }
118 
119     u = STRINGLIB_NEW(NULL, left + STRINGLIB_LEN(self) + right);
120     if (u) {
121         if (left)
122             memset(STRINGLIB_STR(u), fill, left);
123         memcpy(STRINGLIB_STR(u) + left,
124                STRINGLIB_STR(self),
125                STRINGLIB_LEN(self));
126         if (right)
127             memset(STRINGLIB_STR(u) + left + STRINGLIB_LEN(self),
128                    fill, right);
129     }
130 
131     return u;
132 }
133 
134 /*[clinic input]
135 B.ljust as stringlib_ljust
136 
137     width: Py_ssize_t
138     fillchar: char = b' '
139     /
140 
141 Return a left-justified string of length width.
142 
143 Padding is done using the specified fill character.
144 [clinic start generated code]*/
145 
146 static PyObject *
stringlib_ljust_impl(PyObject * self,Py_ssize_t width,char fillchar)147 stringlib_ljust_impl(PyObject *self, Py_ssize_t width, char fillchar)
148 /*[clinic end generated code: output=c79ca173c5ff8337 input=eff2d014bc7d80df]*/
149 {
150     if (STRINGLIB_LEN(self) >= width) {
151         return return_self(self);
152     }
153 
154     return pad(self, 0, width - STRINGLIB_LEN(self), fillchar);
155 }
156 
157 
158 /*[clinic input]
159 B.rjust as stringlib_rjust
160 
161     width: Py_ssize_t
162     fillchar: char = b' '
163     /
164 
165 Return a right-justified string of length width.
166 
167 Padding is done using the specified fill character.
168 [clinic start generated code]*/
169 
170 static PyObject *
stringlib_rjust_impl(PyObject * self,Py_ssize_t width,char fillchar)171 stringlib_rjust_impl(PyObject *self, Py_ssize_t width, char fillchar)
172 /*[clinic end generated code: output=7df5d728a5439570 input=218b0bd31308955d]*/
173 {
174     if (STRINGLIB_LEN(self) >= width) {
175         return return_self(self);
176     }
177 
178     return pad(self, width - STRINGLIB_LEN(self), 0, fillchar);
179 }
180 
181 
182 /*[clinic input]
183 B.center as stringlib_center
184 
185     width: Py_ssize_t
186     fillchar: char = b' '
187     /
188 
189 Return a centered string of length width.
190 
191 Padding is done using the specified fill character.
192 [clinic start generated code]*/
193 
194 static PyObject *
stringlib_center_impl(PyObject * self,Py_ssize_t width,char fillchar)195 stringlib_center_impl(PyObject *self, Py_ssize_t width, char fillchar)
196 /*[clinic end generated code: output=d8da2e055288b4c2 input=3776fd278765d89b]*/
197 {
198     Py_ssize_t marg, left;
199 
200     if (STRINGLIB_LEN(self) >= width) {
201         return return_self(self);
202     }
203 
204     marg = width - STRINGLIB_LEN(self);
205     left = marg / 2 + (marg & width & 1);
206 
207     return pad(self, left, marg - left, fillchar);
208 }
209 
210 /*[clinic input]
211 B.zfill as stringlib_zfill
212 
213     width: Py_ssize_t
214     /
215 
216 Pad a numeric string with zeros on the left, to fill a field of the given width.
217 
218 The original string is never truncated.
219 [clinic start generated code]*/
220 
221 static PyObject *
stringlib_zfill_impl(PyObject * self,Py_ssize_t width)222 stringlib_zfill_impl(PyObject *self, Py_ssize_t width)
223 /*[clinic end generated code: output=0b3c684a7f1b2319 input=2da6d7b8e9bcb19a]*/
224 {
225     Py_ssize_t fill;
226     PyObject *s;
227     char *p;
228 
229     if (STRINGLIB_LEN(self) >= width) {
230         return return_self(self);
231     }
232 
233     fill = width - STRINGLIB_LEN(self);
234 
235     s = pad(self, fill, 0, '0');
236 
237     if (s == NULL)
238         return NULL;
239 
240     p = STRINGLIB_STR(s);
241     if (p[fill] == '+' || p[fill] == '-') {
242         /* move sign to beginning of string */
243         p[0] = p[fill];
244         p[fill] = '0';
245     }
246 
247     return s;
248 }
249 
250 
251 /* find and count characters and substrings */
252 
253 #define findchar(target, target_len, c)                         \
254   ((char *)memchr((const void *)(target), c, target_len))
255 
256 
257 static Py_ssize_t
countchar(const char * target,Py_ssize_t target_len,char c,Py_ssize_t maxcount)258 countchar(const char *target, Py_ssize_t target_len, char c,
259           Py_ssize_t maxcount)
260 {
261     Py_ssize_t count = 0;
262     const char *start = target;
263     const char *end = target + target_len;
264 
265     while ((start = findchar(start, end - start, c)) != NULL) {
266         count++;
267         if (count >= maxcount)
268             break;
269         start += 1;
270     }
271     return count;
272 }
273 
274 
275 /* Algorithms for different cases of string replacement */
276 
277 /* len(self)>=1, from="", len(to)>=1, maxcount>=1 */
278 static PyObject *
stringlib_replace_interleave(PyObject * self,const char * to_s,Py_ssize_t to_len,Py_ssize_t maxcount)279 stringlib_replace_interleave(PyObject *self,
280                              const char *to_s, Py_ssize_t to_len,
281                              Py_ssize_t maxcount)
282 {
283     const char *self_s;
284     char *result_s;
285     Py_ssize_t self_len, result_len;
286     Py_ssize_t count, i;
287     PyObject *result;
288 
289     self_len = STRINGLIB_LEN(self);
290 
291     /* 1 at the end plus 1 after every character;
292        count = min(maxcount, self_len + 1) */
293     if (maxcount <= self_len) {
294         count = maxcount;
295     }
296     else {
297         /* Can't overflow: self_len + 1 <= maxcount <= PY_SSIZE_T_MAX. */
298         count = self_len + 1;
299     }
300 
301     /* Check for overflow */
302     /*   result_len = count * to_len + self_len; */
303     assert(count > 0);
304     if (to_len > (PY_SSIZE_T_MAX - self_len) / count) {
305         PyErr_SetString(PyExc_OverflowError,
306                         "replace bytes is too long");
307         return NULL;
308     }
309     result_len = count * to_len + self_len;
310     result = STRINGLIB_NEW(NULL, result_len);
311     if (result == NULL) {
312         return NULL;
313     }
314 
315     self_s = STRINGLIB_STR(self);
316     result_s = STRINGLIB_STR(result);
317 
318     if (to_len > 1) {
319         /* Lay the first one down (guaranteed this will occur) */
320         memcpy(result_s, to_s, to_len);
321         result_s += to_len;
322         count -= 1;
323 
324         for (i = 0; i < count; i++) {
325             *result_s++ = *self_s++;
326             memcpy(result_s, to_s, to_len);
327             result_s += to_len;
328         }
329     }
330     else {
331         result_s[0] = to_s[0];
332         result_s += to_len;
333         count -= 1;
334         for (i = 0; i < count; i++) {
335             *result_s++ = *self_s++;
336             result_s[0] = to_s[0];
337             result_s += to_len;
338         }
339     }
340 
341     /* Copy the rest of the original string */
342     memcpy(result_s, self_s, self_len - i);
343 
344     return result;
345 }
346 
347 /* Special case for deleting a single character */
348 /* len(self)>=1, len(from)==1, to="", maxcount>=1 */
349 static PyObject *
stringlib_replace_delete_single_character(PyObject * self,char from_c,Py_ssize_t maxcount)350 stringlib_replace_delete_single_character(PyObject *self,
351                                           char from_c, Py_ssize_t maxcount)
352 {
353     const char *self_s, *start, *next, *end;
354     char *result_s;
355     Py_ssize_t self_len, result_len;
356     Py_ssize_t count;
357     PyObject *result;
358 
359     self_len = STRINGLIB_LEN(self);
360     self_s = STRINGLIB_STR(self);
361 
362     count = countchar(self_s, self_len, from_c, maxcount);
363     if (count == 0) {
364         return return_self(self);
365     }
366 
367     result_len = self_len - count;  /* from_len == 1 */
368     assert(result_len>=0);
369 
370     result = STRINGLIB_NEW(NULL, result_len);
371     if (result == NULL) {
372         return NULL;
373     }
374     result_s = STRINGLIB_STR(result);
375 
376     start = self_s;
377     end = self_s + self_len;
378     while (count-- > 0) {
379         next = findchar(start, end - start, from_c);
380         if (next == NULL)
381             break;
382         memcpy(result_s, start, next - start);
383         result_s += (next - start);
384         start = next + 1;
385     }
386     memcpy(result_s, start, end - start);
387 
388     return result;
389 }
390 
391 /* len(self)>=1, len(from)>=2, to="", maxcount>=1 */
392 
393 static PyObject *
stringlib_replace_delete_substring(PyObject * self,const char * from_s,Py_ssize_t from_len,Py_ssize_t maxcount)394 stringlib_replace_delete_substring(PyObject *self,
395                                    const char *from_s, Py_ssize_t from_len,
396                                    Py_ssize_t maxcount)
397 {
398     const char *self_s, *start, *next, *end;
399     char *result_s;
400     Py_ssize_t self_len, result_len;
401     Py_ssize_t count, offset;
402     PyObject *result;
403 
404     self_len = STRINGLIB_LEN(self);
405     self_s = STRINGLIB_STR(self);
406 
407     count = stringlib_count(self_s, self_len,
408                             from_s, from_len,
409                             maxcount);
410 
411     if (count == 0) {
412         /* no matches */
413         return return_self(self);
414     }
415 
416     result_len = self_len - (count * from_len);
417     assert (result_len>=0);
418 
419     result = STRINGLIB_NEW(NULL, result_len);
420     if (result == NULL) {
421         return NULL;
422     }
423     result_s = STRINGLIB_STR(result);
424 
425     start = self_s;
426     end = self_s + self_len;
427     while (count-- > 0) {
428         offset = stringlib_find(start, end - start,
429                                 from_s, from_len,
430                                 0);
431         if (offset == -1)
432             break;
433         next = start + offset;
434 
435         memcpy(result_s, start, next - start);
436 
437         result_s += (next - start);
438         start = next + from_len;
439     }
440     memcpy(result_s, start, end - start);
441     return result;
442 }
443 
444 /* len(self)>=1, len(from)==len(to)==1, maxcount>=1 */
445 static PyObject *
stringlib_replace_single_character_in_place(PyObject * self,char from_c,char to_c,Py_ssize_t maxcount)446 stringlib_replace_single_character_in_place(PyObject *self,
447                                             char from_c, char to_c,
448                                             Py_ssize_t maxcount)
449 {
450     const char *self_s, *end;
451     char *result_s, *start, *next;
452     Py_ssize_t self_len;
453     PyObject *result;
454 
455     /* The result string will be the same size */
456     self_s = STRINGLIB_STR(self);
457     self_len = STRINGLIB_LEN(self);
458 
459     next = findchar(self_s, self_len, from_c);
460 
461     if (next == NULL) {
462         /* No matches; return the original bytes */
463         return return_self(self);
464     }
465 
466     /* Need to make a new bytes */
467     result = STRINGLIB_NEW(NULL, self_len);
468     if (result == NULL) {
469         return NULL;
470     }
471     result_s = STRINGLIB_STR(result);
472     memcpy(result_s, self_s, self_len);
473 
474     /* change everything in-place, starting with this one */
475     start =  result_s + (next - self_s);
476     *start = to_c;
477     start++;
478     end = result_s + self_len;
479 
480     while (--maxcount > 0) {
481         next = findchar(start, end - start, from_c);
482         if (next == NULL)
483             break;
484         *next = to_c;
485         start = next + 1;
486     }
487 
488     return result;
489 }
490 
491 /* len(self)>=1, len(from)==len(to)>=2, maxcount>=1 */
492 static PyObject *
stringlib_replace_substring_in_place(PyObject * self,const char * from_s,Py_ssize_t from_len,const char * to_s,Py_ssize_t to_len,Py_ssize_t maxcount)493 stringlib_replace_substring_in_place(PyObject *self,
494                                      const char *from_s, Py_ssize_t from_len,
495                                      const char *to_s, Py_ssize_t to_len,
496                                      Py_ssize_t maxcount)
497 {
498     const char *self_s, *end;
499     char *result_s, *start;
500     Py_ssize_t self_len, offset;
501     PyObject *result;
502 
503     /* The result bytes will be the same size */
504 
505     self_s = STRINGLIB_STR(self);
506     self_len = STRINGLIB_LEN(self);
507 
508     offset = stringlib_find(self_s, self_len,
509                             from_s, from_len,
510                             0);
511     if (offset == -1) {
512         /* No matches; return the original bytes */
513         return return_self(self);
514     }
515 
516     /* Need to make a new bytes */
517     result = STRINGLIB_NEW(NULL, self_len);
518     if (result == NULL) {
519         return NULL;
520     }
521     result_s = STRINGLIB_STR(result);
522     memcpy(result_s, self_s, self_len);
523 
524     /* change everything in-place, starting with this one */
525     start =  result_s + offset;
526     memcpy(start, to_s, from_len);
527     start += from_len;
528     end = result_s + self_len;
529 
530     while ( --maxcount > 0) {
531         offset = stringlib_find(start, end - start,
532                                 from_s, from_len,
533                                 0);
534         if (offset == -1)
535             break;
536         memcpy(start + offset, to_s, from_len);
537         start += offset + from_len;
538     }
539 
540     return result;
541 }
542 
543 /* len(self)>=1, len(from)==1, len(to)>=2, maxcount>=1 */
544 static PyObject *
stringlib_replace_single_character(PyObject * self,char from_c,const char * to_s,Py_ssize_t to_len,Py_ssize_t maxcount)545 stringlib_replace_single_character(PyObject *self,
546                                    char from_c,
547                                    const char *to_s, Py_ssize_t to_len,
548                                    Py_ssize_t maxcount)
549 {
550     const char *self_s, *start, *next, *end;
551     char *result_s;
552     Py_ssize_t self_len, result_len;
553     Py_ssize_t count;
554     PyObject *result;
555 
556     self_s = STRINGLIB_STR(self);
557     self_len = STRINGLIB_LEN(self);
558 
559     count = countchar(self_s, self_len, from_c, maxcount);
560     if (count == 0) {
561         /* no matches, return unchanged */
562         return return_self(self);
563     }
564 
565     /* use the difference between current and new, hence the "-1" */
566     /*   result_len = self_len + count * (to_len-1)  */
567     assert(count > 0);
568     if (to_len - 1 > (PY_SSIZE_T_MAX - self_len) / count) {
569         PyErr_SetString(PyExc_OverflowError, "replace bytes is too long");
570         return NULL;
571     }
572     result_len = self_len + count * (to_len - 1);
573 
574     result = STRINGLIB_NEW(NULL, result_len);
575     if (result == NULL) {
576         return NULL;
577     }
578     result_s = STRINGLIB_STR(result);
579 
580     start = self_s;
581     end = self_s + self_len;
582     while (count-- > 0) {
583         next = findchar(start, end - start, from_c);
584         if (next == NULL)
585             break;
586 
587         if (next == start) {
588             /* replace with the 'to' */
589             memcpy(result_s, to_s, to_len);
590             result_s += to_len;
591             start += 1;
592         } else {
593             /* copy the unchanged old then the 'to' */
594             memcpy(result_s, start, next - start);
595             result_s += (next - start);
596             memcpy(result_s, to_s, to_len);
597             result_s += to_len;
598             start = next + 1;
599         }
600     }
601     /* Copy the remainder of the remaining bytes */
602     memcpy(result_s, start, end - start);
603 
604     return result;
605 }
606 
607 /* len(self)>=1, len(from)>=2, len(to)>=2, maxcount>=1 */
608 static PyObject *
stringlib_replace_substring(PyObject * self,const char * from_s,Py_ssize_t from_len,const char * to_s,Py_ssize_t to_len,Py_ssize_t maxcount)609 stringlib_replace_substring(PyObject *self,
610                             const char *from_s, Py_ssize_t from_len,
611                             const char *to_s, Py_ssize_t to_len,
612                             Py_ssize_t maxcount)
613 {
614     const char *self_s, *start, *next, *end;
615     char *result_s;
616     Py_ssize_t self_len, result_len;
617     Py_ssize_t count, offset;
618     PyObject *result;
619 
620     self_s = STRINGLIB_STR(self);
621     self_len = STRINGLIB_LEN(self);
622 
623     count = stringlib_count(self_s, self_len,
624                             from_s, from_len,
625                             maxcount);
626 
627     if (count == 0) {
628         /* no matches, return unchanged */
629         return return_self(self);
630     }
631 
632     /* Check for overflow */
633     /*    result_len = self_len + count * (to_len-from_len) */
634     assert(count > 0);
635     if (to_len - from_len > (PY_SSIZE_T_MAX - self_len) / count) {
636         PyErr_SetString(PyExc_OverflowError, "replace bytes is too long");
637         return NULL;
638     }
639     result_len = self_len + count * (to_len - from_len);
640 
641     result = STRINGLIB_NEW(NULL, result_len);
642     if (result == NULL) {
643         return NULL;
644     }
645     result_s = STRINGLIB_STR(result);
646 
647     start = self_s;
648     end = self_s + self_len;
649     while (count-- > 0) {
650         offset = stringlib_find(start, end - start,
651                                 from_s, from_len,
652                                 0);
653         if (offset == -1)
654             break;
655         next = start + offset;
656         if (next == start) {
657             /* replace with the 'to' */
658             memcpy(result_s, to_s, to_len);
659             result_s += to_len;
660             start += from_len;
661         } else {
662             /* copy the unchanged old then the 'to' */
663             memcpy(result_s, start, next - start);
664             result_s += (next - start);
665             memcpy(result_s, to_s, to_len);
666             result_s += to_len;
667             start = next + from_len;
668         }
669     }
670     /* Copy the remainder of the remaining bytes */
671     memcpy(result_s, start, end - start);
672 
673     return result;
674 }
675 
676 
677 static PyObject *
stringlib_replace(PyObject * self,const char * from_s,Py_ssize_t from_len,const char * to_s,Py_ssize_t to_len,Py_ssize_t maxcount)678 stringlib_replace(PyObject *self,
679                   const char *from_s, Py_ssize_t from_len,
680                   const char *to_s, Py_ssize_t to_len,
681                   Py_ssize_t maxcount)
682 {
683     if (maxcount < 0) {
684         maxcount = PY_SSIZE_T_MAX;
685     } else if (maxcount == 0 || STRINGLIB_LEN(self) == 0) {
686         /* nothing to do; return the original bytes */
687         return return_self(self);
688     }
689 
690     /* Handle zero-length special cases */
691     if (from_len == 0) {
692         if (to_len == 0) {
693             /* nothing to do; return the original bytes */
694             return return_self(self);
695         }
696         /* insert the 'to' bytes everywhere.    */
697         /*    >>> b"Python".replace(b"", b".")  */
698         /*    b'.P.y.t.h.o.n.'                  */
699         return stringlib_replace_interleave(self, to_s, to_len, maxcount);
700     }
701 
702     /* Except for b"".replace(b"", b"A") == b"A" there is no way beyond this */
703     /* point for an empty self bytes to generate a non-empty bytes */
704     /* Special case so the remaining code always gets a non-empty bytes */
705     if (STRINGLIB_LEN(self) == 0) {
706         return return_self(self);
707     }
708 
709     if (to_len == 0) {
710         /* delete all occurrences of 'from' bytes */
711         if (from_len == 1) {
712             return stringlib_replace_delete_single_character(
713                 self, from_s[0], maxcount);
714         } else {
715             return stringlib_replace_delete_substring(
716                 self, from_s, from_len, maxcount);
717         }
718     }
719 
720     /* Handle special case where both bytes have the same length */
721 
722     if (from_len == to_len) {
723         if (from_len == 1) {
724             return stringlib_replace_single_character_in_place(
725                 self, from_s[0], to_s[0], maxcount);
726         } else {
727             return stringlib_replace_substring_in_place(
728                 self, from_s, from_len, to_s, to_len, maxcount);
729         }
730     }
731 
732     /* Otherwise use the more generic algorithms */
733     if (from_len == 1) {
734         return stringlib_replace_single_character(
735             self, from_s[0], to_s, to_len, maxcount);
736     } else {
737         /* len('from')>=2, len('to')>=1 */
738         return stringlib_replace_substring(
739             self, from_s, from_len, to_s, to_len, maxcount);
740     }
741 }
742 
743 #undef findchar
744