1
2# Various microbenchmarks comparing unicode and byte string performance
3# Please keep this file both 2.x and 3.x compatible!
4
5import timeit
6import itertools
7import operator
8import re
9import sys
10import datetime
11import optparse
12
13VERSION = '2.0'
14
15def p(*args):
16    sys.stdout.write(' '.join(str(s) for s in args) + '\n')
17
18if sys.version_info >= (3,):
19    BYTES = bytes_from_str = lambda x: x.encode('ascii')
20    UNICODE = unicode_from_str = lambda x: x
21else:
22    BYTES = bytes_from_str = lambda x: x
23    UNICODE = unicode_from_str = lambda x: x.decode('ascii')
24
25class UnsupportedType(TypeError):
26    pass
27
28
29p('stringbench v%s' % VERSION)
30p(sys.version)
31p(datetime.datetime.now())
32
33REPEAT = 1
34REPEAT = 3
35#REPEAT = 7
36
37if __name__ != "__main__":
38    raise SystemExit("Must run as main program")
39
40parser = optparse.OptionParser()
41parser.add_option("-R", "--skip-re", dest="skip_re",
42                  action="store_true",
43                  help="skip regular expression tests")
44parser.add_option("-8", "--8-bit", dest="bytes_only",
45                  action="store_true",
46                  help="only do 8-bit string benchmarks")
47parser.add_option("-u", "--unicode", dest="unicode_only",
48                  action="store_true",
49                  help="only do Unicode string benchmarks")
50
51
52_RANGE_1000 = list(range(1000))
53_RANGE_100 = list(range(100))
54_RANGE_10 = list(range(10))
55
56dups = {}
57def bench(s, group, repeat_count):
58    def blah(f):
59        if f.__name__ in dups:
60            raise AssertionError("Multiple functions with same name: %r" %
61                                 (f.__name__,))
62        dups[f.__name__] = 1
63        f.comment = s
64        f.is_bench = True
65        f.group = group
66        f.repeat_count = repeat_count
67        return f
68    return blah
69
70def uses_re(f):
71    f.uses_re = True
72
73####### 'in' comparisons
74
75@bench('"A" in "A"*1000', "early match, single character", 1000)
76def in_test_quick_match_single_character(STR):
77    s1 = STR("A" * 1000)
78    s2 = STR("A")
79    for x in _RANGE_1000:
80        s2 in s1
81
82@bench('"B" in "A"*1000', "no match, single character", 1000)
83def in_test_no_match_single_character(STR):
84    s1 = STR("A" * 1000)
85    s2 = STR("B")
86    for x in _RANGE_1000:
87        s2 in s1
88
89
90@bench('"AB" in "AB"*1000', "early match, two characters", 1000)
91def in_test_quick_match_two_characters(STR):
92    s1 = STR("AB" * 1000)
93    s2 = STR("AB")
94    for x in _RANGE_1000:
95        s2 in s1
96
97@bench('"BC" in "AB"*1000', "no match, two characters", 1000)
98def in_test_no_match_two_character(STR):
99    s1 = STR("AB" * 1000)
100    s2 = STR("BC")
101    for x in _RANGE_1000:
102        s2 in s1
103
104@bench('"BC" in ("AB"*300+"C")', "late match, two characters", 1000)
105def in_test_slow_match_two_characters(STR):
106    s1 = STR("AB" * 300+"C")
107    s2 = STR("BC")
108    for x in _RANGE_1000:
109        s2 in s1
110
111@bench('s="ABC"*33; (s+"E") in ((s+"D")*300+s+"E")',
112       "late match, 100 characters", 100)
113def in_test_slow_match_100_characters(STR):
114    m = STR("ABC"*33)
115    d = STR("D")
116    e = STR("E")
117    s1 = (m+d)*300 + m+e
118    s2 = m+e
119    for x in _RANGE_100:
120        s2 in s1
121
122# Try with regex
123@uses_re
124@bench('s="ABC"*33; re.compile(s+"D").search((s+"D")*300+s+"E")',
125       "late match, 100 characters", 100)
126def re_test_slow_match_100_characters(STR):
127    m = STR("ABC"*33)
128    d = STR("D")
129    e = STR("E")
130    s1 = (m+d)*300 + m+e
131    s2 = m+e
132    pat = re.compile(s2)
133    search = pat.search
134    for x in _RANGE_100:
135        search(s1)
136
137
138#### same tests as 'in' but use 'find'
139
140@bench('("A"*1000).find("A")', "early match, single character", 1000)
141def find_test_quick_match_single_character(STR):
142    s1 = STR("A" * 1000)
143    s2 = STR("A")
144    s1_find = s1.find
145    for x in _RANGE_1000:
146        s1_find(s2)
147
148@bench('("A"*1000).find("B")', "no match, single character", 1000)
149def find_test_no_match_single_character(STR):
150    s1 = STR("A" * 1000)
151    s2 = STR("B")
152    s1_find = s1.find
153    for x in _RANGE_1000:
154        s1_find(s2)
155
156
157@bench('("AB"*1000).find("AB")', "early match, two characters", 1000)
158def find_test_quick_match_two_characters(STR):
159    s1 = STR("AB" * 1000)
160    s2 = STR("AB")
161    s1_find = s1.find
162    for x in _RANGE_1000:
163        s1_find(s2)
164
165@bench('("AB"*1000).find("BC")', "no match, two characters", 1000)
166def find_test_no_match_two_character(STR):
167    s1 = STR("AB" * 1000)
168    s2 = STR("BC")
169    s1_find = s1.find
170    for x in _RANGE_1000:
171        s1_find(s2)
172
173@bench('("AB"*1000).find("CA")', "no match, two characters", 1000)
174def find_test_no_match_two_character_bis(STR):
175    s1 = STR("AB" * 1000)
176    s2 = STR("CA")
177    s1_find = s1.find
178    for x in _RANGE_1000:
179        s1_find(s2)
180
181@bench('("AB"*300+"C").find("BC")', "late match, two characters", 1000)
182def find_test_slow_match_two_characters(STR):
183    s1 = STR("AB" * 300+"C")
184    s2 = STR("BC")
185    s1_find = s1.find
186    for x in _RANGE_1000:
187        s1_find(s2)
188
189@bench('("AB"*300+"CA").find("CA")', "late match, two characters", 1000)
190def find_test_slow_match_two_characters_bis(STR):
191    s1 = STR("AB" * 300+"CA")
192    s2 = STR("CA")
193    s1_find = s1.find
194    for x in _RANGE_1000:
195        s1_find(s2)
196
197@bench('s="ABC"*33; ((s+"D")*500+s+"E").find(s+"E")',
198       "late match, 100 characters", 100)
199def find_test_slow_match_100_characters(STR):
200    m = STR("ABC"*33)
201    d = STR("D")
202    e = STR("E")
203    s1 = (m+d)*500 + m+e
204    s2 = m+e
205    s1_find = s1.find
206    for x in _RANGE_100:
207        s1_find(s2)
208
209@bench('s="ABC"*33; ((s+"D")*500+"E"+s).find("E"+s)',
210       "late match, 100 characters", 100)
211def find_test_slow_match_100_characters_bis(STR):
212    m = STR("ABC"*33)
213    d = STR("D")
214    e = STR("E")
215    s1 = (m+d)*500 + e+m
216    s2 = e+m
217    s1_find = s1.find
218    for x in _RANGE_100:
219        s1_find(s2)
220
221
222#### Same tests for 'rfind'
223
224@bench('("A"*1000).rfind("A")', "early match, single character", 1000)
225def rfind_test_quick_match_single_character(STR):
226    s1 = STR("A" * 1000)
227    s2 = STR("A")
228    s1_rfind = s1.rfind
229    for x in _RANGE_1000:
230        s1_rfind(s2)
231
232@bench('("A"*1000).rfind("B")', "no match, single character", 1000)
233def rfind_test_no_match_single_character(STR):
234    s1 = STR("A" * 1000)
235    s2 = STR("B")
236    s1_rfind = s1.rfind
237    for x in _RANGE_1000:
238        s1_rfind(s2)
239
240
241@bench('("AB"*1000).rfind("AB")', "early match, two characters", 1000)
242def rfind_test_quick_match_two_characters(STR):
243    s1 = STR("AB" * 1000)
244    s2 = STR("AB")
245    s1_rfind = s1.rfind
246    for x in _RANGE_1000:
247        s1_rfind(s2)
248
249@bench('("AB"*1000).rfind("BC")', "no match, two characters", 1000)
250def rfind_test_no_match_two_character(STR):
251    s1 = STR("AB" * 1000)
252    s2 = STR("BC")
253    s1_rfind = s1.rfind
254    for x in _RANGE_1000:
255        s1_rfind(s2)
256
257@bench('("AB"*1000).rfind("CA")', "no match, two characters", 1000)
258def rfind_test_no_match_two_character_bis(STR):
259    s1 = STR("AB" * 1000)
260    s2 = STR("CA")
261    s1_rfind = s1.rfind
262    for x in _RANGE_1000:
263        s1_rfind(s2)
264
265@bench('("C"+"AB"*300).rfind("CA")', "late match, two characters", 1000)
266def rfind_test_slow_match_two_characters(STR):
267    s1 = STR("C" + "AB" * 300)
268    s2 = STR("CA")
269    s1_rfind = s1.rfind
270    for x in _RANGE_1000:
271        s1_rfind(s2)
272
273@bench('("BC"+"AB"*300).rfind("BC")', "late match, two characters", 1000)
274def rfind_test_slow_match_two_characters_bis(STR):
275    s1 = STR("BC" + "AB" * 300)
276    s2 = STR("BC")
277    s1_rfind = s1.rfind
278    for x in _RANGE_1000:
279        s1_rfind(s2)
280
281@bench('s="ABC"*33; ("E"+s+("D"+s)*500).rfind("E"+s)',
282       "late match, 100 characters", 100)
283def rfind_test_slow_match_100_characters(STR):
284    m = STR("ABC"*33)
285    d = STR("D")
286    e = STR("E")
287    s1 = e+m + (d+m)*500
288    s2 = e+m
289    s1_rfind = s1.rfind
290    for x in _RANGE_100:
291        s1_rfind(s2)
292
293@bench('s="ABC"*33; (s+"E"+("D"+s)*500).rfind(s+"E")',
294       "late match, 100 characters", 100)
295def rfind_test_slow_match_100_characters_bis(STR):
296    m = STR("ABC"*33)
297    d = STR("D")
298    e = STR("E")
299    s1 = m+e + (d+m)*500
300    s2 = m+e
301    s1_rfind = s1.rfind
302    for x in _RANGE_100:
303        s1_rfind(s2)
304
305
306#### Now with index.
307# Skip the ones which fail because that would include exception overhead.
308
309@bench('("A"*1000).index("A")', "early match, single character", 1000)
310def index_test_quick_match_single_character(STR):
311    s1 = STR("A" * 1000)
312    s2 = STR("A")
313    s1_index = s1.index
314    for x in _RANGE_1000:
315        s1_index(s2)
316
317@bench('("AB"*1000).index("AB")', "early match, two characters", 1000)
318def index_test_quick_match_two_characters(STR):
319    s1 = STR("AB" * 1000)
320    s2 = STR("AB")
321    s1_index = s1.index
322    for x in _RANGE_1000:
323        s1_index(s2)
324
325@bench('("AB"*300+"C").index("BC")', "late match, two characters", 1000)
326def index_test_slow_match_two_characters(STR):
327    s1 = STR("AB" * 300+"C")
328    s2 = STR("BC")
329    s1_index = s1.index
330    for x in _RANGE_1000:
331        s1_index(s2)
332
333@bench('s="ABC"*33; ((s+"D")*500+s+"E").index(s+"E")',
334       "late match, 100 characters", 100)
335def index_test_slow_match_100_characters(STR):
336    m = STR("ABC"*33)
337    d = STR("D")
338    e = STR("E")
339    s1 = (m+d)*500 + m+e
340    s2 = m+e
341    s1_index = s1.index
342    for x in _RANGE_100:
343        s1_index(s2)
344
345
346#### Same for rindex
347
348@bench('("A"*1000).rindex("A")', "early match, single character", 1000)
349def rindex_test_quick_match_single_character(STR):
350    s1 = STR("A" * 1000)
351    s2 = STR("A")
352    s1_rindex = s1.rindex
353    for x in _RANGE_1000:
354        s1_rindex(s2)
355
356@bench('("AB"*1000).rindex("AB")', "early match, two characters", 1000)
357def rindex_test_quick_match_two_characters(STR):
358    s1 = STR("AB" * 1000)
359    s2 = STR("AB")
360    s1_rindex = s1.rindex
361    for x in _RANGE_1000:
362        s1_rindex(s2)
363
364@bench('("C"+"AB"*300).rindex("CA")', "late match, two characters", 1000)
365def rindex_test_slow_match_two_characters(STR):
366    s1 = STR("C" + "AB" * 300)
367    s2 = STR("CA")
368    s1_rindex = s1.rindex
369    for x in _RANGE_1000:
370        s1_rindex(s2)
371
372@bench('s="ABC"*33; ("E"+s+("D"+s)*500).rindex("E"+s)',
373       "late match, 100 characters", 100)
374def rindex_test_slow_match_100_characters(STR):
375    m = STR("ABC"*33)
376    d = STR("D")
377    e = STR("E")
378    s1 = e + m + (d+m)*500
379    s2 = e + m
380    s1_rindex = s1.rindex
381    for x in _RANGE_100:
382        s1_rindex(s2)
383
384
385#### Same for partition
386
387@bench('("A"*1000).partition("A")', "early match, single character", 1000)
388def partition_test_quick_match_single_character(STR):
389    s1 = STR("A" * 1000)
390    s2 = STR("A")
391    s1_partition = s1.partition
392    for x in _RANGE_1000:
393        s1_partition(s2)
394
395@bench('("A"*1000).partition("B")', "no match, single character", 1000)
396def partition_test_no_match_single_character(STR):
397    s1 = STR("A" * 1000)
398    s2 = STR("B")
399    s1_partition = s1.partition
400    for x in _RANGE_1000:
401        s1_partition(s2)
402
403
404@bench('("AB"*1000).partition("AB")', "early match, two characters", 1000)
405def partition_test_quick_match_two_characters(STR):
406    s1 = STR("AB" * 1000)
407    s2 = STR("AB")
408    s1_partition = s1.partition
409    for x in _RANGE_1000:
410        s1_partition(s2)
411
412@bench('("AB"*1000).partition("BC")', "no match, two characters", 1000)
413def partition_test_no_match_two_character(STR):
414    s1 = STR("AB" * 1000)
415    s2 = STR("BC")
416    s1_partition = s1.partition
417    for x in _RANGE_1000:
418        s1_partition(s2)
419
420@bench('("AB"*300+"C").partition("BC")', "late match, two characters", 1000)
421def partition_test_slow_match_two_characters(STR):
422    s1 = STR("AB" * 300+"C")
423    s2 = STR("BC")
424    s1_partition = s1.partition
425    for x in _RANGE_1000:
426        s1_partition(s2)
427
428@bench('s="ABC"*33; ((s+"D")*500+s+"E").partition(s+"E")',
429       "late match, 100 characters", 100)
430def partition_test_slow_match_100_characters(STR):
431    m = STR("ABC"*33)
432    d = STR("D")
433    e = STR("E")
434    s1 = (m+d)*500 + m+e
435    s2 = m+e
436    s1_partition = s1.partition
437    for x in _RANGE_100:
438        s1_partition(s2)
439
440
441#### Same for rpartition
442
443@bench('("A"*1000).rpartition("A")', "early match, single character", 1000)
444def rpartition_test_quick_match_single_character(STR):
445    s1 = STR("A" * 1000)
446    s2 = STR("A")
447    s1_rpartition = s1.rpartition
448    for x in _RANGE_1000:
449        s1_rpartition(s2)
450
451@bench('("A"*1000).rpartition("B")', "no match, single character", 1000)
452def rpartition_test_no_match_single_character(STR):
453    s1 = STR("A" * 1000)
454    s2 = STR("B")
455    s1_rpartition = s1.rpartition
456    for x in _RANGE_1000:
457        s1_rpartition(s2)
458
459
460@bench('("AB"*1000).rpartition("AB")', "early match, two characters", 1000)
461def rpartition_test_quick_match_two_characters(STR):
462    s1 = STR("AB" * 1000)
463    s2 = STR("AB")
464    s1_rpartition = s1.rpartition
465    for x in _RANGE_1000:
466        s1_rpartition(s2)
467
468@bench('("AB"*1000).rpartition("BC")', "no match, two characters", 1000)
469def rpartition_test_no_match_two_character(STR):
470    s1 = STR("AB" * 1000)
471    s2 = STR("BC")
472    s1_rpartition = s1.rpartition
473    for x in _RANGE_1000:
474        s1_rpartition(s2)
475
476@bench('("C"+"AB"*300).rpartition("CA")', "late match, two characters", 1000)
477def rpartition_test_slow_match_two_characters(STR):
478    s1 = STR("C" + "AB" * 300)
479    s2 = STR("CA")
480    s1_rpartition = s1.rpartition
481    for x in _RANGE_1000:
482        s1_rpartition(s2)
483
484@bench('s="ABC"*33; ("E"+s+("D"+s)*500).rpartition("E"+s)',
485       "late match, 100 characters", 100)
486def rpartition_test_slow_match_100_characters(STR):
487    m = STR("ABC"*33)
488    d = STR("D")
489    e = STR("E")
490    s1 = e + m + (d+m)*500
491    s2 = e + m
492    s1_rpartition = s1.rpartition
493    for x in _RANGE_100:
494        s1_rpartition(s2)
495
496
497#### Same for split(s, 1)
498
499@bench('("A"*1000).split("A", 1)', "early match, single character", 1000)
500def split_test_quick_match_single_character(STR):
501    s1 = STR("A" * 1000)
502    s2 = STR("A")
503    s1_split = s1.split
504    for x in _RANGE_1000:
505        s1_split(s2, 1)
506
507@bench('("A"*1000).split("B", 1)', "no match, single character", 1000)
508def split_test_no_match_single_character(STR):
509    s1 = STR("A" * 1000)
510    s2 = STR("B")
511    s1_split = s1.split
512    for x in _RANGE_1000:
513        s1_split(s2, 1)
514
515
516@bench('("AB"*1000).split("AB", 1)', "early match, two characters", 1000)
517def split_test_quick_match_two_characters(STR):
518    s1 = STR("AB" * 1000)
519    s2 = STR("AB")
520    s1_split = s1.split
521    for x in _RANGE_1000:
522        s1_split(s2, 1)
523
524@bench('("AB"*1000).split("BC", 1)', "no match, two characters", 1000)
525def split_test_no_match_two_character(STR):
526    s1 = STR("AB" * 1000)
527    s2 = STR("BC")
528    s1_split = s1.split
529    for x in _RANGE_1000:
530        s1_split(s2, 1)
531
532@bench('("AB"*300+"C").split("BC", 1)', "late match, two characters", 1000)
533def split_test_slow_match_two_characters(STR):
534    s1 = STR("AB" * 300+"C")
535    s2 = STR("BC")
536    s1_split = s1.split
537    for x in _RANGE_1000:
538        s1_split(s2, 1)
539
540@bench('s="ABC"*33; ((s+"D")*500+s+"E").split(s+"E", 1)',
541       "late match, 100 characters", 100)
542def split_test_slow_match_100_characters(STR):
543    m = STR("ABC"*33)
544    d = STR("D")
545    e = STR("E")
546    s1 = (m+d)*500 + m+e
547    s2 = m+e
548    s1_split = s1.split
549    for x in _RANGE_100:
550        s1_split(s2, 1)
551
552
553#### Same for rsplit(s, 1)
554
555@bench('("A"*1000).rsplit("A", 1)', "early match, single character", 1000)
556def rsplit_test_quick_match_single_character(STR):
557    s1 = STR("A" * 1000)
558    s2 = STR("A")
559    s1_rsplit = s1.rsplit
560    for x in _RANGE_1000:
561        s1_rsplit(s2, 1)
562
563@bench('("A"*1000).rsplit("B", 1)', "no match, single character", 1000)
564def rsplit_test_no_match_single_character(STR):
565    s1 = STR("A" * 1000)
566    s2 = STR("B")
567    s1_rsplit = s1.rsplit
568    for x in _RANGE_1000:
569        s1_rsplit(s2, 1)
570
571
572@bench('("AB"*1000).rsplit("AB", 1)', "early match, two characters", 1000)
573def rsplit_test_quick_match_two_characters(STR):
574    s1 = STR("AB" * 1000)
575    s2 = STR("AB")
576    s1_rsplit = s1.rsplit
577    for x in _RANGE_1000:
578        s1_rsplit(s2, 1)
579
580@bench('("AB"*1000).rsplit("BC", 1)', "no match, two characters", 1000)
581def rsplit_test_no_match_two_character(STR):
582    s1 = STR("AB" * 1000)
583    s2 = STR("BC")
584    s1_rsplit = s1.rsplit
585    for x in _RANGE_1000:
586        s1_rsplit(s2, 1)
587
588@bench('("C"+"AB"*300).rsplit("CA", 1)', "late match, two characters", 1000)
589def rsplit_test_slow_match_two_characters(STR):
590    s1 = STR("C" + "AB" * 300)
591    s2 = STR("CA")
592    s1_rsplit = s1.rsplit
593    for x in _RANGE_1000:
594        s1_rsplit(s2, 1)
595
596@bench('s="ABC"*33; ("E"+s+("D"+s)*500).rsplit("E"+s, 1)',
597       "late match, 100 characters", 100)
598def rsplit_test_slow_match_100_characters(STR):
599    m = STR("ABC"*33)
600    d = STR("D")
601    e = STR("E")
602    s1 = e + m + (d+m)*500
603    s2 = e + m
604    s1_rsplit = s1.rsplit
605    for x in _RANGE_100:
606        s1_rsplit(s2, 1)
607
608
609#### Benchmark the operator-based methods
610
611@bench('"A"*10', "repeat 1 character 10 times", 1000)
612def repeat_single_10_times(STR):
613    s = STR("A")
614    for x in _RANGE_1000:
615        s * 10
616
617@bench('"A"*1000', "repeat 1 character 1000 times", 1000)
618def repeat_single_1000_times(STR):
619    s = STR("A")
620    for x in _RANGE_1000:
621        s * 1000
622
623@bench('"ABCDE"*10', "repeat 5 characters 10 times", 1000)
624def repeat_5_10_times(STR):
625    s = STR("ABCDE")
626    for x in _RANGE_1000:
627        s * 10
628
629@bench('"ABCDE"*1000', "repeat 5 characters 1000 times", 1000)
630def repeat_5_1000_times(STR):
631    s = STR("ABCDE")
632    for x in _RANGE_1000:
633        s * 1000
634
635# + for concat
636
637@bench('"Andrew"+"Dalke"', "concat two strings", 1000)
638def concat_two_strings(STR):
639    s1 = STR("Andrew")
640    s2 = STR("Dalke")
641    for x in _RANGE_1000:
642        s1+s2
643
644@bench('s1+s2+s3+s4+...+s20', "concat 20 strings of words length 4 to 15",
645       1000)
646def concat_many_strings(STR):
647    s1=STR('TIXSGYNREDCVBHJ')
648    s2=STR('PUMTLXBZVDO')
649    s3=STR('FVZNJ')
650    s4=STR('OGDXUW')
651    s5=STR('WEIMRNCOYVGHKB')
652    s6=STR('FCQTNMXPUZH')
653    s7=STR('TICZJYRLBNVUEAK')
654    s8=STR('REYB')
655    s9=STR('PWUOQ')
656    s10=STR('EQHCMKBS')
657    s11=STR('AEVDFOH')
658    s12=STR('IFHVD')
659    s13=STR('JGTCNLXWOHQ')
660    s14=STR('ITSKEPYLROZAWXF')
661    s15=STR('THEK')
662    s16=STR('GHPZFBUYCKMNJIT')
663    s17=STR('JMUZ')
664    s18=STR('WLZQMTB')
665    s19=STR('KPADCBW')
666    s20=STR('TNJHZQAGBU')
667    for x in _RANGE_1000:
668        (s1 + s2+ s3+ s4+ s5+ s6+ s7+ s8+ s9+s10+
669         s11+s12+s13+s14+s15+s16+s17+s18+s19+s20)
670
671
672#### Benchmark join
673
674def get_bytes_yielding_seq(STR, arg):
675    if STR is BYTES and sys.version_info >= (3,):
676        raise UnsupportedType
677    return STR(arg)
678
679@bench('"A".join("")',
680       "join empty string, with 1 character sep", 100)
681def join_empty_single(STR):
682    sep = STR("A")
683    s2 = get_bytes_yielding_seq(STR, "")
684    sep_join = sep.join
685    for x in _RANGE_100:
686        sep_join(s2)
687
688@bench('"ABCDE".join("")',
689       "join empty string, with 5 character sep", 100)
690def join_empty_5(STR):
691    sep = STR("ABCDE")
692    s2 = get_bytes_yielding_seq(STR, "")
693    sep_join = sep.join
694    for x in _RANGE_100:
695        sep_join(s2)
696
697@bench('"A".join("ABC..Z")',
698       "join string with 26 characters, with 1 character sep", 1000)
699def join_alphabet_single(STR):
700    sep = STR("A")
701    s2 = get_bytes_yielding_seq(STR, "ABCDEFGHIJKLMnOPQRSTUVWXYZ")
702    sep_join = sep.join
703    for x in _RANGE_1000:
704        sep_join(s2)
705
706@bench('"ABCDE".join("ABC..Z")',
707       "join string with 26 characters, with 5 character sep", 1000)
708def join_alphabet_5(STR):
709    sep = STR("ABCDE")
710    s2 = get_bytes_yielding_seq(STR, "ABCDEFGHIJKLMnOPQRSTUVWXYZ")
711    sep_join = sep.join
712    for x in _RANGE_1000:
713        sep_join(s2)
714
715@bench('"A".join(list("ABC..Z"))',
716       "join list of 26 characters, with 1 character sep", 1000)
717def join_alphabet_list_single(STR):
718    sep = STR("A")
719    s2 = [STR(x) for x in "ABCDEFGHIJKLMnOPQRSTUVWXYZ"]
720    sep_join = sep.join
721    for x in _RANGE_1000:
722        sep_join(s2)
723
724@bench('"ABCDE".join(list("ABC..Z"))',
725       "join list of 26 characters, with 5 character sep", 1000)
726def join_alphabet_list_five(STR):
727    sep = STR("ABCDE")
728    s2 = [STR(x) for x in "ABCDEFGHIJKLMnOPQRSTUVWXYZ"]
729    sep_join = sep.join
730    for x in _RANGE_1000:
731        sep_join(s2)
732
733@bench('"A".join(["Bob"]*100)',
734       "join list of 100 words, with 1 character sep", 1000)
735def join_100_words_single(STR):
736    sep = STR("A")
737    s2 = [STR("Bob")]*100
738    sep_join = sep.join
739    for x in _RANGE_1000:
740        sep_join(s2)
741
742@bench('"ABCDE".join(["Bob"]*100))',
743       "join list of 100 words, with 5 character sep", 1000)
744def join_100_words_5(STR):
745    sep = STR("ABCDE")
746    s2 = [STR("Bob")]*100
747    sep_join = sep.join
748    for x in _RANGE_1000:
749        sep_join(s2)
750
751#### split tests
752
753@bench('("Here are some words. "*2).split()', "split whitespace (small)", 1000)
754def whitespace_split(STR):
755    s = STR("Here are some words. "*2)
756    s_split = s.split
757    for x in _RANGE_1000:
758        s_split()
759
760@bench('("Here are some words. "*2).rsplit()', "split whitespace (small)", 1000)
761def whitespace_rsplit(STR):
762    s = STR("Here are some words. "*2)
763    s_rsplit = s.rsplit
764    for x in _RANGE_1000:
765        s_rsplit()
766
767@bench('("Here are some words. "*2).split(None, 1)',
768       "split 1 whitespace", 1000)
769def whitespace_split_1(STR):
770    s = STR("Here are some words. "*2)
771    s_split = s.split
772    N = None
773    for x in _RANGE_1000:
774        s_split(N, 1)
775
776@bench('("Here are some words. "*2).rsplit(None, 1)',
777       "split 1 whitespace", 1000)
778def whitespace_rsplit_1(STR):
779    s = STR("Here are some words. "*2)
780    s_rsplit = s.rsplit
781    N = None
782    for x in _RANGE_1000:
783        s_rsplit(N, 1)
784
785@bench('("Here are some words. "*2).partition(" ")',
786       "split 1 whitespace", 1000)
787def whitespace_partition(STR):
788    sep = STR(" ")
789    s = STR("Here are some words. "*2)
790    s_partition = s.partition
791    for x in _RANGE_1000:
792        s_partition(sep)
793
794@bench('("Here are some words. "*2).rpartition(" ")',
795       "split 1 whitespace", 1000)
796def whitespace_rpartition(STR):
797    sep = STR(" ")
798    s = STR("Here are some words. "*2)
799    s_rpartition = s.rpartition
800    for x in _RANGE_1000:
801        s_rpartition(sep)
802
803human_text = """\
804Python is a dynamic object-oriented programming language that can be
805used for many kinds of software development. It offers strong support
806for integration with other languages and tools, comes with extensive
807standard libraries, and can be learned in a few days. Many Python
808programmers report substantial productivity gains and feel the language
809encourages the development of higher quality, more maintainable code.
810
811Python runs on Windows, Linux/Unix, Mac OS X, Amiga, Palm
812Handhelds, and Nokia mobile phones. Python has also been ported to the
813Java and .NET virtual machines.
814
815Python is distributed under an OSI-approved open source license that
816makes it free to use, even for commercial products.
817"""*25
818human_text_bytes = bytes_from_str(human_text)
819human_text_unicode = unicode_from_str(human_text)
820def _get_human_text(STR):
821    if STR is UNICODE:
822        return human_text_unicode
823    if STR is BYTES:
824        return human_text_bytes
825    raise AssertionError
826
827@bench('human_text.split()', "split whitespace (huge)", 10)
828def whitespace_split_huge(STR):
829    s = _get_human_text(STR)
830    s_split = s.split
831    for x in _RANGE_10:
832        s_split()
833
834@bench('human_text.rsplit()', "split whitespace (huge)", 10)
835def whitespace_rsplit_huge(STR):
836    s = _get_human_text(STR)
837    s_rsplit = s.rsplit
838    for x in _RANGE_10:
839        s_rsplit()
840
841
842
843@bench('"this\\nis\\na\\ntest\\n".split("\\n")', "split newlines", 1000)
844def newlines_split(STR):
845    s = STR("this\nis\na\ntest\n")
846    s_split = s.split
847    nl = STR("\n")
848    for x in _RANGE_1000:
849        s_split(nl)
850
851
852@bench('"this\\nis\\na\\ntest\\n".rsplit("\\n")', "split newlines", 1000)
853def newlines_rsplit(STR):
854    s = STR("this\nis\na\ntest\n")
855    s_rsplit = s.rsplit
856    nl = STR("\n")
857    for x in _RANGE_1000:
858        s_rsplit(nl)
859
860@bench('"this\\nis\\na\\ntest\\n".splitlines()', "split newlines", 1000)
861def newlines_splitlines(STR):
862    s = STR("this\nis\na\ntest\n")
863    s_splitlines = s.splitlines
864    for x in _RANGE_1000:
865        s_splitlines()
866
867## split text with 2000 newlines
868
869def _make_2000_lines():
870    import random
871    r = random.Random(100)
872    chars = list(map(chr, range(32, 128)))
873    i = 0
874    while i < len(chars):
875        chars[i] = " "
876        i += r.randrange(9)
877    s = "".join(chars)
878    s = s*4
879    words = []
880    for i in range(2000):
881        start = r.randrange(96)
882        n = r.randint(5, 65)
883        words.append(s[start:start+n])
884    return "\n".join(words)+"\n"
885
886_text_with_2000_lines = _make_2000_lines()
887_text_with_2000_lines_bytes = bytes_from_str(_text_with_2000_lines)
888_text_with_2000_lines_unicode = unicode_from_str(_text_with_2000_lines)
889def _get_2000_lines(STR):
890    if STR is UNICODE:
891        return _text_with_2000_lines_unicode
892    if STR is BYTES:
893        return _text_with_2000_lines_bytes
894    raise AssertionError
895
896
897@bench('"...text...".split("\\n")', "split 2000 newlines", 10)
898def newlines_split_2000(STR):
899    s = _get_2000_lines(STR)
900    s_split = s.split
901    nl = STR("\n")
902    for x in _RANGE_10:
903        s_split(nl)
904
905@bench('"...text...".rsplit("\\n")', "split 2000 newlines", 10)
906def newlines_rsplit_2000(STR):
907    s = _get_2000_lines(STR)
908    s_rsplit = s.rsplit
909    nl = STR("\n")
910    for x in _RANGE_10:
911        s_rsplit(nl)
912
913@bench('"...text...".splitlines()', "split 2000 newlines", 10)
914def newlines_splitlines_2000(STR):
915    s = _get_2000_lines(STR)
916    s_splitlines = s.splitlines
917    for x in _RANGE_10:
918        s_splitlines()
919
920
921## split text on "--" characters
922@bench(
923    '"this--is--a--test--of--the--emergency--broadcast--system".split("--")',
924    "split on multicharacter separator (small)", 1000)
925def split_multichar_sep_small(STR):
926    s = STR("this--is--a--test--of--the--emergency--broadcast--system")
927    s_split = s.split
928    pat = STR("--")
929    for x in _RANGE_1000:
930        s_split(pat)
931@bench(
932    '"this--is--a--test--of--the--emergency--broadcast--system".rsplit("--")',
933    "split on multicharacter separator (small)", 1000)
934def rsplit_multichar_sep_small(STR):
935    s = STR("this--is--a--test--of--the--emergency--broadcast--system")
936    s_rsplit = s.rsplit
937    pat = STR("--")
938    for x in _RANGE_1000:
939        s_rsplit(pat)
940
941## split dna text on "ACTAT" characters
942@bench('dna.split("ACTAT")',
943       "split on multicharacter separator (dna)", 10)
944def split_multichar_sep_dna(STR):
945    s = _get_dna(STR)
946    s_split = s.split
947    pat = STR("ACTAT")
948    for x in _RANGE_10:
949        s_split(pat)
950
951@bench('dna.rsplit("ACTAT")',
952       "split on multicharacter separator (dna)", 10)
953def rsplit_multichar_sep_dna(STR):
954    s = _get_dna(STR)
955    s_rsplit = s.rsplit
956    pat = STR("ACTAT")
957    for x in _RANGE_10:
958        s_rsplit(pat)
959
960
961
962## split with limits
963
964GFF3_example = "\t".join([
965    "I", "Genomic_canonical", "region", "357208", "396183", ".", "+", ".",
966    "ID=Sequence:R119;note=Clone R119%3B Genbank AF063007;Name=R119"])
967
968@bench('GFF3_example.split("\\t")', "tab split", 1000)
969def tab_split_no_limit(STR):
970    sep = STR("\t")
971    s = STR(GFF3_example)
972    s_split = s.split
973    for x in _RANGE_1000:
974        s_split(sep)
975
976@bench('GFF3_example.split("\\t", 8)', "tab split", 1000)
977def tab_split_limit(STR):
978    sep = STR("\t")
979    s = STR(GFF3_example)
980    s_split = s.split
981    for x in _RANGE_1000:
982        s_split(sep, 8)
983
984@bench('GFF3_example.rsplit("\\t")', "tab split", 1000)
985def tab_rsplit_no_limit(STR):
986    sep = STR("\t")
987    s = STR(GFF3_example)
988    s_rsplit = s.rsplit
989    for x in _RANGE_1000:
990        s_rsplit(sep)
991
992@bench('GFF3_example.rsplit("\\t", 8)', "tab split", 1000)
993def tab_rsplit_limit(STR):
994    sep = STR("\t")
995    s = STR(GFF3_example)
996    s_rsplit = s.rsplit
997    for x in _RANGE_1000:
998        s_rsplit(sep, 8)
999
1000#### Count characters
1001
1002@bench('...text.with.2000.newlines.count("\\n")',
1003       "count newlines", 10)
1004def count_newlines(STR):
1005    s = _get_2000_lines(STR)
1006    s_count = s.count
1007    nl = STR("\n")
1008    for x in _RANGE_10:
1009        s_count(nl)
1010
1011# Orchid sequences concatenated, from Biopython
1012_dna = """
1013CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGATCACATAATAATTGATCGGGTT
1014AATCTGGAGGATCTGTTTACTTTGGTCACCCATGAGCATTTGCTGTTGAAGTGACCTAGAATTGCCATCG
1015AGCCTCCTTGGGAGCTTTCTTGTTGGCGAGATCTAAACCCTTGCCCGGCGCAGTTTTGCTCCAAGTCGTT
1016TGACACATAATTGGTGAAGGGGGTGGCATCCTTCCCTGACCCTCCCCCAACTATTTTTTTAACAACTCTC
1017AGCAACGGAGACTCAGTCTTCGGCAAATGCGATAAATGGTGTGAATTGCAGAATCCCGTGCACCATCGAG
1018TCTTTGAACGCAAGTTGCGCCCGAGGCCATCAGGCCAAGGGCACGCCTGCCTGGGCATTGCGAGTCATAT
1019CTCTCCCTTAACGAGGCTGTCCATACATACTGTTCAGCCGGTGCGGATGTGAGTTTGGCCCCTTGTTCTT
1020TGGTACGGGGGGTCTAAGAGCTGCATGGGCTTTTGATGGTCCTAAATACGGCAAGAGGTGGACGAACTAT
1021GCTACAACAAAATTGTTGTGCAGAGGCCCCGGGTTGTCGTATTAGATGGGCCACCGTAATCTGAAGACCC
1022TTTTGAACCCCATTGGAGGCCCATCAACCCATGATCAGTTGATGGCCATTTGGTTGCGACCCCAGGTCAG
1023GTGAGCAACAGCTGTCGTAACAAGGTTTCCGTAGGGTGAACTGCGGAAGGATCATTGTTGAGATCACATA
1024ATAATTGATCGAGTTAATCTGGAGGATCTGTTTACTTGGGTCACCCATGGGCATTTGCTGTTGAAGTGAC
1025CTAGATTTGCCATCGAGCCTCCTTGGGAGCATCCTTGTTGGCGATATCTAAACCCTCAATTTTTCCCCCA
1026ATCAAATTACACAAAATTGGTGGAGGGGGTGGCATTCTTCCCTTACCCTCCCCCAAATATTTTTTTAACA
1027ACTCTCAGCAACGGATATCTCAGCTCTTGCATCGATGAAGAACCCACCGAAATGCGATAAATGGTGTGAA
1028TTGCAGAATCCCGTGAACCATCGAGTCTTTGAACGCAAGTTGCGCCCGAGGCCATCAGGCCAAGGGCACG
1029CCTGCCTGGGCATTGCGAGTCATATCTCTCCCTTAACGAGGCTGTCCATACATACTGTTCAGCCGGTGCG
1030GATGTGAGTTTGGCCCCTTGTTCTTTGGTACGGGGGGTCTAAGAGATGCATGGGCTTTTGATGGTCCTAA
1031ATACGGCAAGAGGTGGACGAACTATGCTACAACAAAATTGTTGTGCAAAGGCCCCGGGTTGTCGTATAAG
1032ATGGGCCACCGATATCTGAAGACCCTTTTGGACCCCATTGGAGCCCATCAACCCATGTCAGTTGATGGCC
1033ATTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGATCACATAATAATTGATCGA
1034GTTAATCTGGAGGATCTGTTTACTTGGGTCACCCATGGGCATTTGCTGTTGAAGTGACCTAGATTTGCCA
1035TCGAGCCTCCTTGGGAGCTTTCTTGTTGGCGATATCTAAACCCTTGCCCGGCAGAGTTTTGGGAATCCCG
1036TGAACCATCGAGTCTTTGAACGCAAGTTGCGCCCGAGGCCATCAGGCCAAGGGCACGCCTGCCTGGGCAT
1037TGCGAGTCATATCTCTCCCTTAACGAGGCTGTCCATACACACCTGTTCAGCCGGTGCGGATGTGAGTTTG
1038GCCCCTTGTTCTTTGGTACGGGGGGTCTAAGAGCTGCATGGGCTTTTGATGGTCCTAAATACGGCAAGAG
1039GTGGACGAACTATGCTACAACAAAATTGTTGTGCAAAGGCCCCGGGTTGTCGTATTAGATGGGCCACCAT
1040AATCTGAAGACCCTTTTGAACCCCATTGGAGGCCCATCAACCCATGATCAGTTGATGGCCATTTGGTTGC
1041GACCCAGTCAGGTGAGGGTAGGTGAACCTGCGGAAGGATCATTGTTGAGATCACATAATAATTGATCGAG
1042TTAATCTGGAGGATCTGTTTACTTTGGTCACCCATGGGCATTTGCTGTTGAAGTGACCTAGATTTGCCAT
1043CGAGCCTCCTTGGGAGCTTTCTTGTTGGCGAGATCTAAACCCTTGCCCGGCGGAGTTTGGCGCCAAGTCA
1044TATGACACATAATTGGTGAAGGGGGTGGCATCCTGCCCTGACCCTCCCCAAATTATTTTTTTAACAACTC
1045TCAGCAACGGATATCTCGGCTCTTGCATCGATGAAGAACGCAGCGAAATGCGATAAATGGTGTGAATTGC
1046AGAATCCCGTGAACCATCGAGTCTTTGGAACGCAAGTTGCGCCCGAGGCCATCAGGCCAAGGGCACGCCT
1047GCCTGGGCATTGGGAATCATATCTCTCCCCTAACGAGGCTATCCAAACATACTGTTCATCCGGTGCGGAT
1048GTGAGTTTGGCCCCTTGTTCTTTGGTACCGGGGGTCTAAGAGCTGCATGGGCATTTGATGGTCCTCAAAA
1049CGGCAAGAGGTGGACGAACTATGCCACAACAAAATTGTTGTCCCAAGGCCCCGGGTTGTCGTATTAGATG
1050GGCCACCGTAACCTGAAGACCCTTTTGAACCCCATTGGAGGCCCATCAACCCATGATCAGTTGATGACCA
1051TTTGTTGCGACCCCAGTCAGCTGAGCAACCCGCTGAGTGGAAGGTCATTGCCGATATCACATAATAATTG
1052ATCGAGTTAATCTGGAGGATCTGTTTACTTGGTCACCCATGAGCATTTGCTGTTGAAGTGACCTAGATTT
1053GCCATCGAGCCTCCTTGGGAGTTTTCTTGTTGGCGAGATCTAAACCCTTGCCCGGCGGAGTTGTGCGCCA
1054AGTCATATGACACATAATTGGTGAAGGGGGTGGCATCCTGCCCTGACCCTCCCCAAATTATTTTTTTAAC
1055AACTCTCAGCAACGGATATCTCGGCTCTTGCATCGATGAAGAACGCAGCGAAATGCGATAAATGGTGTGA
1056ATTGCAGAATCCCGTGAACCATCGAGTCTTTGAACGCAAGTTGCGCCCGAGGCCATCAGGCCAAGGGCAC
1057GCCTGCCTGGGCATTGCGAGTCATATCTCTCCCTTAACGAGGCTGTCCATACATACTGTTCATCCGGTGC
1058GGATGTGAGTTTGGCCCCTTGTTCTTTGGTACGGGGGGTCTAAGAGCTGCATGGGCATTTGATGGTCCTC
1059AAAACGGCAAGAGGTGGACGAACTATGCTACAACCAAATTGTTGTCCCAAGGCCCCGGGTTGTCGTATTA
1060GATGGGCCACCGTAACCTGAAGACCCTTTTGAACCCCATTGGAGGCCCATCAACCCATGATCAGTTGATG
1061ACCATGTGTTGCGACCCCAGTCAGCTGAGCAACGCGCTGAGCGTAACAAGGTTTCCGTAGGTGGACCTCC
1062GGGAGGATCATTGTTGAGATCACATAATAATTGATCGAGGTAATCTGGAGGATCTGCATATTTTGGTCAC
1063"""
1064_dna = "".join(_dna.splitlines())
1065_dna = _dna * 25
1066_dna_bytes = bytes_from_str(_dna)
1067_dna_unicode = unicode_from_str(_dna)
1068
1069def _get_dna(STR):
1070    if STR is UNICODE:
1071        return _dna_unicode
1072    if STR is BYTES:
1073        return _dna_bytes
1074    raise AssertionError
1075
1076@bench('dna.count("AACT")', "count AACT substrings in DNA example", 10)
1077def count_aact(STR):
1078    seq = _get_dna(STR)
1079    seq_count = seq.count
1080    needle = STR("AACT")
1081    for x in _RANGE_10:
1082        seq_count(needle)
1083
1084##### startswith and endswith
1085
1086@bench('"Andrew".startswith("A")', 'startswith single character', 1000)
1087def startswith_single(STR):
1088    s1 = STR("Andrew")
1089    s2 = STR("A")
1090    s1_startswith = s1.startswith
1091    for x in _RANGE_1000:
1092        s1_startswith(s2)
1093
1094@bench('"Andrew".startswith("Andrew")', 'startswith multiple characters',
1095       1000)
1096def startswith_multiple(STR):
1097    s1 = STR("Andrew")
1098    s2 = STR("Andrew")
1099    s1_startswith = s1.startswith
1100    for x in _RANGE_1000:
1101        s1_startswith(s2)
1102
1103@bench('"Andrew".startswith("Anders")',
1104       'startswith multiple characters - not!', 1000)
1105def startswith_multiple_not(STR):
1106    s1 = STR("Andrew")
1107    s2 = STR("Anders")
1108    s1_startswith = s1.startswith
1109    for x in _RANGE_1000:
1110        s1_startswith(s2)
1111
1112
1113# endswith
1114
1115@bench('"Andrew".endswith("w")', 'endswith single character', 1000)
1116def endswith_single(STR):
1117    s1 = STR("Andrew")
1118    s2 = STR("w")
1119    s1_endswith = s1.endswith
1120    for x in _RANGE_1000:
1121        s1_endswith(s2)
1122
1123@bench('"Andrew".endswith("Andrew")', 'endswith multiple characters', 1000)
1124def endswith_multiple(STR):
1125    s1 = STR("Andrew")
1126    s2 = STR("Andrew")
1127    s1_endswith = s1.endswith
1128    for x in _RANGE_1000:
1129        s1_endswith(s2)
1130
1131@bench('"Andrew".endswith("Anders")',
1132       'endswith multiple characters - not!', 1000)
1133def endswith_multiple_not(STR):
1134    s1 = STR("Andrew")
1135    s2 = STR("Anders")
1136    s1_endswith = s1.endswith
1137    for x in _RANGE_1000:
1138        s1_endswith(s2)
1139
1140#### Strip
1141
1142@bench('"Hello!\\n".strip()', 'strip terminal newline', 1000)
1143def terminal_newline_strip_right(STR):
1144    s = STR("Hello!\n")
1145    s_strip = s.strip
1146    for x in _RANGE_1000:
1147        s_strip()
1148
1149@bench('"Hello!\\n".rstrip()', 'strip terminal newline', 1000)
1150def terminal_newline_rstrip(STR):
1151    s = STR("Hello!\n")
1152    s_rstrip = s.rstrip
1153    for x in _RANGE_1000:
1154        s_rstrip()
1155
1156@bench('"\\nHello!".strip()', 'strip terminal newline', 1000)
1157def terminal_newline_strip_left(STR):
1158    s = STR("\nHello!")
1159    s_strip = s.strip
1160    for x in _RANGE_1000:
1161        s_strip()
1162
1163@bench('"\\nHello!\\n".strip()', 'strip terminal newline', 1000)
1164def terminal_newline_strip_both(STR):
1165    s = STR("\nHello!\n")
1166    s_strip = s.strip
1167    for x in _RANGE_1000:
1168        s_strip()
1169
1170@bench('"\\nHello!".rstrip()', 'strip terminal newline', 1000)
1171def terminal_newline_lstrip(STR):
1172    s = STR("\nHello!")
1173    s_lstrip = s.lstrip
1174    for x in _RANGE_1000:
1175        s_lstrip()
1176
1177@bench('s="Hello!\\n"; s[:-1] if s[-1]=="\\n" else s',
1178       'strip terminal newline', 1000)
1179def terminal_newline_if_else(STR):
1180    s = STR("Hello!\n")
1181    NL = STR("\n")
1182    for x in _RANGE_1000:
1183        s[:-1] if (s[-1] == NL) else s
1184
1185
1186# Strip multiple spaces or tabs
1187
1188@bench('"Hello\\t   \\t".strip()', 'strip terminal spaces and tabs', 1000)
1189def terminal_space_strip(STR):
1190    s = STR("Hello\t   \t!")
1191    s_strip = s.strip
1192    for x in _RANGE_1000:
1193        s_strip()
1194
1195@bench('"Hello\\t   \\t".rstrip()', 'strip terminal spaces and tabs', 1000)
1196def terminal_space_rstrip(STR):
1197    s = STR("Hello!\t   \t")
1198    s_rstrip = s.rstrip
1199    for x in _RANGE_1000:
1200        s_rstrip()
1201
1202@bench('"\\t   \\tHello".rstrip()', 'strip terminal spaces and tabs', 1000)
1203def terminal_space_lstrip(STR):
1204    s = STR("\t   \tHello!")
1205    s_lstrip = s.lstrip
1206    for x in _RANGE_1000:
1207        s_lstrip()
1208
1209
1210#### replace
1211@bench('"This is a test".replace(" ", "\\t")', 'replace single character',
1212       1000)
1213def replace_single_character(STR):
1214    s = STR("This is a test!")
1215    from_str = STR(" ")
1216    to_str = STR("\t")
1217    s_replace = s.replace
1218    for x in _RANGE_1000:
1219        s_replace(from_str, to_str)
1220
1221@uses_re
1222@bench('re.sub(" ", "\\t", "This is a test"', 'replace single character',
1223       1000)
1224def replace_single_character_re(STR):
1225    s = STR("This is a test!")
1226    pat = re.compile(STR(" "))
1227    to_str = STR("\t")
1228    pat_sub = pat.sub
1229    for x in _RANGE_1000:
1230        pat_sub(to_str, s)
1231
1232@bench('"...text.with.2000.lines...replace("\\n", " ")',
1233       'replace single character, big string', 10)
1234def replace_single_character_big(STR):
1235    s = _get_2000_lines(STR)
1236    from_str = STR("\n")
1237    to_str = STR(" ")
1238    s_replace = s.replace
1239    for x in _RANGE_10:
1240        s_replace(from_str, to_str)
1241
1242@uses_re
1243@bench('re.sub("\\n", " ", "...text.with.2000.lines...")',
1244       'replace single character, big string', 10)
1245def replace_single_character_big_re(STR):
1246    s = _get_2000_lines(STR)
1247    pat = re.compile(STR("\n"))
1248    to_str = STR(" ")
1249    pat_sub = pat.sub
1250    for x in _RANGE_10:
1251        pat_sub(to_str, s)
1252
1253
1254@bench('dna.replace("ATC", "ATT")',
1255       'replace multiple characters, dna', 10)
1256def replace_multiple_characters_dna(STR):
1257    seq = _get_dna(STR)
1258    from_str = STR("ATC")
1259    to_str = STR("ATT")
1260    seq_replace = seq.replace
1261    for x in _RANGE_10:
1262        seq_replace(from_str, to_str)
1263
1264# This increases the character count
1265@bench('"...text.with.2000.newlines...replace("\\n", "\\r\\n")',
1266       'replace and expand multiple characters, big string', 10)
1267def replace_multiple_character_big(STR):
1268    s = _get_2000_lines(STR)
1269    from_str = STR("\n")
1270    to_str = STR("\r\n")
1271    s_replace = s.replace
1272    for x in _RANGE_10:
1273        s_replace(from_str, to_str)
1274
1275
1276# This decreases the character count
1277@bench('"When shall we three meet again?".replace("ee", "")',
1278       'replace/remove multiple characters', 1000)
1279def replace_multiple_character_remove(STR):
1280    s = STR("When shall we three meet again?")
1281    from_str = STR("ee")
1282    to_str = STR("")
1283    s_replace = s.replace
1284    for x in _RANGE_1000:
1285        s_replace(from_str, to_str)
1286
1287
1288big_s = "A" + ("Z"*128*1024)
1289big_s_bytes = bytes_from_str(big_s)
1290big_s_unicode = unicode_from_str(big_s)
1291def _get_big_s(STR):
1292    if STR is UNICODE: return big_s_unicode
1293    if STR is BYTES: return big_s_bytes
1294    raise AssertionError
1295
1296# The older replace implementation counted all matches in
1297# the string even when it only needed to make one replacement.
1298@bench('("A" + ("Z"*128*1024)).replace("A", "BB", 1)',
1299       'quick replace single character match', 10)
1300def quick_replace_single_match(STR):
1301    s = _get_big_s(STR)
1302    from_str = STR("A")
1303    to_str = STR("BB")
1304    s_replace = s.replace
1305    for x in _RANGE_10:
1306        s_replace(from_str, to_str, 1)
1307
1308@bench('("A" + ("Z"*128*1024)).replace("AZZ", "BBZZ", 1)',
1309       'quick replace multiple character match', 10)
1310def quick_replace_multiple_match(STR):
1311    s = _get_big_s(STR)
1312    from_str = STR("AZZ")
1313    to_str = STR("BBZZ")
1314    s_replace = s.replace
1315    for x in _RANGE_10:
1316        s_replace(from_str, to_str, 1)
1317
1318
1319####
1320
1321# CCP does a lot of this, for internationalisation of ingame messages.
1322_format = "The %(thing)s is %(place)s the %(location)s."
1323_format_dict = { "thing":"THING", "place":"PLACE", "location":"LOCATION", }
1324_format_bytes = bytes_from_str(_format)
1325_format_unicode = unicode_from_str(_format)
1326_format_dict_bytes = dict((bytes_from_str(k), bytes_from_str(v)) for (k,v) in _format_dict.items())
1327_format_dict_unicode = dict((unicode_from_str(k), unicode_from_str(v)) for (k,v) in _format_dict.items())
1328
1329def _get_format(STR):
1330    if STR is UNICODE:
1331        return _format_unicode
1332    if STR is BYTES:
1333        if sys.version_info >= (3,):
1334            raise UnsupportedType
1335        return _format_bytes
1336    raise AssertionError
1337
1338def _get_format_dict(STR):
1339    if STR is UNICODE:
1340        return _format_dict_unicode
1341    if STR is BYTES:
1342        if sys.version_info >= (3,):
1343            raise UnsupportedType
1344        return _format_dict_bytes
1345    raise AssertionError
1346
1347# Formatting.
1348@bench('"The %(k1)s is %(k2)s the %(k3)s."%{"k1":"x","k2":"y","k3":"z",}',
1349       'formatting a string type with a dict', 1000)
1350def format_with_dict(STR):
1351    s = _get_format(STR)
1352    d = _get_format_dict(STR)
1353    for x in _RANGE_1000:
1354        s % d
1355
1356
1357#### Upper- and lower- case conversion
1358
1359@bench('("Where in the world is Carmen San Deigo?"*10).lower()',
1360       "case conversion -- rare", 1000)
1361def lower_conversion_rare(STR):
1362    s = STR("Where in the world is Carmen San Deigo?"*10)
1363    s_lower = s.lower
1364    for x in _RANGE_1000:
1365        s_lower()
1366
1367@bench('("WHERE IN THE WORLD IS CARMEN SAN DEIGO?"*10).lower()',
1368       "case conversion -- dense", 1000)
1369def lower_conversion_dense(STR):
1370    s = STR("WHERE IN THE WORLD IS CARMEN SAN DEIGO?"*10)
1371    s_lower = s.lower
1372    for x in _RANGE_1000:
1373        s_lower()
1374
1375
1376@bench('("wHERE IN THE WORLD IS cARMEN sAN dEIGO?"*10).upper()',
1377       "case conversion -- rare", 1000)
1378def upper_conversion_rare(STR):
1379    s = STR("Where in the world is Carmen San Deigo?"*10)
1380    s_upper = s.upper
1381    for x in _RANGE_1000:
1382        s_upper()
1383
1384@bench('("where in the world is carmen san deigo?"*10).upper()',
1385       "case conversion -- dense", 1000)
1386def upper_conversion_dense(STR):
1387    s = STR("where in the world is carmen san deigo?"*10)
1388    s_upper = s.upper
1389    for x in _RANGE_1000:
1390        s_upper()
1391
1392
1393# end of benchmarks
1394
1395#################
1396
1397class BenchTimer(timeit.Timer):
1398    def best(self, repeat=1):
1399        for i in range(1, 10):
1400            number = 10**i
1401            x = self.timeit(number)
1402            if x > 0.02:
1403                break
1404        times = [x]
1405        for i in range(1, repeat):
1406            times.append(self.timeit(number))
1407        return min(times) / number
1408
1409def main():
1410    (options, test_names) = parser.parse_args()
1411    if options.bytes_only and options.unicode_only:
1412        raise SystemExit("Only one of --8-bit and --unicode are allowed")
1413
1414    bench_functions = []
1415    for (k,v) in globals().items():
1416        if hasattr(v, "is_bench"):
1417            if test_names:
1418                for name in test_names:
1419                    if name in v.group:
1420                        break
1421                else:
1422                    # Not selected, ignore
1423                    continue
1424            if options.skip_re and hasattr(v, "uses_re"):
1425                continue
1426
1427            bench_functions.append( (v.group, k, v) )
1428    bench_functions.sort()
1429
1430    p("bytes\tunicode")
1431    p("(in ms)\t(in ms)\t%\tcomment")
1432
1433    bytes_total = uni_total = 0.0
1434
1435    for title, group in itertools.groupby(bench_functions,
1436                                      operator.itemgetter(0)):
1437        # Flush buffer before each group
1438        sys.stdout.flush()
1439        p("="*10, title)
1440        for (_, k, v) in group:
1441            if hasattr(v, "is_bench"):
1442                bytes_time = 0.0
1443                bytes_time_s = " - "
1444                if not options.unicode_only:
1445                    try:
1446                        bytes_time = BenchTimer("__main__.%s(__main__.BYTES)" % (k,),
1447                                                "import __main__").best(REPEAT)
1448                        bytes_time_s = "%.2f" % (1000 * bytes_time)
1449                        bytes_total += bytes_time
1450                    except UnsupportedType:
1451                        bytes_time_s = "N/A"
1452                uni_time = 0.0
1453                uni_time_s = " - "
1454                if not options.bytes_only:
1455                    try:
1456                        uni_time = BenchTimer("__main__.%s(__main__.UNICODE)" % (k,),
1457                                              "import __main__").best(REPEAT)
1458                        uni_time_s = "%.2f" % (1000 * uni_time)
1459                        uni_total += uni_time
1460                    except UnsupportedType:
1461                        uni_time_s = "N/A"
1462                try:
1463                    average = bytes_time/uni_time
1464                except (TypeError, ZeroDivisionError):
1465                    average = 0.0
1466                p("%s\t%s\t%.1f\t%s (*%d)" % (
1467                    bytes_time_s, uni_time_s, 100.*average,
1468                    v.comment, v.repeat_count))
1469
1470    if bytes_total == uni_total == 0.0:
1471        p("That was zippy!")
1472    else:
1473        try:
1474            ratio = bytes_total/uni_total
1475        except ZeroDivisionError:
1476            ratio = 0.0
1477        p("%.2f\t%.2f\t%.1f\t%s" % (
1478            1000*bytes_total, 1000*uni_total, 100.*ratio,
1479            "TOTAL"))
1480
1481if __name__ == "__main__":
1482    main()
1483