1 /* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 4 -*- */
2 /* vi: set expandtab shiftwidth=4 tabstop=4: */
3 
4 #include "modp_ascii.h"
5 #include <stdio.h>
6 #include <time.h>
7 #include <stdint.h>
8 #include "modp_ascii_data.h"
9 #include <ctype.h>
10 
11 //extern const char gsToUpperMap[256];
12 /**
13  * This is standard clib implementation of uppercasing a string.
14  * It has an unfair advantage since it's inside the test file
15  * so the optimizer can inline it.
16  */
toupper_copy1(char * dest,const char * str,int len)17 void toupper_copy1(char* dest, const char* str, int len)
18 {
19     int i;
20     for (i = 0; i < len; ++i) {
21         // toupper is defined in <ctype.h>
22         *dest++ = toupper(str[i]);
23     }
24     *dest = 0;
25 }
26 
27 
28 /**
29  * Skipping ctype, and doing the compare directly
30  *
31  */
toupper_copy2(char * dest,const char * str,int len)32 void toupper_copy2(char* dest, const char* str, int len)
33 {
34     int i;
35     char c;
36     for (i = 0; i < len; ++i) {
37         c = str[i];
38         *dest++ = (c >= 'a' && c <= 'z') ? c : (c -32);
39     }
40     *dest = 0;
41 }
42 
43 /**
44  * Sequential table lookup
45  */
toupper_copy3(char * dest,const char * str,int len)46 void toupper_copy3(char* dest, const char* str, int len)
47 {
48     int i;
49     unsigned char c;
50     for (i = 0; i < len; ++i) {
51         c = str[i];
52         *dest++ = gsToUpperMap[c];
53     }
54     *dest = 0;
55 }
56 
57 /** \brief toupper Version 4 -- parallel table lookup
58  *
59  *
60  */
toupper_copy4(char * dest,const char * str,int len)61 void toupper_copy4(char* dest, const char* str, int len)
62 {
63     /*
64      *  int i;
65      *  for (i = 0; i < len; ++i) {
66      *      char c = str[i];
67      *      *dest++ = (c >= 'a' && c <= 'z') ? c - 32 : c;
68      *  }
69      */
70 
71     int i;
72     uint8_t c1,c2,c3,c4;
73 
74     const int leftover = len % 4;
75     const int imax = len - leftover;
76     const uint8_t* s = (const uint8_t*) str;
77     for (i = 0; i != imax ; i+=4) {
78         /*
79          *  it's important to make these variables
80          *  it helps the optimizer to figure out what to do
81          */
82         c1 = s[i], c2=s[i+1], c3=s[i+2], c4=s[i+3];
83         dest[0] = gsToUpperMap[c1];
84         dest[1] = gsToUpperMap[c2];
85         dest[2] = gsToUpperMap[c3];
86         dest[3] = gsToUpperMap[c4];
87         dest += 4;
88     }
89 
90     switch (leftover) {
91         case 3: *dest++ = gsToUpperMap[s[i++]];
92         case 2: *dest++ = gsToUpperMap[s[i++]];
93         case 1: *dest++ = gsToUpperMap[s[i]];
94         case 0: *dest = '\0';
95     }
96 }
97 
98 /** \brief toupper Versions 5 -- hsieh alternate
99  * Based code from Paul Hsieh
100  * http://www.azillionmonkeys.com/qed/asmexample.html
101  *
102  * This was his "improved" version, but it appears to either run just
103  * as fast, or a bit slower than his original version
104  */
toupper_copy5(char * dest,const char * str,int len)105 void toupper_copy5(char* dest, const char* str, int len)
106 {
107     int i;
108     uint32_t eax,ebx,ecx,edx;
109     const uint8_t* ustr = (const uint8_t*) str;
110     const int leftover = len % 4;
111     const int imax = len / 4;
112     const uint32_t* s = (const uint32_t*) str;
113     uint32_t* d = (uint32_t*) dest;
114     for (i = 0; i != imax; ++i) {
115         eax = s[i];
116         ebx = 0x80808080ul | eax;
117         ecx = ebx - 0x61616161ul;
118         edx = ~(ebx - 0x7b7b7b7bul);
119         ebx = (ecx & edx) & (~eax & 0x80808080ul);
120         *d++ = eax - (ebx >> 2);
121     }
122 
123     i = imax*4;
124     dest = (char*) d;
125     switch (leftover) {
126         case 3: *dest++ = gsToUpperMap[ustr[i++]];
127         case 2: *dest++ = gsToUpperMap[ustr[i++]];
128         case 1: *dest++ = gsToUpperMap[ustr[i]];
129         case 0: *dest = '\0';
130     }
131 }
132 
133 /** \brief ToUpper Version 6 -- Hsieh original, ASM style
134  * Based code from Paul Hsieh
135  * http://www.azillionmonkeys.com/qed/asmexample.html
136  *
137  * This is almost a direct port of the original ASM code, on some
138  * platforms/compilers it does run faster then the "de-asm'ed" version
139  * used in the modp library.
140  *
141  */
toupper_copy6(char * dest,const char * str,int len)142 void toupper_copy6(char* dest, const char* str, int len)
143 {
144     int i=0;
145     uint32_t eax,ebx,ecx,edx;
146     const uint8_t* ustr = (const uint8_t*) str;
147     const int leftover = len % 4;
148     const int imax = len / 4;
149     const uint32_t* s = (const uint32_t*) str;
150     uint32_t* d = (uint32_t*) dest;
151     for (i = 0; i != imax; ++i) {
152 #if 1
153         /*
154          * as close to original asm code as possible
155          */
156         eax = s[i];
157         ebx = 0x7f7f7f7f;
158         edx = 0x7f7f7f7f;
159         ebx = ebx & eax;
160         ebx = ebx + 0x05050505;
161         ecx = eax;
162         ecx = ~ecx;
163         ebx = ebx & edx;
164         ebx = ebx + 0x1a1a1a1a;
165         ebx = ebx & ecx;
166         ebx = ebx >> 2;
167         ebx = ebx & 0x20202020;
168         eax = eax - ebx;
169         *d++ = eax;
170 #else
171         /*
172          * "de-asm'ed" version, this is what is used in the modp library
173          */
174         eax = s[i];
175         ebx = (0x7f7f7f7ful & eax) + 0x05050505ul;
176         ebx = (0x7f7f7f7ful & ebx) + 0x1a1a1a1aul;
177         ebx = ((ebx & ~eax) >> 2 ) & 0x20202020ul;
178         *d++ = eax - ebx;
179 #endif
180     }
181 
182     i = imax*4;
183     dest = (char*) d;
184     switch (leftover) {
185         case 3: *dest++ = gsToUpperMap[ustr[i++]];
186         case 2: *dest++ = gsToUpperMap[ustr[i++]];
187         case 1: *dest++ = gsToUpperMap[ustr[i]];
188         case 0: *dest = '\0';
189     }
190 }
191 
modp_toupper_copy_a2(char * dest,const char * str,int len)192 void modp_toupper_copy_a2(char* dest, const char* str, int len)
193 {
194     int i=0;
195     uint32_t eax, ebx;
196     const uint8_t* ustr = (const uint8_t*) str;
197     const int leftover = len % 4;
198     const int imax = len / 4;
199     const uint32_t* s = (const uint32_t*) str;
200     uint32_t* d = (uint32_t*) dest;
201     for (i = 0; i != imax; ++i) {
202         eax = s[i];
203 
204         ebx = (0x7f7f7f7ful & eax) + 0x05050505ul;
205         ebx = (0x7f7f7f7ful & ebx) + 0x1a1a1a1aul;
206         ebx = ((ebx & ~eax) >> 2 ) & 0x20202020ul;
207         *d++ = eax - ebx;
208     }
209 
210     i = imax*4;
211     dest = (char*) d;
212     switch (leftover) {
213         case 3: *dest++ = gsToUpperMap[ustr[i++]];
214         case 2: *dest++ = gsToUpperMap[ustr[i++]];
215         case 1: *dest++ = gsToUpperMap[ustr[i]];
216         case 0: *dest = '\0';
217     }
218 }
219 
main()220 int main()
221 {
222     double last = 0.0;
223     size_t i = 0;
224     char buf[256];
225     char obuf[300];
226 
227     for (i = 0; i < 256; ++i) {
228         buf[i] = (char)i;
229     }
230 
231     uint32_t max = 1000000;
232     clock_t t0, t1;
233     printf("%s", "type\tclib\tdirect\tmap\tpara\thsieh1\thsieh2\tFinal\timprovement\n");
234 
235     printf("toupper\t");
236     fflush(stdout);
237 
238     /**
239      ** V1
240      **/
241     t0 = clock();
242     for (i = 0; i < max; ++i) {
243         toupper_copy1(obuf, buf, sizeof(buf));
244     }
245     t1 = clock();
246     last = t1 -t0;
247     printf("%lu\t", (unsigned long)(t1-t0));
248     fflush(stdout);
249 
250     /**
251      ** V2
252      **/
253     t0 = clock();
254     for (i = 0; i < max; ++i) {
255         toupper_copy2(obuf, buf, sizeof(buf));
256     }
257     t1 = clock();
258     printf("%lu\t", (unsigned long)(t1-t0));
259     fflush(stdout);
260 
261     /**
262      ** V3
263      **/
264     t0 = clock();
265     for (i = 0; i < max; ++i) {
266         toupper_copy3(obuf, buf, sizeof(buf));
267     }
268     t1 = clock();
269     printf("%lu\t", (unsigned long)(t1-t0));
270     fflush(stdout);
271 
272     /**
273      ** V4 -- Parallel Table Lookup
274      **/
275     t0 = clock();
276     for (i = 0; i < max; ++i) {
277         toupper_copy4(obuf, buf, sizeof(buf));
278     }
279     t1 = clock();
280     printf("%lu\t", (unsigned long)(t1-t0));
281     fflush(stdout);
282 
283 
284     /**
285      ** V5 -- Hsieh Alternate
286      **/
287     t0 = clock();
288     for (i = 0; i < max; ++i) {
289         toupper_copy5(obuf, buf, sizeof(buf));
290     }
291     t1 = clock();
292     printf("%lu\t", (unsigned long)(t1-t0));
293     fflush(stdout);
294 
295     /**
296      ** HSEIH -- asm style
297      **/
298     t0 = clock();
299     for (i = 0; i < max; ++i) {
300         toupper_copy6(obuf, buf, sizeof(buf));
301     }
302     t1 = clock();
303     printf("%lu\t", (unsigned long)(t1-t0));
304     fflush(stdout);
305 
306     /**
307      ** MODP FINAL
308      **/
309     t0 = clock();
310     for (i = 0; i < max; ++i) {
311         modp_toupper_copy(obuf, buf, sizeof(buf));
312     }
313     t1 = clock();
314 
315     printf("%lu\t", (unsigned long)(t1-t0));
316     fflush(stdout);
317 
318     printf("%.1fx\n", last/(t1-t0));
319     fflush(stdout);
320 
321     return 0;
322 }
323