1 #include <string.h>
2 #include <vips/vips.h>
3
4 #ifdef __SSE2__
5 #include "xmmintrin.h"
6 #endif //__SSE2__
7
8 #define BENCHMARK
9 #include "../rt/rtengine/StopWatch.h"
10
11 float mrgb[3][3];
12
malloc_aligned(size_t size)13 void* malloc_aligned(size_t size)
14 {
15 void *mem = malloc(size+15+sizeof(void*));
16 if (!mem) return mem;
17 void *ptr = (void*)( ((uintptr_t) mem + 15 + sizeof(void*)) / 16 * 16 );
18 uintptr_t* tptr = (uintptr_t*)ptr;
19 tptr -= 1;
20 *tptr = (uintptr_t)mem;
21 //((void**)ptr)[-1] = mem;
22 return ptr;
23 }
24
free_aligned(void * ptr)25 void free_aligned(void* ptr)
26 {
27 if (ptr) free(((void**)ptr)[-1]);
28 }
29
30
process(float * buf,float * obuf,size_t size,bool in_place,bool interleaved)31 void process( float* buf, float* obuf, size_t size, bool in_place, bool interleaved )
32 {
33 size_t chunk_size = 64;
34 size_t nchunks = size / chunk_size;
35
36 long minTime = 0;
37 long maxTime = 0;
38
39 for(int i = 0; i < 20; i++) {
40 //BENCHFUN;
41 MyTime startTime;
42 MyTime stopTime;
43 startTime.set();
44 float* ptr = in_place ? obuf : buf;
45 //float r, g, b;
46 float* optr = obuf;
47
48 if(in_place) memcpy(obuf, buf, sizeof(float)*4*size);
49
50 if(interleaved) {
51
52 // RGBARGBARGBARGBA... channel arrangement
53
54 for( int ci = 0; ci < nchunks; ci++ ) {
55 //printf("processing chunk %d\n", ci);
56 for( int pi = 0; pi < chunk_size; pi++ ) {
57 if(true) {
58 float r = ptr[0]; float g = ptr[1]; float b = ptr[2];
59 optr[0] = mrgb[0][0] * r + mrgb[0][1] * g + mrgb[0][2] * b;
60 optr[1] = mrgb[1][0] * r + mrgb[1][1] * g + mrgb[1][2] * b;
61 optr[2] = mrgb[2][0] * r + mrgb[2][1] * g + mrgb[2][2] * b;
62 } else {
63 optr[0] = mrgb[0][0] * ptr[0] + mrgb[0][1] * ptr[1] + mrgb[0][2] * ptr[2];
64 optr[1] = mrgb[1][0] * ptr[0] + mrgb[1][1] * ptr[1] + mrgb[1][2] * ptr[2];
65 optr[2] = mrgb[2][0] * ptr[0] + mrgb[2][1] * ptr[1] + mrgb[2][2] * ptr[2];
66 }
67 ptr += 4; optr += 4;
68 }
69 }
70 } else {
71
72 // RRRR...GGGG...BBBB... channel arrangement
73
74 float* rp = ptr;
75 float* gp = &(ptr[size]);
76 float* bp = &(ptr[size*2]);
77 float* orp = optr;
78 float* ogp = &(optr[size]);
79 float* obp = &(optr[size*2]);
80 for( int ci = 0; ci < nchunks; ci++ ) {
81 //printf("processing chunk %d\n", ci);
82 for( int pi = 0; pi < chunk_size; pi++ ) {
83 if( true ) {
84 float r = *rp; float g = *gp; float b = *bp;
85 *orp = mrgb[0][0] * r + mrgb[0][1] * g + mrgb[0][2] * b;
86 *ogp = mrgb[1][0] * r + mrgb[1][1] * g + mrgb[1][2] * b;
87 *obp = mrgb[2][0] * r + mrgb[2][1] * g + mrgb[2][2] * b;
88 } else {
89 *orp = mrgb[0][0] * (*rp) + mrgb[0][1] * (*gp) + mrgb[0][2] * (*bp);
90 *ogp = mrgb[1][0] * (*rp) + mrgb[1][1] * (*gp) + mrgb[1][2] * (*bp);
91 *obp = mrgb[2][0] * (*rp) + mrgb[2][1] * (*gp) + mrgb[2][2] * (*bp);
92 }
93 rp+=1; gp+=1; bp+=1;
94 orp+=1; ogp+=1; obp+=1;
95 }
96 }
97 }
98 stopTime.set();
99 long elapsedTime = stopTime.etime(startTime) / 1000;
100 //std::cout<<"process: elpased time = "<<elapsedTime<<std::endl;
101 if( minTime==0 ) minTime = elapsedTime;
102 else {
103 if( minTime > elapsedTime ) minTime = elapsedTime;
104 }
105 if( maxTime==0 ) maxTime = elapsedTime;
106 else {
107 if( maxTime < elapsedTime ) maxTime = elapsedTime;
108 }
109 }
110 std::cout<<"process: in_place="<<in_place<<" interleaved="<<interleaved<<std::endl;
111 std::cout<<" minimum time = "<<minTime<<std::endl;
112 std::cout<<" maximum time = "<<maxTime<<std::endl;
113 }
114
115
process_sse2(float * buf,float * obuf,size_t size,bool in_place,bool interleaved)116 void process_sse2( float* buf, float* obuf, size_t size, bool in_place, bool interleaved )
117 {
118 #ifdef __SSE2__
119 size_t chunk_size = 64;
120 size_t nchunks = size / chunk_size;
121
122 long minTime = 0;
123 long maxTime = 0;
124
125 __m128 mrgbv[3][3];
126 for(int i = 0; i < 3; i++) {
127 for(int j = 0; j < 3; j++) {
128 mrgbv[i][j] = _mm_set1_ps(mrgb[i][j]);
129 }
130 }
131
132 for(int i = 0; i < 20; i++) {
133 //BENCHFUN;
134 MyTime startTime;
135 MyTime stopTime;
136 startTime.set();
137 float* ptr = in_place ? obuf : buf;
138 float* optr = obuf;
139
140 if(in_place) memcpy(obuf, buf, sizeof(float)*4*size);
141
142 if(interleaved) {
143
144 // RGBARGBARGBARGBA... channel arrangement
145
146 float rin[4] __attribute__ ((aligned (16)));
147 float gin[4] __attribute__ ((aligned (16)));
148 float bin[4] __attribute__ ((aligned (16)));
149 float rout[4] __attribute__ ((aligned (16)));
150 float gout[4] __attribute__ ((aligned (16)));
151 float bout[4] __attribute__ ((aligned (16)));
152
153 for( int ci = 0; ci < nchunks; ci++ ) {
154 //printf("processing chunk %d\n", ci);
155 for( int pi = 0; pi < chunk_size-3; pi+=4 ) {
156 rin[0] = ptr[0]; gin[0] = ptr[1]; bin[0] = ptr[2];
157 rin[1] = ptr[3]; gin[1] = ptr[4]; bin[1] = ptr[5];
158 rin[2] = ptr[6]; gin[2] = ptr[7]; bin[2] = ptr[8];
159 rin[3] = ptr[9]; gin[3] = ptr[10]; bin[3] = ptr[11];
160
161 __m128 rv = _mm_loadu_ps(&rin[0]);
162 __m128 gv = _mm_loadu_ps(&gin[0]);
163 __m128 bv = _mm_loadu_ps(&bin[0]);
164
165 _mm_storeu_ps(&rout[0], mrgbv[0][0]*rv + mrgbv[0][1]*gv + mrgbv[0][2]*bv);
166 _mm_storeu_ps(&gout[0], mrgbv[1][0]*rv + mrgbv[1][1]*gv + mrgbv[1][2]*bv);
167 _mm_storeu_ps(&bout[0], mrgbv[2][0]*rv + mrgbv[2][1]*gv + mrgbv[2][2]*bv);
168
169 optr[0] = rout[0]; optr[1] = gout[0]; optr[2] = bout[0];
170 optr[3] = rout[1]; optr[4] = gout[1]; optr[5] = bout[1];
171 optr[6] = rout[2]; optr[7] = gout[2]; optr[8] = bout[2];
172 optr[9] = rout[3]; optr[10] = gout[3]; optr[11] = bout[3];
173
174 ptr += 16; optr += 16;
175 }
176 }
177 } else {
178
179 // RRRR...GGGG...BBBB... channel arrangement
180
181 float* rp = ptr;
182 float* gp = &(ptr[size]);
183 float* bp = &(ptr[size*2]);
184 float* orp = optr;
185 float* ogp = &(optr[size]);
186 float* obp = &(optr[size*2]);
187
188 for( int ci = 0; ci < nchunks; ci++ ) {
189 //printf("processing chunk %d\n", ci);
190 for( int pi = 0; pi < chunk_size-3; pi+=4 ) {
191 __m128 rv = _mm_loadu_ps(&rp[0]);
192 __m128 gv = _mm_loadu_ps(&gp[0]);
193 __m128 bv = _mm_loadu_ps(&bp[0]);
194
195 _mm_storeu_ps(&orp[0], mrgbv[0][0]*rv + mrgbv[0][1]*gv + mrgbv[0][2]*bv);
196 _mm_storeu_ps(&ogp[0], mrgbv[1][0]*rv + mrgbv[1][1]*gv + mrgbv[1][2]*bv);
197 _mm_storeu_ps(&obp[0], mrgbv[2][0]*rv + mrgbv[2][1]*gv + mrgbv[2][2]*bv);
198
199 rp+=4; gp+=4; bp+=4;
200 orp+=4; ogp+=4; obp+=4;
201 }
202 }
203 }
204 stopTime.set();
205 long elapsedTime = stopTime.etime(startTime) / 1000;
206 //std::cout<<"process_sse2: elpased time = "<<elapsedTime<<std::endl;
207 if( minTime==0 ) minTime = elapsedTime;
208 else {
209 if( minTime > elapsedTime ) minTime = elapsedTime;
210 }
211 if( maxTime==0 ) maxTime = elapsedTime;
212 else {
213 if( maxTime < elapsedTime ) maxTime = elapsedTime;
214 }
215 }
216 std::cout<<"process_sse2: in_place="<<in_place<<" interleaved="<<interleaved<<std::endl;
217 std::cout<<" minimum time = "<<minTime<<std::endl;
218 std::cout<<" maximum time = "<<maxTime<<std::endl;
219 #endif // __SSE2__
220 }
221
222
process_sse2_shuffle(float * buf,float * obuf,size_t size,bool in_place,bool interleaved)223 void process_sse2_shuffle( float* buf, float* obuf, size_t size, bool in_place, bool interleaved )
224 {
225 #ifdef __SSE2__
226 size_t chunk_size = 64;
227 size_t nchunks = size / chunk_size;
228
229 long minTime = 0;
230 long maxTime = 0;
231
232 __m128 mrgbv[3][3];
233 for(int i = 0; i < 3; i++) {
234 for(int j = 0; j < 3; j++) {
235 mrgbv[i][j] = _mm_set1_ps(mrgb[i][j]);
236 }
237 }
238
239 for(int i = 0; i < 20; i++) {
240 //BENCHFUN;
241 MyTime startTime;
242 MyTime stopTime;
243 startTime.set();
244
245 float* ptr = in_place ? obuf : buf;
246 float* optr = obuf;
247
248 if(in_place) memcpy(obuf, buf, sizeof(float)*4*size);
249
250 for( int ci = 0; ci < nchunks; ci++ ) {
251 //printf("processing chunk %d\n", ci);
252 for( int pi = 0; pi < chunk_size-3; pi+=4 ) {
253 __m128 rgbv[4];
254 __m128 a0;
255 __m128 a1;
256 __m128 a2;
257 __m128 a3;
258 rgbv[0] = _mm_loadu_ps(&ptr[0]);
259 rgbv[1] = _mm_loadu_ps(&ptr[4]);
260 rgbv[2] = _mm_loadu_ps(&ptr[8]);
261 rgbv[3] = _mm_loadu_ps(&ptr[12]);
262
263 /**/
264 a0 = _mm_unpacklo_ps(rgbv[0], rgbv[2]);
265 a1 = _mm_unpacklo_ps(rgbv[1], rgbv[3]);
266 a2 = _mm_unpackhi_ps(rgbv[0], rgbv[2]);
267 a3 = _mm_unpackhi_ps(rgbv[1], rgbv[3]);
268
269 rgbv[0] = _mm_unpacklo_ps(a0, a1);
270 rgbv[1] = _mm_unpackhi_ps(a0, a1);
271 rgbv[2] = _mm_unpacklo_ps(a2, a3);
272 rgbv[3] = _mm_unpackhi_ps(a2, a3);
273 /**/
274
275 __m128 orv = mrgbv[0][0]*rgbv[0] + mrgbv[0][1]*rgbv[1] + mrgbv[0][2]*rgbv[2];
276 __m128 ogv = mrgbv[1][0]*rgbv[0] + mrgbv[1][1]*rgbv[1] + mrgbv[1][2]*rgbv[2];
277 __m128 obv = mrgbv[2][0]*rgbv[0] + mrgbv[2][1]*rgbv[1] + mrgbv[2][2]*rgbv[2];
278
279 /**/
280 a0 = _mm_unpacklo_ps(orv, obv);
281 a1 = _mm_unpacklo_ps(ogv, rgbv[3]);
282 a2 = _mm_unpackhi_ps(orv, obv);
283 a3 = _mm_unpackhi_ps(ogv, rgbv[3]);
284
285 rgbv[0] = _mm_unpacklo_ps(a0, a1);
286 rgbv[1] = _mm_unpackhi_ps(a0, a1);
287 rgbv[2] = _mm_unpacklo_ps(a2, a3);
288 rgbv[3] = _mm_unpackhi_ps(a2, a3);
289 /**/
290
291 //rgbv[0] = orv;
292 //rgbv[1] = ogv;
293 //rgbv[2] = obv;
294
295 _mm_storeu_ps(&optr[0], rgbv[0]);
296 _mm_storeu_ps(&optr[4], rgbv[1]);
297 _mm_storeu_ps(&optr[8], rgbv[2]);
298 _mm_storeu_ps(&optr[12], rgbv[3]);
299
300 ptr += 16; optr += 16;
301 }
302 }
303 stopTime.set();
304 long elapsedTime = stopTime.etime(startTime) / 1000;
305 //std::cout<<"process_sse2: elpased time = "<<elapsedTime<<std::endl;
306 if( minTime==0 ) minTime = elapsedTime;
307 else {
308 if( minTime > elapsedTime ) minTime = elapsedTime;
309 }
310 if( maxTime==0 ) maxTime = elapsedTime;
311 else {
312 if( maxTime < elapsedTime ) maxTime = elapsedTime;
313 }
314 }
315 std::cout<<"process_sse2_shuffle: in_place="<<in_place<<" interleaved="<<interleaved<<std::endl;
316 std::cout<<" minimum time = "<<minTime<<std::endl;
317 std::cout<<" maximum time = "<<maxTime<<std::endl;
318 #endif // __SSE2__
319 }
320
321
322
process_sse2_alt(float * buf,float * obuf,size_t size,bool in_place,bool interleaved)323 void process_sse2_alt( float* buf, float* obuf, size_t size, bool in_place, bool interleaved )
324 {
325 #ifdef __SSE2__
326 size_t chunk_size = 64;
327 size_t nchunks = size / chunk_size;
328
329 long minTime = 0;
330 long maxTime = 0;
331
332 float mc0[4] __attribute__ ((aligned (16)));
333 float mc1[4] __attribute__ ((aligned (16)));
334 float mc2[4] __attribute__ ((aligned (16)));
335 for(int i = 0; i < 3; i++) {
336 mc0[i] = mrgb[i][0];
337 mc1[i] = mrgb[i][1];
338 mc2[i] = mrgb[i][2];
339 }
340 mc0[3] = mc1[3] = mc2[3] = 0;
341
342 __m128 mcv[3];
343 mcv[0] = _mm_loadu_ps(&mc0[0]);
344 mcv[1] = _mm_loadu_ps(&mc1[0]);
345 mcv[2] = _mm_loadu_ps(&mc2[0]);
346
347 for(int i = 0; i < 20; i++) {
348 //BENCHFUN;
349 MyTime startTime;
350 MyTime stopTime;
351 startTime.set();
352 float* ptr = in_place ? obuf : buf;
353 float* optr = obuf;
354
355 if(in_place) memcpy(obuf, buf, sizeof(float)*4*size);
356
357 __m128 rv, bv, gv;
358
359 float orgb[4] __attribute__ ((aligned (16)));
360
361 for( int ci = 0; ci < nchunks; ci++ ) {
362 //printf("processing chunk %d\n", ci);
363 for( int pi = 0; pi < chunk_size-3; pi+=1 ) {
364 rv = _mm_set1_ps(ptr[0]);
365 gv = _mm_set1_ps(ptr[1]);
366 bv = _mm_set1_ps(ptr[2]);
367
368 _mm_storeu_ps(&orgb[0], mcv[0]*rv + mcv[1]*gv + mcv[2]*bv);
369
370 optr[0] = orgb[0]; optr[1] = orgb[1]; optr[2] = orgb[2];
371
372 ptr += 4; optr += 4;
373 }
374 }
375 stopTime.set();
376 long elapsedTime = stopTime.etime(startTime) / 1000;
377 //std::cout<<"process_sse2: elpased time = "<<elapsedTime<<std::endl;
378 if( minTime==0 ) minTime = elapsedTime;
379 else {
380 if( minTime > elapsedTime ) minTime = elapsedTime;
381 }
382 if( maxTime==0 ) maxTime = elapsedTime;
383 else {
384 if( maxTime < elapsedTime ) maxTime = elapsedTime;
385 }
386 }
387 std::cout<<"process_sse2_alt: in_place="<<in_place<<" interleaved="<<interleaved<<std::endl;
388 std::cout<<" minimum time = "<<minTime<<std::endl;
389 std::cout<<" maximum time = "<<maxTime<<std::endl;
390 #endif // __SSE2__
391 }
392
393
394
395
396
397 int
main(int argc,char ** argv)398 main( int argc, char **argv )
399 {
400 size_t size = 50*1024*1024;
401 size_t chunk_size = 64;
402 size_t nchunks = size / chunk_size;
403 float* buf = (float*)malloc_aligned(sizeof(float)*4*size);
404 float* obuf = (float*)malloc_aligned(sizeof(float)*4*size);
405
406 memset(buf, 0, sizeof(float)*3*size);
407
408 printf("buf: %p\n", (void*)buf);
409
410 mrgb[0][0] = 0.5;
411 mrgb[0][1] = 0.5;
412 mrgb[0][2] = 0.5;
413 mrgb[1][0] = 0.5;
414 mrgb[1][1] = 0.5;
415 mrgb[1][2] = 0.5;
416 mrgb[2][0] = 0.5;
417 mrgb[2][1] = 0.5;
418 mrgb[2][2] = 0.5;
419
420 memcpy(obuf, buf, sizeof(float)*4*size);
421
422 //process(buf, obuf, size);
423 process_sse2(buf, obuf, size, false, true);
424
425 process(buf, obuf, size, false, false);
426 //process(buf, obuf, size, true, false);
427 process(buf, obuf, size, false, true);
428 //process(buf, obuf, size, true, true);
429
430 process_sse2(buf, obuf, size, false, false);
431 //process_sse2(buf, obuf, size, true, false);
432 process_sse2(buf, obuf, size, false, true);
433 //process_sse2(buf, obuf, size, true, true);
434
435 process_sse2_shuffle(buf, obuf, size, false, true);
436 //process_sse2_shuffle(buf, obuf, size, true, true);
437
438 process_sse2_alt(buf, obuf, size, false, true);
439 //process_sse2_alt(buf, obuf, size, true, true);
440 return( 0 );
441 }
442