1 #include <string.h>
2 #include <vips/vips.h>
3 
4 #ifdef __SSE2__
5 #include "xmmintrin.h"
6 #endif //__SSE2__
7 
8 #define BENCHMARK
9 #include "../rt/rtengine/StopWatch.h"
10 
11 float mrgb[3][3];
12 
malloc_aligned(size_t size)13 void* malloc_aligned(size_t size)
14 {
15   void *mem = malloc(size+15+sizeof(void*));
16   if (!mem) return mem;
17   void *ptr = (void*)( ((uintptr_t) mem + 15 + sizeof(void*)) / 16 * 16 );
18   uintptr_t* tptr = (uintptr_t*)ptr;
19   tptr -= 1;
20   *tptr = (uintptr_t)mem;
21   //((void**)ptr)[-1] = mem;
22   return ptr;
23 }
24 
free_aligned(void * ptr)25 void free_aligned(void* ptr)
26 {
27   if (ptr) free(((void**)ptr)[-1]);
28 }
29 
30 
process(float * buf,float * obuf,size_t size,bool in_place,bool interleaved)31 void process( float* buf, float* obuf, size_t size, bool in_place, bool interleaved )
32 {
33   size_t chunk_size = 64;
34   size_t nchunks = size / chunk_size;
35 
36   long minTime = 0;
37   long maxTime = 0;
38 
39   for(int i = 0; i < 20; i++) {
40     //BENCHFUN;
41     MyTime startTime;
42     MyTime stopTime;
43     startTime.set();
44     float* ptr = in_place ? obuf : buf;
45     //float r, g, b;
46     float* optr = obuf;
47 
48     if(in_place) memcpy(obuf, buf, sizeof(float)*4*size);
49 
50     if(interleaved) {
51 
52       // RGBARGBARGBARGBA... channel arrangement
53 
54       for( int ci = 0; ci < nchunks; ci++ ) {
55         //printf("processing chunk %d\n", ci);
56         for( int pi = 0; pi < chunk_size; pi++ ) {
57           if(true) {
58             float r = ptr[0]; float g = ptr[1]; float b = ptr[2];
59             optr[0] = mrgb[0][0] * r + mrgb[0][1] * g + mrgb[0][2] * b;
60             optr[1] = mrgb[1][0] * r + mrgb[1][1] * g + mrgb[1][2] * b;
61             optr[2] = mrgb[2][0] * r + mrgb[2][1] * g + mrgb[2][2] * b;
62           } else {
63             optr[0] = mrgb[0][0] * ptr[0] + mrgb[0][1] * ptr[1] + mrgb[0][2] * ptr[2];
64             optr[1] = mrgb[1][0] * ptr[0] + mrgb[1][1] * ptr[1] + mrgb[1][2] * ptr[2];
65             optr[2] = mrgb[2][0] * ptr[0] + mrgb[2][1] * ptr[1] + mrgb[2][2] * ptr[2];
66           }
67           ptr += 4; optr += 4;
68         }
69       }
70     } else {
71 
72       // RRRR...GGGG...BBBB... channel arrangement
73 
74       float* rp = ptr;
75       float* gp = &(ptr[size]);
76       float* bp = &(ptr[size*2]);
77       float* orp = optr;
78       float* ogp = &(optr[size]);
79       float* obp = &(optr[size*2]);
80       for( int ci = 0; ci < nchunks; ci++ ) {
81         //printf("processing chunk %d\n", ci);
82         for( int pi = 0; pi < chunk_size; pi++ ) {
83           if( true ) {
84             float r = *rp; float g = *gp; float b = *bp;
85             *orp = mrgb[0][0] * r + mrgb[0][1] * g + mrgb[0][2] * b;
86             *ogp = mrgb[1][0] * r + mrgb[1][1] * g + mrgb[1][2] * b;
87             *obp = mrgb[2][0] * r + mrgb[2][1] * g + mrgb[2][2] * b;
88           } else {
89             *orp = mrgb[0][0] * (*rp) + mrgb[0][1] * (*gp) + mrgb[0][2] * (*bp);
90             *ogp = mrgb[1][0] * (*rp) + mrgb[1][1] * (*gp) + mrgb[1][2] * (*bp);
91             *obp = mrgb[2][0] * (*rp) + mrgb[2][1] * (*gp) + mrgb[2][2] * (*bp);
92           }
93           rp+=1;  gp+=1;  bp+=1;
94           orp+=1; ogp+=1; obp+=1;
95         }
96       }
97     }
98     stopTime.set();
99     long elapsedTime = stopTime.etime(startTime) / 1000;
100     //std::cout<<"process: elpased time = "<<elapsedTime<<std::endl;
101     if( minTime==0 ) minTime = elapsedTime;
102     else {
103       if( minTime > elapsedTime ) minTime = elapsedTime;
104     }
105     if( maxTime==0 ) maxTime = elapsedTime;
106     else {
107       if( maxTime < elapsedTime ) maxTime = elapsedTime;
108     }
109   }
110   std::cout<<"process: in_place="<<in_place<<"  interleaved="<<interleaved<<std::endl;
111   std::cout<<"    minimum time = "<<minTime<<std::endl;
112   std::cout<<"    maximum time = "<<maxTime<<std::endl;
113 }
114 
115 
process_sse2(float * buf,float * obuf,size_t size,bool in_place,bool interleaved)116 void process_sse2( float* buf, float* obuf, size_t size, bool in_place, bool interleaved )
117 {
118 #ifdef __SSE2__
119   size_t chunk_size = 64;
120   size_t nchunks = size / chunk_size;
121 
122   long minTime = 0;
123   long maxTime = 0;
124 
125   __m128 mrgbv[3][3];
126   for(int i = 0; i < 3; i++) {
127       for(int j = 0; j < 3; j++) {
128           mrgbv[i][j] = _mm_set1_ps(mrgb[i][j]);
129       }
130   }
131 
132   for(int i = 0; i < 20; i++) {
133     //BENCHFUN;
134     MyTime startTime;
135     MyTime stopTime;
136     startTime.set();
137     float* ptr = in_place ? obuf : buf;
138     float* optr = obuf;
139 
140     if(in_place) memcpy(obuf, buf, sizeof(float)*4*size);
141 
142     if(interleaved) {
143 
144       // RGBARGBARGBARGBA... channel arrangement
145 
146       float rin[4] __attribute__ ((aligned (16)));
147       float gin[4] __attribute__ ((aligned (16)));
148       float bin[4] __attribute__ ((aligned (16)));
149       float rout[4] __attribute__ ((aligned (16)));
150       float gout[4] __attribute__ ((aligned (16)));
151       float bout[4] __attribute__ ((aligned (16)));
152 
153       for( int ci = 0; ci < nchunks; ci++ ) {
154         //printf("processing chunk %d\n", ci);
155         for( int pi = 0; pi < chunk_size-3; pi+=4 ) {
156           rin[0] = ptr[0]; gin[0] = ptr[1];  bin[0] = ptr[2];
157           rin[1] = ptr[3]; gin[1] = ptr[4];  bin[1] = ptr[5];
158           rin[2] = ptr[6]; gin[2] = ptr[7];  bin[2] = ptr[8];
159           rin[3] = ptr[9]; gin[3] = ptr[10]; bin[3] = ptr[11];
160 
161           __m128 rv = _mm_loadu_ps(&rin[0]);
162           __m128 gv = _mm_loadu_ps(&gin[0]);
163           __m128 bv = _mm_loadu_ps(&bin[0]);
164 
165           _mm_storeu_ps(&rout[0], mrgbv[0][0]*rv + mrgbv[0][1]*gv + mrgbv[0][2]*bv);
166           _mm_storeu_ps(&gout[0], mrgbv[1][0]*rv + mrgbv[1][1]*gv + mrgbv[1][2]*bv);
167           _mm_storeu_ps(&bout[0], mrgbv[2][0]*rv + mrgbv[2][1]*gv + mrgbv[2][2]*bv);
168 
169           optr[0] = rout[0];  optr[1] = gout[0];  optr[2] = bout[0];
170           optr[3] = rout[1];  optr[4] = gout[1];  optr[5] = bout[1];
171           optr[6] = rout[2];  optr[7] = gout[2];  optr[8] = bout[2];
172           optr[9] = rout[3];  optr[10] = gout[3]; optr[11] = bout[3];
173 
174           ptr += 16; optr += 16;
175         }
176       }
177     } else {
178 
179       // RRRR...GGGG...BBBB... channel arrangement
180 
181       float* rp = ptr;
182       float* gp = &(ptr[size]);
183       float* bp = &(ptr[size*2]);
184       float* orp = optr;
185       float* ogp = &(optr[size]);
186       float* obp = &(optr[size*2]);
187 
188       for( int ci = 0; ci < nchunks; ci++ ) {
189         //printf("processing chunk %d\n", ci);
190         for( int pi = 0; pi < chunk_size-3; pi+=4 ) {
191           __m128 rv = _mm_loadu_ps(&rp[0]);
192           __m128 gv = _mm_loadu_ps(&gp[0]);
193           __m128 bv = _mm_loadu_ps(&bp[0]);
194 
195           _mm_storeu_ps(&orp[0], mrgbv[0][0]*rv + mrgbv[0][1]*gv + mrgbv[0][2]*bv);
196           _mm_storeu_ps(&ogp[0], mrgbv[1][0]*rv + mrgbv[1][1]*gv + mrgbv[1][2]*bv);
197           _mm_storeu_ps(&obp[0], mrgbv[2][0]*rv + mrgbv[2][1]*gv + mrgbv[2][2]*bv);
198 
199           rp+=4;  gp+=4;  bp+=4;
200           orp+=4; ogp+=4; obp+=4;
201         }
202       }
203     }
204     stopTime.set();
205     long elapsedTime = stopTime.etime(startTime) / 1000;
206     //std::cout<<"process_sse2: elpased time = "<<elapsedTime<<std::endl;
207     if( minTime==0 ) minTime = elapsedTime;
208     else {
209       if( minTime > elapsedTime ) minTime = elapsedTime;
210     }
211     if( maxTime==0 ) maxTime = elapsedTime;
212     else {
213       if( maxTime < elapsedTime ) maxTime = elapsedTime;
214     }
215   }
216   std::cout<<"process_sse2: in_place="<<in_place<<"  interleaved="<<interleaved<<std::endl;
217   std::cout<<"    minimum time = "<<minTime<<std::endl;
218   std::cout<<"    maximum time = "<<maxTime<<std::endl;
219 #endif // __SSE2__
220 }
221 
222 
process_sse2_shuffle(float * buf,float * obuf,size_t size,bool in_place,bool interleaved)223 void process_sse2_shuffle( float* buf, float* obuf, size_t size, bool in_place, bool interleaved )
224 {
225 #ifdef __SSE2__
226   size_t chunk_size = 64;
227   size_t nchunks = size / chunk_size;
228 
229   long minTime = 0;
230   long maxTime = 0;
231 
232   __m128 mrgbv[3][3];
233   for(int i = 0; i < 3; i++) {
234       for(int j = 0; j < 3; j++) {
235           mrgbv[i][j] = _mm_set1_ps(mrgb[i][j]);
236       }
237   }
238 
239   for(int i = 0; i < 20; i++) {
240     //BENCHFUN;
241     MyTime startTime;
242     MyTime stopTime;
243     startTime.set();
244 
245     float* ptr = in_place ? obuf : buf;
246     float* optr = obuf;
247 
248     if(in_place) memcpy(obuf, buf, sizeof(float)*4*size);
249 
250     for( int ci = 0; ci < nchunks; ci++ ) {
251       //printf("processing chunk %d\n", ci);
252       for( int pi = 0; pi < chunk_size-3; pi+=4 ) {
253         __m128 rgbv[4];
254         __m128 a0;
255         __m128 a1;
256         __m128 a2;
257         __m128 a3;
258         rgbv[0] = _mm_loadu_ps(&ptr[0]);
259         rgbv[1] = _mm_loadu_ps(&ptr[4]);
260         rgbv[2] = _mm_loadu_ps(&ptr[8]);
261         rgbv[3] = _mm_loadu_ps(&ptr[12]);
262 
263         /**/
264         a0 = _mm_unpacklo_ps(rgbv[0], rgbv[2]);
265         a1 = _mm_unpacklo_ps(rgbv[1], rgbv[3]);
266         a2 = _mm_unpackhi_ps(rgbv[0], rgbv[2]);
267         a3 = _mm_unpackhi_ps(rgbv[1], rgbv[3]);
268 
269         rgbv[0] = _mm_unpacklo_ps(a0, a1);
270         rgbv[1] = _mm_unpackhi_ps(a0, a1);
271         rgbv[2] = _mm_unpacklo_ps(a2, a3);
272         rgbv[3] = _mm_unpackhi_ps(a2, a3);
273         /**/
274 
275         __m128 orv = mrgbv[0][0]*rgbv[0] + mrgbv[0][1]*rgbv[1] + mrgbv[0][2]*rgbv[2];
276         __m128 ogv = mrgbv[1][0]*rgbv[0] + mrgbv[1][1]*rgbv[1] + mrgbv[1][2]*rgbv[2];
277         __m128 obv = mrgbv[2][0]*rgbv[0] + mrgbv[2][1]*rgbv[1] + mrgbv[2][2]*rgbv[2];
278 
279         /**/
280         a0 = _mm_unpacklo_ps(orv, obv);
281         a1 = _mm_unpacklo_ps(ogv, rgbv[3]);
282         a2 = _mm_unpackhi_ps(orv, obv);
283         a3 = _mm_unpackhi_ps(ogv, rgbv[3]);
284 
285         rgbv[0] = _mm_unpacklo_ps(a0, a1);
286         rgbv[1] = _mm_unpackhi_ps(a0, a1);
287         rgbv[2] = _mm_unpacklo_ps(a2, a3);
288         rgbv[3] = _mm_unpackhi_ps(a2, a3);
289         /**/
290 
291         //rgbv[0] = orv;
292         //rgbv[1] = ogv;
293         //rgbv[2] = obv;
294 
295         _mm_storeu_ps(&optr[0], rgbv[0]);
296         _mm_storeu_ps(&optr[4], rgbv[1]);
297         _mm_storeu_ps(&optr[8], rgbv[2]);
298         _mm_storeu_ps(&optr[12], rgbv[3]);
299 
300         ptr += 16; optr += 16;
301       }
302     }
303     stopTime.set();
304     long elapsedTime = stopTime.etime(startTime) / 1000;
305     //std::cout<<"process_sse2: elpased time = "<<elapsedTime<<std::endl;
306     if( minTime==0 ) minTime = elapsedTime;
307     else {
308       if( minTime > elapsedTime ) minTime = elapsedTime;
309     }
310     if( maxTime==0 ) maxTime = elapsedTime;
311     else {
312       if( maxTime < elapsedTime ) maxTime = elapsedTime;
313     }
314   }
315   std::cout<<"process_sse2_shuffle: in_place="<<in_place<<"  interleaved="<<interleaved<<std::endl;
316   std::cout<<"    minimum time = "<<minTime<<std::endl;
317   std::cout<<"    maximum time = "<<maxTime<<std::endl;
318 #endif // __SSE2__
319 }
320 
321 
322 
process_sse2_alt(float * buf,float * obuf,size_t size,bool in_place,bool interleaved)323 void process_sse2_alt( float* buf, float* obuf, size_t size, bool in_place, bool interleaved )
324 {
325 #ifdef __SSE2__
326   size_t chunk_size = 64;
327   size_t nchunks = size / chunk_size;
328 
329   long minTime = 0;
330   long maxTime = 0;
331 
332   float mc0[4] __attribute__ ((aligned (16)));
333   float mc1[4] __attribute__ ((aligned (16)));
334   float mc2[4] __attribute__ ((aligned (16)));
335   for(int i = 0; i < 3; i++) {
336     mc0[i] = mrgb[i][0];
337     mc1[i] = mrgb[i][1];
338     mc2[i] = mrgb[i][2];
339   }
340   mc0[3] = mc1[3] = mc2[3] = 0;
341 
342   __m128 mcv[3];
343   mcv[0] = _mm_loadu_ps(&mc0[0]);
344   mcv[1] = _mm_loadu_ps(&mc1[0]);
345   mcv[2] = _mm_loadu_ps(&mc2[0]);
346 
347   for(int i = 0; i < 20; i++) {
348     //BENCHFUN;
349     MyTime startTime;
350     MyTime stopTime;
351     startTime.set();
352     float* ptr = in_place ? obuf : buf;
353     float* optr = obuf;
354 
355     if(in_place) memcpy(obuf, buf, sizeof(float)*4*size);
356 
357     __m128 rv, bv, gv;
358 
359     float orgb[4] __attribute__ ((aligned (16)));
360 
361     for( int ci = 0; ci < nchunks; ci++ ) {
362       //printf("processing chunk %d\n", ci);
363       for( int pi = 0; pi < chunk_size-3; pi+=1 ) {
364         rv = _mm_set1_ps(ptr[0]);
365         gv = _mm_set1_ps(ptr[1]);
366         bv = _mm_set1_ps(ptr[2]);
367 
368         _mm_storeu_ps(&orgb[0], mcv[0]*rv + mcv[1]*gv + mcv[2]*bv);
369 
370         optr[0] = orgb[0];  optr[1] = orgb[1];  optr[2] = orgb[2];
371 
372         ptr += 4; optr += 4;
373       }
374     }
375     stopTime.set();
376     long elapsedTime = stopTime.etime(startTime) / 1000;
377     //std::cout<<"process_sse2: elpased time = "<<elapsedTime<<std::endl;
378     if( minTime==0 ) minTime = elapsedTime;
379     else {
380       if( minTime > elapsedTime ) minTime = elapsedTime;
381     }
382     if( maxTime==0 ) maxTime = elapsedTime;
383     else {
384       if( maxTime < elapsedTime ) maxTime = elapsedTime;
385     }
386   }
387   std::cout<<"process_sse2_alt: in_place="<<in_place<<"  interleaved="<<interleaved<<std::endl;
388   std::cout<<"    minimum time = "<<minTime<<std::endl;
389   std::cout<<"    maximum time = "<<maxTime<<std::endl;
390 #endif // __SSE2__
391 }
392 
393 
394 
395 
396 
397 int
main(int argc,char ** argv)398 main( int argc, char **argv )
399 {
400   size_t size = 50*1024*1024;
401   size_t chunk_size = 64;
402   size_t nchunks = size / chunk_size;
403   float* buf = (float*)malloc_aligned(sizeof(float)*4*size);
404   float* obuf = (float*)malloc_aligned(sizeof(float)*4*size);
405 
406   memset(buf, 0, sizeof(float)*3*size);
407 
408   printf("buf: %p\n", (void*)buf);
409 
410   mrgb[0][0] = 0.5;
411   mrgb[0][1] = 0.5;
412   mrgb[0][2] = 0.5;
413   mrgb[1][0] = 0.5;
414   mrgb[1][1] = 0.5;
415   mrgb[1][2] = 0.5;
416   mrgb[2][0] = 0.5;
417   mrgb[2][1] = 0.5;
418   mrgb[2][2] = 0.5;
419 
420   memcpy(obuf, buf, sizeof(float)*4*size);
421 
422   //process(buf, obuf, size);
423   process_sse2(buf, obuf, size, false, true);
424 
425   process(buf, obuf, size, false, false);
426   //process(buf, obuf, size, true, false);
427   process(buf, obuf, size, false, true);
428   //process(buf, obuf, size, true, true);
429 
430   process_sse2(buf, obuf, size, false, false);
431   //process_sse2(buf, obuf, size, true, false);
432   process_sse2(buf, obuf, size, false, true);
433   //process_sse2(buf, obuf, size, true, true);
434 
435   process_sse2_shuffle(buf, obuf, size, false, true);
436   //process_sse2_shuffle(buf, obuf, size, true, true);
437 
438   process_sse2_alt(buf, obuf, size, false, true);
439   //process_sse2_alt(buf, obuf, size, true, true);
440   return( 0 );
441 }
442