1 /*
2 Copyright 2014-2017 Jay Sorg
3 
4 Permission to use, copy, modify, distribute, and sell this software and its
5 documentation for any purpose is hereby granted without fee, provided that
6 the above copyright notice appear in all copies and that both that
7 copyright notice and this permission notice appear in supporting
8 documentation.
9 
10 The above copyright notice and this permission notice shall be included in
11 all copies or substantial portions of the Software.
12 
13 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
16 OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
17 AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 
20 yuv to rgb speed testing
21 
22 */
23 
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <fcntl.h>
28 #include <unistd.h>
29 #include <sys/types.h>
30 #include <sys/stat.h>
31 #include <sys/time.h>
32 
33 #if defined(USE_SIMD_AMD64)
34 #define a8r8g8b8_to_nv12_box_accel a8r8g8b8_to_nv12_box_amd64_sse2
35 #endif
36 
37 #if defined(USE_SIMD_X86)
38 #define a8r8g8b8_to_nv12_box_accel a8r8g8b8_to_nv12_box_x86_sse2
39 #endif
40 
41 /******************************************************************************/
42 //Y = ( (  66 * R + 129 * G +  25 * B + 128) >> 8) +  16
43 //U = ( ( -38 * R -  74 * G + 112 * B + 128) >> 8) + 128
44 //V = ( ( 112 * R -  94 * G -  18 * B + 128) >> 8) + 128
45 
46 //C = Y - 16
47 //D = U - 128
48 //E = V - 128
49 //R = clip(( 298 * C           + 409 * E + 128) >> 8)
50 //G = clip(( 298 * C - 100 * D - 208 * E + 128) >> 8)
51 //B = clip(( 298 * C + 516 * D           + 128) >> 8)
52 
53 /******************************************************************************/
54 #define RDPCLAMP(_val, _lo, _hi) \
55     (_val) < (_lo) ? (_lo) : (_val) > (_hi) ? (_hi) : (_val)
56 
57 // floating point
58 #define YUV2RGB1(_Y, _U, _V, _R, _G, _B) \
59   _Y  = (0.257 * _R) + (0.504 * _G) + (0.098 * _B) +  16; \
60   _U = -(0.148 * _R) - (0.291 * _G) + (0.439 * _B) + 128; \
61   _V =  (0.439 * _R) - (0.368 * _G) - (0.071 * _B) + 128;
62 
63 #define YUV2RGB3(_Y, _U, _V, _R, _G, _B) \
64   _Y = (( 1053 * _R + 2064 * _G +  401 * _B) >> 12) +  16; \
65   _U = (( -606 * _R - 1192 * _G + 1798 * _B) >> 12) + 128; \
66   _V = (( 1798 * _R - 1507 * _G -  291 * _B) >> 12) + 128;
67 
68 #define YUV2RGB2(_Y, _U, _V, _R, _G, _B) \
69   _Y = (( 16843 * _R + 33030 * _G +  6423 * _B) >> 16) +  16; \
70   _U = (( -9699 * _R - 19071 * _G + 28770 * _B) >> 16) + 128; \
71   _V = (( 28770 * _R - 24117 * _G -  4653 * _B) >> 16) + 128;
72 
73 // original
74 #define YUV2RGB(_Y, _U, _V, _R, _G, _B) \
75   _Y = (( 66 * _R + 129 * _G +  25 * _B + 128) >> 8) +  16; \
76   _U = ((-38 * _R -  74 * _G + 112 * _B + 128) >> 8) + 128; \
77   _V = ((112 * _R -  94 * _G -  18 * _B + 128) >> 8) + 128;
78 
79 #define YUV2RGB4(_Y, _U, _V, _R, _G, _B) \
80   _Y = ( ((1053 * ((_R) << 4)) >> 16) + ((2064 * ((_G) << 4)) >> 16) +  (( 401 * ((_B) << 4)) >> 16)) +  16; \
81   _U = ( ((1798 * ((_B) << 4)) >> 16) - (( 606 * ((_R) << 4)) >> 16) -  ((1192 * ((_G) << 4)) >> 16)) + 128; \
82   _V = ( ((1798 * ((_R) << 4)) >> 16) - ((1507 * ((_G) << 4)) >> 16) -  (( 291 * ((_B) << 4)) >> 16)) + 128;
83 
84 /******************************************************************************/
85 static int
a8r8g8b8_to_nv12_box(char * s8,int src_stride,char * d8_y,int dst_stride_y,char * d8_uv,int dst_stride_uv,int width,int height)86 a8r8g8b8_to_nv12_box(char *s8, int src_stride,
87                      char *d8_y, int dst_stride_y,
88                      char *d8_uv, int dst_stride_uv,
89                      int width, int height)
90 {
91     int index;
92     int jndex;
93     int R;
94     int G;
95     int B;
96     int Y;
97     int U;
98     int V;
99     int U_sum;
100     int V_sum;
101     int pixel;
102     int lwidth;
103     int lheight;
104     int *s32a;
105     int *s32b;
106     char *d8ya;
107     char *d8yb;
108     char *d8uv;
109 
110     /* must be even */
111     lwidth = width & ~1;
112     lheight = height & ~1;
113     for (jndex = 0; jndex < lheight; jndex += 2)
114     {
115         s32a = (int *) (s8 + src_stride * jndex);
116         s32b = (int *) (s8 + src_stride * (jndex + 1));
117         d8ya = d8_y + dst_stride_y * jndex;
118         d8yb = d8_y + dst_stride_y * (jndex + 1);
119         d8uv = d8_uv + dst_stride_uv * (jndex / 2);
120         for (index = 0; index < lwidth; index += 2)
121         {
122             U_sum = 0;
123             V_sum = 0;
124 
125             pixel = s32a[0];
126             s32a++;
127             R = (pixel >> 16) & 0xff;
128             G = (pixel >>  8) & 0xff;
129             B = (pixel >>  0) & 0xff;
130             YUV2RGB(Y, U, V, R, G, B);
131             d8ya[0] = RDPCLAMP(Y, 0, 255);
132             d8ya++;
133             U_sum += RDPCLAMP(U, 0, 255);
134             V_sum += RDPCLAMP(V, 0, 255);
135 
136             pixel = s32a[0];
137             s32a++;
138             R = (pixel >> 16) & 0xff;
139             G = (pixel >>  8) & 0xff;
140             B = (pixel >>  0) & 0xff;
141             YUV2RGB(Y, U, V, R, G, B);
142             d8ya[0] = RDPCLAMP(Y, 0, 255);
143             d8ya++;
144             U_sum += RDPCLAMP(U, 0, 255);
145             V_sum += RDPCLAMP(V, 0, 255);
146 
147             pixel = s32b[0];
148             s32b++;
149             R = (pixel >> 16) & 0xff;
150             G = (pixel >>  8) & 0xff;
151             B = (pixel >>  0) & 0xff;
152             YUV2RGB(Y, U, V, R, G, B);
153             d8yb[0] = RDPCLAMP(Y, 0, 255);
154             d8yb++;
155             U_sum += RDPCLAMP(U, 0, 255);
156             V_sum += RDPCLAMP(V, 0, 255);
157 
158             pixel = s32b[0];
159             s32b++;
160             R = (pixel >> 16) & 0xff;
161             G = (pixel >>  8) & 0xff;
162             B = (pixel >>  0) & 0xff;
163             YUV2RGB(Y, U, V, R, G, B);
164             d8yb[0] = RDPCLAMP(Y, 0, 255);
165             d8yb++;
166             U_sum += RDPCLAMP(U, 0, 255);
167             V_sum += RDPCLAMP(V, 0, 255);
168 
169             d8uv[0] = (U_sum + 2) / 4;
170             d8uv++;
171             d8uv[0] = (V_sum + 2) / 4;
172             d8uv++;
173         }
174     }
175 
176     return 0;
177 }
178 
output_params(void)179 int output_params(void)
180 {
181     return 0;
182 }
183 
hexdump(const void * p,int len)184 void hexdump(const void* p, int len)
185 {
186     const unsigned char* line;
187     int i;
188     int thisline;
189     int offset;
190 
191     line = (const unsigned char *)p;
192     offset = 0;
193 
194     while (offset < len)
195     {
196         printf("%04x ", offset);
197         thisline = len - offset;
198 
199         if (thisline > 16)
200         {
201             thisline = 16;
202         }
203 
204         for (i = 0; i < thisline; i++)
205         {
206             printf("%02x ", line[i]);
207         }
208 
209         for (; i < 16; i++)
210         {
211             printf("   ");
212         }
213 
214         for (i = 0; i < thisline; i++)
215         {
216             printf("%c", (line[i] >= 0x20 && line[i] < 0x7f) ? line[i] : '.');
217         }
218 
219         printf("\n");
220         offset += thisline;
221         line += thisline;
222     }
223 }
224 
lmemcmp(const void * data1,const void * data2,int bytes,int * offset)225 int lmemcmp(const void* data1, const void* data2, int bytes, int* offset)
226 {
227     int index;
228     int diff;
229     const unsigned char* ldata1;
230     const unsigned char* ldata2;
231 
232     ldata1 = (const unsigned char*)data1;
233     ldata2 = (const unsigned char*)data2;
234 
235     for (index = 0; index < bytes; index++)
236     {
237         diff = ldata1[index] - ldata2[index];
238         if (abs(diff) > 0)
239         {
240             *offset = index;
241             return 1;
242         }
243     }
244     return 0;
245 }
246 
get_mstime(void)247 int get_mstime(void)
248 {
249     struct timeval tp;
250 
251     gettimeofday(&tp, 0);
252     return (tp.tv_sec * 1000) + (tp.tv_usec / 1000);
253 }
254 
255 int
256 a8r8g8b8_to_nv12_box_x86_sse2(char *s8, int src_stride,
257                               char *d8_y, int dst_stride_y,
258                               char *d8_uv, int dst_stride_uv,
259                               int width, int height);
260 int
261 a8r8g8b8_to_nv12_box_amd64_sse2(char *s8, int src_stride,
262                                 char *d8_y, int dst_stride_y,
263                                 char *d8_uv, int dst_stride_uv,
264                                 int width, int height);
265 
266 #define AL(_ptr) ((char*)((((size_t)_ptr) + 15) & ~15))
267 
main(int argc,char ** argv)268 int main(int argc, char** argv)
269 {
270     int index;
271     int offset;
272     int fd;
273     int data_bytes;
274     int stime;
275     int etime;
276     int ret = 0;
277     char* rgb_data;
278     char* yuv_data1;
279     char* yuv_data2;
280     char* al_rgb_data;
281     char* al_yuv_data1;
282     char* al_yuv_data2;
283 
284     if (argc == 1)
285     {
286         return output_params();
287     }
288     fd = open("/dev/urandom", O_RDONLY);
289     data_bytes = 1920 * 1080 * 4;
290     rgb_data = (char*)malloc(data_bytes + 16);
291     al_rgb_data = AL(rgb_data);
292     if (read(fd, al_rgb_data, data_bytes) != data_bytes)
293     {
294         printf("error\n");
295     }
296     close(fd);
297     data_bytes = 1920 * 1080 * 2;
298     yuv_data1 = (char*)malloc(data_bytes + 16);
299     yuv_data2 = (char*)malloc(data_bytes + 16);
300     al_yuv_data1 = AL(yuv_data1);
301     al_yuv_data2 = AL(yuv_data2);
302     stime = get_mstime();
303     for (index = 0; index < 100; index++)
304     {
305         a8r8g8b8_to_nv12_box(al_rgb_data, 1920 * 4,
306                              al_yuv_data1, 1920,
307                              al_yuv_data1 + 1920 * 1080,
308                              1920, 1920, 1080);
309     }
310     etime = get_mstime();
311     printf("a8r8g8b8_to_nv12_box took %d\n", etime - stime);
312     stime = get_mstime();
313     for (index = 0; index < 100; index++)
314     {
315         a8r8g8b8_to_nv12_box_accel(al_rgb_data, 1920 * 4,
316                                    al_yuv_data2, 1920,
317                                    al_yuv_data2 + 1920 * 1080, 1920,
318                                    1920, 1080);
319     }
320     etime = get_mstime();
321     printf("a8r8g8b8_to_nv12_box_x86_sse2 took %d\n", etime - stime);
322     if (lmemcmp(al_yuv_data1, al_yuv_data2, 1920 * 1080 * 3 / 2, &offset) != 0)
323     {
324         ret = 1;
325         printf("no match at offset %d\n", offset);
326         printf("first\n");
327         hexdump(al_yuv_data1 + offset, 16);
328         printf("second\n");
329         hexdump(al_yuv_data2 + offset, 16);
330     }
331     else
332     {
333         printf("match\n");
334     }
335     free(rgb_data);
336     free(yuv_data1);
337     free(yuv_data2);
338     return ret;
339 }
340