1 /*
2 * * Copyright (C) 2006-2011 Anders Brander <anders@brander.dk>,
3 * * Anders Kvist <akv@lnxbx.dk> and Klaus Post <klauspost@gmail.com>
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 */
19
20 #include "floatplanarimage.h"
21
22 namespace RawStudio {
23 namespace FFTFilter {
24
25 #if defined (__i386__) || defined (__x86_64__)
26
27 #if defined (__x86_64__)
28
29 // Only 64 bits, and only if pixelsize is 4
unpackInterleavedYUV_SSE2(const ImgConvertJob * j)30 void FloatPlanarImage::unpackInterleavedYUV_SSE2( const ImgConvertJob* j )
31 {
32 RS_IMAGE16* image = j->rs;
33 float* temp = p[0]->data;
34 temp[0] = redCorrection; temp[1] = 1.0f; temp[2] = blueCorrection; temp[3] = 0.0f;
35 for (int i = 0; i < 4; i++) {
36 temp[i+4] = (0.299); //r->Y
37 temp[i+8] = (0.587); //g->Y
38 temp[i+12] = (0.114); //b->Y
39
40 temp[i+16] = (-0.169); //r->Cb
41 temp[i+20] = (-0.331); //g->Cb
42 temp[i+24] = (0.499); //b->Cb
43
44 temp[i+28] = (0.499); //r->Cr
45 temp[i+32] = (-0.418); //g->Cr
46 temp[i+36] = (-0.0813); //b->Cr
47 }
48
49 asm volatile
50 (
51 "movaps 0(%0), %%xmm15\n" // Red, green, bluecorrection
52 : // no output registers
53 : "r" (temp)
54 : // %0
55 );
56 for (int y = j->start_y; y < j->end_y; y++ ) {
57 const gushort* pix = GET_PIXEL(image,0,y);
58 gfloat *Y = p[0]->getAt(ox, y+oy);
59 gfloat *Cb = p[1]->getAt(ox, y+oy);
60 gfloat *Cr = p[2]->getAt(ox, y+oy);
61 gint w = (3+image->w) >>2;
62 asm volatile
63 (
64 "unpack_next_pixel:\n"
65 "movaps (%0), %%xmm0\n" // Load xx,b1,g1,r1,xx,b0,g0,r0
66 "movaps 16(%0), %%xmm2\n" // Load xx,b3,g3,r3,xx,b2,g2,r2
67 "prefetchnta 64(%0)\n" // Prefetch next
68 "pxor %%xmm5,%%xmm5\n"
69 "movaps %%xmm0, %%xmm1\n"
70 "movaps %%xmm2, %%xmm3\n"
71
72 "punpcklwd %%xmm5,%%xmm0\n" //00xx 00b0 00g0 00r0
73 "punpckhwd %%xmm5,%%xmm1\n" //00xx 00b1 00g1 00r1
74 "punpcklwd %%xmm5,%%xmm2\n" //00xx 00b2 00g2 00r2
75 "punpckhwd %%xmm5,%%xmm3\n" //00xx 00b3 00g3 00r3
76
77 "cvtdq2ps %%xmm0, %%xmm0\n" // doubleword to float
78 "cvtdq2ps %%xmm1, %%xmm1\n"
79 "cvtdq2ps %%xmm2, %%xmm2\n" // doubleword to float
80 "cvtdq2ps %%xmm3, %%xmm3\n"
81
82 "mulps %%xmm15, %%xmm0\n" // Multiply by redcorrection/bluecorrection
83 "mulps %%xmm15, %%xmm1\n" // Multiply by redcorrection/bluecorrection
84 "mulps %%xmm15, %%xmm2\n" // Multiply by redcorrection/bluecorrection
85 "mulps %%xmm15, %%xmm3\n" // Multiply by redcorrection/bluecorrection
86
87 "rsqrtps %%xmm0, %%xmm0\n" // 1 / sqrt()
88 "rsqrtps %%xmm1, %%xmm1\n"
89 "rsqrtps %%xmm2, %%xmm2\n"
90 "rsqrtps %%xmm3, %%xmm3\n"
91
92 "rcpps %%xmm0, %%xmm0\n" // sqrt
93 "rcpps %%xmm1, %%xmm1\n" // sqrt
94 "rcpps %%xmm2, %%xmm2\n" // sqrt
95 "rcpps %%xmm3, %%xmm3\n" // sqrt
96
97 "movaps %%xmm0, %%xmm5\n"
98 "movaps %%xmm2, %%xmm7\n"
99 "unpcklps %%xmm1, %%xmm0\n" //g1 g0 r1 r0
100 "unpcklps %%xmm3, %%xmm2\n" //g3 g2 r3 r2
101
102 "movaps %%xmm0, %%xmm4\n" //g1 g0 r1 r0
103 "movlhps %%xmm2, %%xmm0\n" //r3 r2 r1 r0
104 "movhlps %%xmm4, %%xmm2\n" //g3 g2 g1 g0
105
106 "unpckhps %%xmm1, %%xmm5\n" //xx xx b1 b0
107 "unpckhps %%xmm3, %%xmm7\n" //xx xx b3 b2
108 "movlhps %%xmm7, %%xmm5\n" //b3 b2 b1 b0
109
110 "movaps %%xmm2, %%xmm1\n" // Green in xmm1
111 "movaps %%xmm2, %%xmm4\n" // Green (copy) in xmm4
112 "movaps %%xmm5, %%xmm2\n" // Blue in xmm2
113 "movaps %%xmm0, %%xmm3\n" // Red (copy) in xmm3
114
115 "mulps 16(%5), %%xmm3\n" // R->Y
116 "mulps 32(%5), %%xmm4\n" // G->Y
117 "mulps 48(%5), %%xmm5\n" // B->Y
118
119 "movaps %%xmm0, %%xmm6\n" // Red (copy) in xmm6
120 "movaps %%xmm1, %%xmm7\n" // Green (copy) in xmm7
121 "movaps %%xmm2, %%xmm8\n" // Blue (copy) in xmm8
122
123 "mulps 64(%5), %%xmm0\n" // R->Cb
124 "mulps 80(%5), %%xmm1\n" // G->Cb
125 "mulps 96(%5), %%xmm2\n" // B->Cb
126
127 "addps %%xmm4, %%xmm3\n" // Add Y
128 "addps %%xmm1, %%xmm0\n" // Add Cb
129
130 "mulps 112(%5), %%xmm6\n" // R->Cr
131 "mulps 128(%5), %%xmm7\n" // G->Cr
132 "mulps 144(%5), %%xmm8\n" // B->Cr
133
134 "addps %%xmm5, %%xmm3\n" // Add Y (finished)
135 "addps %%xmm2, %%xmm0\n" // Add Cb (finished)
136 "addps %%xmm7, %%xmm6\n" // Add Cr
137 "addps %%xmm8, %%xmm6\n" // Add Cr (finished)
138
139 "movntdq %%xmm3, (%1)\n" // Store Y
140 "movntdq %%xmm0, (%2)\n" // Store Cb
141 "movntdq %%xmm6, (%3)\n" // Store Cr
142
143 "add $32, %0\n"
144 "add $16, %1\n"
145 "add $16, %2\n"
146 "add $16, %3\n"
147 "dec %4\n"
148 "jnz unpack_next_pixel\n"
149 : // no output registers
150 : "r" (pix), "r" (Y), "r" (Cb), "r" (Cr), "r" (w), "r" (temp)
151 // %0 %1 %2 %3 %4 %5
152 : "%rax", "%rbx", "%rcx"
153 );
154 }
155 asm volatile ( "emms\nsfence\n" );
156
157 }
158 #endif // defined (__x86_64__)
159
160 #if defined (__x86_64__)
161
packInterleavedYUV_SSE2(const ImgConvertJob * j)162 void FloatPlanarImage::packInterleavedYUV_SSE2( const ImgConvertJob* j)
163 {
164 RS_IMAGE16* image = j->rs;
165 float* temp = p[0]->data;
166 for (int i = 0; i < 4; i++) {
167 temp[i] = 1.402f; // Cr to r
168 temp[i+4] = -0.714f; // Cr to g
169 temp[i+8] = -0.344f; // Cb to g
170 temp[i+12] = 1.772f; // Cb to b
171 temp[i+16] = (1.0f/redCorrection); // Red correction
172 temp[i+20] = (1.0f/blueCorrection); // Blue correction
173 *((gint*)&temp[i+24]) = 32768; // Subtract
174 *((guint*)&temp[i+28]) = 0x80008000; // xor sign shift
175 }
176
177 asm volatile
178 (
179 "movaps (%0), %%xmm10\n" // Cr to r
180 "movaps 16(%0), %%xmm11\n" // Cr to g
181 "movaps 32(%0), %%xmm12\n" // Cb to g
182 "movaps 48(%0), %%xmm13\n" // Cb to b
183 "movaps 64(%0), %%xmm14\n" // Red Correction
184 "movaps 80(%0), %%xmm15\n" // Blue Correction
185 "movaps 96(%0), %%xmm9\n" // 0x00008000
186 "pxor %%xmm8, %%xmm8\n" // Zero
187 "movaps 112(%0), %%xmm7\n" // word 0x8000
188 : // no output registers
189 : "r" (temp)
190 : // %0
191 );
192 for (int y = j->start_y; y < j->end_y; y++ ) {
193 gfloat *Y = p[0]->getAt(ox, y+oy);
194 gfloat *Cb = p[1]->getAt(ox, y+oy);
195 gfloat *Cr = p[2]->getAt(ox, y+oy);
196 gushort* out = GET_PIXEL(image,0,y);
197 guint n = (image->w+3)>>2;
198 asm volatile
199 (
200 "loopback_YUV_SSE2_64:"
201 "movaps (%2), %%xmm1\n" // xmm1: Cb (4 pixels)
202 "movaps (%3), %%xmm2\n" // xmm2: Cr
203 "movaps (%1), %%xmm0\n" // xmm0: Y
204 "movaps %%xmm1, %%xmm3\n" // xmm3: Cb
205 "movaps %%xmm2, %%xmm4\n" // xmm4: Cr
206 "mulps %%xmm12, %%xmm1\n" // xmm1: Cb for green
207 "mulps %%xmm11, %%xmm2\n" // xmm2: Cr for green
208 "addps %%xmm0, %%xmm1\n" // xmm1: Add Y for green
209 "mulps %%xmm13, %%xmm3\n" // xmm3: Cb for blue
210 "mulps %%xmm10, %%xmm4\n" // xmm4: Cr for red
211 "addps %%xmm2, %%xmm1\n" // Green ready in xmm1
212 "addps %%xmm0, %%xmm3\n" // Add Y to blue
213 "addps %%xmm0, %%xmm4\n" // Add Y to red - xmm 0 free
214 "mulps %%xmm1, %%xmm1\n" // Square green
215 "cvtps2dq %%xmm1, %%xmm1\n" // Convert green to dwords
216 "mulps %%xmm3, %%xmm3\n" // Square blue
217 "mulps %%xmm4, %%xmm4\n" // Square red
218 "mulps %%xmm15, %%xmm3\n" // Multiply blue correction - maybe not needed later
219 "mulps %%xmm14, %%xmm4\n" // Multiply red correction - maybe not needed later
220 "psubd %%xmm9, %%xmm1\n" // g = g - 32768 ( to avoid saturation)
221 "cvtps2dq %%xmm3, %%xmm3\n" // Convert blue to dwords
222 "packssdw %%xmm1,%%xmm1\n" // g3g2 g1g0 g3g2 g1g0
223 "cvtps2dq %%xmm4, %%xmm4\n" // Convert red to dwords
224 "pxor %%xmm7, %%xmm1\n" // Shift sign
225 "psubd %%xmm9, %%xmm3\n" // b = b - 32768 ( to avoid saturation)
226 "psubd %%xmm9, %%xmm4\n" // r = r - 32768 ( to avoid saturation)
227 "packssdw %%xmm3,%%xmm3\n" // b3b2 b1b0 b3b2 b1b0
228 "packssdw %%xmm4,%%xmm4\n" // g3g2 g1g0 r3r2 r1r0
229 "pxor %%xmm7, %%xmm3\n" // Shift sign (b)
230 "pxor %%xmm7, %%xmm4\n" // Shift sign (r)
231 "punpcklwd %%xmm1, %%xmm4\n" // g3r3 g2r2 g1r1 g0r0
232 "punpcklwd %%xmm8, %%xmm3\n" // 00b3 00b2 00b1 00b0
233 "movdqa %%xmm4, %%xmm0\n" // Copy r&g
234 "punpckldq %%xmm3, %%xmm4\n" // Interleave lower blue into reg&green in xmm4 Now 00b1 g1r1 00b0 g0r0
235 "punpckhdq %%xmm3, %%xmm0\n" // Interleave higher blue into reg&green in xmm0 Now 00b3 g3r3 00b2 g2r2
236 "movntdq %%xmm4, (%0)\n" // Store low pixels
237 "movntdq %%xmm0, 16(%0)\n" // Store high pixels
238 "add $32, %0\n"
239 "add $16, %1\n"
240 "add $16, %2\n"
241 "add $16, %3\n"
242 "dec %4\n"
243 "jnz loopback_YUV_SSE2_64\n"
244 : // no output registers
245 : "r" (out), "r" (Y), "r" (Cb), "r" (Cr), "r"(n)
246 : // %0 %1 %2 %3 %4
247 );
248 }
249 asm volatile ( "emms\nsfence\n" );
250 }
251
252 #if 0
253 void FloatPlanarImage::packInterleavedYUV_SSE4( const ImgConvertJob* j)
254 {
255 RS_IMAGE16* image = j->rs;
256 float* temp = p[0]->data;
257 for (int i = 0; i < 4; i++) {
258 temp[i] = 1.402f; // Cr to r
259 temp[i+4] = -0.714f; // Cr to g
260 temp[i+8] = -0.344f; // Cb to g
261 temp[i+12] = 1.772f; // Cb to b
262 temp[i+16] = (1.0f/redCorrection); // Red correction
263 temp[i+20] = (1.0f/blueCorrection); // Blue correction
264 }
265
266 asm volatile
267 (
268 "movaps (%0), %%xmm10\n" // Cr to r
269 "movaps 16(%0), %%xmm11\n" // Cr to g
270 "movaps 32(%0), %%xmm12\n" // Cb to g
271 "movaps 48(%0), %%xmm13\n" // Cb to b
272 "movaps 64(%0), %%xmm14\n" // Red Correction
273 "movaps 80(%0), %%xmm15\n" // Blue Correction
274 : // no output registers
275 : "r" (temp)
276 : // %0
277 );
278 for (int y = j->start_y; y < j->end_y; y++ ) {
279 gfloat *Y = p[0]->getAt(ox, y+oy);
280 gfloat *Cb = p[1]->getAt(ox, y+oy);
281 gfloat *Cr = p[2]->getAt(ox, y+oy);
282 gushort* out = GET_PIXEL(image,0,y);
283 guint n = (image->w+3)>>2;
284 asm volatile
285 (
286 "loopback_YUV_SSE4_64:"
287 "movaps (%2), %%xmm1\n" // xmm1: Cb (4 pixels)
288 "movaps (%3), %%xmm2\n" // xmm2: Cr
289 "movaps (%1), %%xmm0\n" // xmm0: Y
290 "movaps %%xmm1, %%xmm3\n" // xmm3: Cb
291 "movaps %%xmm2, %%xmm4\n" // xmm4: Cr
292 "mulps %%xmm12, %%xmm1\n" // xmm1: Cb for green
293 "mulps %%xmm11, %%xmm2\n" // xmm2: Cr for green
294 "addps %%xmm0, %%xmm1\n" // xmm1: Add Y for green
295 "mulps %%xmm13, %%xmm3\n" // xmm3: Cb for blue
296 "mulps %%xmm10, %%xmm4\n" // xmm4: Cr for red
297 "addps %%xmm2, %%xmm1\n" // Green ready in xmm1
298 "addps %%xmm0, %%xmm3\n" // Add Y to blue
299 "addps %%xmm0, %%xmm4\n" // Add Y to red - xmm 0 free
300 "mulps %%xmm1, %%xmm1\n" // Square green
301 "mulps %%xmm3, %%xmm3\n" // Square blue
302 "mulps %%xmm4, %%xmm4\n" // Square red
303 "cvtps2dq %%xmm1, %%xmm1\n" // Convert green to dwords
304 "mulps %%xmm15, %%xmm3\n" // Multiply blue correction - maybe not needed later
305 "mulps %%xmm14, %%xmm4\n" // Multiply red correction - maybe not needed later
306 "cvtps2dq %%xmm4, %%xmm4\n" // Convert red to dwords
307 "cvtps2dq %%xmm3, %%xmm3\n" // Convert blue to dwords
308 "packusdw %%xmm1, %%xmm1\n" // green g3g2 g1g0 g3g2 g1g0
309 "packusdw %%xmm3, %%xmm3\n" // blue
310 "packusdw %%xmm4, %%xmm4\n" // red
311 "pxor %%xmm0,%%xmm0\n" // Not really needed, but almost a no-op, so we play nice
312 "punpcklwd %%xmm1,%%xmm4\n" // red + green interleaved g3r3 g2r2 g1r1 g0r0
313 "punpcklwd %%xmm0,%%xmm3\n" // blue zero interleaved 00b3 00b2 00b1 00b0
314 "movdqa %%xmm4, %%xmm1\n" // Copy r+g
315 "punpckldq %%xmm3,%%xmm4\n" // interleave r+g and blue low
316 "punpckhdq %%xmm3,%%xmm1\n" // interleave r+g and blue high
317
318 "movntdq %%xmm4, (%0)\n" // Store low pixels
319 "movntdq %%xmm1, 16(%0)\n" // Store high pixels
320 "add $32, %0\n"
321 "add $16, %1\n"
322 "add $16, %2\n"
323 "add $16, %3\n"
324 "dec %4\n"
325 "jnz loopback_YUV_SSE4_64\n"
326 : // no output registers
327 : "r" (out), "r" (Y), "r" (Cb), "r" (Cr), "r"(n)
328 : // %0 %1 %2 %3 %4
329 );
330 }
331 asm volatile ( "emms\nsfence\n" );
332 }
333 #endif
334
335 #else // 32 bits
336
packInterleavedYUV_SSE2(const ImgConvertJob * j)337 void FloatPlanarImage::packInterleavedYUV_SSE2( const ImgConvertJob* j)
338 {
339 RS_IMAGE16* image = j->rs;
340 float temp[32] __attribute__ ((aligned (16)));
341 for (int i = 0; i < 4; i++) {
342 temp[i] = 1.402f; // Cr to r
343 temp[i+4] = -0.714f; // Cr to g
344 temp[i+8] = -0.344f; // Cb to g
345 temp[i+12] = 1.772f; // Cb to b
346 temp[i+16] = (1.0f/redCorrection); // Red correction
347 temp[i+20] = (1.0f/blueCorrection); // Blue correction
348 *((gint*)&temp[i+24]) = 32768; // Subtract
349 *((guint*)&temp[i+28]) = 0x80008000; // xor sign shift
350 }
351 int* itemp = (int*)(&temp[28]);
352
353 asm volatile
354 (
355 "movaps 96(%0), %%xmm7\n" // Subtract
356 "movaps 112(%0), %%xmm5\n" // Xor sign
357 "pxor %%xmm6, %%xmm6\n" // Zero
358 : // no output registers
359 : "r" (temp)
360 : // %0
361 );
362 for (int y = j->start_y; y < j->end_y; y++ ) {
363 gfloat *Y = p[0]->getAt(ox, y+oy);
364 gfloat *Cb = p[1]->getAt(ox, y+oy);
365 gfloat *Cr = p[2]->getAt(ox, y+oy);
366 gushort* out = GET_PIXEL(image,0,y);
367 itemp[0] = (image->w+3)>>2;
368 asm volatile
369 (
370 "loopback_YUV_SSE2_32:"
371 "movaps (%2), %%xmm1\n" // xmm1: Cb (4 pixels)
372 "movaps (%3), %%xmm2\n" // xmm2: Cr
373 "movaps (%1), %%xmm0\n" // xmm0: Y
374 "movaps %%xmm1, %%xmm3\n" // xmm3: Cb
375 "movaps %%xmm2, %%xmm4\n" // xmm4: Cr
376 "mulps 32(%4), %%xmm1\n" // xmm1: Cb for green
377 "mulps 16(%4), %%xmm2\n" // xmm2: Cr for green
378 "addps %%xmm0, %%xmm1\n" // xmm1: Add Y for green
379 "mulps 48(%4), %%xmm3\n" // xmm3: Cb for blue
380 "mulps (%4), %%xmm4\n" // xmm4: Cr for red
381 "addps %%xmm2, %%xmm1\n" // Green ready in xmm1
382 "addps %%xmm0, %%xmm3\n" // Add Y to blue
383 "addps %%xmm0, %%xmm4\n" // Add Y to red - xmm 0 free
384 "mulps %%xmm1, %%xmm1\n" // Square green
385 "mulps %%xmm3, %%xmm3\n" // Square blue
386 "mulps %%xmm4, %%xmm4\n" // Square red
387 "cvtps2dq %%xmm1, %%xmm1\n" // Convert green to dwords
388 "mulps 80(%4), %%xmm3\n" // Multiply blue correction - maybe not needed later
389 "mulps 64(%4), %%xmm4\n" // Multiply red correction - maybe not needed later
390 "psubd %%xmm7, %%xmm1\n" // g = g - 32768 ( to avoid saturation)
391 "cvtps2dq %%xmm3, %%xmm3\n" // Convert blue to dwords
392 "packssdw %%xmm1,%%xmm1\n" // g3g2 g1g0 g3g2 g1g0
393 "cvtps2dq %%xmm4, %%xmm4\n" // Convert red to dwords
394 "pxor %%xmm5, %%xmm1\n" // Shift sign
395 "psubd %%xmm7, %%xmm3\n" // b = b - 32768 ( to avoid saturation)
396 "psubd %%xmm7, %%xmm4\n" // r = r - 32768 ( to avoid saturation)
397 "packssdw %%xmm3,%%xmm3\n" // b3b2 b1b0 b3b2 b1b0
398 "packssdw %%xmm4,%%xmm4\n" // g3g2 g1g0 r3r2 r1r0
399 "pxor %%xmm5, %%xmm3\n" // Shift sign (b)
400 "pxor %%xmm5, %%xmm4\n" // Shift sign (r)
401 "punpcklwd %%xmm1, %%xmm4\n" // g3r3 g2r2 g1r1 g0r0
402 "punpcklwd %%xmm6, %%xmm3\n" // 00b3 00b2 00b1 00b0
403 "movdqa %%xmm4, %%xmm0\n" // Copy r&g
404 "punpckldq %%xmm3, %%xmm4\n" // Interleave lower blue into reg&green in xmm4 Now 00b1 g1r1 00b0 g0r0
405 "punpckhdq %%xmm3, %%xmm0\n" // Interleave higher blue into reg&green in xmm0 Now 00b3 g3r3 00b2 g2r2
406
407 "movdqa %%xmm4, (%0)\n" // Store low pixels
408 "movdqa %%xmm0, 16(%0)\n" // Store high pixels
409 "add $32, %0\n"
410 "add $16, %1\n"
411 "add $16, %2\n"
412 "add $16, %3\n"
413 "decl 112(%4)\n"
414 "jnz loopback_YUV_SSE2_32\n"
415 "emms\n"
416 : // no output registers
417 : "r" (out), "r" (Y), "r" (Cb), "r" (Cr), "r"(temp)
418 : // %0 %1 %2 %3 %4
419 );
420 }
421 }
422
423 #endif
424
425 #endif // defined (__i386__) || defined (__x86_64__)
426
427 }}// namespace RawStudio::FFTFilter
428