1 /*
2  *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/rotate_row.h"
12 #include "libyuv/row.h"
13 
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18 
19 // This module is for GCC x86 and x64.
20 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
21 
22 // Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
23 #if defined(HAS_TRANSPOSEWX8_SSSE3)
TransposeWx8_SSSE3(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int width)24 void TransposeWx8_SSSE3(const uint8_t* src,
25                         int src_stride,
26                         uint8_t* dst,
27                         int dst_stride,
28                         int width) {
29   asm volatile(
30       // Read in the data from the source pointer.
31       // First round of bit swap.
32       LABELALIGN
33       "1:                                        \n"
34       "movq        (%0),%%xmm0                   \n"
35       "movq        (%0,%3),%%xmm1                \n"
36       "lea         (%0,%3,2),%0                  \n"
37       "punpcklbw   %%xmm1,%%xmm0                 \n"
38       "movq        (%0),%%xmm2                   \n"
39       "movdqa      %%xmm0,%%xmm1                 \n"
40       "palignr     $0x8,%%xmm1,%%xmm1            \n"
41       "movq        (%0,%3),%%xmm3                \n"
42       "lea         (%0,%3,2),%0                  \n"
43       "punpcklbw   %%xmm3,%%xmm2                 \n"
44       "movdqa      %%xmm2,%%xmm3                 \n"
45       "movq        (%0),%%xmm4                   \n"
46       "palignr     $0x8,%%xmm3,%%xmm3            \n"
47       "movq        (%0,%3),%%xmm5                \n"
48       "lea         (%0,%3,2),%0                  \n"
49       "punpcklbw   %%xmm5,%%xmm4                 \n"
50       "movdqa      %%xmm4,%%xmm5                 \n"
51       "movq        (%0),%%xmm6                   \n"
52       "palignr     $0x8,%%xmm5,%%xmm5            \n"
53       "movq        (%0,%3),%%xmm7                \n"
54       "lea         (%0,%3,2),%0                  \n"
55       "punpcklbw   %%xmm7,%%xmm6                 \n"
56       "neg         %3                            \n"
57       "movdqa      %%xmm6,%%xmm7                 \n"
58       "lea         0x8(%0,%3,8),%0               \n"
59       "palignr     $0x8,%%xmm7,%%xmm7            \n"
60       "neg         %3                            \n"
61       // Second round of bit swap.
62       "punpcklwd   %%xmm2,%%xmm0                 \n"
63       "punpcklwd   %%xmm3,%%xmm1                 \n"
64       "movdqa      %%xmm0,%%xmm2                 \n"
65       "movdqa      %%xmm1,%%xmm3                 \n"
66       "palignr     $0x8,%%xmm2,%%xmm2            \n"
67       "palignr     $0x8,%%xmm3,%%xmm3            \n"
68       "punpcklwd   %%xmm6,%%xmm4                 \n"
69       "punpcklwd   %%xmm7,%%xmm5                 \n"
70       "movdqa      %%xmm4,%%xmm6                 \n"
71       "movdqa      %%xmm5,%%xmm7                 \n"
72       "palignr     $0x8,%%xmm6,%%xmm6            \n"
73       "palignr     $0x8,%%xmm7,%%xmm7            \n"
74       // Third round of bit swap.
75       // Write to the destination pointer.
76       "punpckldq   %%xmm4,%%xmm0                 \n"
77       "movq        %%xmm0,(%1)                   \n"
78       "movdqa      %%xmm0,%%xmm4                 \n"
79       "palignr     $0x8,%%xmm4,%%xmm4            \n"
80       "movq        %%xmm4,(%1,%4)                \n"
81       "lea         (%1,%4,2),%1                  \n"
82       "punpckldq   %%xmm6,%%xmm2                 \n"
83       "movdqa      %%xmm2,%%xmm6                 \n"
84       "movq        %%xmm2,(%1)                   \n"
85       "palignr     $0x8,%%xmm6,%%xmm6            \n"
86       "punpckldq   %%xmm5,%%xmm1                 \n"
87       "movq        %%xmm6,(%1,%4)                \n"
88       "lea         (%1,%4,2),%1                  \n"
89       "movdqa      %%xmm1,%%xmm5                 \n"
90       "movq        %%xmm1,(%1)                   \n"
91       "palignr     $0x8,%%xmm5,%%xmm5            \n"
92       "movq        %%xmm5,(%1,%4)                \n"
93       "lea         (%1,%4,2),%1                  \n"
94       "punpckldq   %%xmm7,%%xmm3                 \n"
95       "movq        %%xmm3,(%1)                   \n"
96       "movdqa      %%xmm3,%%xmm7                 \n"
97       "palignr     $0x8,%%xmm7,%%xmm7            \n"
98       "sub         $0x8,%2                       \n"
99       "movq        %%xmm7,(%1,%4)                \n"
100       "lea         (%1,%4,2),%1                  \n"
101       "jg          1b                            \n"
102       : "+r"(src),                    // %0
103         "+r"(dst),                    // %1
104         "+r"(width)                   // %2
105       : "r"((intptr_t)(src_stride)),  // %3
106         "r"((intptr_t)(dst_stride))   // %4
107       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
108         "xmm7");
109 }
110 #endif  // defined(HAS_TRANSPOSEWX8_SSSE3)
111 
112 // Transpose 16x8. 64 bit
113 #if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
TransposeWx8_Fast_SSSE3(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int width)114 void TransposeWx8_Fast_SSSE3(const uint8_t* src,
115                              int src_stride,
116                              uint8_t* dst,
117                              int dst_stride,
118                              int width) {
119   asm volatile(
120       // Read in the data from the source pointer.
121       // First round of bit swap.
122       LABELALIGN
123       "1:                                        \n"
124       "movdqu      (%0),%%xmm0                   \n"
125       "movdqu      (%0,%3),%%xmm1                \n"
126       "lea         (%0,%3,2),%0                  \n"
127       "movdqa      %%xmm0,%%xmm8                 \n"
128       "punpcklbw   %%xmm1,%%xmm0                 \n"
129       "punpckhbw   %%xmm1,%%xmm8                 \n"
130       "movdqu      (%0),%%xmm2                   \n"
131       "movdqa      %%xmm0,%%xmm1                 \n"
132       "movdqa      %%xmm8,%%xmm9                 \n"
133       "palignr     $0x8,%%xmm1,%%xmm1            \n"
134       "palignr     $0x8,%%xmm9,%%xmm9            \n"
135       "movdqu      (%0,%3),%%xmm3                \n"
136       "lea         (%0,%3,2),%0                  \n"
137       "movdqa      %%xmm2,%%xmm10                \n"
138       "punpcklbw   %%xmm3,%%xmm2                 \n"
139       "punpckhbw   %%xmm3,%%xmm10                \n"
140       "movdqa      %%xmm2,%%xmm3                 \n"
141       "movdqa      %%xmm10,%%xmm11               \n"
142       "movdqu      (%0),%%xmm4                   \n"
143       "palignr     $0x8,%%xmm3,%%xmm3            \n"
144       "palignr     $0x8,%%xmm11,%%xmm11          \n"
145       "movdqu      (%0,%3),%%xmm5                \n"
146       "lea         (%0,%3,2),%0                  \n"
147       "movdqa      %%xmm4,%%xmm12                \n"
148       "punpcklbw   %%xmm5,%%xmm4                 \n"
149       "punpckhbw   %%xmm5,%%xmm12                \n"
150       "movdqa      %%xmm4,%%xmm5                 \n"
151       "movdqa      %%xmm12,%%xmm13               \n"
152       "movdqu      (%0),%%xmm6                   \n"
153       "palignr     $0x8,%%xmm5,%%xmm5            \n"
154       "palignr     $0x8,%%xmm13,%%xmm13          \n"
155       "movdqu      (%0,%3),%%xmm7                \n"
156       "lea         (%0,%3,2),%0                  \n"
157       "movdqa      %%xmm6,%%xmm14                \n"
158       "punpcklbw   %%xmm7,%%xmm6                 \n"
159       "punpckhbw   %%xmm7,%%xmm14                \n"
160       "neg         %3                            \n"
161       "movdqa      %%xmm6,%%xmm7                 \n"
162       "movdqa      %%xmm14,%%xmm15               \n"
163       "lea         0x10(%0,%3,8),%0              \n"
164       "palignr     $0x8,%%xmm7,%%xmm7            \n"
165       "palignr     $0x8,%%xmm15,%%xmm15          \n"
166       "neg         %3                            \n"
167       // Second round of bit swap.
168       "punpcklwd   %%xmm2,%%xmm0                 \n"
169       "punpcklwd   %%xmm3,%%xmm1                 \n"
170       "movdqa      %%xmm0,%%xmm2                 \n"
171       "movdqa      %%xmm1,%%xmm3                 \n"
172       "palignr     $0x8,%%xmm2,%%xmm2            \n"
173       "palignr     $0x8,%%xmm3,%%xmm3            \n"
174       "punpcklwd   %%xmm6,%%xmm4                 \n"
175       "punpcklwd   %%xmm7,%%xmm5                 \n"
176       "movdqa      %%xmm4,%%xmm6                 \n"
177       "movdqa      %%xmm5,%%xmm7                 \n"
178       "palignr     $0x8,%%xmm6,%%xmm6            \n"
179       "palignr     $0x8,%%xmm7,%%xmm7            \n"
180       "punpcklwd   %%xmm10,%%xmm8                \n"
181       "punpcklwd   %%xmm11,%%xmm9                \n"
182       "movdqa      %%xmm8,%%xmm10                \n"
183       "movdqa      %%xmm9,%%xmm11                \n"
184       "palignr     $0x8,%%xmm10,%%xmm10          \n"
185       "palignr     $0x8,%%xmm11,%%xmm11          \n"
186       "punpcklwd   %%xmm14,%%xmm12               \n"
187       "punpcklwd   %%xmm15,%%xmm13               \n"
188       "movdqa      %%xmm12,%%xmm14               \n"
189       "movdqa      %%xmm13,%%xmm15               \n"
190       "palignr     $0x8,%%xmm14,%%xmm14          \n"
191       "palignr     $0x8,%%xmm15,%%xmm15          \n"
192       // Third round of bit swap.
193       // Write to the destination pointer.
194       "punpckldq   %%xmm4,%%xmm0                 \n"
195       "movq        %%xmm0,(%1)                   \n"
196       "movdqa      %%xmm0,%%xmm4                 \n"
197       "palignr     $0x8,%%xmm4,%%xmm4            \n"
198       "movq        %%xmm4,(%1,%4)                \n"
199       "lea         (%1,%4,2),%1                  \n"
200       "punpckldq   %%xmm6,%%xmm2                 \n"
201       "movdqa      %%xmm2,%%xmm6                 \n"
202       "movq        %%xmm2,(%1)                   \n"
203       "palignr     $0x8,%%xmm6,%%xmm6            \n"
204       "punpckldq   %%xmm5,%%xmm1                 \n"
205       "movq        %%xmm6,(%1,%4)                \n"
206       "lea         (%1,%4,2),%1                  \n"
207       "movdqa      %%xmm1,%%xmm5                 \n"
208       "movq        %%xmm1,(%1)                   \n"
209       "palignr     $0x8,%%xmm5,%%xmm5            \n"
210       "movq        %%xmm5,(%1,%4)                \n"
211       "lea         (%1,%4,2),%1                  \n"
212       "punpckldq   %%xmm7,%%xmm3                 \n"
213       "movq        %%xmm3,(%1)                   \n"
214       "movdqa      %%xmm3,%%xmm7                 \n"
215       "palignr     $0x8,%%xmm7,%%xmm7            \n"
216       "movq        %%xmm7,(%1,%4)                \n"
217       "lea         (%1,%4,2),%1                  \n"
218       "punpckldq   %%xmm12,%%xmm8                \n"
219       "movq        %%xmm8,(%1)                   \n"
220       "movdqa      %%xmm8,%%xmm12                \n"
221       "palignr     $0x8,%%xmm12,%%xmm12          \n"
222       "movq        %%xmm12,(%1,%4)               \n"
223       "lea         (%1,%4,2),%1                  \n"
224       "punpckldq   %%xmm14,%%xmm10               \n"
225       "movdqa      %%xmm10,%%xmm14               \n"
226       "movq        %%xmm10,(%1)                  \n"
227       "palignr     $0x8,%%xmm14,%%xmm14          \n"
228       "punpckldq   %%xmm13,%%xmm9                \n"
229       "movq        %%xmm14,(%1,%4)               \n"
230       "lea         (%1,%4,2),%1                  \n"
231       "movdqa      %%xmm9,%%xmm13                \n"
232       "movq        %%xmm9,(%1)                   \n"
233       "palignr     $0x8,%%xmm13,%%xmm13          \n"
234       "movq        %%xmm13,(%1,%4)               \n"
235       "lea         (%1,%4,2),%1                  \n"
236       "punpckldq   %%xmm15,%%xmm11               \n"
237       "movq        %%xmm11,(%1)                  \n"
238       "movdqa      %%xmm11,%%xmm15               \n"
239       "palignr     $0x8,%%xmm15,%%xmm15          \n"
240       "sub         $0x10,%2                      \n"
241       "movq        %%xmm15,(%1,%4)               \n"
242       "lea         (%1,%4,2),%1                  \n"
243       "jg          1b                            \n"
244       : "+r"(src),                    // %0
245         "+r"(dst),                    // %1
246         "+r"(width)                   // %2
247       : "r"((intptr_t)(src_stride)),  // %3
248         "r"((intptr_t)(dst_stride))   // %4
249       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
250         "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
251         "xmm15");
252 }
253 #endif  // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
254 
255 // Transpose UV 8x8.  64 bit.
256 #if defined(HAS_TRANSPOSEUVWX8_SSE2)
TransposeUVWx8_SSE2(const uint8_t * src,int src_stride,uint8_t * dst_a,int dst_stride_a,uint8_t * dst_b,int dst_stride_b,int width)257 void TransposeUVWx8_SSE2(const uint8_t* src,
258                          int src_stride,
259                          uint8_t* dst_a,
260                          int dst_stride_a,
261                          uint8_t* dst_b,
262                          int dst_stride_b,
263                          int width) {
264   asm volatile(
265       // Read in the data from the source pointer.
266       // First round of bit swap.
267       LABELALIGN
268       "1:                                        \n"
269       "movdqu      (%0),%%xmm0                   \n"
270       "movdqu      (%0,%4),%%xmm1                \n"
271       "lea         (%0,%4,2),%0                  \n"
272       "movdqa      %%xmm0,%%xmm8                 \n"
273       "punpcklbw   %%xmm1,%%xmm0                 \n"
274       "punpckhbw   %%xmm1,%%xmm8                 \n"
275       "movdqa      %%xmm8,%%xmm1                 \n"
276       "movdqu      (%0),%%xmm2                   \n"
277       "movdqu      (%0,%4),%%xmm3                \n"
278       "lea         (%0,%4,2),%0                  \n"
279       "movdqa      %%xmm2,%%xmm8                 \n"
280       "punpcklbw   %%xmm3,%%xmm2                 \n"
281       "punpckhbw   %%xmm3,%%xmm8                 \n"
282       "movdqa      %%xmm8,%%xmm3                 \n"
283       "movdqu      (%0),%%xmm4                   \n"
284       "movdqu      (%0,%4),%%xmm5                \n"
285       "lea         (%0,%4,2),%0                  \n"
286       "movdqa      %%xmm4,%%xmm8                 \n"
287       "punpcklbw   %%xmm5,%%xmm4                 \n"
288       "punpckhbw   %%xmm5,%%xmm8                 \n"
289       "movdqa      %%xmm8,%%xmm5                 \n"
290       "movdqu      (%0),%%xmm6                   \n"
291       "movdqu      (%0,%4),%%xmm7                \n"
292       "lea         (%0,%4,2),%0                  \n"
293       "movdqa      %%xmm6,%%xmm8                 \n"
294       "punpcklbw   %%xmm7,%%xmm6                 \n"
295       "neg         %4                            \n"
296       "lea         0x10(%0,%4,8),%0              \n"
297       "punpckhbw   %%xmm7,%%xmm8                 \n"
298       "movdqa      %%xmm8,%%xmm7                 \n"
299       "neg         %4                            \n"
300       // Second round of bit swap.
301       "movdqa      %%xmm0,%%xmm8                 \n"
302       "movdqa      %%xmm1,%%xmm9                 \n"
303       "punpckhwd   %%xmm2,%%xmm8                 \n"
304       "punpckhwd   %%xmm3,%%xmm9                 \n"
305       "punpcklwd   %%xmm2,%%xmm0                 \n"
306       "punpcklwd   %%xmm3,%%xmm1                 \n"
307       "movdqa      %%xmm8,%%xmm2                 \n"
308       "movdqa      %%xmm9,%%xmm3                 \n"
309       "movdqa      %%xmm4,%%xmm8                 \n"
310       "movdqa      %%xmm5,%%xmm9                 \n"
311       "punpckhwd   %%xmm6,%%xmm8                 \n"
312       "punpckhwd   %%xmm7,%%xmm9                 \n"
313       "punpcklwd   %%xmm6,%%xmm4                 \n"
314       "punpcklwd   %%xmm7,%%xmm5                 \n"
315       "movdqa      %%xmm8,%%xmm6                 \n"
316       "movdqa      %%xmm9,%%xmm7                 \n"
317       // Third round of bit swap.
318       // Write to the destination pointer.
319       "movdqa      %%xmm0,%%xmm8                 \n"
320       "punpckldq   %%xmm4,%%xmm0                 \n"
321       "movlpd      %%xmm0,(%1)                   \n"  // Write back U channel
322       "movhpd      %%xmm0,(%2)                   \n"  // Write back V channel
323       "punpckhdq   %%xmm4,%%xmm8                 \n"
324       "movlpd      %%xmm8,(%1,%5)                \n"
325       "lea         (%1,%5,2),%1                  \n"
326       "movhpd      %%xmm8,(%2,%6)                \n"
327       "lea         (%2,%6,2),%2                  \n"
328       "movdqa      %%xmm2,%%xmm8                 \n"
329       "punpckldq   %%xmm6,%%xmm2                 \n"
330       "movlpd      %%xmm2,(%1)                   \n"
331       "movhpd      %%xmm2,(%2)                   \n"
332       "punpckhdq   %%xmm6,%%xmm8                 \n"
333       "movlpd      %%xmm8,(%1,%5)                \n"
334       "lea         (%1,%5,2),%1                  \n"
335       "movhpd      %%xmm8,(%2,%6)                \n"
336       "lea         (%2,%6,2),%2                  \n"
337       "movdqa      %%xmm1,%%xmm8                 \n"
338       "punpckldq   %%xmm5,%%xmm1                 \n"
339       "movlpd      %%xmm1,(%1)                   \n"
340       "movhpd      %%xmm1,(%2)                   \n"
341       "punpckhdq   %%xmm5,%%xmm8                 \n"
342       "movlpd      %%xmm8,(%1,%5)                \n"
343       "lea         (%1,%5,2),%1                  \n"
344       "movhpd      %%xmm8,(%2,%6)                \n"
345       "lea         (%2,%6,2),%2                  \n"
346       "movdqa      %%xmm3,%%xmm8                 \n"
347       "punpckldq   %%xmm7,%%xmm3                 \n"
348       "movlpd      %%xmm3,(%1)                   \n"
349       "movhpd      %%xmm3,(%2)                   \n"
350       "punpckhdq   %%xmm7,%%xmm8                 \n"
351       "sub         $0x8,%3                       \n"
352       "movlpd      %%xmm8,(%1,%5)                \n"
353       "lea         (%1,%5,2),%1                  \n"
354       "movhpd      %%xmm8,(%2,%6)                \n"
355       "lea         (%2,%6,2),%2                  \n"
356       "jg          1b                            \n"
357       : "+r"(src),                      // %0
358         "+r"(dst_a),                    // %1
359         "+r"(dst_b),                    // %2
360         "+r"(width)                     // %3
361       : "r"((intptr_t)(src_stride)),    // %4
362         "r"((intptr_t)(dst_stride_a)),  // %5
363         "r"((intptr_t)(dst_stride_b))   // %6
364       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
365         "xmm7", "xmm8", "xmm9");
366 }
367 #endif  // defined(HAS_TRANSPOSEUVWX8_SSE2)
368 #endif  // defined(__x86_64__) || defined(__i386__)
369 
370 #ifdef __cplusplus
371 }  // extern "C"
372 }  // namespace libyuv
373 #endif
374