1 /*
2 * Copyright 2015 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/rotate_row.h"
12 #include "libyuv/row.h"
13
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18
19 // This module is for GCC x86 and x64.
20 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
21
22 // Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
23 #if defined(HAS_TRANSPOSEWX8_SSSE3)
TransposeWx8_SSSE3(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int width)24 void TransposeWx8_SSSE3(const uint8_t* src,
25 int src_stride,
26 uint8_t* dst,
27 int dst_stride,
28 int width) {
29 asm volatile(
30 // Read in the data from the source pointer.
31 // First round of bit swap.
32 LABELALIGN
33 "1: \n"
34 "movq (%0),%%xmm0 \n"
35 "movq (%0,%3),%%xmm1 \n"
36 "lea (%0,%3,2),%0 \n"
37 "punpcklbw %%xmm1,%%xmm0 \n"
38 "movq (%0),%%xmm2 \n"
39 "movdqa %%xmm0,%%xmm1 \n"
40 "palignr $0x8,%%xmm1,%%xmm1 \n"
41 "movq (%0,%3),%%xmm3 \n"
42 "lea (%0,%3,2),%0 \n"
43 "punpcklbw %%xmm3,%%xmm2 \n"
44 "movdqa %%xmm2,%%xmm3 \n"
45 "movq (%0),%%xmm4 \n"
46 "palignr $0x8,%%xmm3,%%xmm3 \n"
47 "movq (%0,%3),%%xmm5 \n"
48 "lea (%0,%3,2),%0 \n"
49 "punpcklbw %%xmm5,%%xmm4 \n"
50 "movdqa %%xmm4,%%xmm5 \n"
51 "movq (%0),%%xmm6 \n"
52 "palignr $0x8,%%xmm5,%%xmm5 \n"
53 "movq (%0,%3),%%xmm7 \n"
54 "lea (%0,%3,2),%0 \n"
55 "punpcklbw %%xmm7,%%xmm6 \n"
56 "neg %3 \n"
57 "movdqa %%xmm6,%%xmm7 \n"
58 "lea 0x8(%0,%3,8),%0 \n"
59 "palignr $0x8,%%xmm7,%%xmm7 \n"
60 "neg %3 \n"
61 // Second round of bit swap.
62 "punpcklwd %%xmm2,%%xmm0 \n"
63 "punpcklwd %%xmm3,%%xmm1 \n"
64 "movdqa %%xmm0,%%xmm2 \n"
65 "movdqa %%xmm1,%%xmm3 \n"
66 "palignr $0x8,%%xmm2,%%xmm2 \n"
67 "palignr $0x8,%%xmm3,%%xmm3 \n"
68 "punpcklwd %%xmm6,%%xmm4 \n"
69 "punpcklwd %%xmm7,%%xmm5 \n"
70 "movdqa %%xmm4,%%xmm6 \n"
71 "movdqa %%xmm5,%%xmm7 \n"
72 "palignr $0x8,%%xmm6,%%xmm6 \n"
73 "palignr $0x8,%%xmm7,%%xmm7 \n"
74 // Third round of bit swap.
75 // Write to the destination pointer.
76 "punpckldq %%xmm4,%%xmm0 \n"
77 "movq %%xmm0,(%1) \n"
78 "movdqa %%xmm0,%%xmm4 \n"
79 "palignr $0x8,%%xmm4,%%xmm4 \n"
80 "movq %%xmm4,(%1,%4) \n"
81 "lea (%1,%4,2),%1 \n"
82 "punpckldq %%xmm6,%%xmm2 \n"
83 "movdqa %%xmm2,%%xmm6 \n"
84 "movq %%xmm2,(%1) \n"
85 "palignr $0x8,%%xmm6,%%xmm6 \n"
86 "punpckldq %%xmm5,%%xmm1 \n"
87 "movq %%xmm6,(%1,%4) \n"
88 "lea (%1,%4,2),%1 \n"
89 "movdqa %%xmm1,%%xmm5 \n"
90 "movq %%xmm1,(%1) \n"
91 "palignr $0x8,%%xmm5,%%xmm5 \n"
92 "movq %%xmm5,(%1,%4) \n"
93 "lea (%1,%4,2),%1 \n"
94 "punpckldq %%xmm7,%%xmm3 \n"
95 "movq %%xmm3,(%1) \n"
96 "movdqa %%xmm3,%%xmm7 \n"
97 "palignr $0x8,%%xmm7,%%xmm7 \n"
98 "sub $0x8,%2 \n"
99 "movq %%xmm7,(%1,%4) \n"
100 "lea (%1,%4,2),%1 \n"
101 "jg 1b \n"
102 : "+r"(src), // %0
103 "+r"(dst), // %1
104 "+r"(width) // %2
105 : "r"((intptr_t)(src_stride)), // %3
106 "r"((intptr_t)(dst_stride)) // %4
107 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
108 "xmm7");
109 }
110 #endif // defined(HAS_TRANSPOSEWX8_SSSE3)
111
112 // Transpose 16x8. 64 bit
113 #if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
TransposeWx8_Fast_SSSE3(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int width)114 void TransposeWx8_Fast_SSSE3(const uint8_t* src,
115 int src_stride,
116 uint8_t* dst,
117 int dst_stride,
118 int width) {
119 asm volatile(
120 // Read in the data from the source pointer.
121 // First round of bit swap.
122 LABELALIGN
123 "1: \n"
124 "movdqu (%0),%%xmm0 \n"
125 "movdqu (%0,%3),%%xmm1 \n"
126 "lea (%0,%3,2),%0 \n"
127 "movdqa %%xmm0,%%xmm8 \n"
128 "punpcklbw %%xmm1,%%xmm0 \n"
129 "punpckhbw %%xmm1,%%xmm8 \n"
130 "movdqu (%0),%%xmm2 \n"
131 "movdqa %%xmm0,%%xmm1 \n"
132 "movdqa %%xmm8,%%xmm9 \n"
133 "palignr $0x8,%%xmm1,%%xmm1 \n"
134 "palignr $0x8,%%xmm9,%%xmm9 \n"
135 "movdqu (%0,%3),%%xmm3 \n"
136 "lea (%0,%3,2),%0 \n"
137 "movdqa %%xmm2,%%xmm10 \n"
138 "punpcklbw %%xmm3,%%xmm2 \n"
139 "punpckhbw %%xmm3,%%xmm10 \n"
140 "movdqa %%xmm2,%%xmm3 \n"
141 "movdqa %%xmm10,%%xmm11 \n"
142 "movdqu (%0),%%xmm4 \n"
143 "palignr $0x8,%%xmm3,%%xmm3 \n"
144 "palignr $0x8,%%xmm11,%%xmm11 \n"
145 "movdqu (%0,%3),%%xmm5 \n"
146 "lea (%0,%3,2),%0 \n"
147 "movdqa %%xmm4,%%xmm12 \n"
148 "punpcklbw %%xmm5,%%xmm4 \n"
149 "punpckhbw %%xmm5,%%xmm12 \n"
150 "movdqa %%xmm4,%%xmm5 \n"
151 "movdqa %%xmm12,%%xmm13 \n"
152 "movdqu (%0),%%xmm6 \n"
153 "palignr $0x8,%%xmm5,%%xmm5 \n"
154 "palignr $0x8,%%xmm13,%%xmm13 \n"
155 "movdqu (%0,%3),%%xmm7 \n"
156 "lea (%0,%3,2),%0 \n"
157 "movdqa %%xmm6,%%xmm14 \n"
158 "punpcklbw %%xmm7,%%xmm6 \n"
159 "punpckhbw %%xmm7,%%xmm14 \n"
160 "neg %3 \n"
161 "movdqa %%xmm6,%%xmm7 \n"
162 "movdqa %%xmm14,%%xmm15 \n"
163 "lea 0x10(%0,%3,8),%0 \n"
164 "palignr $0x8,%%xmm7,%%xmm7 \n"
165 "palignr $0x8,%%xmm15,%%xmm15 \n"
166 "neg %3 \n"
167 // Second round of bit swap.
168 "punpcklwd %%xmm2,%%xmm0 \n"
169 "punpcklwd %%xmm3,%%xmm1 \n"
170 "movdqa %%xmm0,%%xmm2 \n"
171 "movdqa %%xmm1,%%xmm3 \n"
172 "palignr $0x8,%%xmm2,%%xmm2 \n"
173 "palignr $0x8,%%xmm3,%%xmm3 \n"
174 "punpcklwd %%xmm6,%%xmm4 \n"
175 "punpcklwd %%xmm7,%%xmm5 \n"
176 "movdqa %%xmm4,%%xmm6 \n"
177 "movdqa %%xmm5,%%xmm7 \n"
178 "palignr $0x8,%%xmm6,%%xmm6 \n"
179 "palignr $0x8,%%xmm7,%%xmm7 \n"
180 "punpcklwd %%xmm10,%%xmm8 \n"
181 "punpcklwd %%xmm11,%%xmm9 \n"
182 "movdqa %%xmm8,%%xmm10 \n"
183 "movdqa %%xmm9,%%xmm11 \n"
184 "palignr $0x8,%%xmm10,%%xmm10 \n"
185 "palignr $0x8,%%xmm11,%%xmm11 \n"
186 "punpcklwd %%xmm14,%%xmm12 \n"
187 "punpcklwd %%xmm15,%%xmm13 \n"
188 "movdqa %%xmm12,%%xmm14 \n"
189 "movdqa %%xmm13,%%xmm15 \n"
190 "palignr $0x8,%%xmm14,%%xmm14 \n"
191 "palignr $0x8,%%xmm15,%%xmm15 \n"
192 // Third round of bit swap.
193 // Write to the destination pointer.
194 "punpckldq %%xmm4,%%xmm0 \n"
195 "movq %%xmm0,(%1) \n"
196 "movdqa %%xmm0,%%xmm4 \n"
197 "palignr $0x8,%%xmm4,%%xmm4 \n"
198 "movq %%xmm4,(%1,%4) \n"
199 "lea (%1,%4,2),%1 \n"
200 "punpckldq %%xmm6,%%xmm2 \n"
201 "movdqa %%xmm2,%%xmm6 \n"
202 "movq %%xmm2,(%1) \n"
203 "palignr $0x8,%%xmm6,%%xmm6 \n"
204 "punpckldq %%xmm5,%%xmm1 \n"
205 "movq %%xmm6,(%1,%4) \n"
206 "lea (%1,%4,2),%1 \n"
207 "movdqa %%xmm1,%%xmm5 \n"
208 "movq %%xmm1,(%1) \n"
209 "palignr $0x8,%%xmm5,%%xmm5 \n"
210 "movq %%xmm5,(%1,%4) \n"
211 "lea (%1,%4,2),%1 \n"
212 "punpckldq %%xmm7,%%xmm3 \n"
213 "movq %%xmm3,(%1) \n"
214 "movdqa %%xmm3,%%xmm7 \n"
215 "palignr $0x8,%%xmm7,%%xmm7 \n"
216 "movq %%xmm7,(%1,%4) \n"
217 "lea (%1,%4,2),%1 \n"
218 "punpckldq %%xmm12,%%xmm8 \n"
219 "movq %%xmm8,(%1) \n"
220 "movdqa %%xmm8,%%xmm12 \n"
221 "palignr $0x8,%%xmm12,%%xmm12 \n"
222 "movq %%xmm12,(%1,%4) \n"
223 "lea (%1,%4,2),%1 \n"
224 "punpckldq %%xmm14,%%xmm10 \n"
225 "movdqa %%xmm10,%%xmm14 \n"
226 "movq %%xmm10,(%1) \n"
227 "palignr $0x8,%%xmm14,%%xmm14 \n"
228 "punpckldq %%xmm13,%%xmm9 \n"
229 "movq %%xmm14,(%1,%4) \n"
230 "lea (%1,%4,2),%1 \n"
231 "movdqa %%xmm9,%%xmm13 \n"
232 "movq %%xmm9,(%1) \n"
233 "palignr $0x8,%%xmm13,%%xmm13 \n"
234 "movq %%xmm13,(%1,%4) \n"
235 "lea (%1,%4,2),%1 \n"
236 "punpckldq %%xmm15,%%xmm11 \n"
237 "movq %%xmm11,(%1) \n"
238 "movdqa %%xmm11,%%xmm15 \n"
239 "palignr $0x8,%%xmm15,%%xmm15 \n"
240 "sub $0x10,%2 \n"
241 "movq %%xmm15,(%1,%4) \n"
242 "lea (%1,%4,2),%1 \n"
243 "jg 1b \n"
244 : "+r"(src), // %0
245 "+r"(dst), // %1
246 "+r"(width) // %2
247 : "r"((intptr_t)(src_stride)), // %3
248 "r"((intptr_t)(dst_stride)) // %4
249 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
250 "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
251 "xmm15");
252 }
253 #endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
254
255 // Transpose UV 8x8. 64 bit.
256 #if defined(HAS_TRANSPOSEUVWX8_SSE2)
TransposeUVWx8_SSE2(const uint8_t * src,int src_stride,uint8_t * dst_a,int dst_stride_a,uint8_t * dst_b,int dst_stride_b,int width)257 void TransposeUVWx8_SSE2(const uint8_t* src,
258 int src_stride,
259 uint8_t* dst_a,
260 int dst_stride_a,
261 uint8_t* dst_b,
262 int dst_stride_b,
263 int width) {
264 asm volatile(
265 // Read in the data from the source pointer.
266 // First round of bit swap.
267 LABELALIGN
268 "1: \n"
269 "movdqu (%0),%%xmm0 \n"
270 "movdqu (%0,%4),%%xmm1 \n"
271 "lea (%0,%4,2),%0 \n"
272 "movdqa %%xmm0,%%xmm8 \n"
273 "punpcklbw %%xmm1,%%xmm0 \n"
274 "punpckhbw %%xmm1,%%xmm8 \n"
275 "movdqa %%xmm8,%%xmm1 \n"
276 "movdqu (%0),%%xmm2 \n"
277 "movdqu (%0,%4),%%xmm3 \n"
278 "lea (%0,%4,2),%0 \n"
279 "movdqa %%xmm2,%%xmm8 \n"
280 "punpcklbw %%xmm3,%%xmm2 \n"
281 "punpckhbw %%xmm3,%%xmm8 \n"
282 "movdqa %%xmm8,%%xmm3 \n"
283 "movdqu (%0),%%xmm4 \n"
284 "movdqu (%0,%4),%%xmm5 \n"
285 "lea (%0,%4,2),%0 \n"
286 "movdqa %%xmm4,%%xmm8 \n"
287 "punpcklbw %%xmm5,%%xmm4 \n"
288 "punpckhbw %%xmm5,%%xmm8 \n"
289 "movdqa %%xmm8,%%xmm5 \n"
290 "movdqu (%0),%%xmm6 \n"
291 "movdqu (%0,%4),%%xmm7 \n"
292 "lea (%0,%4,2),%0 \n"
293 "movdqa %%xmm6,%%xmm8 \n"
294 "punpcklbw %%xmm7,%%xmm6 \n"
295 "neg %4 \n"
296 "lea 0x10(%0,%4,8),%0 \n"
297 "punpckhbw %%xmm7,%%xmm8 \n"
298 "movdqa %%xmm8,%%xmm7 \n"
299 "neg %4 \n"
300 // Second round of bit swap.
301 "movdqa %%xmm0,%%xmm8 \n"
302 "movdqa %%xmm1,%%xmm9 \n"
303 "punpckhwd %%xmm2,%%xmm8 \n"
304 "punpckhwd %%xmm3,%%xmm9 \n"
305 "punpcklwd %%xmm2,%%xmm0 \n"
306 "punpcklwd %%xmm3,%%xmm1 \n"
307 "movdqa %%xmm8,%%xmm2 \n"
308 "movdqa %%xmm9,%%xmm3 \n"
309 "movdqa %%xmm4,%%xmm8 \n"
310 "movdqa %%xmm5,%%xmm9 \n"
311 "punpckhwd %%xmm6,%%xmm8 \n"
312 "punpckhwd %%xmm7,%%xmm9 \n"
313 "punpcklwd %%xmm6,%%xmm4 \n"
314 "punpcklwd %%xmm7,%%xmm5 \n"
315 "movdqa %%xmm8,%%xmm6 \n"
316 "movdqa %%xmm9,%%xmm7 \n"
317 // Third round of bit swap.
318 // Write to the destination pointer.
319 "movdqa %%xmm0,%%xmm8 \n"
320 "punpckldq %%xmm4,%%xmm0 \n"
321 "movlpd %%xmm0,(%1) \n" // Write back U channel
322 "movhpd %%xmm0,(%2) \n" // Write back V channel
323 "punpckhdq %%xmm4,%%xmm8 \n"
324 "movlpd %%xmm8,(%1,%5) \n"
325 "lea (%1,%5,2),%1 \n"
326 "movhpd %%xmm8,(%2,%6) \n"
327 "lea (%2,%6,2),%2 \n"
328 "movdqa %%xmm2,%%xmm8 \n"
329 "punpckldq %%xmm6,%%xmm2 \n"
330 "movlpd %%xmm2,(%1) \n"
331 "movhpd %%xmm2,(%2) \n"
332 "punpckhdq %%xmm6,%%xmm8 \n"
333 "movlpd %%xmm8,(%1,%5) \n"
334 "lea (%1,%5,2),%1 \n"
335 "movhpd %%xmm8,(%2,%6) \n"
336 "lea (%2,%6,2),%2 \n"
337 "movdqa %%xmm1,%%xmm8 \n"
338 "punpckldq %%xmm5,%%xmm1 \n"
339 "movlpd %%xmm1,(%1) \n"
340 "movhpd %%xmm1,(%2) \n"
341 "punpckhdq %%xmm5,%%xmm8 \n"
342 "movlpd %%xmm8,(%1,%5) \n"
343 "lea (%1,%5,2),%1 \n"
344 "movhpd %%xmm8,(%2,%6) \n"
345 "lea (%2,%6,2),%2 \n"
346 "movdqa %%xmm3,%%xmm8 \n"
347 "punpckldq %%xmm7,%%xmm3 \n"
348 "movlpd %%xmm3,(%1) \n"
349 "movhpd %%xmm3,(%2) \n"
350 "punpckhdq %%xmm7,%%xmm8 \n"
351 "sub $0x8,%3 \n"
352 "movlpd %%xmm8,(%1,%5) \n"
353 "lea (%1,%5,2),%1 \n"
354 "movhpd %%xmm8,(%2,%6) \n"
355 "lea (%2,%6,2),%2 \n"
356 "jg 1b \n"
357 : "+r"(src), // %0
358 "+r"(dst_a), // %1
359 "+r"(dst_b), // %2
360 "+r"(width) // %3
361 : "r"((intptr_t)(src_stride)), // %4
362 "r"((intptr_t)(dst_stride_a)), // %5
363 "r"((intptr_t)(dst_stride_b)) // %6
364 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
365 "xmm7", "xmm8", "xmm9");
366 }
367 #endif // defined(HAS_TRANSPOSEUVWX8_SSE2)
368 #endif // defined(__x86_64__) || defined(__i386__)
369
370 #ifdef __cplusplus
371 } // extern "C"
372 } // namespace libyuv
373 #endif
374