1;
2; Copyright (c) 2016, Alliance for Open Media. All rights reserved
3;
4; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
12;
13
14%include "third_party/x86inc/x86inc.asm"
15
16SECTION_RODATA
17
18pw_11585x2: times 8 dw 23170
19pd_8192:    times 4 dd 8192
20
21%macro TRANSFORM_COEFFS 2
22pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
23pw_%2_m%1:  dw  %2, -%1,  %2, -%1,  %2, -%1,  %2, -%1
24%endmacro
25
26TRANSFORM_COEFFS 11585,  11585
27TRANSFORM_COEFFS 15137,   6270
28TRANSFORM_COEFFS 16069,   3196
29TRANSFORM_COEFFS  9102,  13623
30
31%macro STORE_OUTPUT 2 ; index, result
32  ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
33  ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
34  ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
35  ; _mm_store_si128((__m128i *)(dst_ptr), out0);
36  ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
37  pxor               m11, m11
38  pcmpgtw            m11, m%2
39  movdqa             m12, m%2
40  punpcklwd          m%2, m11
41  punpckhwd          m12, m11
42  mova               [outputq + 4*%1 +  0], m%2
43  mova               [outputq + 4*%1 + 16], m12
44%endmacro
45
46SECTION .text
47
48%if ARCH_X86_64
49INIT_XMM ssse3
50cglobal fdct8x8, 3, 5, 13, input, output, stride
51
52  mova               m8, [GLOBAL(pd_8192)]
53  mova              m12, [GLOBAL(pw_11585x2)]
54
55  lea                r3, [2 * strideq]
56  lea                r4, [4 * strideq]
57  mova               m0, [inputq]
58  mova               m1, [inputq + r3]
59  lea                inputq, [inputq + r4]
60  mova               m2, [inputq]
61  mova               m3, [inputq + r3]
62  lea                inputq, [inputq + r4]
63  mova               m4, [inputq]
64  mova               m5, [inputq + r3]
65  lea                inputq, [inputq + r4]
66  mova               m6, [inputq]
67  mova               m7, [inputq + r3]
68
69  ; left shift by 2 to increase forward transformation precision
70  psllw              m0, 2
71  psllw              m1, 2
72  psllw              m2, 2
73  psllw              m3, 2
74  psllw              m4, 2
75  psllw              m5, 2
76  psllw              m6, 2
77  psllw              m7, 2
78
79  ; column transform
80  ; stage 1
81  paddw m10, m0, m7
82  psubw m0, m7
83
84  paddw m9, m1, m6
85  psubw m1, m6
86
87  paddw m7, m2, m5
88  psubw m2, m5
89
90  paddw m6, m3, m4
91  psubw m3, m4
92
93  ; stage 2
94  paddw m5, m9, m7
95  psubw m9, m7
96
97  paddw m4, m10, m6
98  psubw m10, m6
99
100  paddw m7, m1, m2
101  psubw m1, m2
102
103  ; stage 3
104  paddw m6, m4, m5
105  psubw m4, m5
106
107  pmulhrsw m1, m12
108  pmulhrsw m7, m12
109
110  ; sin(pi / 8), cos(pi / 8)
111  punpcklwd m2, m10, m9
112  punpckhwd m10, m9
113  pmaddwd m5, m2, [GLOBAL(pw_15137_6270)]
114  pmaddwd m2, [GLOBAL(pw_6270_m15137)]
115  pmaddwd m9, m10, [GLOBAL(pw_15137_6270)]
116  pmaddwd m10, [GLOBAL(pw_6270_m15137)]
117  paddd m5, m8
118  paddd m2, m8
119  paddd m9, m8
120  paddd m10, m8
121  psrad m5, 14
122  psrad m2, 14
123  psrad m9, 14
124  psrad m10, 14
125  packssdw m5, m9
126  packssdw m2, m10
127
128  pmulhrsw m6, m12
129  pmulhrsw m4, m12
130
131  paddw m9, m3, m1
132  psubw m3, m1
133
134  paddw m10, m0, m7
135  psubw m0, m7
136
137  ; stage 4
138  ; sin(pi / 16), cos(pi / 16)
139  punpcklwd m1, m10, m9
140  punpckhwd m10, m9
141  pmaddwd m7, m1, [GLOBAL(pw_16069_3196)]
142  pmaddwd m1, [GLOBAL(pw_3196_m16069)]
143  pmaddwd m9, m10, [GLOBAL(pw_16069_3196)]
144  pmaddwd m10, [GLOBAL(pw_3196_m16069)]
145  paddd m7, m8
146  paddd m1, m8
147  paddd m9, m8
148  paddd m10, m8
149  psrad m7, 14
150  psrad m1, 14
151  psrad m9, 14
152  psrad m10, 14
153  packssdw m7, m9
154  packssdw m1, m10
155
156  ; sin(3 * pi / 16), cos(3 * pi / 16)
157  punpcklwd m11, m0, m3
158  punpckhwd m0, m3
159  pmaddwd m9, m11, [GLOBAL(pw_9102_13623)]
160  pmaddwd m11, [GLOBAL(pw_13623_m9102)]
161  pmaddwd m3, m0, [GLOBAL(pw_9102_13623)]
162  pmaddwd m0, [GLOBAL(pw_13623_m9102)]
163  paddd m9, m8
164  paddd m11, m8
165  paddd m3, m8
166  paddd m0, m8
167  psrad m9, 14
168  psrad m11, 14
169  psrad m3, 14
170  psrad m0, 14
171  packssdw m9, m3
172  packssdw m11, m0
173
174  ; transpose
175  ; stage 1
176  punpcklwd m0, m6, m7
177  punpcklwd m3, m5, m11
178  punpckhwd m6, m7
179  punpckhwd m5, m11
180  punpcklwd m7, m4, m9
181  punpcklwd m10, m2, m1
182  punpckhwd m4, m9
183  punpckhwd m2, m1
184
185  ; stage 2
186  punpckldq m9, m0, m3
187  punpckldq m1, m6, m5
188  punpckhdq m0, m3
189  punpckhdq m6, m5
190  punpckldq m3, m7, m10
191  punpckldq m5, m4, m2
192  punpckhdq m7, m10
193  punpckhdq m4, m2
194
195  ; stage 3
196  punpcklqdq m10, m9, m3
197  punpckhqdq m9, m3
198  punpcklqdq m2, m0, m7
199  punpckhqdq m0, m7
200  punpcklqdq m3, m1, m5
201  punpckhqdq m1, m5
202  punpcklqdq m7, m6, m4
203  punpckhqdq m6, m4
204
205  ; row transform
206  ; stage 1
207  paddw m5, m10, m6
208  psubw m10, m6
209
210  paddw m4, m9, m7
211  psubw m9, m7
212
213  paddw m6, m2, m1
214  psubw m2, m1
215
216  paddw m7, m0, m3
217  psubw m0, m3
218
219  ;stage 2
220  paddw m1, m5, m7
221  psubw m5, m7
222
223  paddw m3, m4, m6
224  psubw m4, m6
225
226  paddw m7, m9, m2
227  psubw m9, m2
228
229  ; stage 3
230  punpcklwd m6, m1, m3
231  punpckhwd m1, m3
232  pmaddwd m2, m6, [GLOBAL(pw_11585_11585)]
233  pmaddwd m6, [GLOBAL(pw_11585_m11585)]
234  pmaddwd m3, m1, [GLOBAL(pw_11585_11585)]
235  pmaddwd m1, [GLOBAL(pw_11585_m11585)]
236  paddd m2, m8
237  paddd m6, m8
238  paddd m3, m8
239  paddd m1, m8
240  psrad m2, 14
241  psrad m6, 14
242  psrad m3, 14
243  psrad m1, 14
244  packssdw m2, m3
245  packssdw m6, m1
246
247  pmulhrsw m7, m12
248  pmulhrsw m9, m12
249
250  punpcklwd m3, m5, m4
251  punpckhwd m5, m4
252  pmaddwd m1, m3, [GLOBAL(pw_15137_6270)]
253  pmaddwd m3, [GLOBAL(pw_6270_m15137)]
254  pmaddwd m4, m5, [GLOBAL(pw_15137_6270)]
255  pmaddwd m5, [GLOBAL(pw_6270_m15137)]
256  paddd m1, m8
257  paddd m3, m8
258  paddd m4, m8
259  paddd m5, m8
260  psrad m1, 14
261  psrad m3, 14
262  psrad m4, 14
263  psrad m5, 14
264  packssdw m1, m4
265  packssdw m3, m5
266
267  paddw m4, m0, m9
268  psubw m0, m9
269
270  paddw m5, m10, m7
271  psubw m10, m7
272
273  ; stage 4
274  punpcklwd m9, m5, m4
275  punpckhwd m5, m4
276  pmaddwd m7, m9, [GLOBAL(pw_16069_3196)]
277  pmaddwd m9, [GLOBAL(pw_3196_m16069)]
278  pmaddwd m4, m5, [GLOBAL(pw_16069_3196)]
279  pmaddwd m5, [GLOBAL(pw_3196_m16069)]
280  paddd m7, m8
281  paddd m9, m8
282  paddd m4, m8
283  paddd m5, m8
284  psrad m7, 14
285  psrad m9, 14
286  psrad m4, 14
287  psrad m5, 14
288  packssdw m7, m4
289  packssdw m9, m5
290
291  punpcklwd m4, m10, m0
292  punpckhwd m10, m0
293  pmaddwd m5, m4, [GLOBAL(pw_9102_13623)]
294  pmaddwd m4, [GLOBAL(pw_13623_m9102)]
295  pmaddwd m0, m10, [GLOBAL(pw_9102_13623)]
296  pmaddwd m10, [GLOBAL(pw_13623_m9102)]
297  paddd m5, m8
298  paddd m4, m8
299  paddd m0, m8
300  paddd m10, m8
301  psrad m5, 14
302  psrad m4, 14
303  psrad m0, 14
304  psrad m10, 14
305  packssdw m5, m0
306  packssdw m4, m10
307
308  ; transpose
309  ; stage 1
310  punpcklwd m0, m2, m7
311  punpcklwd m10, m1, m4
312  punpckhwd m2, m7
313  punpckhwd m1, m4
314  punpcklwd m7, m6, m5
315  punpcklwd m4, m3, m9
316  punpckhwd m6, m5
317  punpckhwd m3, m9
318
319  ; stage 2
320  punpckldq m5, m0, m10
321  punpckldq m9, m2, m1
322  punpckhdq m0, m10
323  punpckhdq m2, m1
324  punpckldq m10, m7, m4
325  punpckldq m1, m6, m3
326  punpckhdq m7, m4
327  punpckhdq m6, m3
328
329  ; stage 3
330  punpcklqdq m4, m5, m10
331  punpckhqdq m5, m10
332  punpcklqdq m3, m0, m7
333  punpckhqdq m0, m7
334  punpcklqdq m10, m9, m1
335  punpckhqdq m9, m1
336  punpcklqdq m7, m2, m6
337  punpckhqdq m2, m6
338
339  psraw m1, m4, 15
340  psraw m6, m5, 15
341  psraw m8, m3, 15
342  psraw m11, m0, 15
343
344  psubw m4, m1
345  psubw m5, m6
346  psubw m3, m8
347  psubw m0, m11
348
349  psraw m4, 1
350  psraw m5, 1
351  psraw m3, 1
352  psraw m0, 1
353
354  psraw m1, m10, 15
355  psraw m6, m9, 15
356  psraw m8, m7, 15
357  psraw m11, m2, 15
358
359  psubw m10, m1
360  psubw m9, m6
361  psubw m7, m8
362  psubw m2, m11
363
364  psraw m10, 1
365  psraw m9, 1
366  psraw m7, 1
367  psraw m2, 1
368
369  STORE_OUTPUT  0,  4
370  STORE_OUTPUT  8,  5
371  STORE_OUTPUT 16,  3
372  STORE_OUTPUT 24,  0
373  STORE_OUTPUT 32, 10
374  STORE_OUTPUT 40,  9
375  STORE_OUTPUT 48,  7
376  STORE_OUTPUT 56,  2
377
378  RET
379%endif
380