1; Copyright © 2018, VideoLAN and dav1d authors
2; Copyright © 2018, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29
30SECTION_RODATA 16
31
32deint_shuf:  db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
33
34deint_shuf1: db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
35deint_shuf2: db  8,  9,  0,  1, 10, 11,  2,  3, 12, 13,  4,  5, 14, 15,  6,  7
36
37%macro COEF_PAIR 2-3 0 ; !0 = m%1_m%2, 2 = no %2_%1
38pw_%1_m%2:  times 4 dw  %1, -%2
39%if %3 != 2
40pw_%2_%1:   times 4 dw  %2,  %1
41%endif
42%if %3
43pw_m%1_m%2: times 4 dw -%1, -%2
44%endif
45%endmacro
46
47;adst4
48pw_1321_3803:   times 4 dw  1321,  3803
49pw_2482_m1321:  times 4 dw  2482, -1321
50pw_3344_2482:   times 4 dw  3344,  2482
51pw_3344_m3803:  times 4 dw  3344, -3803
52pw_3344_m3344:  times 4 dw  3344, -3344
53pw_0_3344       times 4 dw     0,  3344
54pw_m6688_m3803: times 4 dw -6688, -3803
55
56COEF_PAIR 2896, 2896
57COEF_PAIR 1567, 3784
58COEF_PAIR  799, 4017
59COEF_PAIR 3406, 2276
60COEF_PAIR  401, 4076
61COEF_PAIR 1931, 3612
62COEF_PAIR 3166, 2598
63COEF_PAIR 3920, 1189
64COEF_PAIR 3784, 1567, 1
65COEF_PAIR  995, 3973
66COEF_PAIR 1751, 3703
67COEF_PAIR 3513, 2106
68COEF_PAIR 3857, 1380
69COEF_PAIR 4017,  799, 1
70COEF_PAIR  201, 4091
71COEF_PAIR 2440, 3290
72COEF_PAIR 3035, 2751
73COEF_PAIR 4052,  601
74COEF_PAIR 2276, 3406, 1
75COEF_PAIR 4076,  401, 2
76COEF_PAIR 2598, 3166, 2
77COEF_PAIR 3612, 1931, 2
78COEF_PAIR 1189, 3920, 2
79
80pd_2048:        times 4 dd  2048
81pw_2048:        times 8 dw  2048
82pw_m2048:       times 8 dw -2048
83pw_4096:        times 8 dw  4096
84pw_16384:       times 8 dw  16384
85pw_m16384:      times 8 dw  -16384
86pw_1697x16:     times 8 dw  1697*16
87pw_1697x8:      times 8 dw  1697*8
88pw_2896x8:      times 8 dw  2896*8
89pw_3344x8:      times 8 dw  3344*8
90pw_8192:        times 8 dw  8192
91pw_m8192:       times 8 dw -8192
92pw_5:           times 8 dw  5
93pw_201x8:       times 8 dw   201*8
94pw_4091x8:      times 8 dw  4091*8
95pw_m2751x8:     times 8 dw -2751*8
96pw_3035x8:      times 8 dw  3035*8
97pw_1751x8:      times 8 dw  1751*8
98pw_3703x8:      times 8 dw  3703*8
99pw_m1380x8:     times 8 dw -1380*8
100pw_3857x8:      times 8 dw  3857*8
101pw_995x8:       times 8 dw   995*8
102pw_3973x8:      times 8 dw  3973*8
103pw_m2106x8:     times 8 dw -2106*8
104pw_3513x8:      times 8 dw  3513*8
105pw_2440x8:      times 8 dw  2440*8
106pw_3290x8:      times 8 dw  3290*8
107pw_m601x8:      times 8 dw  -601*8
108pw_4052x8:      times 8 dw  4052*8
109
110pw_4095x8:      times 8 dw  4095*8
111pw_101x8:       times 8 dw   101*8
112pw_2967x8:      times 8 dw  2967*8
113pw_m2824x8:     times 8 dw -2824*8
114pw_3745x8:      times 8 dw  3745*8
115pw_1660x8:      times 8 dw  1660*8
116pw_3822x8:      times 8 dw  3822*8
117pw_m1474x8:     times 8 dw -1474*8
118pw_3996x8:      times 8 dw  3996*8
119pw_897x8:       times 8 dw   897*8
120pw_3461x8:      times 8 dw  3461*8
121pw_m2191x8:     times 8 dw -2191*8
122pw_3349x8:      times 8 dw  3349*8
123pw_2359x8:      times 8 dw  2359*8
124pw_4036x8:      times 8 dw  4036*8
125pw_m700x8:      times 8 dw  -700*8
126pw_4065x8:      times 8 dw  4065*8
127pw_501x8:       times 8 dw   501*8
128pw_3229x8:      times 8 dw  3229*8
129pw_m2520x8:     times 8 dw -2520*8
130pw_3564x8:      times 8 dw  3564*8
131pw_2019x8:      times 8 dw  2019*8
132pw_3948x8:      times 8 dw  3948*8
133pw_m1092x8:     times 8 dw -1092*8
134pw_3889x8:      times 8 dw  3889*8
135pw_1285x8:      times 8 dw  1285*8
136pw_3659x8:      times 8 dw  3659*8
137pw_m1842x8:     times 8 dw -1842*8
138pw_3102x8:      times 8 dw  3102*8
139pw_2675x8:      times 8 dw  2675*8
140pw_4085x8:      times 8 dw  4085*8
141pw_m301x8:      times 8 dw  -301*8
142
143SECTION .text
144
145%macro REPX 2-*
146    %xdefine %%f(x) %1
147%rep %0 - 1
148    %rotate 1
149    %%f(%1)
150%endrep
151%endmacro
152
153%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
154
155%if ARCH_X86_64
156%define o(x) x
157%else
158%define o(x) r5-$$+x ; PIC
159%endif
160
161%macro WRITE_4X4 9  ;src[1-2], tmp[1-3], row[1-4]
162    lea                  r2, [dstq+strideq*2]
163%assign %%i 1
164%rotate 5
165%rep 4
166    %if %1 & 2
167        CAT_XDEFINE %%row_adr, %%i, r2   + strideq*(%1&1)
168    %else
169        CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
170    %endif
171    %assign %%i %%i + 1
172    %rotate 1
173%endrep
174
175    movd                 m%3, [%%row_adr1]        ;dst0
176    movd                 m%5, [%%row_adr2]        ;dst1
177    punpckldq            m%3, m%5                 ;high: dst1 :low: dst0
178    movd                 m%4, [%%row_adr3]        ;dst2
179    movd                 m%5, [%%row_adr4]        ;dst3
180    punpckldq            m%4, m%5                 ;high: dst3 :low: dst2
181
182    pxor                 m%5, m%5
183    punpcklbw            m%3, m%5                 ;extend byte to word
184    punpcklbw            m%4, m%5                 ;extend byte to word
185
186    paddw                m%3, m%1                 ;high: dst1 + out1 ;low: dst0 + out0
187    paddw                m%4, m%2                 ;high: dst3 + out3 ;low: dst2 + out2
188
189    packuswb             m%3, m%4                 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0
190
191    movd        [%%row_adr1], m%3                  ;store dst0 + out0
192    pshuflw              m%4, m%3, q1032
193    movd        [%%row_adr2], m%4                  ;store dst1 + out1
194    punpckhqdq           m%3, m%3
195    movd        [%%row_adr3], m%3                  ;store dst2 + out2
196    psrlq                m%3, 32
197    movd        [%%row_adr4], m%3                  ;store dst3 + out3
198%endmacro
199
200%macro ITX4_END 4-5 2048 ; row[1-4], rnd
201%if %5
202    mova                 m2, [o(pw_%5)]
203    pmulhrsw             m0, m2
204    pmulhrsw             m1, m2
205%endif
206
207    WRITE_4X4            0, 1, 2, 3, 4, %1, %2, %3, %4
208    ret
209%endmacro
210
211; flags: 1 = swap, 2: coef_regs, 4: no_pack
212%macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags
213%if %6 & 2
214    pmaddwd              m%2, m%4, m%1
215    pmaddwd              m%1, m%5
216%elif %6 & 1
217    pmaddwd              m%2, m%1, [o(pw_%5_%4)]
218    pmaddwd              m%1, [o(pw_%4_m%5)]
219%else
220    pmaddwd              m%2, m%1, [o(pw_%4_m%5)]
221    pmaddwd              m%1, [o(pw_%5_%4)]
222%endif
223    paddd                m%2, m%3
224    paddd                m%1, m%3
225    psrad                m%2, 12
226    psrad                m%1, 12
227%if %6 & 4 == 0
228    packssdw             m%1, m%2
229%endif
230%endmacro
231
232%macro IDCT4_1D_PACKED 0-1   ;pw_2896x8
233    mova                 m3, [o(pd_2048)]
234    punpckhwd            m2, m0, m1            ;unpacked in1 in3
235    punpcklwd            m0, m1                ;unpacked in0 in2
236    ITX_MUL2X_PACK        2, 1, 3, 1567, 3784
237    ITX_MUL2X_PACK        0, 1, 3, 2896, 2896
238    psubsw               m1, m0, m2            ;high: out2 ;low: out3
239    paddsw               m0, m2                ;high: out1 ;low: out0
240%endmacro
241
242%macro INV_TXFM_FN 4+ ; type1, type2, size, xmm/stack
243cglobal inv_txfm_add_%1_%2_%3, 4, 6, %4, dst, stride, coeff, eob, tx2
244    %define %%p1 m(i%1_%3_internal)
245%if ARCH_X86_32
246    LEA                    r5, $$
247%endif
248%if has_epilogue
249%ifidn %1_%2, dct_dct
250    test                 eobd, eobd
251    jz %%end
252%endif
253    lea                  tx2q, [o(m(i%2_%3_internal).pass2)]
254    call %%p1
255    RET
256%%end:
257%else
258    lea                  tx2q, [o(m(i%2_%3_internal).pass2)]
259%ifidn %1_%2, dct_dct
260    test                 eobd, eobd
261    jnz %%p1
262%else
263    times ((%%end - %%p1) >> 31) & 1 jmp %%p1
264ALIGN function_align
265%%end:
266%endif
267%endif
268%endmacro
269
270%macro INV_TXFM_4X4_FN 2 ; type1, type2
271    INV_TXFM_FN          %1, %2, 4x4, 6
272%ifidn %1_%2, dct_dct
273    pshuflw              m0, [coeffq], q0000
274    punpcklqdq           m0, m0
275    mova                 m1, [o(pw_2896x8)]
276    pmulhrsw             m0, m1
277    mov            [coeffq], eobd                ;0
278    pmulhrsw             m0, m1
279    mova                 m1, m0
280    TAIL_CALL m(iadst_4x4_internal).end2
281%endif
282%endmacro
283
284INIT_XMM ssse3
285
286INV_TXFM_4X4_FN dct, dct
287INV_TXFM_4X4_FN dct, adst
288INV_TXFM_4X4_FN dct, flipadst
289INV_TXFM_4X4_FN dct, identity
290
291cglobal idct_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
292    mova                 m0, [coeffq+16*0]      ;high: in1 ;low: in0
293    mova                 m1, [coeffq+16*1]      ;high: in3 ;low in2
294
295    IDCT4_1D_PACKED
296
297    mova                 m2, [o(deint_shuf)]
298    shufps               m3, m0, m1, q1331
299    shufps               m0, m1, q0220
300    pshufb               m0, m2                 ;high: in1 ;low: in0
301    pshufb               m1, m3, m2             ;high: in3 ;low :in2
302    jmp                tx2q
303
304.pass2:
305    IDCT4_1D_PACKED
306
307    pxor                 m2, m2
308    mova      [coeffq+16*0], m2
309    mova      [coeffq+16*1], m2                 ;memset(coeff, 0, sizeof(*coeff) * sh * sw);
310
311    ITX4_END     0, 1, 3, 2
312
313INV_TXFM_4X4_FN adst, dct
314INV_TXFM_4X4_FN adst, adst
315INV_TXFM_4X4_FN adst, flipadst
316INV_TXFM_4X4_FN adst, identity
317
318cglobal iadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
319    mova                 m0, [coeffq+16*0]
320    mova                 m1, [coeffq+16*1]
321    call .main
322    punpckhwd            m2, m0, m1
323    punpcklwd            m0, m1
324    punpckhwd            m1, m0, m2       ;high: in3 ;low :in2
325    punpcklwd            m0, m2           ;high: in1 ;low: in0
326    jmp                tx2q
327
328.pass2:
329    call .main
330
331.end:
332    pxor                 m2, m2
333    mova      [coeffq+16*0], m2
334    mova      [coeffq+16*1], m2
335
336.end2:
337    ITX4_END              0, 1, 2, 3
338
339ALIGN function_align
340.main:
341    punpcklwd            m2, m0, m1                ;unpacked in0 in2
342    punpckhwd            m0, m1                    ;unpacked in1 in3
343    mova                 m3, m0
344    pmaddwd              m1, m2, [o(pw_3344_m3344)];3344 * in0 - 3344 * in2
345    pmaddwd              m0, [o(pw_0_3344)]        ;3344 * in3
346    paddd                m1, m0                    ;t2
347    pmaddwd              m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
348    pmaddwd              m2, [o(pw_2482_m1321)]    ;2482 * in0 - 1321 * in2
349    pmaddwd              m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
350    pmaddwd              m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3
351    paddd                m4, m0                    ;t0 + t3
352    pmaddwd              m3, [o(pw_m6688_m3803)]   ;-2 * 3344 * in1 - 3803 * in3
353    mova                 m0, [o(pd_2048)]
354    paddd                m1, m0                    ;t2 + 2048
355    paddd                m2, m0
356    paddd                m0, m4                    ;t0 + t3 + 2048
357    paddd                m5, m2                    ;t1 + t3 + 2048
358    paddd                m2, m4
359    paddd                m2, m3                    ;t0 + t1 - t3 + 2048
360    REPX      {psrad x, 12}, m1, m0, m5, m2
361    packssdw             m0, m5                    ;high: out1 ;low: out0
362    packssdw             m1, m2                    ;high: out3 ;low: out3
363    ret
364
365INV_TXFM_4X4_FN flipadst, dct
366INV_TXFM_4X4_FN flipadst, adst
367INV_TXFM_4X4_FN flipadst, flipadst
368INV_TXFM_4X4_FN flipadst, identity
369
370cglobal iflipadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
371    mova                 m0, [coeffq+16*0]
372    mova                 m1, [coeffq+16*1]
373    call m(iadst_4x4_internal).main
374    punpcklwd            m2, m1, m0
375    punpckhwd            m1, m0
376    punpcklwd            m0, m1, m2            ;high: in3 ;low :in2
377    punpckhwd            m1, m2                ;high: in1 ;low: in0
378    jmp                tx2q
379
380.pass2:
381    call m(iadst_4x4_internal).main
382
383.end:
384    pxor                 m2, m2
385    mova      [coeffq+16*0], m2
386    mova      [coeffq+16*1], m2
387
388.end2:
389    ITX4_END              3, 2, 1, 0
390
391INV_TXFM_4X4_FN identity, dct
392INV_TXFM_4X4_FN identity, adst
393INV_TXFM_4X4_FN identity, flipadst
394INV_TXFM_4X4_FN identity, identity
395
396cglobal iidentity_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
397    mova                 m0, [coeffq+16*0]
398    mova                 m1, [coeffq+16*1]
399    mova                 m3, [o(pw_1697x8)]
400    pmulhrsw             m2, m0, m3
401    pmulhrsw             m3, m1
402    paddsw               m0, m2
403    paddsw               m1, m3
404    punpckhwd            m2, m0, m1
405    punpcklwd            m0, m1
406    punpckhwd            m1, m0, m2            ;high: in3 ;low :in2
407    punpcklwd            m0, m2                ;high: in1 ;low: in0
408    jmp                tx2q
409
410.pass2:
411    mova                 m3, [o(pw_1697x8)]
412    pmulhrsw             m2, m3, m0
413    pmulhrsw             m3, m1
414    paddsw               m0, m2
415    paddsw               m1, m3
416    jmp m(iadst_4x4_internal).end
417
418%macro IWHT4_1D_PACKED 0
419    punpckhqdq           m3, m0, m1            ;low: in1 high: in3
420    punpcklqdq           m0, m1                ;low: in0 high: in2
421    psubw                m2, m0, m3            ;low: in0 - in1 high: in2 - in3
422    paddw                m0, m3                ;low: in0 + in1 high: in2 + in3
423    punpckhqdq           m2, m2                ;t2 t2
424    punpcklqdq           m0, m0                ;t0 t0
425    psubw                m1, m0, m2
426    psraw                m1, 1                 ;t4 t4
427    psubw                m1, m3                ;low: t1/out2 high: t3/out1
428    psubw                m0, m1                ;high: out0
429    paddw                m2, m1                ;low: out3
430%endmacro
431
432cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff
433    mova                 m0, [coeffq+16*0]
434    mova                 m1, [coeffq+16*1]
435    pxor                 m2, m2
436    mova      [coeffq+16*0], m2
437    mova      [coeffq+16*1], m2
438    psraw                m0, 2
439    psraw                m1, 2
440
441    IWHT4_1D_PACKED
442
443    punpckhwd            m0, m1
444    punpcklwd            m3, m1, m2
445    punpckhdq            m1, m0, m3
446    punpckldq            m0, m3
447
448    IWHT4_1D_PACKED
449
450    shufpd               m0, m2, 0x01
451    ITX4_END              0, 3, 2, 1, 0
452
453
454%macro IDCT8_1D_PACKED 0
455    mova                 m6, [o(pd_2048)]
456    punpckhwd            m4, m0, m3                 ;unpacked in1 in7
457    punpcklwd            m0, m2                     ;unpacked in0 in4
458    punpckhwd            m2, m1                     ;unpacked in5 in3
459    punpcklwd            m1, m3                     ;unpacked in2 in6
460    ITX_MUL2X_PACK        4, 3, 6,  799, 4017       ;low: t7a high: t4a
461    ITX_MUL2X_PACK        2, 3, 6, 3406, 2276       ;low: t6a high: t5a
462    ITX_MUL2X_PACK        1, 3, 6, 1567, 3784       ;low: t3  high: t2
463    psubsw               m3, m4, m2                 ;low: t6a high: t5a
464    paddsw               m4, m2                     ;low: t7  high: t4
465    pshufb               m3, [o(deint_shuf1)]
466    ITX_MUL2X_PACK        0, 2, 6, 2896, 2896       ;low: t0  high: t1
467    ITX_MUL2X_PACK        3, 2, 6, 2896, 2896       ;low: t6  high: t5
468    psubsw               m2, m0, m1                 ;low: tmp3 high: tmp2
469    paddsw               m0, m1                     ;low: tmp0 high: tmp1
470    punpcklqdq           m1, m4, m3                 ;low: t7   high: t6
471    punpckhqdq           m4, m3                     ;low: t4   high: t5
472    psubsw               m3, m0, m1                 ;low: out7 high: out6
473    paddsw               m0, m1                     ;low: out0 high: out1
474    paddsw               m1, m2, m4                 ;low: out3 high: out2
475    psubsw               m2, m4                     ;low: out4 high: out5
476%endmacro
477
478;dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
479;dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
480%macro ITX_MULSUB_2W 7-8 0 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2_in_tmp1
481    punpckhwd           m%4, m%1, m%2
482    punpcklwd           m%1, m%2
483%if %7 < 8
484    pmaddwd             m%2, m%7, m%1
485    pmaddwd             m%3, m%7, m%4
486%else
487    mova                m%2, [o(pw_%7_%6)]
488%if %8
489    pmaddwd             m%3, m%1, m%2
490    pmaddwd             m%2, m%4
491%else
492    pmaddwd             m%3, m%4, m%2
493    pmaddwd             m%2, m%1
494%endif
495%endif
496    paddd               m%3, m%5
497    paddd               m%2, m%5
498    psrad               m%3, 12
499    psrad               m%2, 12
500%if %8
501    packssdw            m%3, m%2
502%else
503    packssdw            m%2, m%3                 ;dst2
504%endif
505%if %7 < 8
506    pmaddwd             m%4, m%6
507    pmaddwd             m%1, m%6
508%elif %8
509    mova                m%2, [o(pw_%6_m%7)]
510    pmaddwd             m%4, m%2
511    pmaddwd             m%1, m%2
512%else
513    mova                m%3, [o(pw_%6_m%7)]
514    pmaddwd             m%4, m%3
515    pmaddwd             m%1, m%3
516%endif
517    paddd               m%4, m%5
518    paddd               m%1, m%5
519    psrad               m%4, 12
520    psrad               m%1, 12
521    packssdw            m%1, m%4                 ;dst1
522%endmacro
523
524%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
525    ITX_MULSUB_2W        %2, %4, %5, %6, %7, 1567, 3784, 1 ;t2, t3
526    ITX_MULSUB_2W        %1, %3, %4, %6, %7, 2896, 2896, 1 ;t1, t0
527    psubsw              m%3, m%1, m%2                      ;out2
528    paddsw              m%2, m%1                           ;out1
529    paddsw              m%1, m%5, m%4                      ;out0
530    psubsw              m%4, m%5                           ;out3
531%endmacro
532
533%macro WRITE_4X8 4 ;row[1-4]
534    WRITE_4X4             0, 1, 4, 5, 6, %1, %2, %3, %4
535    lea                dstq, [dstq+strideq*4]
536    WRITE_4X4             2, 3, 4, 5, 6, %1, %2, %3, %4
537%endmacro
538
539%macro INV_4X8 0
540    punpckhwd            m4, m2, m3
541    punpcklwd            m2, m3
542    punpckhwd            m3, m0, m1
543    punpcklwd            m0, m1
544    punpckhdq            m1, m0, m2                  ;low: in2 high: in3
545    punpckldq            m0, m2                      ;low: in0 high: in1
546    punpckldq            m2, m3, m4                  ;low: in4 high: in5
547    punpckhdq            m3, m4                      ;low: in6 high: in7
548%endmacro
549
550%macro INV_TXFM_4X8_FN 2 ; type1, type2
551    INV_TXFM_FN          %1, %2, 4x8, 8
552%ifidn %1_%2, dct_dct
553    pshuflw              m0, [coeffq], q0000
554    punpcklqdq           m0, m0
555    mova                 m1, [o(pw_2896x8)]
556    pmulhrsw             m0, m1
557    mov           [coeffq], eobd
558    pmulhrsw             m0, m1
559    pmulhrsw             m0, m1
560    pmulhrsw             m0, [o(pw_2048)]
561    mova                 m1, m0
562    mova                 m2, m0
563    mova                 m3, m0
564    TAIL_CALL m(iadst_4x8_internal).end3
565%endif
566%endmacro
567
568INV_TXFM_4X8_FN dct, dct
569INV_TXFM_4X8_FN dct, adst
570INV_TXFM_4X8_FN dct, flipadst
571INV_TXFM_4X8_FN dct, identity
572
573cglobal idct_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
574    mova                 m3, [o(pw_2896x8)]
575    pmulhrsw             m0, m3, [coeffq+16*0]
576    pmulhrsw             m1, m3, [coeffq+16*1]
577    pmulhrsw             m2, m3, [coeffq+16*2]
578    pmulhrsw             m3,     [coeffq+16*3]
579
580.pass1:
581    call m(idct_8x4_internal).main
582    jmp m(iadst_4x8_internal).pass1_end
583
584.pass2:
585    call .main
586    shufps               m1, m1, q1032
587    shufps               m3, m3, q1032
588    mova                 m4, [o(pw_2048)]
589    jmp m(iadst_4x8_internal).end2
590
591ALIGN function_align
592.main:
593    IDCT8_1D_PACKED
594    ret
595
596
597INV_TXFM_4X8_FN adst, dct
598INV_TXFM_4X8_FN adst, adst
599INV_TXFM_4X8_FN adst, flipadst
600INV_TXFM_4X8_FN adst, identity
601
602cglobal iadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
603    mova                 m3, [o(pw_2896x8)]
604    pmulhrsw             m0, m3, [coeffq+16*0]
605    pmulhrsw             m1, m3, [coeffq+16*1]
606    pmulhrsw             m2, m3, [coeffq+16*2]
607    pmulhrsw             m3,     [coeffq+16*3]
608
609.pass1:
610    call m(iadst_8x4_internal).main
611
612.pass1_end:
613    INV_4X8
614    jmp                tx2q
615
616.pass2:
617    shufps               m0, m0, q1032
618    shufps               m1, m1, q1032
619    call .main
620    mova                 m4, [o(pw_2048)]
621    pxor                 m5, m5
622    psubw                m5, m4
623
624.end:
625    punpcklqdq           m4, m5
626
627.end2:
628    pmulhrsw             m0, m4
629    pmulhrsw             m1, m4
630    pmulhrsw             m2, m4
631    pmulhrsw             m3, m4
632    pxor                 m5, m5
633    mova      [coeffq+16*0], m5
634    mova      [coeffq+16*1], m5
635    mova      [coeffq+16*2], m5
636    mova      [coeffq+16*3], m5
637
638.end3:
639    WRITE_4X8             0, 1, 2, 3
640    RET
641
642ALIGN function_align
643.main:
644    mova                 m6, [o(pd_2048)]
645    punpckhwd            m4, m3, m0                ;unpacked in7 in0
646    punpckhwd            m5, m2, m1                ;unpacked in5 in2
647    punpcklwd            m1, m2                    ;unpacked in3 in4
648    punpcklwd            m0, m3                    ;unpacked in1 in6
649    ITX_MUL2X_PACK        4, 2, 6,  401, 4076      ;low:  t0a   high:  t1a
650    ITX_MUL2X_PACK        5, 2, 6, 1931, 3612      ;low:  t2a   high:  t3a
651    ITX_MUL2X_PACK        1, 2, 6, 3166, 2598      ;low:  t4a   high:  t5a
652    ITX_MUL2X_PACK        0, 2, 6, 3920, 1189      ;low:  t6a   high:  t7a
653
654    psubsw               m3, m4, m1                ;low:  t4    high:  t5
655    paddsw               m4, m1                    ;low:  t0    high:  t1
656    psubsw               m2, m5, m0                ;low:  t6    high:  t7
657    paddsw               m5, m0                    ;low:  t2    high:  t3
658
659    shufps               m1, m3, m2, q1032
660    punpckhwd            m2, m1
661    punpcklwd            m3, m1
662    ITX_MUL2X_PACK        3, 0, 6, 1567, 3784, 1   ;low:  t5a   high:  t4a
663    ITX_MUL2X_PACK        2, 0, 6, 3784, 1567      ;low:  t7a   high:  t6a
664
665    psubsw               m1, m4, m5                ;low:  t2    high:  t3
666    paddsw               m4, m5                    ;low:  out0  high: -out7
667    psubsw               m5, m3, m2                ;low:  t7    high:  t6
668    paddsw               m3, m2                    ;low:  out6  high: -out1
669    shufps               m0, m4, m3, q3210         ;low:  out0  high: -out1
670    shufps               m3, m4, q3210             ;low:  out6  high: -out7
671
672    mova                 m2, [o(pw_2896_m2896)]
673    mova                 m7, [o(pw_2896_2896)]
674    shufps               m4, m1, m5, q1032         ;low:  t3    high:  t7
675    shufps               m1, m5, q3210             ;low:  t2    high:  t6
676    punpcklwd            m5, m1, m4
677    punpckhwd            m1, m4
678    pmaddwd              m4, m2, m1                ;-out5
679    pmaddwd              m2, m5                    ; out4
680    pmaddwd              m1, m7                    ; out2
681    pmaddwd              m5, m7                    ;-out3
682    REPX      {paddd x, m6}, m4, m2, m1, m5
683    REPX      {psrad x, 12}, m4, m2, m1, m5
684    packssdw             m1, m5                    ;low:  out2  high: -out3
685    packssdw             m2, m4                    ;low:  out4  high: -out5
686    ret
687
688INV_TXFM_4X8_FN flipadst, dct
689INV_TXFM_4X8_FN flipadst, adst
690INV_TXFM_4X8_FN flipadst, flipadst
691INV_TXFM_4X8_FN flipadst, identity
692
693cglobal iflipadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
694    mova                 m3, [o(pw_2896x8)]
695    pmulhrsw             m0, m3, [coeffq+16*0]
696    pmulhrsw             m1, m3, [coeffq+16*1]
697    pmulhrsw             m2, m3, [coeffq+16*2]
698    pmulhrsw             m3,     [coeffq+16*3]
699
700.pass1:
701    call m(iadst_8x4_internal).main
702
703    punpcklwd            m4, m3, m2
704    punpckhwd            m3, m2
705    punpcklwd            m5, m1, m0
706    punpckhwd            m1, m0
707    punpckldq            m2, m3, m1                  ;low: in4 high: in5
708    punpckhdq            m3, m1                      ;low: in6 high: in7
709    punpckldq            m0, m4, m5                  ;low: in0 high: in1
710    punpckhdq            m1, m4, m5                  ;low: in2 high: in3
711    jmp                tx2q
712
713.pass2:
714    shufps               m0, m0, q1032
715    shufps               m1, m1, q1032
716    call m(iadst_4x8_internal).main
717
718    mova                 m4, m0
719    mova                 m5, m1
720    pshufd               m0, m3, q1032
721    pshufd               m1, m2, q1032
722    pshufd               m2, m5, q1032
723    pshufd               m3, m4, q1032
724    mova                 m5, [o(pw_2048)]
725    pxor                 m4, m4
726    psubw                m4, m5
727    jmp m(iadst_4x8_internal).end
728
729INV_TXFM_4X8_FN identity, dct
730INV_TXFM_4X8_FN identity, adst
731INV_TXFM_4X8_FN identity, flipadst
732INV_TXFM_4X8_FN identity, identity
733
734cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
735    mova                 m3, [o(pw_2896x8)]
736    pmulhrsw             m0, m3, [coeffq+16*0]
737    pmulhrsw             m1, m3, [coeffq+16*1]
738    pmulhrsw             m2, m3, [coeffq+16*2]
739    pmulhrsw             m3,     [coeffq+16*3]
740
741.pass1:
742    mova                 m7, [o(pw_1697x8)]
743    pmulhrsw             m4, m7, m0
744    pmulhrsw             m5, m7, m1
745    pmulhrsw             m6, m7, m2
746    pmulhrsw             m7, m3
747    paddsw               m0, m4
748    paddsw               m1, m5
749    paddsw               m2, m6
750    paddsw               m3, m7
751    jmp m(iadst_4x8_internal).pass1_end
752
753.pass2:
754    mova                 m4, [o(pw_4096)]
755    jmp m(iadst_4x8_internal).end2
756
757
758%macro WRITE_8X2 5       ;coefs[1-2], tmp[1-3]
759    movq                 m%3, [dstq        ]
760    movq                 m%4, [dstq+strideq]
761    pxor                 m%5, m%5
762    punpcklbw            m%3, m%5                 ;extend byte to word
763    punpcklbw            m%4, m%5                 ;extend byte to word
764%ifnum %1
765    paddw                m%3, m%1
766%else
767    paddw                m%3, %1
768%endif
769%ifnum %2
770    paddw                m%4, m%2
771%else
772    paddw                m%4, %2
773%endif
774    packuswb             m%3, m%4
775    movq      [dstq        ], m%3
776    punpckhqdq           m%3, m%3
777    movq      [dstq+strideq], m%3
778%endmacro
779
780%macro WRITE_8X4 7      ;coefs[1-4], tmp[1-3]
781    WRITE_8X2             %1, %2, %5, %6, %7
782    lea                dstq, [dstq+strideq*2]
783    WRITE_8X2             %3, %4, %5, %6, %7
784%endmacro
785
786%macro INV_TXFM_8X4_FN 2 ; type1, type2
787    INV_TXFM_FN          %1, %2, 8x4, 8
788%ifidn %1_%2, dct_dct
789    pshuflw              m0, [coeffq], q0000
790    punpcklqdq           m0, m0
791    mova                 m1, [o(pw_2896x8)]
792    pmulhrsw             m0, m1
793    pmulhrsw             m0, m1
794    mova                 m2, [o(pw_2048)]
795    pmulhrsw             m0, m1
796    pmulhrsw             m0, m2
797    mova                 m1, m0
798    mova                 m2, m0
799    mova                 m3, m0
800    TAIL_CALL m(iadst_8x4_internal).end2
801%endif
802%endmacro
803
804INV_TXFM_8X4_FN dct, dct
805INV_TXFM_8X4_FN dct, adst
806INV_TXFM_8X4_FN dct, flipadst
807INV_TXFM_8X4_FN dct, identity
808
809cglobal idct_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
810    mova                 m3, [o(pw_2896x8)]
811    pmulhrsw             m0, m3, [coeffq+16*0]
812    pmulhrsw             m1, m3, [coeffq+16*1]
813    pmulhrsw             m2, m3, [coeffq+16*2]
814    pmulhrsw             m3,     [coeffq+16*3]
815
816    call m(idct_4x8_internal).main
817
818    mova                 m4, [o(deint_shuf1)]
819    mova                 m5, [o(deint_shuf2)]
820    pshufb               m0, m4
821    pshufb               m1, m5
822    pshufb               m2, m4
823    pshufb               m3, m5
824    punpckhdq            m4, m0, m1
825    punpckldq            m0, m1
826    punpckhdq            m5, m2, m3
827    punpckldq            m2, m3
828    punpckhqdq           m1, m0, m2                      ;in1
829    punpcklqdq           m0, m2                          ;in0
830    punpckhqdq           m3, m4, m5                      ;in3
831    punpcklqdq           m2 ,m4, m5                      ;in2
832    jmp                tx2q
833
834.pass2:
835    call .main
836    jmp m(iadst_8x4_internal).end
837
838ALIGN function_align
839.main:
840    mova                 m6, [o(pd_2048)]
841    IDCT4_1D             0, 1, 2, 3, 4, 5, 6
842    ret
843
844INV_TXFM_8X4_FN adst, dct
845INV_TXFM_8X4_FN adst, adst
846INV_TXFM_8X4_FN adst, flipadst
847INV_TXFM_8X4_FN adst, identity
848
849cglobal iadst_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
850    mova                 m3, [o(pw_2896x8)]
851    pmulhrsw             m0, m3, [coeffq+16*0]
852    pmulhrsw             m1, m3, [coeffq+16*1]
853    pmulhrsw             m2, m3, [coeffq+16*2]
854    pmulhrsw             m3,     [coeffq+16*3]
855
856    shufps               m0, m0, q1032
857    shufps               m1, m1, q1032
858    call m(iadst_4x8_internal).main
859
860    punpckhwd            m4, m0, m1
861    punpcklwd            m0, m1
862    punpckhwd            m1, m2, m3
863    punpcklwd            m2, m3
864    pxor                 m5, m5
865    psubsw               m3, m5, m1
866    psubsw               m5, m4
867    punpckhdq            m4, m5, m3
868    punpckldq            m5, m3
869    punpckhdq            m3, m0, m2
870    punpckldq            m0, m2
871    punpckhwd            m1, m0, m5      ;in1
872    punpcklwd            m0, m5          ;in0
873    punpcklwd            m2, m3, m4      ;in2
874    punpckhwd            m3, m4          ;in3
875    jmp              tx2q
876
877.pass2:
878    call .main
879
880.end:
881    mova                 m4, [o(pw_2048)]
882    pmulhrsw             m0, m4
883    pmulhrsw             m1, m4
884    pmulhrsw             m2, m4
885    pmulhrsw             m3, m4
886
887.end2:
888    pxor                 m6, m6
889    mova      [coeffq+16*0], m6
890    mova      [coeffq+16*1], m6
891    mova      [coeffq+16*2], m6
892    mova      [coeffq+16*3], m6
893.end3:
894    WRITE_8X4             0, 1, 2, 3, 4, 5, 6
895    RET
896
897ALIGN function_align
898.main:
899    punpckhwd            m6, m0, m2                    ;unpacked in0 in2
900    punpcklwd            m0, m2                        ;unpacked in0 in2
901    punpckhwd            m7, m1, m3                    ;unpacked in1 in3
902    punpcklwd            m1, m3                        ;unpacked in1 in3
903
904    mova                 m2, [o(pw_3344_m3344)]
905    mova                 m4, [o(pw_0_3344)]
906    pmaddwd              m3, m2, m6                    ;3344 * in0 - 3344 * in2
907    pmaddwd              m5, m4, m7                    ;3344 * in3
908    pmaddwd              m2, m0
909    pmaddwd              m4, m1
910    paddd                m3, m5
911    paddd                m2, m4
912    mova                 m4, [o(pd_2048)]
913    paddd                m3, m4                        ;t2 + 2048
914    paddd                m2, m4
915    psrad                m3, 12
916    psrad                m2, 12
917    packssdw             m2, m3                        ;out2
918
919    pmaddwd              m4, m0, [o(pw_1321_3803)]     ;1321 * in0 + 3803 * in2
920    pmaddwd              m0, [o(pw_2482_m1321)]        ;2482 * in0 - 1321 * in2
921    pmaddwd              m3, m1, [o(pw_3344_2482)]     ;3344 * in1 + 2482 * in3
922    pmaddwd              m5, m1, [o(pw_3344_m3803)]    ;3344 * in1 - 3803 * in3
923    paddd                m3, m4                        ;t0 + t3
924
925    pmaddwd              m1, [o(pw_m6688_m3803)]       ;-2 * 3344 * in1 - 3803 * in3
926    mova                 m4, [o(pd_2048)]
927    paddd                m0, m4
928    paddd                m4, m3                        ;t0 + t3 + 2048
929    paddd                m5, m0                        ;t1 + t3 + 2048
930    paddd                m3, m0
931    paddd                m3, m1                        ;t0 + t1 - t3 + 2048
932
933    psrad                m4, 12                        ;out0
934    psrad                m5, 12                        ;out1
935    psrad                m3, 12                        ;out3
936    packssdw             m0, m4, m5                    ;low: out0  high: out1
937
938    pmaddwd              m4, m6, [o(pw_1321_3803)]     ;1321 * in0 + 3803 * in2
939    pmaddwd              m6, [o(pw_2482_m1321)]        ;2482 * in0 - 1321 * in2
940    pmaddwd              m1, m7, [o(pw_3344_2482)]     ;3344 * in1 + 2482 * in3
941    pmaddwd              m5, m7, [o(pw_3344_m3803)]    ;3344 * in1 - 3803 * in3
942    paddd                m1, m4                        ;t0 + t3
943    pmaddwd              m7, [o(pw_m6688_m3803)]       ;-2 * 3344 * in1 - 3803 * in3
944
945    mova                 m4, [o(pd_2048)]
946    paddd                m6, m4
947    paddd                m4, m1                        ;t0 + t3 + 2048
948    paddd                m5, m6                        ;t1 + t3 + 2048
949    paddd                m1, m6
950    paddd                m1, m7                        ;t0 + t1 - t3 + 2048
951
952    psrad                m4, 12                        ;out0
953    psrad                m5, 12                        ;out1
954    psrad                m1, 12                        ;out3
955    packssdw             m3, m1                        ;out3
956    packssdw             m4, m5                        ;low: out0  high: out1
957
958    punpckhqdq           m1, m0, m4                    ;out1
959    punpcklqdq           m0, m4                        ;out0
960    ret
961
962INV_TXFM_8X4_FN flipadst, dct
963INV_TXFM_8X4_FN flipadst, adst
964INV_TXFM_8X4_FN flipadst, flipadst
965INV_TXFM_8X4_FN flipadst, identity
966
967cglobal iflipadst_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
968    mova                 m3, [o(pw_2896x8)]
969    pmulhrsw             m0, m3, [coeffq+16*0]
970    pmulhrsw             m1, m3, [coeffq+16*1]
971    pmulhrsw             m2, m3, [coeffq+16*2]
972    pmulhrsw             m3,     [coeffq+16*3]
973
974    shufps               m0, m0, q1032
975    shufps               m1, m1, q1032
976    call m(iadst_4x8_internal).main
977
978    punpckhwd            m5, m3, m2
979    punpcklwd            m3, m2
980    punpckhwd            m2, m1, m0
981    punpcklwd            m1, m0
982
983    pxor                 m0, m0
984    psubsw               m4, m0, m2
985    psubsw               m0, m5
986    punpckhdq            m2, m0, m4
987    punpckldq            m0, m4
988    punpckhdq            m4, m3, m1
989    punpckldq            m3, m1
990    punpckhwd            m1, m0, m3      ;in1
991    punpcklwd            m0, m3          ;in0
992    punpckhwd            m3, m2, m4      ;in3
993    punpcklwd            m2, m4          ;in2
994    jmp                  tx2q
995
996.pass2:
997    call m(iadst_8x4_internal).main
998    mova                 m4, m0
999    mova                 m5, m1
1000    mova                 m0, m3
1001    mova                 m1, m2
1002    mova                 m2, m5
1003    mova                 m3, m4
1004    jmp m(iadst_8x4_internal).end
1005
1006INV_TXFM_8X4_FN identity, dct
1007INV_TXFM_8X4_FN identity, adst
1008INV_TXFM_8X4_FN identity, flipadst
1009INV_TXFM_8X4_FN identity, identity
1010
1011cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
1012    mova                 m3, [o(pw_2896x8)]
1013    pmulhrsw             m0, m3, [coeffq+16*0]
1014    pmulhrsw             m1, m3, [coeffq+16*1]
1015    pmulhrsw             m2, m3, [coeffq+16*2]
1016    pmulhrsw             m3,     [coeffq+16*3]
1017    paddsw               m0, m0
1018    paddsw               m1, m1
1019    paddsw               m2, m2
1020    paddsw               m3, m3
1021
1022    punpckhwd            m4, m0, m1
1023    punpcklwd            m0, m1
1024    punpckhwd            m1, m2, m3
1025    punpcklwd            m2, m3
1026    punpckhdq            m5, m4, m1
1027    punpckldq            m4, m1
1028    punpckhdq            m3, m0, m2
1029    punpckldq            m0, m2
1030    punpckhwd            m1, m0, m4      ;in1
1031    punpcklwd            m0, m4          ;in0
1032    punpcklwd            m2, m3, m5      ;in2
1033    punpckhwd            m3, m5          ;in3
1034    jmp                tx2q
1035
1036.pass2:
1037    mova                 m7, [o(pw_1697x8)]
1038    pmulhrsw             m4, m7, m0
1039    pmulhrsw             m5, m7, m1
1040    pmulhrsw             m6, m7, m2
1041    pmulhrsw             m7, m3
1042    paddsw               m0, m4
1043    paddsw               m1, m5
1044    paddsw               m2, m6
1045    paddsw               m3, m7
1046    jmp m(iadst_8x4_internal).end
1047
1048%macro INV_TXFM_8X8_FN 2 ; type1, type2
1049    INV_TXFM_FN          %1, %2, 8x8, 8, 16*4
1050%ifidn %1_%2, dct_dct
1051    pshuflw              m0, [coeffq], q0000
1052    punpcklwd            m0, m0
1053    mova                 m1, [o(pw_2896x8)]
1054    pmulhrsw             m0, m1
1055    mova                 m2, [o(pw_16384)]
1056    mov            [coeffq], eobd
1057    pmulhrsw             m0, m2
1058    psrlw                m2, 3
1059    pmulhrsw             m0, m1
1060    pmulhrsw             m0, m2
1061.end:
1062    mov                 r3d, 2
1063    lea                tx2q, [o(m(inv_txfm_add_dct_dct_8x8).end3)]
1064.loop:
1065    WRITE_8X4             0, 0, 0, 0, 1, 2, 3
1066    lea                dstq, [dstq+strideq*2]
1067    dec                 r3d
1068    jg .loop
1069    jmp                tx2q
1070.end3:
1071    RET
1072%endif
1073%endmacro
1074
1075%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2
1076%if %3
1077    mova                 m7, [o(pw_2896x8)]
1078    pmulhrsw             m0, m7, [%1+%2*0]
1079    pmulhrsw             m1, m7, [%1+%2*1]
1080    pmulhrsw             m2, m7, [%1+%2*2]
1081    pmulhrsw             m3, m7, [%1+%2*3]
1082    pmulhrsw             m4, m7, [%1+%2*4]
1083    pmulhrsw             m5, m7, [%1+%2*5]
1084    pmulhrsw             m6, m7, [%1+%2*6]
1085    pmulhrsw             m7, [%1+%2*7]
1086%else
1087    mova                 m0, [%1+%2*0]
1088    mova                 m1, [%1+%2*1]
1089    mova                 m2, [%1+%2*2]
1090    mova                 m3, [%1+%2*3]
1091    mova                 m4, [%1+%2*4]
1092    mova                 m5, [%1+%2*5]
1093    mova                 m6, [%1+%2*6]
1094    mova                 m7, [%1+%2*7]
1095%endif
1096%endmacro
1097
1098%macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048
1099    ITX_MULSUB_2W         %1, %4, %5, %6, %7,  799, 4017    ;t4a, t7a
1100    ITX_MULSUB_2W         %3, %2, %5, %6, %7, 3406, 2276, 1 ;t5a, t6a
1101    psubsw               m%2, m%4, m%5                      ;t6a
1102    paddsw               m%4, m%5                           ;t7
1103    psubsw               m%5, m%1, m%3                      ;t5a
1104    paddsw               m%1, m%3                           ;t4
1105    ITX_MULSUB_2W         %2, %5, %3, %6, %7, 2896, 2896, 1 ;t5, t6
1106%endmacro
1107
1108INV_TXFM_8X8_FN dct, dct
1109INV_TXFM_8X8_FN dct, adst
1110INV_TXFM_8X8_FN dct, flipadst
1111INV_TXFM_8X8_FN dct, identity
1112
1113cglobal idct_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
1114    LOAD_8ROWS          coeffq, 16
1115
1116.pass1:
1117    call .main
1118
1119.pass1_end:
1120    mova                    m7, [o(pw_16384)]
1121
1122.pass1_end1:
1123    REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
1124    mova    [rsp+gprsize+16*1], m6
1125
1126.pass1_end2:
1127    REPX      {pmulhrsw x, m7}, m1, m3, m5
1128    pmulhrsw                m7, [rsp+gprsize+16*0]
1129
1130.pass1_end3:
1131    punpcklwd               m6, m1, m5             ;10 50 11 51 12 52 13 53
1132    punpckhwd               m1, m5                 ;14 54 15 55 16 56 17 57
1133    punpckhwd               m5, m0, m4             ;04 44 05 45 06 46 07 47
1134    punpcklwd               m0, m4                 ;00 40 01 41 02 42 03 43
1135    punpckhwd               m4, m3, m7             ;34 74 35 75 36 76 37 77
1136    punpcklwd               m3, m7                 ;30 70 31 71 32 72 33 73
1137    punpckhwd               m7, m1, m4             ;16 36 56 76 17 37 57 77
1138    punpcklwd               m1, m4                 ;14 34 54 74 15 35 55 75
1139    punpckhwd               m4, m6, m3             ;12 32 52 72 13 33 53 73
1140    punpcklwd               m6, m3                 ;10 30 50 70 11 31 51 71
1141    mova    [rsp+gprsize+16*2], m6
1142    mova                    m6, [rsp+gprsize+16*1]
1143    punpckhwd               m3, m2, m6             ;24 64 25 65 26 66 27 67
1144    punpcklwd               m2, m6                 ;20 60 21 61 22 62 23 63
1145    punpckhwd               m6, m5, m3             ;06 26 46 66 07 27 47 67
1146    punpcklwd               m5, m3                 ;04 24 44 64 05 25 45 65
1147    punpckhwd               m3, m0, m2             ;02 22 42 62 03 23 43 63
1148    punpcklwd               m0, m2                 ;00 20 40 60 01 21 41 61
1149
1150    punpckhwd               m2, m6, m7             ;07 17 27 37 47 57 67 77
1151    punpcklwd               m6, m7                 ;06 16 26 36 46 56 66 76
1152    mova    [rsp+gprsize+16*0], m2
1153    punpcklwd               m2, m3, m4             ;02 12 22 32 42 52 62 72
1154    punpckhwd               m3, m4                 ;03 13 23 33 43 53 63 73
1155    punpcklwd               m4, m5, m1             ;04 14 24 34 44 54 64 74
1156    punpckhwd               m5, m1                 ;05 15 25 35 45 55 65 75
1157    mova                    m7, [rsp+gprsize+16*2]
1158    punpckhwd               m1, m0, m7             ;01 11 21 31 41 51 61 71
1159    punpcklwd               m0, m7                 ;00 10 20 30 40 50 60 70
1160    mova                    m7, [rsp+gprsize+16*0]
1161    jmp                   tx2q
1162
1163.pass2:
1164    lea                   tx2q, [o(m(idct_8x8_internal).end4)]
1165
1166.pass2_main:
1167    call .main
1168
1169.end:
1170    mova                    m7, [o(pw_2048)]
1171    REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
1172    mova    [rsp+gprsize+16*1], m6
1173
1174.end2:
1175    REPX      {pmulhrsw x, m7}, m1, m3, m5
1176    pmulhrsw                m7, [rsp+gprsize+16*0]
1177    mova    [rsp+gprsize+16*2], m5
1178    mova    [rsp+gprsize+16*0], m7
1179
1180.end3:
1181    WRITE_8X4                0, 1, 2, 3, 5, 6, 7
1182    lea                   dstq, [dstq+strideq*2]
1183    WRITE_8X4                4, [rsp+gprsize+16*2], [rsp+gprsize+16*1], [rsp+gprsize+16*0], 5, 6, 7
1184    jmp                   tx2q
1185
1186.end4:
1187    pxor                    m7, m7
1188    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
1189    ret
1190
1191ALIGN function_align
1192.main:
1193    mova  [rsp+gprsize*2+16*0], m7
1194    mova  [rsp+gprsize*2+16*1], m3
1195    mova  [rsp+gprsize*2+16*2], m1
1196    mova                    m7, [o(pd_2048)]
1197    IDCT4_1D                 0, 2, 4, 6, 1, 3, 7
1198    mova                    m3, [rsp+gprsize*2+16*2]
1199    mova  [rsp+gprsize*2+16*2], m2
1200    mova                    m2, [rsp+gprsize*2+16*1]
1201    mova  [rsp+gprsize*2+16*1], m4
1202    mova                    m4, [rsp+gprsize*2+16*0]
1203    mova  [rsp+gprsize*2+16*0], m6
1204    IDCT8_1D_ODDHALF         3, 2, 5, 4, 1, 6, 7
1205    mova                    m6, [rsp+gprsize*2+16*0]
1206    psubsw                  m7, m0, m4                    ;out7
1207    paddsw                  m0, m4                        ;out0
1208    mova  [rsp+gprsize*2+16*0], m7
1209    mova                    m1, [rsp+gprsize*2+16*2]
1210    psubsw                  m4, m6, m3                    ;out4
1211    paddsw                  m3, m6                        ;out3
1212    mova                    m7, [rsp+gprsize*2+16*1]
1213    psubsw                  m6, m1, m5                    ;out6
1214    paddsw                  m1, m5                        ;out1
1215    psubsw                  m5, m7, m2                    ;out5
1216    paddsw                  m2, m7                        ;out2
1217    ret
1218
1219
1220INV_TXFM_8X8_FN adst, dct
1221INV_TXFM_8X8_FN adst, adst
1222INV_TXFM_8X8_FN adst, flipadst
1223INV_TXFM_8X8_FN adst, identity
1224
1225cglobal iadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
1226    LOAD_8ROWS          coeffq, 16
1227
1228.pass1:
1229    call .main
1230    call .main_pass1_end
1231
1232.pass1_end:
1233    mova                    m7, [o(pw_16384)]
1234
1235.pass1_end1:
1236    REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
1237    mova    [rsp+gprsize+16*1], m6
1238    pxor                    m6, m6
1239    psubw                   m6, m7
1240    mova                    m7, m6
1241    jmp m(idct_8x8_internal).pass1_end2
1242
1243ALIGN function_align
1244.pass2:
1245    lea                   tx2q, [o(m(idct_8x8_internal).end4)]
1246
1247.pass2_main:
1248    call .main
1249    call .main_pass2_end
1250
1251.end:
1252    mova                    m7, [o(pw_2048)]
1253    REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
1254    mova    [rsp+gprsize+16*1], m6
1255    pxor                    m6, m6
1256    psubw                   m6, m7
1257    mova                    m7, m6
1258    jmp m(idct_8x8_internal).end2
1259
1260ALIGN function_align
1261.main:
1262    mova  [rsp+gprsize*2+16*0], m7
1263    mova  [rsp+gprsize*2+16*1], m3
1264    mova  [rsp+gprsize*2+16*2], m4
1265    mova                    m7, [o(pd_2048)]
1266    ITX_MULSUB_2W            5, 2, 3, 4, 7, 1931, 3612    ;t3a, t2a
1267    ITX_MULSUB_2W            1, 6, 3, 4, 7, 3920, 1189    ;t7a, t6a
1268    paddsw                  m3, m2, m6                    ;t2
1269    psubsw                  m2, m6                        ;t6
1270    paddsw                  m4, m5, m1                    ;t3
1271    psubsw                  m5, m1                        ;t7
1272    ITX_MULSUB_2W            5, 2, 1, 6, 7, 3784, 1567    ;t6a, t7a
1273
1274    mova                    m6, [rsp+gprsize*2+16*2]
1275    mova  [rsp+gprsize*2+16*2], m5
1276    mova                    m1, [rsp+gprsize*2+16*1]
1277    mova  [rsp+gprsize*2+16*1], m2
1278    mova                    m5, [rsp+gprsize*2+16*0]
1279    mova  [rsp+gprsize*2+16*0], m3
1280    ITX_MULSUB_2W            5, 0, 2, 3, 7,  401, 4076    ;t1a, t0a
1281    ITX_MULSUB_2W            1, 6, 2, 3, 7, 3166, 2598    ;t5a, t4a
1282    psubsw                  m2, m0, m6                    ;t4
1283    paddsw                  m0, m6                        ;t0
1284    paddsw                  m3, m5, m1                    ;t1
1285    psubsw                  m5, m1                        ;t5
1286    ITX_MULSUB_2W            2, 5, 1, 6, 7, 1567, 3784    ;t5a, t4a
1287
1288    mova                    m7, [rsp+gprsize*2+16*0]
1289    paddsw                  m1, m3, m4                    ;-out7
1290    psubsw                  m3, m4                        ;t3
1291    mova  [rsp+gprsize*2+16*0], m1
1292    psubsw                  m4, m0, m7                    ;t2
1293    paddsw                  m0, m7                        ;out0
1294    mova                    m6, [rsp+gprsize*2+16*2]
1295    mova                    m7, [rsp+gprsize*2+16*1]
1296    paddsw                  m1, m5, m6                    ;-out1
1297    psubsw                  m5, m6                        ;t6
1298    paddsw                  m6, m2, m7                    ;out6
1299    psubsw                  m2, m7                        ;t7
1300    ret
1301ALIGN function_align
1302.main_pass1_end:
1303    mova  [rsp+gprsize*2+16*1], m1
1304    mova  [rsp+gprsize*2+16*2], m6
1305    punpckhwd               m1, m4, m3
1306    punpcklwd               m4, m3
1307    punpckhwd               m7, m5, m2
1308    punpcklwd               m5, m2
1309    mova                    m2, [o(pw_2896_2896)]
1310    mova                    m6, [o(pd_2048)]
1311    pmaddwd                 m3, m2, m7
1312    pmaddwd                 m2, m5
1313    paddd                   m3, m6
1314    paddd                   m2, m6
1315    psrad                   m3, 12
1316    psrad                   m2, 12
1317    packssdw                m2, m3                        ;out2
1318    mova                    m3, [o(pw_2896_m2896)]
1319    pmaddwd                 m7, m3
1320    pmaddwd                 m5, m3
1321    paddd                   m7, m6
1322    paddd                   m5, m6
1323    psrad                   m7, 12
1324    psrad                   m5, 12
1325    packssdw                m5, m7                        ;-out5
1326    mova                    m3, [o(pw_2896_2896)]
1327    pmaddwd                 m7, m3, m1
1328    pmaddwd                 m3, m4
1329    paddd                   m7, m6
1330    paddd                   m3, m6
1331    psrad                   m7, 12
1332    psrad                   m3, 12
1333    packssdw                m3, m7                        ;-out3
1334    mova                    m7, [o(pw_2896_m2896)]
1335    pmaddwd                 m1, m7
1336    pmaddwd                 m4, m7
1337    paddd                   m1, m6
1338    paddd                   m4, m6
1339    psrad                   m1, 12
1340    psrad                   m4, 12
1341    packssdw                m4, m1                        ;-out5
1342    mova                    m1, [rsp+gprsize*2+16*1]
1343    mova                    m6, [rsp+gprsize*2+16*2]
1344    ret
1345ALIGN function_align
1346.main_pass2_end:
1347    paddsw                  m7, m4, m3                    ;t2 + t3
1348    psubsw                  m4, m3                        ;t2 - t3
1349    paddsw                  m3, m5, m2                    ;t6 + t7
1350    psubsw                  m5, m2                        ;t6 - t7
1351    mova                    m2, [o(pw_2896x8)]
1352    pmulhrsw                m4, m2                        ;out4
1353    pmulhrsw                m5, m2                        ;-out5
1354    pmulhrsw                m7, m2                        ;-out3
1355    pmulhrsw                m2, m3                        ;out2
1356    mova                    m3, m7
1357    ret
1358
1359INV_TXFM_8X8_FN flipadst, dct
1360INV_TXFM_8X8_FN flipadst, adst
1361INV_TXFM_8X8_FN flipadst, flipadst
1362INV_TXFM_8X8_FN flipadst, identity
1363
1364cglobal iflipadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
1365    LOAD_8ROWS          coeffq, 16
1366
1367.pass1:
1368    call m(iadst_8x8_internal).main
1369    call m(iadst_8x8_internal).main_pass1_end
1370
1371.pass1_end:
1372    mova                    m7, [o(pw_m16384)]
1373
1374.pass1_end1:
1375    pmulhrsw                m1, m7
1376    mova    [rsp+gprsize+16*1], m1
1377    mova                    m1, m6
1378    mova                    m6, m2
1379    pmulhrsw                m2, m5, m7
1380    mova                    m5, m6
1381    mova                    m6, m4
1382    pmulhrsw                m4, m3, m7
1383    mova                    m3, m6
1384    mova                    m6, m0
1385    mova                    m0, m7
1386    pxor                    m7, m7
1387    psubw                   m7, m0
1388    pmulhrsw                m0, [rsp+gprsize+16*0]
1389    REPX      {pmulhrsw x, m7}, m1, m3, m5
1390    pmulhrsw                m7, m6
1391    jmp m(idct_8x8_internal).pass1_end3
1392
1393ALIGN function_align
1394.pass2:
1395    lea                   tx2q, [o(m(idct_8x8_internal).end4)]
1396
1397.pass2_main:
1398    call m(iadst_8x8_internal).main
1399    call m(iadst_8x8_internal).main_pass2_end
1400
1401.end:
1402    mova                    m7, [o(pw_2048)]
1403    REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
1404    mova    [rsp+gprsize+16*2], m2
1405    mova                    m2, m0
1406    pxor                    m0, m0
1407    psubw                   m0, m7
1408    mova                    m7, m2
1409    pmulhrsw                m1, m0
1410    pmulhrsw                m2, m5, m0
1411    mova    [rsp+gprsize+16*1], m1
1412    mova                    m5, m4
1413    mova                    m1, m6
1414    pmulhrsw                m4, m3, m0
1415    pmulhrsw                m0, [rsp+gprsize+16*0]
1416    mova                    m3, m5
1417    mova    [rsp+gprsize+16*0], m7
1418    jmp m(idct_8x8_internal).end3
1419
1420INV_TXFM_8X8_FN identity, dct
1421INV_TXFM_8X8_FN identity, adst
1422INV_TXFM_8X8_FN identity, flipadst
1423INV_TXFM_8X8_FN identity, identity
1424
1425cglobal iidentity_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
1426    LOAD_8ROWS          coeffq, 16
1427    mova    [rsp+gprsize+16*1], m6
1428    jmp   m(idct_8x8_internal).pass1_end3
1429
1430ALIGN function_align
1431.pass2:
1432    lea                   tx2q, [o(m(idct_8x8_internal).end4)]
1433
1434.end:
1435    pmulhrsw                m7, [o(pw_4096)]
1436    mova    [rsp+gprsize+16*0], m7
1437    mova                    m7, [o(pw_4096)]
1438    REPX      {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
1439    mova    [rsp+gprsize+16*2], m5
1440    mova    [rsp+gprsize+16*1], m6
1441    jmp m(idct_8x8_internal).end3
1442
1443
1444%macro INV_TXFM_4X16_FN 2 ; type1, type2
1445    INV_TXFM_FN          %1, %2, 4x16, 8
1446%ifidn %1_%2, dct_dct
1447    pshuflw               m0, [coeffq], q0000
1448    punpcklwd             m0, m0
1449    mova                  m1, [o(pw_2896x8)]
1450    pmulhrsw              m0, m1
1451    mov             [coeffq], eobd
1452    pmulhrsw              m0, [o(pw_16384)]
1453    pmulhrsw              m0, m1
1454    pmulhrsw              m0, [o(pw_2048)]
1455.end:
1456    WRITE_4X4             0, 0, 1, 2, 3, 0, 1, 2, 3
1457    lea                dstq, [dstq+strideq*4]
1458    WRITE_4X4             0, 0, 1, 2, 3, 0, 1, 2, 3
1459    lea                dstq, [dstq+strideq*4]
1460    WRITE_4X4             0, 0, 1, 2, 3, 0, 1, 2, 3
1461    lea                dstq, [dstq+strideq*4]
1462    WRITE_4X4             0, 0, 1, 2, 3, 0, 1, 2, 3
1463    RET
1464%endif
1465%endmacro
1466
1467INV_TXFM_4X16_FN dct, dct
1468INV_TXFM_4X16_FN dct, adst
1469INV_TXFM_4X16_FN dct, flipadst
1470INV_TXFM_4X16_FN dct, identity
1471
1472cglobal idct_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
1473    lea                  r3, [o(m(idct_4x8_internal).pass1)]
1474
1475.pass1:
1476    mova                 m0, [coeffq+16*1]
1477    mova                 m1, [coeffq+16*3]
1478    mova                 m2, [coeffq+16*5]
1479    mova                 m3, [coeffq+16*7]
1480    push               tx2q
1481    lea                tx2q, [o(m(idct_4x16_internal).pass1_2)]
1482    jmp                  r3
1483
1484.pass1_2:
1485    mova      [coeffq+16*1], m0
1486    mova      [coeffq+16*3], m1
1487    mova      [coeffq+16*5], m2
1488    mova      [coeffq+16*7], m3
1489    mova                 m0, [coeffq+16*0]
1490    mova                 m1, [coeffq+16*2]
1491    mova                 m2, [coeffq+16*4]
1492    mova                 m3, [coeffq+16*6]
1493    lea                tx2q, [o(m(idct_4x16_internal).pass1_end)]
1494    jmp                  r3
1495
1496.pass1_end:
1497    pop                tx2q
1498
1499    mova                 m4, [coeffq+16*1]
1500    mova                 m5, [coeffq+16*3]
1501    mova                 m6, [coeffq+16*5]
1502    mova                 m7, [o(pw_16384)]
1503    REPX   {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
1504
1505    pmulhrsw             m7, [coeffq+16*7]
1506    mova       [coeffq+16*7], m7
1507    jmp                tx2q
1508
1509.pass2:
1510    call m(idct_16x4_internal).main
1511
1512.end:
1513    mova                  m7, [o(pw_2048)]
1514    REPX    {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
1515    pmulhrsw              m7, [coeffq+16*7]
1516    mova       [coeffq+16*4], m4
1517
1518.end1:
1519    mova       [coeffq+16*5], m5
1520    mova       [coeffq+16*6], m6
1521    mov                   r3, coeffq
1522    WRITE_4X8              0, 1, 3, 2
1523
1524    mova                  m0, [r3+16*4]
1525    mova                  m1, [r3+16*5]
1526    mova                  m2, [r3+16*6]
1527    mova                  m3, m7
1528    lea                 dstq, [dstq+strideq*4]
1529    WRITE_4X8              0, 1, 3, 2
1530
1531.end2:
1532    pxor                  m7, m7
1533    REPX     {mova [r3+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
1534    ret
1535
1536INV_TXFM_4X16_FN adst, dct
1537INV_TXFM_4X16_FN adst, adst
1538INV_TXFM_4X16_FN adst, flipadst
1539INV_TXFM_4X16_FN adst, identity
1540
1541cglobal iadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
1542    lea                   r3, [o(m(iadst_4x8_internal).pass1)]
1543    jmp   m(idct_4x16_internal).pass1
1544
1545.pass2:
1546    call m(iadst_16x4_internal).main
1547    call m(iadst_16x4_internal).main_pass2_end
1548
1549    punpcklqdq            m6, m5, m4                ;low: -out5  high: -out7
1550    punpckhqdq            m4, m5                    ;low:  out8  high:  out10
1551    punpcklqdq            m5, m7, m2                ;low:  out4  high:  out6
1552    punpckhqdq            m2, m7                    ;low: -out9  high: -out11
1553    mova       [coeffq+16*4], m2
1554    mova       [coeffq+16*5], m6
1555    mova                  m2, [coeffq+16*6]
1556    mova                  m6, [coeffq+16*7]
1557    punpckhqdq            m1, m6, m0                ;low: -out13 high: -out15
1558    punpcklqdq            m0, m6                    ;low:  out0  high:  out2
1559    punpckhqdq            m6, m3, m2                ;low:  out12 high:  out14
1560    punpcklqdq            m2, m3                    ;low: -out1  high: -out3
1561
1562    mova                  m7, [o(pw_2048)]
1563
1564.end1:
1565    REPX    {pmulhrsw x, m7}, m0, m5, m4, m6
1566    pxor                  m3, m3
1567    psubw                 m3, m7
1568    mova                  m7, [coeffq+16*4]
1569    REPX    {pmulhrsw x, m3}, m2, m7, m1
1570    pmulhrsw              m3, [coeffq+16*5]
1571    mova       [coeffq+16*7], m5
1572
1573    punpckhqdq            m5, m4, m7                ;low:  out10 high:  out11
1574    punpcklqdq            m4, m7                    ;low:  out8  high:  out9
1575    punpckhqdq            m7, m6, m1                ;low:  out14 high:  out15
1576    punpcklqdq            m6, m1                    ;low:  out12 high:  out13
1577    punpckhqdq            m1, m0, m2                ;low:  out2  high:  out3
1578    punpcklqdq            m0, m2                    ;low:  out0  high:  out1
1579    mova       [coeffq+16*4], m4
1580    mova                  m4, [coeffq+16*7]
1581    punpcklqdq            m2, m4, m3                ;low:  out4  high:  out5
1582    punpckhqdq            m4, m3                    ;low:  out6  high:  out7
1583    mova                  m3, m4
1584
1585.end2:
1586    mova       [coeffq+16*5], m5
1587    mova       [coeffq+16*6], m6
1588    mov                   r3, coeffq
1589    WRITE_4X8              0, 1, 2, 3
1590
1591    mova                  m0, [r3+16*4]
1592    mova                  m1, [r3+16*5]
1593    mova                  m2, [r3+16*6]
1594    mova                  m3, m7
1595    lea                 dstq, [dstq+strideq*4]
1596    WRITE_4X8              0, 1, 2, 3
1597
1598.end3:
1599    pxor                  m7, m7
1600    REPX     {mova [r3+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
1601    ret
1602
1603
1604INV_TXFM_4X16_FN flipadst, dct
1605INV_TXFM_4X16_FN flipadst, adst
1606INV_TXFM_4X16_FN flipadst, flipadst
1607INV_TXFM_4X16_FN flipadst, identity
1608
1609cglobal iflipadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
1610    lea                   r3, [o(m(iflipadst_4x8_internal).pass1)]
1611    jmp   m(idct_4x16_internal).pass1
1612
1613.pass2:
1614    call m(iadst_16x4_internal).main
1615    call m(iadst_16x4_internal).main_pass2_end
1616
1617    punpckhqdq            m6, m5, m4                ;low:  out5  high:  out7
1618    punpcklqdq            m4, m5                    ;low: -out8  high: -out10
1619    punpckhqdq            m5, m7, m2                ;low: -out4  high: -out6
1620    punpcklqdq            m2, m7                    ;low:  out9  high:  out11
1621    mova       [coeffq+16*4], m2
1622    mova       [coeffq+16*5], m6
1623    mova                  m2, [coeffq+16*6]
1624    mova                  m6, [coeffq+16*7]
1625    punpcklqdq            m1, m6, m0                ;low:  out13 high:  out15
1626    punpckhqdq            m0, m6                    ;low: -out0  high: -out2
1627    punpcklqdq            m6, m3, m2                ;low: -out12 high: -out14
1628    punpckhqdq            m2, m3                    ;low:  out1  high:  out3
1629
1630    mova                  m7, [o(pw_m2048)]
1631    jmp   m(iadst_4x16_internal).end1
1632
1633
1634INV_TXFM_4X16_FN identity, dct
1635INV_TXFM_4X16_FN identity, adst
1636INV_TXFM_4X16_FN identity, flipadst
1637INV_TXFM_4X16_FN identity, identity
1638
1639%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394]
1640    pmulhrsw            m%2, m%3, m%1
1641%if %0 == 4 ; if downshifting by 1
1642    pmulhrsw            m%2, m%4
1643%else
1644    paddsw              m%1, m%1
1645%endif
1646    paddsw              m%1, m%2
1647%endmacro
1648
1649cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
1650    mova                  m0, [coeffq+16*1]
1651    mova                  m6, [o(pw_1697x8)]
1652    mova                  m1, [coeffq+16*3]
1653    mova                  m2, [coeffq+16*5]
1654    mova                  m3, [coeffq+16*7]
1655    pcmpeqw               m7, m7
1656    mov                   r3, tx2q
1657    lea                 tx2q, [o(.pass1_2)]
1658.pass1:
1659    pmulhrsw              m4, m6, m0
1660    pmulhrsw              m5, m6, m1
1661    pavgw                 m4, m0
1662    pcmpeqw               m0, m7
1663    pavgw                 m5, m1
1664    pcmpeqw               m1, m7
1665    pandn                 m0, m4
1666    pmulhrsw              m4, m6, m2
1667    pandn                 m1, m5
1668    pmulhrsw              m5, m6, m3
1669    pavgw                 m4, m2
1670    pcmpeqw               m2, m7
1671    pavgw                 m5, m3
1672    pcmpeqw               m3, m7
1673    pandn                 m2, m4
1674    pandn                 m3, m5
1675    jmp m(iadst_4x8_internal).pass1_end
1676.pass1_2:
1677    mova       [coeffq+16*1], m0
1678    mova       [coeffq+16*3], m1
1679    mova       [coeffq+16*5], m2
1680    mova       [coeffq+16*7], m3
1681    mova                  m0, [coeffq+16*0]
1682    mova                  m1, [coeffq+16*2]
1683    mova                  m2, [coeffq+16*4]
1684    mova                  m3, [coeffq+16*6]
1685    lea                 tx2q, [o(.pass1_end)]
1686    jmp .pass1
1687.pass1_end:
1688    mova                  m4, [coeffq+16*1]
1689    mova                  m5, [coeffq+16*3]
1690    mova                  m6, [coeffq+16*5]
1691    jmp                   r3
1692.pass2:
1693    mova                  m7, [o(pw_1697x16)]
1694    mova       [coeffq+16*6], m6
1695    REPX    {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5
1696    mova                  m6, [coeffq+16*7]
1697    IDTX16                 6, 7, 7
1698    mova       [coeffq+16*7], m6
1699    mova                  m6, [coeffq+16*6]
1700    pmulhrsw              m7, m6, [o(pw_1697x16)]
1701    paddsw                m6, m6
1702    paddsw                m6, m7
1703    mova                  m7, [o(pw_2048)]
1704    REPX    {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
1705    pmulhrsw              m7, [coeffq+16*7]
1706    mova       [coeffq+16*4], m4
1707    jmp m(iadst_4x16_internal).end2
1708
1709
1710%macro INV_TXFM_16X4_FN 2 ; type1, type2
1711    INV_TXFM_FN          %1, %2, 16x4, 8
1712%ifidn %1_%2, dct_dct
1713    movd                 m1, [o(pw_2896x8)]
1714    pmulhrsw             m0, m1, [coeffq]
1715    movd                 m2, [o(pw_16384)]
1716    mov            [coeffq], eobd
1717    mov                 r2d, 2
1718    lea                tx2q, [o(m(inv_txfm_add_dct_dct_16x4).end)]
1719.dconly:
1720    pmulhrsw             m0, m2
1721    movd                 m2, [o(pw_2048)]              ;intentionally rip-relative
1722    pmulhrsw             m0, m1
1723    pmulhrsw             m0, m2
1724    pshuflw              m0, m0, q0000
1725    punpcklwd            m0, m0
1726    pxor                 m5, m5
1727.dconly_loop:
1728    mova                 m1, [dstq]
1729    mova                 m3, [dstq+strideq]
1730    punpckhbw            m2, m1, m5
1731    punpcklbw            m1, m5
1732    punpckhbw            m4, m3, m5
1733    punpcklbw            m3, m5
1734    paddw                m2, m0
1735    paddw                m1, m0
1736    paddw                m4, m0
1737    paddw                m3, m0
1738    packuswb             m1, m2
1739    packuswb             m3, m4
1740    mova             [dstq], m1
1741    mova     [dstq+strideq], m3
1742    lea                dstq, [dstq+strideq*2]
1743    dec                 r2d
1744    jg .dconly_loop
1745    jmp                tx2q
1746.end:
1747    RET
1748%endif
1749%endmacro
1750
1751%macro LOAD_7ROWS 2 ;src, stride
1752    mova                 m0, [%1+%2*0]
1753    mova                 m1, [%1+%2*1]
1754    mova                 m2, [%1+%2*2]
1755    mova                 m3, [%1+%2*3]
1756    mova                 m4, [%1+%2*4]
1757    mova                 m5, [%1+%2*5]
1758    mova                 m6, [%1+%2*6]
1759%endmacro
1760
1761%macro SAVE_7ROWS 2 ;src, stride
1762    mova          [%1+%2*0], m0
1763    mova          [%1+%2*1], m1
1764    mova          [%1+%2*2], m2
1765    mova          [%1+%2*3], m3
1766    mova          [%1+%2*4], m4
1767    mova          [%1+%2*5], m5
1768    mova          [%1+%2*6], m6
1769%endmacro
1770
1771%macro IDCT16_1D_PACKED_ODDHALF 7  ;src[1-4], tmp[1-3]
1772    punpckhwd            m%5, m%4, m%1                ;packed in13 in3
1773    punpcklwd            m%1, m%4                     ;packed in1  in15
1774    punpcklwd            m%4, m%3, m%2                ;packed in9  in7
1775    punpckhwd            m%2, m%3                     ;packed in5  in11
1776    mova                 m%7, [o(pd_2048)]
1777    ITX_MUL2X_PACK        %1, %6, %7,  401, 4076, 1    ;low: t8a   high: t15a
1778    ITX_MUL2X_PACK        %4, %6, %7, 3166, 2598, 1    ;low: t9a   high: t14a
1779    ITX_MUL2X_PACK        %2, %6, %7, 1931, 3612, 1    ;low: t10a  high: t13a
1780    ITX_MUL2X_PACK        %5, %6, %7, 3920, 1189, 1    ;low: t11a  high: t12a
1781    psubsw               m%6, m%1, m%4                 ;low: t9    high: t14
1782    paddsw               m%1, m%4                      ;low: t8    high: t15
1783    psubsw               m%4, m%5, m%2                 ;low: t10   high: t13
1784    paddsw               m%5, m%2                      ;low: t11   high: t12
1785    mova                 m%2, [o(deint_shuf2)]
1786    pshufb               m%6, m%2
1787    pshufb               m%4, m%2
1788    ITX_MUL2X_PACK        %6, %3, %7, 1567, 3784, 1    ;low: t9a   high: t14a
1789    ITX_MUL2X_PACK        %4, %3, %7, m3784, 1567, 1   ;low: t10a  high: t13a
1790    psubsw               m%3, m%1, m%5                 ;low: t11a  high: t12a
1791    paddsw               m%1, m%5                      ;low: t8a   high: t15a
1792    psubsw               m%5, m%6, m%4                 ;low: t10   high: t13
1793    paddsw               m%6, m%4                      ;low: t9    high: t14
1794    pshufb               m%3, m%2
1795    pshufb               m%5, m%2
1796    ITX_MUL2X_PACK        %3, %2, %7, 2896, 2896, 4    ;t12,  t11
1797    ITX_MUL2X_PACK        %5, %4, %7, 2896, 2896, 4    ;t13a, t10a
1798    packssdw             m%2, m%4                      ;low: t11   high: t10a
1799    packssdw             m%3, m%5                      ;low: t12   high: t13a
1800    punpckhqdq           m%4, m%1, m%6                 ;low: t15a  high: t14
1801    punpcklqdq           m%1, m%6                      ;low: t8a   high: t9
1802%endmacro
1803
1804INV_TXFM_16X4_FN dct, dct
1805INV_TXFM_16X4_FN dct, adst
1806INV_TXFM_16X4_FN dct, flipadst
1807INV_TXFM_16X4_FN dct, identity
1808
1809cglobal idct_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
1810    LOAD_7ROWS        coeffq, 16
1811    call .main
1812
1813.pass1_end:
1814    punpckhwd             m7, m0, m2                 ;packed out1,  out5
1815    punpcklwd             m0, m2                     ;packed out0,  out4
1816    punpcklwd             m2, m1, m3                 ;packed out3,  out7
1817    punpckhwd             m1, m3                     ;packed out2,  out6
1818    mova       [coeffq+16*6], m7
1819    mova                  m7, [coeffq+16*7]
1820    punpckhwd             m3, m4, m6                 ;packed out9,  out13
1821    punpcklwd             m4, m6                     ;packed out8,  out12
1822    punpcklwd             m6, m5, m7                 ;packed out11, out15
1823    punpckhwd             m5, m7                     ;packed out10, out14
1824
1825.pass1_end2:
1826    mova                  m7, [o(pw_16384)]
1827    REPX    {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
1828    pmulhrsw              m7, [coeffq+16*6]
1829    mova       [coeffq+16*6], m7
1830
1831.pass1_end3:
1832    punpckhwd             m7, m3, m6                 ;packed 9, 11, 13, 15 high
1833    punpcklwd             m3, m6                     ;packed 9, 10, 13, 15 low
1834    punpckhwd             m6, m4, m5                 ;packed 8, 10, 12, 14 high
1835    punpcklwd             m4, m5                     ;packed 8, 10, 12, 14 low
1836    punpckhwd             m5, m4, m3                 ;8, 9, 10, 11, 12, 13, 14, 15(1)
1837    punpcklwd             m4, m3                     ;8, 9, 10, 11, 12, 13, 14, 15(0)
1838    punpckhwd             m3, m6, m7                 ;8, 9, 10, 11, 12, 13, 14, 15(3)
1839    punpcklwd             m6, m7                     ;8, 9, 10, 11, 12, 13, 14, 15(2)
1840    mova       [coeffq+16*7], m3
1841    mova                  m3, [coeffq+16*6]
1842    punpckhwd             m7, m3, m2                 ;packed 1, 3, 5, 7 high
1843    punpcklwd             m3, m2                     ;packed 1, 3, 5, 7 low
1844    punpckhwd             m2, m0, m1                 ;packed 0, 2, 4, 6 high
1845    punpcklwd             m0, m1                     ;packed 0, 2, 4, 6 low
1846    punpckhwd             m1, m0, m3                 ;0, 1, 2, 3, 4, 5, 6, 7(1)
1847    punpcklwd             m0, m3                     ;0, 1, 2, 3, 4, 5, 6, 7(0)
1848    punpckhwd             m3, m2, m7                 ;0, 1, 2, 3, 4, 5, 6, 7(3)
1849    punpcklwd             m2, m7                     ;0, 1, 2, 3, 4, 5, 6, 7(2)
1850    jmp                 tx2q
1851
1852.pass2:
1853    lea                 tx2q, [o(m(idct_8x4_internal).pass2)]
1854
1855.pass2_end:
1856    mova       [coeffq+16*4], m4
1857    mova       [coeffq+16*5], m5
1858    mova       [coeffq+16*6], m6
1859    lea                   r3, [dstq+8]
1860    call                tx2q
1861
1862    add               coeffq, 16*4
1863    mova                  m0, [coeffq+16*0]
1864    mova                  m1, [coeffq+16*1]
1865    mova                  m2, [coeffq+16*2]
1866    mova                  m3, [coeffq+16*3]
1867    mov                 dstq, r3
1868    jmp                 tx2q
1869
1870ALIGN function_align
1871.main:
1872    punpckhqdq            m7, m0, m1                 ;low:in1  high:in3
1873    punpcklqdq            m0, m1
1874    punpcklqdq            m1, m2, m3
1875    punpckhqdq            m3, m2                     ;low:in7  high:in5
1876    mova       [coeffq+16*4], m7
1877    mova       [coeffq+16*5], m3
1878    mova                  m7, [coeffq+16*7]
1879    punpcklqdq            m2, m4, m5
1880    punpckhqdq            m4, m5                     ;low:in9  high:in11
1881    punpcklqdq            m3, m6, m7
1882    punpckhqdq            m7, m6                     ;low:in15 high:in13
1883    mova       [coeffq+16*6], m4
1884    IDCT8_1D_PACKED
1885    mova                  m6, [coeffq+16*4]
1886    mova                  m4, [coeffq+16*5]
1887    mova                  m5, [coeffq+16*6]
1888    mova       [coeffq+16*4], m1
1889    mova       [coeffq+16*5], m2
1890    mova       [coeffq+16*6], m3
1891
1892    IDCT16_1D_PACKED_ODDHALF 6, 4, 5, 7, 1, 2, 3
1893
1894    mova                  m1, [coeffq+16*4]
1895    psubsw                m3, m0, m7                 ;low:out15 high:out14
1896    paddsw                m0, m7                     ;low:out0  high:out1
1897    psubsw                m7, m1, m5                 ;low:out12 high:out13
1898    paddsw                m1, m5                     ;low:out3  high:out2
1899    mova       [coeffq+16*7], m3
1900    mova                  m2, [coeffq+16*5]
1901    mova                  m3, [coeffq+16*6]
1902    psubsw                m5, m2, m4                 ;low:out11 high:out10
1903    paddsw                m2, m4                     ;low:out4  high:out5
1904    psubsw                m4, m3, m6                 ;low:out8  high:out9
1905    paddsw                m3, m6                     ;low:out7  high:out6
1906    mova                  m6, m7
1907    ret
1908
1909INV_TXFM_16X4_FN adst, dct
1910INV_TXFM_16X4_FN adst, adst
1911INV_TXFM_16X4_FN adst, flipadst
1912INV_TXFM_16X4_FN adst, identity
1913
1914cglobal iadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
1915    LOAD_7ROWS        coeffq, 16
1916    call .main
1917    call .main_pass1_end
1918
1919    punpckhwd             m6, m7, m0                 ;packed -out11, -out15
1920    punpcklwd             m0, m7                     ;packed   out0,   out4
1921    punpcklwd             m7, m3, m4                 ;packed  -out3,  -out7
1922    punpckhwd             m4, m3                     ;packed   out8,  out12
1923    mova                  m1, [coeffq+16*6]
1924    punpcklwd             m3, m1, m5                 ;packed  -out1,  -out5
1925    punpckhwd             m5, m1                     ;packed  out10,  out14
1926    mova                  m1, [coeffq+16*7]
1927    mova       [coeffq+16*6], m3
1928    mova       [coeffq+16*7], m7
1929    punpckhwd             m3, m2, m1                 ;packed  -out9,  -out13
1930    punpcklwd             m1, m2                     ;packed   out2,   out6
1931
1932    mova                  m7, [o(pw_16384)]
1933
1934.pass1_end:
1935    REPX    {pmulhrsw x, m7}, m0, m1, m4, m5
1936    pxor                  m2, m2
1937    psubw                 m2, m7
1938    mova                  m7, [coeffq+16*6]
1939    REPX    {pmulhrsw x, m2}, m7, m3, m6
1940    pmulhrsw              m2, [coeffq+16*7]
1941    mova       [coeffq+16*6], m7
1942    jmp   m(idct_16x4_internal).pass1_end3
1943
1944.pass2:
1945    lea                 tx2q, [o(m(iadst_8x4_internal).pass2)]
1946    jmp   m(idct_16x4_internal).pass2_end
1947
1948ALIGN function_align
1949.main:
1950    mova       [coeffq+16*6], m0
1951    pshufd                m0, m1, q1032
1952    pshufd                m2, m2, q1032
1953    punpckhwd             m1, m6, m0                 ;packed in13,  in2
1954    punpcklwd             m0, m6                     ;packed  in3, in12
1955    punpckhwd             m7, m5, m2                 ;packed in11,  in4
1956    punpcklwd             m2, m5                     ;packed  in5, in10
1957    mova                  m6, [o(pd_2048)]
1958    ITX_MUL2X_PACK         1, 5, 6,  995, 3973       ;low:t2   high:t3
1959    ITX_MUL2X_PACK         7, 5, 6, 1751, 3703       ;low:t4   high:t5
1960    ITX_MUL2X_PACK         2, 5, 6, 3513, 2106       ;low:t10  high:t11
1961    ITX_MUL2X_PACK         0, 5, 6, 3857, 1380       ;low:t12  high:t13
1962    psubsw                m5, m1, m2                 ;low:t10a high:t11a
1963    paddsw                m1, m2                     ;low:t2a  high:t3a
1964    psubsw                m2, m7, m0                 ;low:t12a high:t13a
1965    paddsw                m7, m0                     ;low:t4a  high:t5a
1966    punpcklqdq            m0, m5
1967    punpckhwd             m0, m5                     ;packed t10a, t11a
1968    punpcklqdq            m5, m2
1969    punpckhwd             m2, m5                     ;packed t13a, t12a
1970    ITX_MUL2X_PACK         0, 5, 6, 3406, 2276       ;low:t10  high:t11
1971    ITX_MUL2X_PACK         2, 5, 6, 4017,  799, 1    ;low:t12  high:t13
1972    mova       [coeffq+16*4], m1
1973    mova       [coeffq+16*5], m7
1974    mova                  m1, [coeffq+16*6]
1975    mova                  m7, [coeffq+16*7]
1976    pshufd                m1, m1, q1032
1977    pshufd                m3, m3, q1032
1978    punpckhwd             m5, m7, m1                 ;packed in15,  in0
1979    punpcklwd             m1, m7                     ;packed  in1, in14
1980    punpckhwd             m7, m4, m3                 ;packed  in9,  in6
1981    punpcklwd             m3, m4                     ;packed  in7,  in8
1982    ITX_MUL2X_PACK         5, 4, 6,  201, 4091       ;low:t0    high:t1
1983    ITX_MUL2X_PACK         7, 4, 6, 2440, 3290       ;low:t6    high:t7
1984    ITX_MUL2X_PACK         3, 4, 6, 3035, 2751       ;low:t8    high:t9
1985    ITX_MUL2X_PACK         1, 4, 6, 4052,  601       ;low:t14   high:t15
1986    psubsw                m4, m5, m3                 ;low:t8a   high:t9a
1987    paddsw                m5, m3                     ;low:t0a   high:t1a
1988    psubsw                m3, m7, m1                 ;low:t14a  high:t15a
1989    paddsw                m7, m1                     ;low:t6a   high:t7a
1990    punpcklqdq            m1, m4
1991    punpckhwd             m1, m4                     ;packed  t8a,  t9a
1992    punpcklqdq            m4, m3
1993    punpckhwd             m3, m4                     ;packed t15a, t14a
1994    ITX_MUL2X_PACK         1, 4, 6,  799, 4017       ;low:t8    high:t9
1995    ITX_MUL2X_PACK         3, 4, 6, 2276, 3406, 1    ;low:t14   high:t15
1996    paddsw                m4, m1, m2                 ;low:t12a  high:t13a
1997    psubsw                m1, m2                     ;low:t8a   high:t9a
1998    psubsw                m2, m0, m3                 ;low:t14a  high:t15a
1999    paddsw                m0, m3                     ;low:t10a  high:t11a
2000    punpcklqdq            m3, m1
2001    punpckhwd             m3, m1                     ;packed t12a, t13a
2002    punpcklqdq            m1, m2
2003    punpckhwd             m2, m1                     ;packed t15a, t14a
2004    ITX_MUL2X_PACK         3, 1, 6, 1567, 3784       ;low:t12   high:t13
2005    ITX_MUL2X_PACK         2, 1, 6, 3784, 1567, 1    ;low:t14   high:t15
2006    psubsw                m1, m3, m2                 ;low:t14a  high:t15a
2007    paddsw                m3, m2                     ;low:out2  high:-out13
2008    psubsw                m2, m4, m0                 ;low:t10   high:t11
2009    paddsw                m0, m4                     ;low:-out1 high:out14
2010    mova       [coeffq+16*6], m0
2011    mova       [coeffq+16*7], m3
2012    mova                  m0, [coeffq+16*4]
2013    mova                  m3, [coeffq+16*5]
2014    psubsw                m4, m5, m3                 ;low:t4    high:t5
2015    paddsw                m5, m3                     ;low:t0    high:t1
2016    psubsw                m3, m0, m7                 ;low:t6    high:t7
2017    paddsw                m0, m7                     ;low:t2    high:t3
2018    punpcklqdq            m7, m4
2019    punpckhwd             m7, m4                     ;packed t4, t5
2020    punpcklqdq            m4, m3
2021    punpckhwd             m3, m4                     ;packed t7, t6
2022    ITX_MUL2X_PACK         7, 4, 6, 1567, 3784       ;low:t4a   high:t5a
2023    ITX_MUL2X_PACK         3, 4, 6, 3784, 1567, 1    ;low:t6a   high:t7a
2024    psubsw                m4, m5, m0                 ;low:t2a   high:t3a
2025    paddsw                m0, m5                     ;low:out0  high:-out15
2026    psubsw                m5, m7, m3                 ;low:t6    high:t7
2027    paddsw                m3, m7                     ;low:-out3 high:out12
2028    ret
2029ALIGN function_align
2030.main_pass1_end:
2031    mova                  m7, [o(deint_shuf1)]
2032    mova       [coeffq+16*4], m0
2033    mova       [coeffq+16*5], m3
2034    mova                  m0, [o(pw_2896_m2896)]
2035    mova                  m3, [o(pw_2896_2896)]
2036    pshufb                m1, m7                     ;t14a t15a
2037    pshufb                m2, m7                     ;t10  t11
2038    pshufb                m4, m7                     ;t2a  t3a
2039    pshufb                m5, m7                     ;t6   t7
2040    pmaddwd               m7, m0, m2
2041    pmaddwd               m2, m3
2042    paddd                 m7, m6
2043    paddd                 m2, m6
2044    psrad                 m7, 12
2045    psrad                 m2, 12
2046    packssdw              m2, m7                     ;low:out6  high:-out9
2047    pmaddwd               m7, m0, m4
2048    pmaddwd               m4, m3
2049    paddd                 m7, m6
2050    paddd                 m4, m6
2051    psrad                 m7, 12
2052    psrad                 m4, 12
2053    packssdw              m4, m7                     ;low:-out7 high:out8
2054    pmaddwd               m7, m3, m5
2055    pmaddwd               m5, m0
2056    paddd                 m7, m6
2057    paddd                 m5, m6
2058    psrad                 m7, 12
2059    psrad                 m5, 12
2060    packssdw              m7, m5                     ;low:out4  high:-out11
2061    pmaddwd               m5, m3, m1
2062    pmaddwd               m1, m0
2063    paddd                 m5, m6
2064    paddd                 m1, m6
2065    psrad                 m5, 12
2066    psrad                 m1, 12
2067    packssdw              m5, m1                     ;low:-out5 high:out10
2068    mova                  m0, [coeffq+16*4]
2069    mova                  m3, [coeffq+16*5]
2070    ret
2071ALIGN function_align
2072.main_pass2_end:
2073    mova                  m7, [o(pw_2896x8)]
2074    punpckhqdq            m6, m2, m1                 ;low:t11   high:t15a
2075    punpcklqdq            m2, m1                     ;low:t10   high:t14a
2076    psubsw                m1, m2, m6
2077    paddsw                m2, m6
2078    punpckhqdq            m6, m4, m5                 ;low:t3a   high:t7
2079    punpcklqdq            m4, m5                     ;low:t2a   high:t6
2080    psubsw                m5, m4, m6
2081    paddsw                m4, m6
2082    pmulhrsw              m1, m7                     ;low:-out9 high:out10
2083    pmulhrsw              m2, m7                     ;low:out6  high:-out5
2084    pmulhrsw              m5, m7                     ;low:out8  high:-out11
2085    pmulhrsw              m4, m7                     ;low:-out7 high:out4
2086    punpckhqdq            m7, m4, m5                 ;low:out4  high:-out11
2087    punpcklqdq            m4, m5                     ;low:-out7 high:out8
2088    punpckhqdq            m5, m2, m1                 ;low:-out5 high:out10
2089    punpcklqdq            m2, m1                     ;low:out6  high:-out9
2090    ret
2091
2092
2093INV_TXFM_16X4_FN flipadst, dct
2094INV_TXFM_16X4_FN flipadst, adst
2095INV_TXFM_16X4_FN flipadst, flipadst
2096INV_TXFM_16X4_FN flipadst, identity
2097
2098cglobal iflipadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
2099    LOAD_7ROWS        coeffq, 16
2100    call m(iadst_16x4_internal).main
2101    call m(iadst_16x4_internal).main_pass1_end
2102
2103    punpcklwd             m6, m7, m0                 ;packed  out11,  out15
2104    punpckhwd             m0, m7                     ;packed  -out0,  -out4
2105    punpckhwd             m7, m3, m4                 ;packed   out3,   out7
2106    punpcklwd             m4, m3                     ;packed  -out8, -out12
2107    mova                  m1, [coeffq+16*6]
2108    punpckhwd             m3, m1, m5                 ;packed   out1,   out5
2109    punpcklwd             m5, m1                     ;packed -out10, -out14
2110    mova                  m1, [coeffq+16*7]
2111    mova       [coeffq+16*6], m3
2112    mova       [coeffq+16*7], m7
2113    punpcklwd             m3, m2, m1                 ;packed   out9,  out13
2114    punpckhwd             m1, m2                     ;packed  -out2,  -out6
2115
2116    mova                  m7, [o(pw_m16384)]
2117    jmp   m(iadst_16x4_internal).pass1_end
2118
2119.pass2:
2120    lea                 tx2q, [o(m(iflipadst_8x4_internal).pass2)]
2121    jmp   m(idct_16x4_internal).pass2_end
2122
2123
2124INV_TXFM_16X4_FN identity, dct
2125INV_TXFM_16X4_FN identity, adst
2126INV_TXFM_16X4_FN identity, flipadst
2127INV_TXFM_16X4_FN identity, identity
2128
2129cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
2130    mova                  m1, [coeffq+16*6]
2131    mova                  m0, [coeffq+16*5]
2132    mova                  m2, [coeffq+16*7]
2133    mova                  m6, [o(pw_1697x16)]
2134    mova                  m7, [o(pw_16384)]
2135    pmulhrsw              m4, m6, m1
2136    pmulhrsw              m3, m6, m0
2137    pmulhrsw              m5, m6, m2
2138    pmulhrsw              m4, m7
2139    pmulhrsw              m3, m7
2140    pmulhrsw              m5, m7
2141    paddsw                m1, m4
2142    paddsw                m0, m3
2143    paddsw                m5, m2
2144    mova                  m2, [coeffq+16*2]
2145    mova                  m3, [coeffq+16*3]
2146    mova                  m4, [coeffq+16*4]
2147    mova       [coeffq+16*6], m1
2148    mova       [coeffq+16*5], m0
2149    mova       [coeffq+16*7], m5
2150    pmulhrsw              m0, m6, m2
2151    pmulhrsw              m1, m6, m3
2152    pmulhrsw              m5, m6, m4
2153    pmulhrsw              m0, m7
2154    pmulhrsw              m1, m7
2155    pmulhrsw              m5, m7
2156    paddsw                m2, m0
2157    paddsw                m3, m1
2158    paddsw                m4, m5
2159    mova                  m0, [coeffq+16*0]
2160    mova                  m1, [coeffq+16*1]
2161    pmulhrsw              m5, m6, m0
2162    pmulhrsw              m6, m1
2163    pmulhrsw              m5, m7
2164    pmulhrsw              m6, m7
2165    paddsw                m0, m5
2166    paddsw                m1, m6
2167    mova                  m6, [coeffq+16*6]
2168    mova                  m5, [coeffq+16*5]
2169    punpckhwd             m7, m0, m2                 ;packed out1,  out5
2170    punpcklwd             m0, m2                     ;packed out0,  out4
2171    punpckhwd             m2, m1, m3                 ;packed out3,  out7
2172    punpcklwd             m1, m3                     ;packed out2,  out6
2173    mova       [coeffq+16*6], m7
2174    mova                  m7, [coeffq+16*7]
2175    punpckhwd             m3, m4, m6                 ;packed out9,  out13
2176    punpcklwd             m4, m6                     ;packed out8,  out12
2177    punpckhwd             m6, m5, m7                 ;packed out11, out15
2178    punpcklwd             m5, m7                     ;packed out10, out14
2179    jmp   m(idct_16x4_internal).pass1_end3
2180
2181.pass2:
2182    lea                 tx2q, [o(m(iidentity_8x4_internal).pass2)]
2183    jmp   m(idct_16x4_internal).pass2_end
2184
2185
2186%macro SAVE_8ROWS 2  ;src, stride
2187    mova                 [%1+%2*0], m0
2188    mova                 [%1+%2*1], m1
2189    mova                 [%1+%2*2], m2
2190    mova                 [%1+%2*3], m3
2191    mova                 [%1+%2*4], m4
2192    mova                 [%1+%2*5], m5
2193    mova                 [%1+%2*6], m6
2194    mova                 [%1+%2*7], m7
2195%endmacro
2196
2197%macro INV_TXFM_8X16_FN 2 ; type1, type2
2198    INV_TXFM_FN          %1, %2, 8x16, 8, 16*16
2199%ifidn %1_%2, dct_dct
2200    pshuflw              m0, [coeffq], q0000
2201    punpcklwd            m0, m0
2202    mova                 m1, [o(pw_2896x8)]
2203    pmulhrsw             m0, m1
2204    mova                 m2, [o(pw_16384)]
2205    mov            [coeffq], eobd
2206    pmulhrsw             m0, m1
2207    pmulhrsw             m0, m2
2208    psrlw                m2, 3              ; pw_2048
2209    pmulhrsw             m0, m1
2210    pmulhrsw             m0, m2
2211    mov                 r3d, 4
2212    lea                tx2q, [o(m(inv_txfm_add_dct_dct_8x16).end)]
2213    jmp m(inv_txfm_add_dct_dct_8x8).loop
2214.end:
2215    RET
2216%endif
2217%endmacro
2218
2219INV_TXFM_8X16_FN dct, dct
2220INV_TXFM_8X16_FN dct, adst
2221INV_TXFM_8X16_FN dct, flipadst
2222INV_TXFM_8X16_FN dct, identity
2223
2224cglobal idct_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
2225    lea                    r3, [o(m(idct_8x8_internal).pass1)]
2226
2227.pass1:
2228    LOAD_8ROWS    coeffq+16*1, 32, 1
2229    mov   [rsp+gprsize+16*11], tx2q
2230    lea                  tx2q, [o(m(idct_8x16_internal).pass1_end)]
2231    jmp                    r3
2232
2233.pass1_end:
2234    SAVE_8ROWS    coeffq+16*1, 32
2235    LOAD_8ROWS    coeffq+16*0, 32, 1
2236    mov                  tx2q, [rsp+gprsize+16*11]
2237    jmp                    r3
2238
2239.pass2:
2240    lea                  tx2q, [o(m(idct_8x16_internal).end)]
2241
2242.pass2_pre:
2243    mova       [coeffq+16*2 ], m1
2244    mova       [coeffq+16*6 ], m3
2245    mova       [coeffq+16*10], m5
2246    mova       [coeffq+16*14], m7
2247    mova                   m1, m2
2248    mova                   m2, m4
2249    mova                   m3, m6
2250    mova                   m4, [coeffq+16*1 ]
2251    mova                   m5, [coeffq+16*5 ]
2252    mova                   m6, [coeffq+16*9 ]
2253    mova                   m7, [coeffq+16*13]
2254
2255.pass2_main:
2256    call m(idct_8x8_internal).main
2257
2258    SAVE_7ROWS   rsp+gprsize+16*3, 16
2259    mova                   m0, [coeffq+16*2 ]
2260    mova                   m1, [coeffq+16*6 ]
2261    mova                   m2, [coeffq+16*10]
2262    mova                   m3, [coeffq+16*14]
2263    mova                   m4, [coeffq+16*3 ]
2264    mova                   m5, [coeffq+16*7 ]
2265    mova                   m6, [coeffq+16*11]
2266    mova                   m7, [coeffq+16*15]
2267    call m(idct_16x8_internal).main
2268
2269    mov                    r3, dstq
2270    lea                  dstq, [dstq+strideq*8]
2271    jmp  m(idct_8x8_internal).end
2272
2273.end:
2274    LOAD_8ROWS   rsp+gprsize+16*3, 16
2275    mova   [rsp+gprsize+16*0], m7
2276    lea                  tx2q, [o(m(idct_8x16_internal).end1)]
2277    mov                  dstq, r3
2278    jmp  m(idct_8x8_internal).end
2279
2280.end1:
2281    pxor                   m7, m7
2282    REPX  {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
2283    ret
2284
2285INV_TXFM_8X16_FN adst, dct
2286INV_TXFM_8X16_FN adst, adst
2287INV_TXFM_8X16_FN adst, flipadst
2288INV_TXFM_8X16_FN adst, identity
2289
2290cglobal iadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
2291    lea                    r3, [o(m(iadst_8x8_internal).pass1)]
2292    jmp  m(idct_8x16_internal).pass1
2293
2294.pass2:
2295    lea                  tx2q, [o(m(iadst_8x16_internal).end)]
2296
2297.pass2_pre:
2298    mova    [rsp+gprsize+16*7], m0
2299    mova    [rsp+gprsize+16*8], m1
2300    mova    [rsp+gprsize+16*5], m6
2301    mova    [rsp+gprsize+16*6], m7
2302    mova                    m0, m2
2303    mova                    m1, m3
2304    mova                    m2, m4
2305    mova                    m3, m5
2306
2307.pass2_main:
2308    mova                    m4, [coeffq+16*1 ]
2309    mova                    m5, [coeffq+16*3 ]
2310    mova                    m6, [coeffq+16*13]
2311    mova                    m7, [coeffq+16*15]
2312    mova    [rsp+gprsize+16*3], m4
2313    mova    [rsp+gprsize+16*4], m5
2314    mova    [rsp+gprsize+16*9], m6
2315    mova    [rsp+gprsize+32*5], m7
2316    mova                    m4, [coeffq+16*5 ]
2317    mova                    m5, [coeffq+16*7 ]
2318    mova                    m6, [coeffq+16*9 ]
2319    mova                    m7, [coeffq+16*11]
2320
2321    call m(iadst_16x8_internal).main
2322    call m(iadst_16x8_internal).main_pass2_end
2323
2324    mov                    r3, dstq
2325    lea                  dstq, [dstq+strideq*8]
2326    jmp m(iadst_8x8_internal).end
2327
2328.end:
2329    LOAD_8ROWS   rsp+gprsize+16*3, 16
2330    mova   [rsp+gprsize+16*0], m7
2331    lea                  tx2q, [o(m(idct_8x16_internal).end1)]
2332    mov                  dstq, r3
2333    jmp  m(iadst_8x8_internal).end
2334
2335
2336INV_TXFM_8X16_FN flipadst, dct
2337INV_TXFM_8X16_FN flipadst, adst
2338INV_TXFM_8X16_FN flipadst, flipadst
2339INV_TXFM_8X16_FN flipadst, identity
2340
2341cglobal iflipadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
2342    lea                    r3, [o(m(iflipadst_8x8_internal).pass1)]
2343    jmp  m(idct_8x16_internal).pass1
2344
2345.pass2:
2346    lea                   tx2q, [o(m(iflipadst_8x16_internal).end)]
2347    lea                     r3, [dstq+strideq*8]
2348
2349.pass2_pre:
2350    mova    [rsp+gprsize+16*7], m0
2351    mova    [rsp+gprsize+16*8], m1
2352    mova    [rsp+gprsize+16*5], m6
2353    mova    [rsp+gprsize+16*6], m7
2354    mova                    m0, m2
2355    mova                    m1, m3
2356    mova                    m2, m4
2357    mova                    m3, m5
2358
2359.pass2_main:
2360    mova                    m4, [coeffq+16*1 ]
2361    mova                    m5, [coeffq+16*3 ]
2362    mova                    m6, [coeffq+16*13]
2363    mova                    m7, [coeffq+16*15]
2364    mova    [rsp+gprsize+16*3], m4
2365    mova    [rsp+gprsize+16*4], m5
2366    mova    [rsp+gprsize+16*9], m6
2367    mova    [rsp+gprsize+32*5], m7
2368    mova                    m4, [coeffq+16*5 ]
2369    mova                    m5, [coeffq+16*7 ]
2370    mova                    m6, [coeffq+16*9 ]
2371    mova                    m7, [coeffq+16*11]
2372
2373    call m(iadst_16x8_internal).main
2374    call m(iadst_16x8_internal).main_pass2_end
2375    jmp  m(iflipadst_8x8_internal).end
2376
2377.end:
2378    LOAD_8ROWS    rsp+gprsize+16*3, 16
2379    mova    [rsp+gprsize+16*0], m7
2380    lea                   tx2q, [o(m(idct_8x16_internal).end1)]
2381    mov                   dstq, r3
2382    jmp  m(iflipadst_8x8_internal).end
2383
2384
2385INV_TXFM_8X16_FN identity, dct
2386INV_TXFM_8X16_FN identity, adst
2387INV_TXFM_8X16_FN identity, flipadst
2388INV_TXFM_8X16_FN identity, identity
2389
2390cglobal iidentity_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
2391    LOAD_8ROWS    coeffq+16*1, 32, 1
2392    mov                    r3, tx2q
2393    lea                  tx2q, [o(m(iidentity_8x16_internal).pass1_end)]
2394    mova   [rsp+gprsize+16*1], m6
2395    jmp  m(idct_8x8_internal).pass1_end3
2396
2397.pass1_end:
2398    SAVE_8ROWS    coeffq+16*1, 32
2399    LOAD_8ROWS    coeffq+16*0, 32, 1
2400    mov                  tx2q, r3
2401    mova   [rsp+gprsize+16*1], m6
2402    jmp  m(idct_8x8_internal).pass1_end3
2403
2404.pass2:
2405    lea                  tx2q, [o(m(iidentity_8x16_internal).end1)]
2406
2407.end:
2408    mova   [rsp+gprsize+16*0], m7
2409    mova   [rsp+gprsize+16*1], m6
2410    mova                   m7, [o(pw_1697x16)]
2411    REPX     {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5
2412    mova                   m6, [rsp+gprsize+16*1]
2413    mova   [rsp+gprsize+16*2], m5
2414    IDTX16                  6, 5, 7
2415    mova                   m5, [rsp+gprsize+16*0]
2416    IDTX16                  5, 7, 7
2417    mova                   m7, [o(pw_2048)]
2418    REPX     {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
2419    pmulhrsw               m7, [rsp+gprsize+16*2]
2420    mova   [rsp+gprsize+16*0], m5
2421    mova   [rsp+gprsize+16*1], m6
2422    mova   [rsp+gprsize+16*2], m7
2423    jmp  m(idct_8x8_internal).end3
2424
2425.end1:
2426    LOAD_8ROWS    coeffq+16*1, 32
2427    lea                  tx2q, [o(m(idct_8x16_internal).end1)]
2428    lea                  dstq, [dstq+strideq*2]
2429    jmp .end
2430
2431
2432%macro INV_TXFM_16X8_FN 2 ; type1, type2
2433    INV_TXFM_FN          %1, %2, 16x8, 8, 16*16
2434%ifidn %1_%2, dct_dct
2435    movd                 m1, [o(pw_2896x8)]
2436    pmulhrsw             m0, m1, [coeffq]
2437    movd                 m2, [o(pw_16384)]
2438    mov            [coeffq], eobd
2439    pmulhrsw             m0, m1
2440    mov                 r2d, 4
2441    lea                tx2q, [o(m(inv_txfm_add_dct_dct_16x8).end)]
2442    jmp m(inv_txfm_add_dct_dct_16x4).dconly
2443.end:
2444    RET
2445%endif
2446%endmacro
2447
2448INV_TXFM_16X8_FN dct, dct
2449INV_TXFM_16X8_FN dct, adst
2450INV_TXFM_16X8_FN dct, flipadst
2451INV_TXFM_16X8_FN dct, identity
2452
2453cglobal idct_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
2454    LOAD_8ROWS    coeffq+16*0, 32, 1
2455    call m(idct_8x8_internal).main
2456    SAVE_7ROWS   rsp+gprsize+16*3, 16
2457
2458    LOAD_8ROWS    coeffq+16*1, 32, 1
2459    call  .main
2460    mov                    r3, tx2q
2461    lea                  tx2q, [o(m(idct_16x8_internal).pass1_end)]
2462    jmp  m(idct_8x8_internal).pass1_end
2463
2464.pass1_end:
2465    SAVE_8ROWS    coeffq+16*1, 32
2466    LOAD_8ROWS   rsp+gprsize+16*3, 16
2467    mova   [rsp+gprsize+16*0], m7
2468    mov                  tx2q, r3
2469    jmp  m(idct_8x8_internal).pass1_end
2470
2471.pass2:
2472    lea                  tx2q, [o(m(idct_16x8_internal).end)]
2473    lea                    r3, [dstq+8]
2474    jmp  m(idct_8x8_internal).pass2_main
2475
2476.end:
2477    LOAD_8ROWS    coeffq+16*1, 32
2478    lea                  tx2q, [o(m(idct_8x16_internal).end1)]
2479    mov                  dstq, r3
2480    jmp  m(idct_8x8_internal).pass2_main
2481
2482
2483ALIGN function_align
2484.main:
2485    mova [rsp+gprsize*2+16*1], m2
2486    mova [rsp+gprsize*2+16*2], m6
2487    mova [rsp+gprsize*2+32*5], m5
2488
2489    mova                   m6, [o(pd_2048)]
2490    ITX_MULSUB_2W           0, 7, 2, 5, 6,  401, 4076   ;t8a, t15a
2491    ITX_MULSUB_2W           4, 3, 2, 5, 6, 3166, 2598   ;t9a, t14a
2492    psubsw                 m2, m0, m4                   ;t9
2493    paddsw                 m0, m4                       ;t8
2494    psubsw                 m4, m7, m3                   ;t14
2495    paddsw                 m7, m3                       ;t15
2496    ITX_MULSUB_2W           4, 2, 3, 5, 6, 1567, 3784   ;t9a, t14a
2497    mova                   m3, [rsp+gprsize*2+16*1]
2498    mova                   m5, [rsp+gprsize*2+32*5]
2499    mova [rsp+gprsize*2+16*1], m2
2500    mova [rsp+gprsize*2+32*5], m4
2501    mova                   m2, [rsp+gprsize*2+16*2]
2502    mova [rsp+gprsize*2+16*2], m7
2503    ITX_MULSUB_2W           3, 5, 7, 4, 6, 1931, 3612   ;t10a, t13a
2504    ITX_MULSUB_2W           2, 1, 7, 4, 6, 3920, 1189   ;t11a, t12a
2505    psubsw                 m4, m2, m3                   ;t10
2506    paddsw                 m2, m3                       ;t11
2507    psubsw                 m3, m1, m5                   ;t13
2508    paddsw                 m1, m5                       ;t12
2509    ITX_MULSUB_2W           3, 4, 7, 5, 6, m3784, 1567  ;t10a, t13a
2510    mova                   m7, [rsp+gprsize*2+32*5]
2511    psubsw                 m6, m0, m2                   ;t11a
2512    paddsw                 m0, m2                       ;t8a
2513    paddsw                 m2, m7, m3                   ;t9
2514    psubsw                 m7, m3                       ;t10
2515    mova                   m5, [rsp+gprsize*2+16*0]
2516    psubsw                 m3, m5, m0                   ;out8
2517    paddsw                 m0, m5                       ;out7
2518    mova [rsp+gprsize*2+32*5], m0
2519    mova                   m5, [rsp+gprsize*2+16*9]
2520    psubsw                 m0, m5, m2                   ;out9
2521    paddsw                 m2, m5                       ;out6
2522    mova [rsp+gprsize*2+16*0], m0
2523    mova [rsp+gprsize*2+16*9], m2
2524    mova                   m0, [rsp+gprsize*2+16*1]
2525    mova                   m2, [rsp+gprsize*2+16*2]
2526    mova [rsp+gprsize*2+16*1], m3
2527    psubsw                 m5, m0, m4                   ;t13
2528    paddsw                 m0, m4                       ;t14
2529    mova                   m3, [o(pd_2048)]
2530    psubsw                 m4, m2, m1                   ;t12a
2531    paddsw                 m1, m2                       ;t15a
2532    mova [rsp+gprsize*2+16*2], m1
2533    ITX_MULSUB_2W           5, 7, 1, 2, 3, 2896, 2896   ;t10a, t13a
2534    ITX_MULSUB_2W           4, 6, 1, 2, 3, 2896, 2896   ;t11,  t12
2535    mova                   m3, [rsp+gprsize*2+16*8]
2536    psubsw                 m2, m3, m5                   ;out10
2537    paddsw                 m3, m5                       ;out5
2538    mova                   m5, [rsp+gprsize*2+16*7]
2539    mova [rsp+gprsize*2+16*8], m3
2540    psubsw                 m3, m5, m4                   ;out11
2541    paddsw                 m5, m4                       ;out4
2542    mova                   m4, [rsp+gprsize*2+16*6]
2543    mova [rsp+gprsize*2+16*7], m5
2544    paddsw                 m5, m4, m6                   ;out3
2545    psubsw                 m4, m6                       ;out12
2546    mova                   m6, [rsp+gprsize*2+16*5]
2547    mova [rsp+gprsize*2+16*6], m5
2548    psubsw                 m5, m6, m7                   ;out13
2549    paddsw                 m6, m7                       ;out2
2550    mova                   m7, [rsp+gprsize*2+16*4]
2551    mova [rsp+gprsize*2+16*5], m6
2552    psubsw                 m6, m7, m0                   ;out14
2553    paddsw                 m7, m0                       ;out1
2554    mova                   m1, [rsp+gprsize*2+16*2]
2555    mova                   m0, [rsp+gprsize*2+16*3]
2556    mova [rsp+gprsize*2+16*4], m7
2557    psubsw                 m7, m0, m1                   ;out15
2558    paddsw                 m0, m1                       ;out0
2559    mova [rsp+gprsize*2+16*3], m0
2560    mova                   m1, [rsp+gprsize*2+16*0]
2561    mova                   m0, [rsp+gprsize*2+16*1]
2562    mova [rsp+gprsize*2+16*0], m7
2563    ret
2564
2565INV_TXFM_16X8_FN adst, dct
2566INV_TXFM_16X8_FN adst, adst
2567INV_TXFM_16X8_FN adst, flipadst
2568INV_TXFM_16X8_FN adst, identity
2569
2570cglobal iadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
2571    mova                    m7, [o(pw_2896x8)]
2572    pmulhrsw                m0, m7, [coeffq+16*0 ]
2573    pmulhrsw                m1, m7, [coeffq+16*1 ]
2574    pmulhrsw                m2, m7, [coeffq+16*14]
2575    pmulhrsw                m3, m7, [coeffq+16*15]
2576    mova    [rsp+gprsize+16*7], m0
2577    mova    [rsp+gprsize+16*8], m1
2578    mova    [rsp+gprsize+16*9], m2
2579    mova    [rsp+gprsize+32*5], m3
2580    pmulhrsw                m0, m7, [coeffq+16*6 ]
2581    pmulhrsw                m1, m7, [coeffq+16*7 ]
2582    pmulhrsw                m2, m7, [coeffq+16*8 ]
2583    pmulhrsw                m3, m7, [coeffq+16*9 ]
2584    mova    [rsp+gprsize+16*3], m2
2585    mova    [rsp+gprsize+16*4], m3
2586    mova    [rsp+gprsize+16*5], m0
2587    mova    [rsp+gprsize+16*6], m1
2588    pmulhrsw                m0, m7, [coeffq+16*2 ]
2589    pmulhrsw                m1, m7, [coeffq+16*3 ]
2590    pmulhrsw                m2, m7, [coeffq+16*4 ]
2591    pmulhrsw                m3, m7, [coeffq+16*5 ]
2592    pmulhrsw                m4, m7, [coeffq+16*10]
2593    pmulhrsw                m5, m7, [coeffq+16*11]
2594    pmulhrsw                m6, m7, [coeffq+16*12]
2595    pmulhrsw                m7,     [coeffq+16*13]
2596
2597    call .main
2598    call .main_pass1_end
2599    mov                    r3, tx2q
2600    lea                  tx2q, [o(m(iadst_16x8_internal).pass1_end)]
2601    jmp m(iadst_8x8_internal).pass1_end
2602
2603.pass1_end:
2604    SAVE_8ROWS    coeffq+16*1, 32
2605    LOAD_8ROWS   rsp+gprsize+16*3, 16
2606    mova   [rsp+gprsize+16*0], m7
2607    mov                  tx2q, r3
2608    jmp m(iadst_8x8_internal).pass1_end
2609
2610.pass2:
2611    lea                  tx2q, [o(m(iadst_16x8_internal).end)]
2612    lea                    r3, [dstq+8]
2613    jmp m(iadst_8x8_internal).pass2_main
2614
2615.end:
2616    LOAD_8ROWS    coeffq+16*1, 32
2617    lea                  tx2q, [o(m(idct_8x16_internal).end1)]
2618    mov                  dstq, r3
2619    jmp m(iadst_8x8_internal).pass2_main
2620
2621ALIGN function_align
2622.main:
2623    mova  [rsp+gprsize*2+16*0], m1
2624    mova  [rsp+gprsize*2+16*1], m2
2625    mova  [rsp+gprsize*2+16*2], m6
2626
2627    mova                    m6, [o(pd_2048)]
2628    ITX_MULSUB_2W            7, 0, 1, 2, 6,  995, 3973   ;t3,  t2
2629    ITX_MULSUB_2W            3, 4, 1, 2, 6, 3513, 2106   ;t11, t10
2630    psubsw                  m1, m0, m4                   ;t10a
2631    paddsw                  m0, m4                       ;t2a
2632    psubsw                  m4, m7, m3                   ;t11a
2633    paddsw                  m3, m7                       ;t3a
2634    ITX_MULSUB_2W            1, 4, 7, 2, 6, 3406, 2276   ;t11, t10
2635    mova                    m2, [rsp+gprsize*2+16*0]     ;in3
2636    mova                    m7, [rsp+gprsize*2+16*1]     ;in4
2637    mova  [rsp+gprsize*2+16*0], m1                       ;t11
2638    mova  [rsp+gprsize*2+16*1], m4                       ;t10
2639    mova                    m1, [rsp+gprsize*2+16*2]     ;in12
2640    mova  [rsp+gprsize*2+16*2], m0                       ;t2a
2641    ITX_MULSUB_2W            5, 7, 0, 4, 6, 1751, 3703   ;t5,  t4
2642    ITX_MULSUB_2W            2, 1, 0, 4, 6, 3857, 1380   ;t13, t12
2643    psubsw                  m0, m7, m1                   ;t12a
2644    paddsw                  m1, m7                       ;t4a
2645    psubsw                  m4, m5, m2                   ;t13a
2646    paddsw                  m5, m2                       ;t5a
2647    ITX_MULSUB_2W            4, 0, 7, 2, 6, 4017,  799   ;t12, t13
2648    mova                    m2, [rsp+gprsize*2+16*8]     ;in1
2649    mova                    m7, [rsp+gprsize*2+16*9]     ;in14
2650    mova  [rsp+gprsize*2+16*8], m4                       ;t12
2651    mova  [rsp+gprsize*2+16*9], m0                       ;t13
2652    mova                    m4, [rsp+gprsize*2+16*4]     ;in9
2653    mova                    m0, [rsp+gprsize*2+16*5]     ;in6
2654    mova  [rsp+gprsize*2+16*4], m1                       ;t4a
2655    mova  [rsp+gprsize*2+16*5], m5                       ;t5a
2656    ITX_MULSUB_2W            2, 7, 1, 5, 6, 4052,  601   ;t15, t14
2657    ITX_MULSUB_2W            4, 0, 1, 5, 6, 2440, 3290   ;t7,  t6
2658    psubsw                  m1, m0, m7                   ;t14a
2659    paddsw                  m0, m7                       ;t6a
2660    psubsw                  m5, m4, m2                   ;t15a
2661    paddsw                  m4, m2                       ;t7a
2662    ITX_MULSUB_2W            5, 1, 7, 2, 6, 2276, 3406   ;t14, t15
2663    mova                    m2, [rsp+gprsize*2+16*2]     ;t2a
2664    mova  [rsp+gprsize*2+16*2], m5                       ;t14
2665    psubsw                  m7, m2, m0                   ;t6
2666    paddsw                  m2, m0                       ;t2
2667    psubsw                  m0, m3, m4                   ;t7
2668    paddsw                  m3, m4                       ;t3
2669    ITX_MULSUB_2W            0, 7, 4, 5, 6, 3784, 1567   ;t6a, t7a
2670    mova                    m4, [rsp+gprsize*2+16*7]     ;in0
2671    mova                    m5, [rsp+gprsize*2+32*5]     ;in15
2672    mova  [rsp+gprsize*2+16*7], m3                       ;t3
2673    mova  [rsp+gprsize*2+32*5], m1                       ;t15
2674    mova                    m1, [rsp+gprsize*2+16*6]     ;in7
2675    mova                    m3, [rsp+gprsize*2+16*3]     ;in8
2676    mova  [rsp+gprsize*2+16*6], m7                       ;t7a
2677    mova  [rsp+gprsize*2+16*3], m0                       ;t6a
2678    ITX_MULSUB_2W            5, 4, 0, 7, 6,  201, 4091   ;t1,  t0
2679    ITX_MULSUB_2W            1, 3, 0, 7, 6, 3035, 2751   ;t9,  t8
2680    psubsw                  m0, m4, m3                   ;t8a
2681    paddsw                  m4, m3                       ;t0a
2682    psubsw                  m3, m5, m1                   ;t9a
2683    paddsw                  m5, m1                       ;t1a
2684    ITX_MULSUB_2W            0, 3, 1, 7, 6,  799, 4017   ;t9,  t8
2685    mova                    m1, [rsp+gprsize*2+16*4]     ;t4a
2686    mova                    m7, [rsp+gprsize*2+16*5]     ;t5a
2687    mova  [rsp+gprsize*2+16*4], m3                       ;t8
2688    mova  [rsp+gprsize*2+16*5], m0                       ;t9
2689    psubsw                  m0, m4, m1                   ;t4
2690    paddsw                  m4, m1                       ;t0
2691    psubsw                  m3, m5, m7                   ;t5
2692    paddsw                  m5, m7                       ;t1
2693    ITX_MULSUB_2W            0, 3, 1, 7, 6, 1567, 3784   ;t5a, t4a
2694    mova                    m7, [rsp+gprsize*2+16*3]     ;t6a
2695    psubsw                  m1, m4, m2                   ;t2a
2696    paddsw                  m4, m2                       ;out0
2697    mova  [rsp+gprsize*2+16*3], m4                       ;out0
2698    mova                    m4, [rsp+gprsize*2+16*6]     ;t7a
2699    psubsw                  m2, m3, m7                   ;t6
2700    paddsw                  m3, m7                       ;-out3
2701    mova  [rsp+gprsize*2+16*6], m3                       ;-out3
2702    psubsw                  m3, m0, m4                   ;t7
2703    paddsw                  m0, m4                       ;out12
2704    mova [rsp+gprsize*2+16*12], m3
2705    mova                    m3, [rsp+gprsize*2+16*7]     ;t3
2706    mova [rsp+gprsize*2+16* 7], m2                       ;out4
2707    psubsw                  m2, m5, m3                   ;t3a
2708    paddsw                  m5, m3                       ;-out15
2709    mova [rsp+gprsize*2+16*11], m2
2710    mova                    m2, [rsp+gprsize*2+32*5]     ;t15
2711    mova [rsp+gprsize*2+16*10], m1                       ;-out7
2712    mova                    m1, [rsp+gprsize*2+16*0]     ;t11
2713    mova [rsp+gprsize*2+16*0 ], m5                       ;-out15
2714    mova                    m3, [rsp+gprsize*2+16*1]     ;t10
2715    mova [rsp+gprsize*2+16*1 ], m4                       ;-out11
2716    mova                    m4, [rsp+gprsize*2+16*2]     ;t14
2717    mova [rsp+gprsize*2+16*2 ], m0                       ;out12
2718    psubsw                  m0, m3, m4                   ;t14a
2719    paddsw                  m3, m4                       ;t10a
2720    psubsw                  m5, m1, m2                   ;t15a
2721    paddsw                  m1, m2                       ;t11a
2722    ITX_MULSUB_2W            5, 0, 2, 4, 6, 3784, 1567   ;t14, t15
2723    mova                    m2, [rsp+gprsize*2+16*4]     ;t8
2724    mova                    m4, [rsp+gprsize*2+16*5]     ;t9
2725    mova  [rsp+gprsize*2+16*4], m3                       ;t10a
2726    mova  [rsp+gprsize*2+16*5], m1                       ;t11a
2727    mova                    m3, [rsp+gprsize*2+16*8]     ;t12
2728    mova                    m1, [rsp+gprsize*2+16*9]     ;t13
2729    mova  [rsp+gprsize*2+16*8], m5                       ;t14
2730    mova  [rsp+gprsize*2+16*9], m0                       ;t15
2731    psubsw                  m5, m2, m3                   ;t12a
2732    paddsw                  m2, m3                       ;t8a
2733    psubsw                  m0, m4, m1                   ;t13a
2734    paddsw                  m4, m1                       ;t9a
2735    ITX_MULSUB_2W            5, 0, 1, 3, 6, 1567, 3784   ;t13, t12
2736    mova                    m6, [rsp+gprsize*2+16*4]     ;t10a
2737    mova                    m1, [rsp+gprsize*2+16*5]     ;t11a
2738    psubsw                  m3, m2, m6                   ;t10
2739    paddsw                  m2, m6                       ;-out1
2740    paddsw                  m6, m4, m1                   ;out14
2741    psubsw                  m4, m1                       ;t11
2742    mova [rsp+gprsize*2+16*14], m4
2743    mova [rsp+gprsize*2+16* 4], m2                       ;-out1
2744    mova                    m4, [rsp+gprsize*2+16*8]     ;t14
2745    mova                    m2, [rsp+gprsize*2+16*9]     ;t15
2746    mova [rsp+gprsize*2+16* 9], m3                       ;out6
2747    psubsw                  m3, m0, m4                   ;t14a
2748    paddsw                  m0, m4                       ;out2
2749    psubsw                  m4, m5, m2                   ;t15a
2750    paddsw                  m5, m2                       ;-out13
2751    mova [rsp+gprsize*2+16* 5], m0                       ;out2
2752    ret
2753ALIGN function_align
2754.main_pass1_end:
2755    mova                    m0, [rsp+gprsize*2+16*14]
2756    mova [rsp+gprsize*2+16*14], m5
2757    mova [rsp+gprsize*2+16*15], m6
2758    mova                    m5, [o(pw_2896_2896)]
2759    mova                    m6, [o(pw_2896_m2896)]
2760    mova                    m7, [o(pd_2048)]
2761    punpcklwd               m2, m3, m4
2762    punpckhwd               m3, m4
2763    pmaddwd                 m4, m5, m2
2764    pmaddwd                 m2, m6
2765    pmaddwd                 m1, m5, m3
2766    pmaddwd                 m3, m6
2767    REPX         {paddd x, m7}, m4, m2, m1, m3
2768    REPX         {psrad x, 12}, m4, m1, m2, m3
2769    packssdw                m4, m1                       ;-out5
2770    packssdw                m2, m3                       ;out10
2771    mova [rsp+gprsize*2+16* 8], m4
2772    mova                    m3, [rsp+gprsize*2+16* 9]
2773    punpcklwd               m1, m3, m0
2774    punpckhwd               m3, m0
2775    pmaddwd                 m0, m5, m1
2776    pmaddwd                 m1, m6
2777    pmaddwd                 m4, m5, m3
2778    pmaddwd                 m3, m6
2779    REPX         {paddd x, m7}, m0, m1, m4, m3
2780    REPX         {psrad x, 12}, m0, m4, m1, m3
2781    packssdw                m0, m4                       ;out6
2782    packssdw                m1, m3                       ;-out9
2783    mova [rsp+gprsize*2+16* 9], m0
2784    mova                    m0, [rsp+gprsize*2+16* 7]
2785    mova                    m4, [rsp+gprsize*2+16*12]
2786    punpcklwd               m3, m0, m4
2787    punpckhwd               m0, m4
2788    pmaddwd                 m4, m5, m3
2789    pmaddwd                 m3, m6
2790    pmaddwd                 m5, m0
2791    pmaddwd                 m0, m6
2792    REPX         {paddd x, m7}, m4, m3, m5, m0
2793    REPX         {psrad x, 12}, m4, m5, m3, m0
2794    packssdw                m4, m5                       ;out4
2795    packssdw                m3, m0                       ;-out11
2796    mova [rsp+gprsize*2+16* 7], m4
2797    mova                    m4, [rsp+gprsize*2+16*10]
2798    mova                    m5, [rsp+gprsize*2+16*11]
2799    punpcklwd               m0, m4, m5
2800    punpckhwd               m4, m5
2801    pmaddwd                 m5, m0, [o(pw_2896_2896)]
2802    pmaddwd                 m0, m6
2803    pmaddwd                 m6, m4
2804    pmaddwd                 m4, [o(pw_2896_2896)]
2805    REPX         {paddd x, m7}, m5, m0, m6, m4
2806    REPX         {psrad x, 12}, m0, m6, m5, m4
2807    packssdw                m0, m6                       ;out8
2808    packssdw                m5, m4                       ;-out7
2809    mova [rsp+gprsize*2+16*10], m5
2810    mova                    m4, [rsp+gprsize*2+16* 2]    ;out12
2811    mova                    m5, [rsp+gprsize*2+16*14]    ;-out13
2812    mova                    m6, [rsp+gprsize*2+16*15]    ;out14
2813    ret
2814ALIGN function_align
2815.main_pass2_end:
2816    mova                    m7, [o(pw_2896x8)]
2817    mova                    m1, [rsp+gprsize*2+16* 9]
2818    mova                    m2, [rsp+gprsize*2+16*14]
2819    paddsw                  m0, m1, m2
2820    psubsw                  m1, m2
2821    pmulhrsw                m0, m7                       ;out6
2822    pmulhrsw                m1, m7                       ;-out9
2823    mova [rsp+gprsize*2+16* 9], m0
2824    psubsw                  m2, m3, m4
2825    paddsw                  m3, m4
2826    pmulhrsw                m2, m7                       ;out10
2827    pmulhrsw                m3, m7                       ;-out5
2828    mova [rsp+gprsize*2+16* 8], m3
2829    mova                    m3, [rsp+gprsize*2+16* 7]
2830    mova                    m4, [rsp+gprsize*2+16*12]
2831    paddsw                  m0, m3, m4
2832    psubsw                  m3, m4
2833    pmulhrsw                m0, m7                       ;out4
2834    pmulhrsw                m3, m7                       ;-out11
2835    mova [rsp+gprsize*2+16* 7], m0
2836    mova                    m0, [rsp+gprsize*2+16*10]
2837    paddsw                  m4, m0, [rsp+gprsize*2+16*11]
2838    psubsw                  m0, [rsp+gprsize*2+16*11]
2839    pmulhrsw                m4, m7                       ;-out7
2840    pmulhrsw                m0, m7                       ;out8
2841    mova [rsp+gprsize*2+16*10], m4
2842    mova                    m4, [rsp+gprsize*2+16*2 ]    ;out12
2843    ret
2844
2845INV_TXFM_16X8_FN flipadst, dct
2846INV_TXFM_16X8_FN flipadst, adst
2847INV_TXFM_16X8_FN flipadst, flipadst
2848INV_TXFM_16X8_FN flipadst, identity
2849
2850cglobal iflipadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
2851    mova                    m7, [o(pw_2896x8)]
2852    pmulhrsw                m0, m7, [coeffq+16*0 ]
2853    pmulhrsw                m1, m7, [coeffq+16*1 ]
2854    pmulhrsw                m2, m7, [coeffq+16*14]
2855    pmulhrsw                m3, m7, [coeffq+16*15]
2856    mova    [rsp+gprsize+16*7], m0
2857    mova    [rsp+gprsize+16*8], m1
2858    mova    [rsp+gprsize+16*9], m2
2859    mova    [rsp+gprsize+32*5], m3
2860    pmulhrsw                m0, m7, [coeffq+16*6 ]
2861    pmulhrsw                m1, m7, [coeffq+16*7 ]
2862    pmulhrsw                m2, m7, [coeffq+16*8 ]
2863    pmulhrsw                m3, m7, [coeffq+16*9 ]
2864    mova    [rsp+gprsize+16*3], m2
2865    mova    [rsp+gprsize+16*4], m3
2866    mova    [rsp+gprsize+16*5], m0
2867    mova    [rsp+gprsize+16*6], m1
2868    pmulhrsw                m0, m7, [coeffq+16*2 ]
2869    pmulhrsw                m1, m7, [coeffq+16*3 ]
2870    pmulhrsw                m2, m7, [coeffq+16*4 ]
2871    pmulhrsw                m3, m7, [coeffq+16*5 ]
2872    pmulhrsw                m4, m7, [coeffq+16*10]
2873    pmulhrsw                m5, m7, [coeffq+16*11]
2874    pmulhrsw                m6, m7, [coeffq+16*12]
2875    pmulhrsw                m7,     [coeffq+16*13]
2876
2877    call m(iadst_16x8_internal).main
2878    call m(iadst_16x8_internal).main_pass1_end
2879
2880    mova                    m7, [rsp+gprsize+16*0]
2881    SAVE_8ROWS     coeffq+16*0, 32
2882    LOAD_8ROWS    rsp+gprsize+16*3, 16
2883    mova    [rsp+gprsize+16*0], m7
2884    mov                     r3, tx2q
2885    lea                   tx2q, [o(m(iflipadst_16x8_internal).pass1_end)]
2886    jmp m(iflipadst_8x8_internal).pass1_end
2887
2888.pass1_end:
2889    SAVE_8ROWS     coeffq+16*1, 32
2890    LOAD_8ROWS     coeffq+16*0, 32
2891    mova    [rsp+gprsize+16*0], m7
2892    mov                   tx2q, r3
2893    jmp m(iflipadst_8x8_internal).pass1_end
2894
2895.pass2:
2896    lea                   tx2q, [o(m(iflipadst_16x8_internal).end)]
2897    lea                     r3, [dstq+8]
2898    jmp m(iflipadst_8x8_internal).pass2_main
2899
2900.end:
2901    LOAD_8ROWS     coeffq+16*1, 32
2902    lea                   tx2q, [o(m(idct_8x16_internal).end1)]
2903    mov                   dstq, r3
2904    jmp m(iflipadst_8x8_internal).pass2_main
2905
2906
2907INV_TXFM_16X8_FN identity, dct
2908INV_TXFM_16X8_FN identity, adst
2909INV_TXFM_16X8_FN identity, flipadst
2910INV_TXFM_16X8_FN identity, identity
2911
2912cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
2913    add                coeffq, 16*16
2914    mova                   m4, [coeffq-16*7]
2915    mova                   m5, [coeffq-16*5]
2916    mova                   m6, [coeffq-16*3]
2917    mova                   m7, [coeffq-16*1]
2918    mov                    r3, tx2q
2919    lea                  tx2q, [o(m(iidentity_16x8_internal).pass1_end)]
2920
2921.pass1:
2922    mova                   m0, [o(pw_2896x8)]
2923    mova                   m2, [o(pw_1697x16)]
2924    mova                   m3, [o(pw_16384)]
2925    sub                coeffq, 8*16
2926    REPX     {pmulhrsw x, m0}, m4, m5, m6, m7
2927    pmulhrsw               m1, m2, m4
2928    pmulhrsw               m1, m3
2929    paddsw                 m1, m4 ; 1
2930    pmulhrsw               m4, m2, m5
2931    pmulhrsw               m4, m3
2932    paddsw                 m4, m5 ; 3
2933    pmulhrsw               m5, m2, m6
2934    pmulhrsw               m5, m3
2935    paddsw                 m5, m6 ; 5
2936    pmulhrsw               m6, m2, m7
2937    pmulhrsw               m6, m3
2938    paddsw                 m7, m6 ; 7
2939    pmulhrsw               m6, m0, [coeffq+16*6]
2940    mova   [rsp+gprsize+16*0], m4
2941    pmulhrsw               m4, m2, m6
2942    pmulhrsw               m4, m3
2943    paddsw                 m6, m4 ; 6
2944    pmulhrsw               m4, m0, [coeffq+16*4]
2945    mova   [rsp+gprsize+16*1], m6
2946    pmulhrsw               m6, m2, m4
2947    pmulhrsw               m6, m3
2948    paddsw                 m4, m6 ; 4
2949    pmulhrsw               m6, m0, [coeffq+16*2]
2950    pmulhrsw               m0,     [coeffq+16*0]
2951    pmulhrsw               m2, m6
2952    pmulhrsw               m2, m3
2953    paddsw                 m2, m6 ; 2
2954    pmulhrsw               m6, m0, [o(pw_1697x16)]
2955    pmulhrsw               m6, m3
2956    mova                   m3, [rsp+gprsize+16*0]
2957    paddsw                 m0, m6
2958    jmp   m(idct_8x8_internal).pass1_end3
2959
2960.pass1_end:
2961    mova        [coeffq+16*1], m4
2962    mova        [coeffq+16*3], m5
2963    mova        [coeffq+16*5], m6
2964    mova        [coeffq+16*7], m7
2965    mova                   m4, [coeffq-16*7]
2966    mova                   m5, [coeffq-16*5]
2967    mova                   m6, [coeffq-16*3]
2968    mova                   m7, [coeffq-16*1]
2969    mova        [coeffq-16*7], m0
2970    mova        [coeffq-16*5], m1
2971    mova        [coeffq-16*3], m2
2972    mova        [coeffq-16*1], m3
2973    mov                  tx2q, r3
2974    jmp .pass1
2975
2976.pass2:
2977    lea                  tx2q, [o(m(iidentity_16x8_internal).end)]
2978    lea                    r3, [dstq+8]
2979    jmp  m(iidentity_8x8_internal).end
2980
2981.end:
2982    LOAD_8ROWS    coeffq+16*1, 32
2983    lea                  tx2q, [o(m(idct_8x16_internal).end1)]
2984    mov                  dstq, r3
2985    jmp  m(iidentity_8x8_internal).end
2986
2987
2988%macro INV_TXFM_16X16_FN 2 ; type1, type2
2989    INV_TXFM_FN          %1, %2, 16x16, 8, 16*16
2990%ifidn %1_%2, dct_dct
2991    movd                   m1, [o(pw_2896x8)]
2992    pmulhrsw               m0, m1, [coeffq]
2993    movd                   m2, [o(pw_8192)]
2994    mov              [coeffq], eobd
2995    mov                   r2d, 8
2996    lea                  tx2q, [o(m(inv_txfm_add_dct_dct_16x16).end)]
2997    jmp m(inv_txfm_add_dct_dct_16x4).dconly
2998.end:
2999    RET
3000%endif
3001%endmacro
3002
3003INV_TXFM_16X16_FN dct, dct
3004INV_TXFM_16X16_FN dct, adst
3005INV_TXFM_16X16_FN dct, flipadst
3006INV_TXFM_16X16_FN dct, identity
3007
3008cglobal idct_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
3009    LOAD_8ROWS     coeffq+16*1, 64
3010    call  m(idct_8x8_internal).main
3011    SAVE_7ROWS    rsp+gprsize+16*3, 16
3012    LOAD_8ROWS     coeffq+16*3, 64
3013    call m(idct_16x8_internal).main
3014    mov                     r3, tx2q
3015    lea                   tx2q, [o(m(idct_16x16_internal).pass1_end)]
3016    mova                    m7, [o(pw_8192)]
3017    jmp   m(idct_8x8_internal).pass1_end1
3018
3019.pass1_end:
3020    SAVE_8ROWS    coeffq+16*17, 32
3021    LOAD_8ROWS    rsp+gprsize+16*3, 16
3022    mova    [rsp+gprsize+16*0], m7
3023    lea                   tx2q, [o(m(idct_16x16_internal).pass1_end1)]
3024    mova                    m7, [o(pw_8192)]
3025    jmp   m(idct_8x8_internal).pass1_end1
3026
3027.pass1_end1:
3028    SAVE_8ROWS     coeffq+16*1, 32
3029    LOAD_8ROWS     coeffq+16*0, 64
3030    call  m(idct_8x8_internal).main
3031    SAVE_7ROWS    rsp+gprsize+16*3, 16
3032    LOAD_8ROWS     coeffq+16*2, 64
3033    call m(idct_16x8_internal).main
3034    lea                   tx2q, [o(m(idct_16x16_internal).pass1_end2)]
3035    mova                    m7, [o(pw_8192)]
3036    jmp   m(idct_8x8_internal).pass1_end1
3037
3038.pass1_end2:
3039    SAVE_8ROWS    coeffq+16*16, 32
3040    LOAD_8ROWS    rsp+gprsize+16*3, 16
3041    mova    [rsp+gprsize+16*0], m7
3042    mov                   tx2q, r3
3043    mova                    m7, [o(pw_8192)]
3044    jmp   m(idct_8x8_internal).pass1_end1
3045
3046.pass2:
3047    lea                   tx2q, [o(m(idct_16x16_internal).end)]
3048    jmp  m(idct_8x16_internal).pass2_pre
3049
3050.end:
3051    LOAD_8ROWS    rsp+gprsize+16*3, 16
3052    mova    [rsp+gprsize+16*0], m7
3053    lea                   tx2q, [o(m(idct_16x16_internal).end1)]
3054    mov                   dstq, r3
3055    lea                     r3, [dstq+8]
3056    jmp   m(idct_8x8_internal).end
3057
3058.end1:
3059    pxor                    m7, m7
3060    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
3061
3062    add                 coeffq, 32*8
3063    mov                   dstq, r3
3064
3065    mova                    m0, [coeffq+16*0 ]
3066    mova                    m1, [coeffq+16*4 ]
3067    mova                    m2, [coeffq+16*8 ]
3068    mova                    m3, [coeffq+16*12]
3069    mova                    m4, [coeffq+16*1 ]
3070    mova                    m5, [coeffq+16*5 ]
3071    mova                    m6, [coeffq+16*9 ]
3072    mova                    m7, [coeffq+16*13]
3073    lea                   tx2q, [o(m(idct_8x16_internal).end)]
3074    jmp  m(idct_8x16_internal).pass2_main
3075
3076
3077%macro ITX_16X16_ADST_LOAD_ODD_COEFS 0
3078    mova                    m0, [coeffq+16*1 ]
3079    mova                    m1, [coeffq+16*3 ]
3080    mova                    m2, [coeffq+16*29]
3081    mova                    m3, [coeffq+16*31]
3082    mova    [rsp+gprsize+16*7], m0
3083    mova    [rsp+gprsize+16*8], m1
3084    mova    [rsp+gprsize+16*9], m2
3085    mova    [rsp+gprsize+32*5], m3
3086    mova                    m0, [coeffq+16*13]
3087    mova                    m1, [coeffq+16*15]
3088    mova                    m2, [coeffq+16*17]
3089    mova                    m3, [coeffq+16*19]
3090    mova    [rsp+gprsize+16*3], m2
3091    mova    [rsp+gprsize+16*4], m3
3092    mova    [rsp+gprsize+16*5], m0
3093    mova    [rsp+gprsize+16*6], m1
3094    mova                    m0, [coeffq+16*5 ]
3095    mova                    m1, [coeffq+16*7 ]
3096    mova                    m2, [coeffq+16*9 ]
3097    mova                    m3, [coeffq+16*11]
3098    mova                    m4, [coeffq+16*21]
3099    mova                    m5, [coeffq+16*23]
3100    mova                    m6, [coeffq+16*25]
3101    mova                    m7, [coeffq+16*27]
3102%endmacro
3103
3104%macro ITX_16X16_ADST_LOAD_EVEN_COEFS 0
3105    mova                    m0, [coeffq+16*0 ]
3106    mova                    m1, [coeffq+16*2 ]
3107    mova                    m2, [coeffq+16*28]
3108    mova                    m3, [coeffq+16*30]
3109    mova    [rsp+gprsize+16*7], m0
3110    mova    [rsp+gprsize+16*8], m1
3111    mova    [rsp+gprsize+16*9], m2
3112    mova    [rsp+gprsize+32*5], m3
3113    mova                    m0, [coeffq+16*12]
3114    mova                    m1, [coeffq+16*14]
3115    mova                    m2, [coeffq+16*16]
3116    mova                    m3, [coeffq+16*18]
3117    mova    [rsp+gprsize+16*3], m2
3118    mova    [rsp+gprsize+16*4], m3
3119    mova    [rsp+gprsize+16*5], m0
3120    mova    [rsp+gprsize+16*6], m1
3121    mova                    m0, [coeffq+16*4 ]
3122    mova                    m1, [coeffq+16*6 ]
3123    mova                    m2, [coeffq+16*8 ]
3124    mova                    m3, [coeffq+16*10]
3125    mova                    m4, [coeffq+16*20]
3126    mova                    m5, [coeffq+16*22]
3127    mova                    m6, [coeffq+16*24]
3128    mova                    m7, [coeffq+16*26]
3129%endmacro
3130
3131INV_TXFM_16X16_FN adst, dct
3132INV_TXFM_16X16_FN adst, adst
3133INV_TXFM_16X16_FN adst, flipadst
3134
3135cglobal iadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
3136    ITX_16X16_ADST_LOAD_ODD_COEFS
3137    call m(iadst_16x8_internal).main
3138    call m(iadst_16x8_internal).main_pass1_end
3139
3140    mov                     r3, tx2q
3141    lea                   tx2q, [o(m(iadst_16x16_internal).pass1_end)]
3142    mova                    m7, [o(pw_8192)]
3143    jmp  m(iadst_8x8_internal).pass1_end1
3144
3145.pass1_end:
3146    SAVE_8ROWS    coeffq+16*17, 32
3147    LOAD_8ROWS    rsp+gprsize+16*3, 16
3148    mova    [rsp+gprsize+16*0], m7
3149    lea                   tx2q, [o(m(iadst_16x16_internal).pass1_end1)]
3150    mova                    m7, [o(pw_8192)]
3151    jmp  m(iadst_8x8_internal).pass1_end1
3152
3153.pass1_end1:
3154    SAVE_8ROWS     coeffq+16*1, 32
3155    ITX_16X16_ADST_LOAD_EVEN_COEFS
3156    call m(iadst_16x8_internal).main
3157    call m(iadst_16x8_internal).main_pass1_end
3158
3159    lea                   tx2q, [o(m(iadst_16x16_internal).pass1_end2)]
3160    mova                    m7, [o(pw_8192)]
3161    jmp  m(iadst_8x8_internal).pass1_end1
3162
3163.pass1_end2:
3164    SAVE_8ROWS    coeffq+16*16, 32
3165    LOAD_8ROWS    rsp+gprsize+16*3, 16
3166    mova    [rsp+gprsize+16*0], m7
3167    mov                   tx2q, r3
3168    mova                    m7, [o(pw_8192)]
3169    jmp  m(iadst_8x8_internal).pass1_end1
3170
3171.pass2:
3172    lea                   tx2q, [o(m(iadst_16x16_internal).end)]
3173    jmp m(iadst_8x16_internal).pass2_pre
3174
3175.end:
3176    LOAD_8ROWS    rsp+gprsize+16*3, 16
3177    mova    [rsp+gprsize+16*0], m7
3178    lea                   tx2q, [o(m(iadst_16x16_internal).end1)]
3179    mov                   dstq, r3
3180    lea                     r3, [dstq+8]
3181    jmp  m(iadst_8x8_internal).end
3182
3183.end1:
3184    pxor                    m7, m7
3185    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
3186
3187    add                 coeffq, 32*8
3188    mov                   dstq, r3
3189
3190    mova                    m4, [coeffq+16*0 ]
3191    mova                    m5, [coeffq+16*2 ]
3192    mova                    m0, [coeffq+16*4 ]
3193    mova                    m1, [coeffq+16*6 ]
3194    mova                    m2, [coeffq+16*8 ]
3195    mova                    m3, [coeffq+16*10]
3196    mova                    m6, [coeffq+16*12]
3197    mova                    m7, [coeffq+16*14]
3198    mova    [rsp+gprsize+16*7], m4
3199    mova    [rsp+gprsize+16*8], m5
3200    mova    [rsp+gprsize+16*5], m6
3201    mova    [rsp+gprsize+16*6], m7
3202    lea                   tx2q, [o(m(iadst_8x16_internal).end)]
3203    jmp m(iadst_8x16_internal).pass2_main
3204
3205
3206INV_TXFM_16X16_FN flipadst, dct
3207INV_TXFM_16X16_FN flipadst, adst
3208INV_TXFM_16X16_FN flipadst, flipadst
3209
3210cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
3211    ITX_16X16_ADST_LOAD_ODD_COEFS
3212    call m(iadst_16x8_internal).main
3213    call m(iadst_16x8_internal).main_pass1_end
3214
3215    mov                     r3, tx2q
3216    lea                   tx2q, [o(m(iflipadst_16x16_internal).pass1_end)]
3217    mova                    m7, [o(pw_m8192)]
3218    jmp  m(iflipadst_8x8_internal).pass1_end1
3219
3220.pass1_end:
3221    SAVE_8ROWS     coeffq+16*1, 32
3222    LOAD_8ROWS    rsp+gprsize+16*3, 16
3223    mova    [rsp+gprsize+16*0], m7
3224    lea                   tx2q, [o(m(iflipadst_16x16_internal).pass1_end1)]
3225    mova                    m7, [o(pw_m8192)]
3226    jmp  m(iflipadst_8x8_internal).pass1_end1
3227
3228.pass1_end1:
3229    SAVE_8ROWS    coeffq+16*17, 32
3230    ITX_16X16_ADST_LOAD_EVEN_COEFS
3231    call m(iadst_16x8_internal).main
3232    call m(iadst_16x8_internal).main_pass1_end
3233
3234    mova                    m7, [rsp+gprsize+16*0]
3235    SAVE_8ROWS     coeffq+16*0, 32
3236    LOAD_8ROWS    rsp+gprsize+16*3, 16
3237    mova    [rsp+gprsize+16*0], m7
3238    lea                   tx2q, [o(m(iflipadst_16x16_internal).pass1_end2)]
3239    mova                    m7, [o(pw_m8192)]
3240    jmp  m(iflipadst_8x8_internal).pass1_end1
3241
3242.pass1_end2:
3243    SAVE_8ROWS    coeffq+16*16, 32
3244    LOAD_8ROWS    coeffq+16* 0, 32
3245    mova    [rsp+gprsize+16*0], m7
3246    mov                   tx2q, r3
3247    mova                    m7, [o(pw_m8192)]
3248    jmp m(iflipadst_8x8_internal).pass1_end1
3249
3250.pass2:
3251    lea                   tx2q, [o(m(iflipadst_16x16_internal).end)]
3252    lea                     r3, [dstq+8]
3253    jmp m(iflipadst_8x16_internal).pass2_pre
3254
3255.end:
3256    LOAD_8ROWS    rsp+gprsize+16*3, 16
3257    mova    [rsp+gprsize+16*0], m7
3258    lea                   tx2q, [o(m(iflipadst_16x16_internal).end1)]
3259    lea                   dstq, [dstq+strideq*2]
3260    jmp  m(iflipadst_8x8_internal).end
3261
3262.end1:
3263    pxor                    m7, m7
3264    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
3265
3266    add                 coeffq, 32*8
3267
3268    mova                    m4, [coeffq+16*0 ]
3269    mova                    m5, [coeffq+16*2 ]
3270    mova                    m0, [coeffq+16*4 ]
3271    mova                    m1, [coeffq+16*6 ]
3272    mova                    m2, [coeffq+16*8 ]
3273    mova                    m3, [coeffq+16*10]
3274    mova                    m6, [coeffq+16*12]
3275    mova                    m7, [coeffq+16*14]
3276    mova    [rsp+gprsize+16*7], m4
3277    mova    [rsp+gprsize+16*8], m5
3278    mova    [rsp+gprsize+16*5], m6
3279    mova    [rsp+gprsize+16*6], m7
3280
3281    lea                   tx2q, [o(m(iflipadst_16x16_internal).end2)]
3282    mov                   dstq, r3
3283    jmp m(iflipadst_8x16_internal).pass2_main
3284
3285.end2:
3286    LOAD_8ROWS    rsp+gprsize+16*3, 16
3287    mova    [rsp+gprsize+16*0], m7
3288    lea                   tx2q, [o(m(idct_8x16_internal).end1)]
3289    lea                   dstq, [dstq+strideq*2]
3290    jmp  m(iflipadst_8x8_internal).end
3291
3292
3293%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16
3294    pmulhrsw            m%2, m%3, m%1
3295    psraw               m%2, 1
3296    pavgw               m%1, m%2
3297%endmacro
3298
3299INV_TXFM_16X16_FN identity, dct
3300INV_TXFM_16X16_FN identity, identity
3301
3302cglobal iidentity_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
3303    add                 coeffq, 16*17
3304    mov                     r3, tx2q
3305    lea                   tx2q, [o(m(iidentity_16x16_internal).pass1_end)]
3306
3307.pass1:
3308    mova                    m6, [o(pw_1697x16)]
3309    mova                    m7, [coeffq+32*6]
3310    mova                    m0, [coeffq+32*0]
3311    mova                    m1, [coeffq+32*1]
3312    mova                    m2, [coeffq+32*2]
3313    mova                    m3, [coeffq+32*3]
3314    mova                    m4, [coeffq+32*4]
3315    REPX     {IDTX16B x, 5, 6}, 7, 0, 1, 2, 3, 4
3316    mova                    m5, [coeffq+32*5]
3317    mova    [rsp+gprsize+16*1], m7
3318    IDTX16B                  5, 7, 6
3319    mova                    m7, [coeffq+32*7]
3320    IDTX16B                  7, 6, 6
3321    jmp   m(idct_8x8_internal).pass1_end3
3322
3323.pass1_end:
3324    SAVE_8ROWS          coeffq, 32
3325    sub                 coeffq, 16
3326    lea                   tx2q, [o(m(iidentity_16x16_internal).pass1_end1)]
3327    jmp .pass1
3328
3329.pass1_end1:
3330    SAVE_8ROWS          coeffq, 32
3331    sub                 coeffq, 15*16
3332    lea                   tx2q, [o(m(iidentity_16x16_internal).pass1_end2)]
3333    jmp .pass1
3334
3335.pass1_end2:
3336    SAVE_8ROWS          coeffq, 32
3337    sub                 coeffq, 16
3338    mov                   tx2q, r3
3339    jmp .pass1
3340
3341.pass2:
3342    lea                     r3, [dstq+8]
3343    lea                   tx2q, [o(m(iidentity_16x16_internal).end1)]
3344
3345.end:
3346    mova    [rsp+gprsize+16*0], m7
3347    mova    [rsp+gprsize+16*1], m4
3348    mova                    m7, [o(pw_1697x16)]
3349    REPX      {IDTX16 x, 4, 7}, 5, 6, 0, 1, 2, 3
3350    mova                    m4, [o(pw_2048)]
3351    pmulhrsw                m5, m4
3352    pmulhrsw                m6, m4
3353    mova    [rsp+gprsize+16*2], m5
3354    mova                    m5, [rsp+gprsize+16*1]
3355    mova    [rsp+gprsize+16*1], m6
3356    IDTX16                   5, 6, 7
3357    mova                    m6, [rsp+gprsize+16*0]
3358    IDTX16                   6, 7, 7
3359    REPX      {pmulhrsw x, m4}, m0, m1, m2, m3, m6
3360    pmulhrsw                m4, m5
3361    mova    [rsp+gprsize+16*0], m6
3362    jmp   m(idct_8x8_internal).end3
3363
3364.end1:
3365    LOAD_8ROWS     coeffq+16*1, 32
3366    lea                   tx2q, [o(m(iidentity_16x16_internal).end2)]
3367    lea                   dstq, [dstq+strideq*2]
3368    jmp .end
3369
3370.end2:
3371    pxor                    m7, m7
3372    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
3373
3374    add                 coeffq, 32*8
3375    LOAD_8ROWS          coeffq, 32
3376    lea                   tx2q, [o(m(iidentity_16x16_internal).end3)]
3377    mov                   dstq, r3
3378    jmp .end
3379
3380.end3:
3381    LOAD_8ROWS     coeffq+16*1, 32
3382    lea                   tx2q, [o(m(idct_8x16_internal).end1)]
3383    lea                   dstq, [dstq+strideq*2]
3384    jmp .end
3385
3386
3387cglobal inv_txfm_add_dct_dct_8x32, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
3388%if ARCH_X86_32
3389    LEA                     r5, $$
3390%endif
3391    test                  eobd, eobd
3392    jz .dconly
3393    call  m(idct_8x32_internal)
3394    RET
3395
3396.dconly:
3397    movd                 m1, [o(pw_2896x8)]
3398    pmulhrsw             m0, m1, [coeffq]
3399    movd                 m2, [o(pw_8192)]
3400    mov            [coeffq], eobd
3401    pmulhrsw             m0, m2
3402    psrlw                m2, 2            ;pw_2048
3403    pmulhrsw             m0, m1
3404    pmulhrsw             m0, m2
3405    pshuflw              m0, m0, q0000
3406    punpcklwd            m0, m0
3407    mov                 r3d, 8
3408    lea                tx2q, [o(m(inv_txfm_add_dct_dct_8x32).end)]
3409    jmp m(inv_txfm_add_dct_dct_8x8).loop
3410
3411.end:
3412    RET
3413
3414
3415
3416cglobal idct_8x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
3417    %undef cmp
3418    cmp                   eobd, 106
3419    jle .fast
3420
3421    LOAD_8ROWS     coeffq+16*3, 64
3422    call  m(idct_8x8_internal).main
3423    mova                    m7, [o(pw_8192)]
3424    lea                   tx2q, [o(m(idct_8x32_internal).pass1)]
3425    jmp   m(idct_8x8_internal).pass1_end1
3426
3427.pass1:
3428    mova   [rsp+gprsize+16*9 ], m0                        ;in24
3429    mova   [rsp+gprsize+16*10], m4                        ;in28
3430    mova   [rsp+gprsize+16*17], m2                        ;in26
3431    mova   [rsp+gprsize+16*18], m6                        ;in30
3432    mova   [rsp+gprsize+16*31], m1                        ;in25
3433    mova   [rsp+gprsize+16*30], m3                        ;in27
3434    mova   [rsp+gprsize+16*27], m5                        ;in29
3435    mova   [rsp+gprsize+16*34], m7                        ;in31
3436    LOAD_8ROWS     coeffq+16*2, 64
3437    call  m(idct_8x8_internal).main
3438    mova                    m7, [o(pw_8192)]
3439    lea                   tx2q, [o(m(idct_8x32_internal).pass1_1)]
3440    jmp   m(idct_8x8_internal).pass1_end1
3441
3442.pass1_1:
3443    mova   [rsp+gprsize+16*7 ], m0                        ;in16
3444    mova   [rsp+gprsize+16*8 ], m4                        ;in20
3445    mova   [rsp+gprsize+16*15], m2                        ;in18
3446    mova   [rsp+gprsize+16*16], m6                        ;in22
3447    mova   [rsp+gprsize+16*33], m1                        ;in17
3448    mova   [rsp+gprsize+16*28], m3                        ;in19
3449    mova   [rsp+gprsize+16*29], m5                        ;in21
3450    mova   [rsp+gprsize+16*32], m7                        ;in23
3451
3452.fast:
3453    LOAD_8ROWS     coeffq+16*1, 64
3454    call  m(idct_8x8_internal).main
3455    mova                    m7, [o(pw_8192)]
3456    lea                   tx2q, [o(m(idct_8x32_internal).pass1_end)]
3457    jmp   m(idct_8x8_internal).pass1_end1
3458
3459.pass1_end:
3460    mova   [rsp+gprsize+16*5 ], m0                        ;in8
3461    mova   [rsp+gprsize+16*6 ], m4                        ;in12
3462    mova   [rsp+gprsize+16*13], m2                        ;in10
3463    mova   [rsp+gprsize+16*14], m6                        ;in14
3464    mova   [rsp+gprsize+16*21], m1                        ;in9
3465    mova   [rsp+gprsize+16*24], m3                        ;in11
3466    mova   [rsp+gprsize+16*25], m5                        ;in13
3467    mova   [rsp+gprsize+16*20], m7                        ;in15
3468    LOAD_8ROWS     coeffq+16*0, 64
3469    call  m(idct_8x8_internal).main
3470    mova                    m7, [o(pw_8192)]
3471    lea                   tx2q, [o(m(idct_8x32_internal).pass1_end1)]
3472    jmp   m(idct_8x8_internal).pass1_end1
3473
3474.pass1_end1:
3475    mova   [rsp+gprsize+16*11], m2                        ;in2
3476    mova   [rsp+gprsize+16*12], m6                        ;in6
3477    mova   [rsp+gprsize+16*19], m1                        ;in1
3478    mova   [rsp+gprsize+16*26], m3                        ;in3
3479    mova   [rsp+gprsize+16*23], m5                        ;in5
3480    mova   [rsp+gprsize+16*22], m7                        ;in7
3481    mova                    m1, m4                        ;in4
3482    mova                    m2, [rsp+gprsize+16*5 ]       ;in8
3483    mova                    m3, [rsp+gprsize+16*6 ]       ;in12
3484
3485    cmp                   eobd, 106
3486    jg .full
3487
3488    pxor                    m4, m4
3489    REPX          {mova x, m4}, m5, m6, m7
3490    call  m(idct_8x8_internal).main
3491    SAVE_7ROWS   rsp+gprsize+16*3 , 16
3492    mova                    m0, [rsp+gprsize+16*11]
3493    mova                    m1, [rsp+gprsize+16*12]
3494    mova                    m2, [rsp+gprsize+16*13]
3495    mova                    m3, [rsp+gprsize+16*14]
3496    pxor                    m4, m4
3497    REPX          {mova x, m4}, m5, m6, m7
3498    call m(idct_16x8_internal).main
3499    mova                    m7, [rsp+gprsize+16*0]
3500    SAVE_8ROWS   rsp+gprsize+16*11, 16
3501
3502    call .main_fast
3503    jmp  .pass2
3504
3505.full:
3506    mova                    m4, [rsp+gprsize+16*7 ]       ;in16
3507    mova                    m5, [rsp+gprsize+16*8 ]       ;in20
3508    mova                    m6, [rsp+gprsize+16*9 ]       ;in24
3509    mova                    m7, [rsp+gprsize+16*10]       ;in28
3510    call  m(idct_8x8_internal).main
3511    SAVE_7ROWS   rsp+gprsize+16*3 , 16
3512    LOAD_8ROWS   rsp+gprsize+16*11, 16
3513    call m(idct_16x8_internal).main
3514    mova                    m7, [rsp+gprsize+16*0]
3515    SAVE_8ROWS   rsp+gprsize+16*11, 16
3516    call .main
3517
3518.pass2:
3519    lea                     r3, [o(m(idct_8x32_internal).end6)]
3520
3521.end:
3522    mova   [rsp+gprsize+16*0 ], m7
3523    lea                   tx2q, [o(m(idct_8x32_internal).end2)]
3524
3525.end1:
3526    pxor                    m7, m7
3527    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  \
3528                                     8,  9,  10, 11, 12, 13, 14, 15, \
3529                                     16, 17, 18, 19, 20, 21, 22, 23, \
3530                                     24, 25, 26, 27, 28, 29, 30, 31
3531
3532    jmp                   tx2q
3533
3534.end2:
3535    lea                   tx2q, [o(m(idct_8x32_internal).end3)]
3536    jmp   m(idct_8x8_internal).end
3537
3538.end3:
3539    LOAD_8ROWS   rsp+gprsize+16*11, 16
3540    mova   [rsp+gprsize+16*0 ], m7
3541    lea                   dstq, [dstq+strideq*2]
3542    lea                   tx2q, [o(m(idct_8x32_internal).end4)]
3543    jmp   m(idct_8x8_internal).end
3544
3545.end4:
3546    LOAD_8ROWS   rsp+gprsize+16*19, 16
3547    mova   [rsp+gprsize+16*0 ], m7
3548    lea                   dstq, [dstq+strideq*2]
3549    lea                   tx2q, [o(m(idct_8x32_internal).end5)]
3550    jmp   m(idct_8x8_internal).end
3551
3552.end5:
3553    LOAD_8ROWS   rsp+gprsize+16*27, 16
3554    mova   [rsp+gprsize+16*0 ], m7
3555    lea                   dstq, [dstq+strideq*2]
3556    mov                   tx2q, r3
3557    jmp   m(idct_8x8_internal).end
3558
3559.end6:
3560    ret
3561
3562ALIGN function_align
3563.main_veryfast:
3564    mova                    m0, [rsp+gprsize*2+16*19]     ;in1
3565    pmulhrsw                m3, m0, [o(pw_4091x8)]        ;t30,t31
3566    pmulhrsw                m0, [o(pw_201x8)]             ;t16,t17
3567    mova                    m7, [o(pd_2048)]
3568    mova [rsp+gprsize*2+16*19], m0                        ;t16
3569    mova [rsp+gprsize*2+16*34], m3                        ;t31
3570    ITX_MULSUB_2W            3, 0, 1, 2, 7,  799, 4017    ;t17a, t30a
3571    mova [rsp+gprsize*2+16*20], m3                        ;t17a
3572    mova [rsp+gprsize*2+16*33], m0                        ;t30a
3573    mova                    m1, [rsp+gprsize*2+16*22]     ;in7
3574    pmulhrsw                m2, m1, [o(pw_3857x8)]        ;t28,t29
3575    pmulhrsw                m1, [o(pw_m1380x8)]           ;t18,t19
3576    mova [rsp+gprsize*2+16*22], m1                        ;t19
3577    mova [rsp+gprsize*2+16*31], m2                        ;t28
3578    ITX_MULSUB_2W            2, 1, 0, 3, 7, m4017, 799    ;t18a, t29a
3579    mova [rsp+gprsize*2+16*21], m2                        ;t18a
3580    mova [rsp+gprsize*2+16*32], m1                        ;t29a
3581    mova                    m0, [rsp+gprsize*2+16*23]     ;in5
3582    pmulhrsw                m3, m0, [o(pw_3973x8)]        ;t26, t27
3583    pmulhrsw                m0, [o(pw_995x8)]             ;t20, t21
3584    mova [rsp+gprsize*2+16*23], m0                        ;t20
3585    mova [rsp+gprsize*2+16*30], m3                        ;t27
3586    ITX_MULSUB_2W            3, 0, 1, 2, 7, 3406, 2276    ;t21a, t26a
3587    mova [rsp+gprsize*2+16*24], m3                        ;t21a
3588    mova [rsp+gprsize*2+16*29], m0                        ;t26a
3589    mova                    m2, [rsp+gprsize*2+16*26]     ;in3
3590    pxor                    m0, m0
3591    mova                    m3, m0
3592    pmulhrsw                m1, m2, [o(pw_4052x8)]
3593    pmulhrsw                m2, [o(pw_m601x8)]
3594    jmp .main2
3595
3596ALIGN function_align
3597.main_fast: ;bottom half is zero
3598    mova                    m0, [rsp+gprsize*2+16*19]     ;in1
3599    mova                    m1, [rsp+gprsize*2+16*20]     ;in15
3600    pmulhrsw                m3, m0, [o(pw_4091x8)]        ;t31a
3601    pmulhrsw                m0, [o(pw_201x8)]             ;t16a
3602    pmulhrsw                m2, m1, [o(pw_3035x8)]        ;t30a
3603    pmulhrsw                m1, [o(pw_m2751x8)]           ;t17a
3604    mova                    m7, [o(pd_2048)]
3605    psubsw                  m4, m0, m1                    ;t17
3606    paddsw                  m0, m1                        ;t16
3607    psubsw                  m5, m3, m2                    ;t30
3608    paddsw                  m3, m2                        ;t31
3609    ITX_MULSUB_2W            5, 4, 1, 2, 7,  799, 4017    ;t17a, t30a
3610    mova [rsp+gprsize*2+16*19], m0                        ;t16
3611    mova [rsp+gprsize*2+16*20], m5                        ;t17a
3612    mova [rsp+gprsize*2+16*33], m4                        ;t30a
3613    mova [rsp+gprsize*2+16*34], m3                        ;t31
3614    mova                    m0, [rsp+gprsize*2+16*21]     ;in9
3615    mova                    m1, [rsp+gprsize*2+16*22]     ;in7
3616    pmulhrsw                m3, m0, [o(pw_3703x8)]
3617    pmulhrsw                m0, [o(pw_1751x8)]
3618    pmulhrsw                m2, m1, [o(pw_3857x8)]
3619    pmulhrsw                m1, [o(pw_m1380x8)]
3620    psubsw                  m4, m1, m0                    ;t18
3621    paddsw                  m0, m1                        ;t19
3622    psubsw                  m5, m2, m3                    ;t29
3623    paddsw                  m3, m2                        ;t28
3624    ITX_MULSUB_2W            5, 4, 1, 2, 7, m4017, 799    ;t18a, t29a
3625    mova [rsp+gprsize*2+16*21], m5                        ;t18a
3626    mova [rsp+gprsize*2+16*22], m0                        ;t19
3627    mova [rsp+gprsize*2+16*31], m3                        ;t28
3628    mova [rsp+gprsize*2+16*32], m4                        ;t29a
3629    mova                    m0, [rsp+gprsize*2+16*23]     ;in5
3630    mova                    m1, [rsp+gprsize*2+16*24]     ;in11
3631    pmulhrsw                m3, m0, [o(pw_3973x8)]
3632    pmulhrsw                m0, [o(pw_995x8)]
3633    pmulhrsw                m2, m1, [o(pw_3513x8)]
3634    pmulhrsw                m1, [o(pw_m2106x8)]
3635    psubsw                  m4, m0, m1                    ;t21
3636    paddsw                  m0, m1                        ;t20
3637    psubsw                  m5, m3, m2                    ;t26
3638    paddsw                  m3, m2                        ;t27
3639    ITX_MULSUB_2W            5, 4, 1, 2, 7, 3406, 2276    ;t21a, t26a
3640    mova [rsp+gprsize*2+16*23], m0                        ;t20
3641    mova [rsp+gprsize*2+16*24], m5                        ;t21a
3642    mova [rsp+gprsize*2+16*29], m4                        ;t26a
3643    mova [rsp+gprsize*2+16*30], m3                        ;t27
3644    mova                    m0, [rsp+gprsize*2+16*25]     ;in13
3645    mova                    m2, [rsp+gprsize*2+16*26]     ;in3
3646    pmulhrsw                m3, m0, [o(pw_3290x8)]
3647    pmulhrsw                m0, [o(pw_2440x8)]
3648    pmulhrsw                m1, m2, [o(pw_4052x8)]
3649    pmulhrsw                m2, [o(pw_m601x8)]
3650    jmp .main2
3651
3652ALIGN function_align
3653.main:
3654    mova                    m7, [o(pd_2048)]
3655    mova                    m0, [rsp+gprsize*2+16*19]     ;in1
3656    mova                    m1, [rsp+gprsize*2+16*20]     ;in15
3657    mova                    m2, [rsp+gprsize*2+16*33]     ;in17
3658    mova                    m3, [rsp+gprsize*2+16*34]     ;in31
3659    ITX_MULSUB_2W            0, 3, 4, 5, 7,  201, 4091    ;t16a, t31a
3660    ITX_MULSUB_2W            2, 1, 4, 5, 7, 3035, 2751    ;t17a, t30a
3661    psubsw                  m4, m0, m2                    ;t17
3662    paddsw                  m0, m2                        ;t16
3663    psubsw                  m5, m3, m1                    ;t30
3664    paddsw                  m3, m1                        ;t31
3665    ITX_MULSUB_2W            5, 4, 1, 2, 7,  799, 4017    ;t17a, t30a
3666    mova [rsp+gprsize*2+16*19], m0                        ;t16
3667    mova [rsp+gprsize*2+16*20], m5                        ;t17a
3668    mova [rsp+gprsize*2+16*33], m4                        ;t30a
3669    mova [rsp+gprsize*2+16*34], m3                        ;t31
3670    mova                    m0, [rsp+gprsize*2+16*21]     ;in9
3671    mova                    m1, [rsp+gprsize*2+16*22]     ;in7
3672    mova                    m2, [rsp+gprsize*2+16*31]     ;in25
3673    mova                    m3, [rsp+gprsize*2+16*32]     ;in23
3674    ITX_MULSUB_2W            0, 3, 4, 5, 7, 1751, 3703    ;t18a, t29a
3675    ITX_MULSUB_2W            2, 1, 4, 5, 7, 3857, 1380    ;t19a, t28a
3676    psubsw                  m4, m2, m0                    ;t18
3677    paddsw                  m0, m2                        ;t19
3678    psubsw                  m5, m1, m3                    ;t29
3679    paddsw                  m3, m1                        ;t28
3680    ITX_MULSUB_2W            5, 4, 1, 2, 7, m4017, 799    ;t18a, t29a
3681    mova [rsp+gprsize*2+16*21], m5                        ;t18a
3682    mova [rsp+gprsize*2+16*22], m0                        ;t19
3683    mova [rsp+gprsize*2+16*31], m3                        ;t28
3684    mova [rsp+gprsize*2+16*32], m4                        ;t29a
3685    mova                    m0, [rsp+gprsize*2+16*23]     ;in5
3686    mova                    m1, [rsp+gprsize*2+16*24]     ;in11
3687    mova                    m2, [rsp+gprsize*2+16*29]     ;in21
3688    mova                    m3, [rsp+gprsize*2+16*30]     ;in27
3689    ITX_MULSUB_2W            0, 3, 4, 5, 7,  995, 3973    ;t20a, t27a
3690    ITX_MULSUB_2W            2, 1, 4, 5, 7, 3513, 2106    ;t21a, t26a
3691    psubsw                  m4, m0, m2                    ;t21
3692    paddsw                  m0, m2                        ;t20
3693    psubsw                  m5, m3, m1                    ;t26
3694    paddsw                  m3, m1                        ;t27
3695    ITX_MULSUB_2W            5, 4, 1, 2, 7, 3406, 2276    ;t21a, t26a
3696    mova [rsp+gprsize*2+16*23], m0                        ;t20
3697    mova [rsp+gprsize*2+16*24], m5                        ;t21a
3698    mova [rsp+gprsize*2+16*29], m4                        ;t26a
3699    mova [rsp+gprsize*2+16*30], m3                        ;t27
3700    mova                    m0, [rsp+gprsize*2+16*25]     ;in13
3701    mova                    m1, [rsp+gprsize*2+16*26]     ;in3
3702    mova                    m2, [rsp+gprsize*2+16*27]     ;in29
3703    mova                    m3, [rsp+gprsize*2+16*28]     ;in19
3704    ITX_MULSUB_2W            0, 3, 4, 5, 7, 2440, 3290    ;t22a, t25a
3705    ITX_MULSUB_2W            2, 1, 4, 5, 7, 4052,  601    ;t23a, t24a
3706
3707.main2:
3708    psubsw                  m4, m2, m0                    ;t22
3709    paddsw                  m0, m2                        ;t23
3710    psubsw                  m5, m1, m3                    ;t25
3711    paddsw                  m3, m1                        ;t24
3712    ITX_MULSUB_2W            5, 4, 1, 2, 7, m2276, 3406   ;t22a, t25a
3713    mova                    m2, [rsp+gprsize*2+16*24]     ;t21a
3714    psubsw                  m1, m5, m2                    ;t21
3715    paddsw                  m5, m2                        ;t22
3716    mova [rsp+gprsize*2+16*25], m5                        ;t22
3717    mova                    m2, [rsp+gprsize*2+16*29]     ;t26a
3718    psubsw                  m5, m4, m2                    ;t26
3719    paddsw                  m4, m2                        ;t25
3720    mova [rsp+gprsize*2+16*28], m4                        ;t25
3721    ITX_MULSUB_2W            5, 1, 2, 4, 7, m3784, 1567   ;t21a, t26a
3722    mova [rsp+gprsize*2+16*24], m5                        ;t21a
3723    mova [rsp+gprsize*2+16*29], m1                        ;t26a
3724
3725    mova                    m1, [rsp+gprsize*2+16*23]     ;t20
3726    mova                    m5, [rsp+gprsize*2+16*30]     ;t27
3727    psubsw                  m2, m0, m1                    ;t20a
3728    paddsw                  m0, m1                        ;t23a
3729    psubsw                  m6, m3, m5                    ;t27a
3730    paddsw                  m3, m5                        ;t24a
3731    ITX_MULSUB_2W            6, 2, 1, 5, 7, m3784, 1567   ;t20, t27
3732    mova [rsp+gprsize*2+16*26], m0                        ;t23a
3733    mova [rsp+gprsize*2+16*27], m3                        ;t24a
3734    mova [rsp+gprsize*2+16*30], m2                        ;t27
3735
3736    mova                    m0, [rsp+gprsize*2+16*20]     ;t17a
3737    mova                    m1, [rsp+gprsize*2+16*21]     ;t18a
3738    mova                    m2, [rsp+gprsize*2+16*32]     ;t29a
3739    mova                    m3, [rsp+gprsize*2+16*33]     ;t30a
3740    psubsw                  m4, m0, m1                    ;t18
3741    paddsw                  m0, m1                        ;t17
3742    psubsw                  m5, m3, m2                    ;t29
3743    paddsw                  m3, m2                        ;t30
3744    ITX_MULSUB_2W            5, 4, 1, 2, 7, 1567, 3784    ;t18a, t29a
3745    mova [rsp+gprsize*2+16*20], m0                        ;t17
3746    mova [rsp+gprsize*2+16*21], m5                        ;t18a
3747    mova [rsp+gprsize*2+16*32], m4                        ;t29a
3748    mova [rsp+gprsize*2+16*33], m3                        ;t30
3749    mova                    m0, [rsp+gprsize*2+16*19]     ;t16
3750    mova                    m1, [rsp+gprsize*2+16*22]     ;t19
3751    mova                    m2, [rsp+gprsize*2+16*31]     ;t28
3752    mova                    m3, [rsp+gprsize*2+16*34]     ;t31
3753    psubsw                  m4, m0, m1                    ;t19a
3754    paddsw                  m0, m1                        ;t16a
3755    psubsw                  m5, m3, m2                    ;t28a
3756    paddsw                  m3, m2                        ;t31a
3757    ITX_MULSUB_2W            5, 4, 1, 2, 7, 1567, 3784    ;t19, t28
3758    mova                    m2, [rsp+gprsize*2+16*15]     ;tmp12
3759    psubsw                  m1, m5, m6                    ;t20a
3760    paddsw                  m5, m6                        ;t19a
3761    psubsw                  m6, m2, m5                    ;out19
3762    paddsw                  m2, m5                        ;out12
3763    mova                    m5, [rsp+gprsize*2+16*30]     ;t27
3764    mova [rsp+gprsize*2+16*22], m6                        ;out19
3765    mova [rsp+gprsize*2+16*15], m2                        ;out12
3766    psubsw                  m6, m4, m5                    ;t27a
3767    paddsw                  m4, m5                        ;t28a
3768    ITX_MULSUB_2W            6, 1, 2, 5, 7, 2896, 2896    ;t20, t27
3769    mova                    m2, [rsp+gprsize*2+16*6 ]     ;tmp3
3770    psubsw                  m5, m2, m4                    ;out28
3771    paddsw                  m2, m4                        ;out3
3772    mova                    m4, [rsp+gprsize*2+16*14]     ;tmp11
3773    mova [rsp+gprsize*2+16*31], m5                        ;out28
3774    mova [rsp+gprsize*2+16*6 ], m2                        ;out3
3775    psubsw                  m5, m4, m6                    ;out20
3776    paddsw                  m4, m6                        ;out11
3777    mova                    m2, [rsp+gprsize*2+16*7 ]     ;tmp4
3778    mova [rsp+gprsize*2+16*23], m5                        ;out20
3779    mova [rsp+gprsize*2+16*14], m4                        ;out11
3780    psubsw                  m5, m2, m1                    ;out27
3781    paddsw                  m2, m1                        ;out4
3782    mova                    m1, [rsp+gprsize*2+16*26]     ;t23a
3783    mova                    m4, [rsp+gprsize*2+16*27]     ;t24a
3784    mova [rsp+gprsize*2+16*30], m5                        ;out27
3785    mova [rsp+gprsize*2+16*7 ], m2                        ;out4
3786    psubsw                  m5, m0, m1                    ;t23
3787    paddsw                  m0, m1                        ;t16
3788    psubsw                  m2, m3, m4                    ;t24
3789    paddsw                  m3, m4                        ;t31
3790    ITX_MULSUB_2W            2, 5, 4, 6, 7, 2896, 2896    ;t23a, t24a
3791    mova                    m6, [rsp+gprsize*2+16*18]     ;tmp15
3792    psubsw                  m4, m6, m0                    ;out16
3793    paddsw                  m6, m0                        ;out15
3794    mova                    m0, [rsp+gprsize*2+16*3 ]     ;tmp0
3795    mova                    m1, [rsp+gprsize*2+16*11]     ;tmp8
3796    mova [rsp+gprsize*2+16*18], m6                        ;out15
3797    mova [rsp+gprsize*2+16*19], m4                        ;out16
3798    psubsw                  m6, m0, m3                    ;out31
3799    paddsw                  m0, m3                        ;out0
3800    psubsw                  m4, m1, m2                    ;out23
3801    paddsw                  m1, m2                        ;out8
3802    mova                    m3, [rsp+gprsize*2+16*10]     ;tmp7
3803    mova [rsp+gprsize*2+16*34], m6                        ;out31
3804    mova [rsp+gprsize*2+16*11], m1                        ;out8
3805    mova [rsp+gprsize*2+16*26], m4                        ;out23
3806    paddsw                  m6, m3, m5                    ;out7
3807    psubsw                  m3, m5                        ;out24
3808    mova                    m1, [rsp+gprsize*2+16*20]     ;t17
3809    mova                    m5, [rsp+gprsize*2+16*25]     ;t22
3810    mova                    m2, [rsp+gprsize*2+16*17]     ;tmp14
3811    mova [rsp+gprsize*2+16*27], m3                        ;out24
3812    psubsw                  m4, m1, m5                    ;t22a
3813    paddsw                  m1, m5                        ;t17a
3814    psubsw                  m3, m2, m1                    ;out17
3815    paddsw                  m2, m1                        ;out14
3816    mova                    m5, [rsp+gprsize*2+16*28]     ;t25
3817    mova                    m1, [rsp+gprsize*2+16*33]     ;t30
3818    mova [rsp+gprsize*2+16*17], m2                        ;out14
3819    mova [rsp+gprsize*2+16*20], m3                        ;out17
3820    psubsw                  m2, m1, m5                    ;t25a
3821    paddsw                  m1, m5                        ;t30a
3822    ITX_MULSUB_2W            2, 4, 3, 5, 7, 2896, 2896    ;t22, t25
3823    mova                    m5, [rsp+gprsize*2+16*4 ]     ;tmp1
3824    psubsw                  m3, m5, m1                    ;out30
3825    paddsw                  m5, m1                        ;out1
3826    mova                    m1, [rsp+gprsize*2+16*12]     ;tmp9
3827    mova [rsp+gprsize*2+16*33], m3                        ;out30
3828    mova [rsp+gprsize*2+16*4 ], m5                        ;out1
3829    psubsw                  m3, m1, m2                    ;out22
3830    paddsw                  m1, m2                        ;out9
3831    mova                    m5, [rsp+gprsize*2+16*9 ]     ;tmp6
3832    mova [rsp+gprsize*2+16*25], m3                        ;out22
3833    mova [rsp+gprsize*2+16*12], m1                        ;out9
3834    psubsw                  m3, m5, m4                    ;out25
3835    paddsw                  m5, m4                        ;out6
3836    mova                    m4, [rsp+gprsize*2+16*21]     ;t18a
3837    mova                    m1, [rsp+gprsize*2+16*24]     ;t21a
3838    mova                    m2, [rsp+gprsize*2+16*16]     ;tmp13
3839    mova [rsp+gprsize*2+16*28], m3                        ;out25
3840    mova [rsp+gprsize*2+16*9 ], m5                        ;out6
3841    paddsw                  m3, m4, m1                    ;t18
3842    psubsw                  m4, m1                        ;t21
3843    psubsw                  m5, m2, m3                    ;out18
3844    paddsw                  m2, m3                        ;out13
3845    mova                    m1, [rsp+gprsize*2+16*29]     ;t26a
3846    mova                    m3, [rsp+gprsize*2+16*32]     ;t29a
3847    mova [rsp+gprsize*2+16*21], m5                        ;out18
3848    mova [rsp+gprsize*2+16*16], m2                        ;out13
3849    psubsw                  m5, m3, m1                    ;t26
3850    paddsw                  m3, m1                        ;t29
3851    ITX_MULSUB_2W            5, 4, 1, 2, 7, 2896, 2896    ;t21a, t26a
3852    mova                    m2, [rsp+gprsize*2+16*5 ]     ;tmp2
3853    psubsw                  m1, m2, m3                    ;out29
3854    paddsw                  m2, m3                        ;out2
3855    mova                    m3, [rsp+gprsize*2+16*13]     ;tmp10
3856    mova [rsp+gprsize*2+16*32], m1                        ;out29
3857    psubsw                  m7, m3, m5                    ;out21
3858    paddsw                  m3, m5                        ;out10
3859    mova                    m5, [rsp+gprsize*2+16*8 ]     ;tmp5
3860    mova [rsp+gprsize*2+16*24], m7                        ;out21
3861    mova [rsp+gprsize*2+16*13], m3                        ;out10
3862    psubsw                  m1, m5, m4                    ;out26
3863    paddsw                  m5, m4                        ;out5
3864    mova                    m7, m6                        ;out7
3865    mova                    m3, [rsp+gprsize*2+16*6 ]     ;out3
3866    mova                    m4, [rsp+gprsize*2+16*7 ]     ;out4
3867    mova [rsp+gprsize*2+16*29], m1                        ;out26
3868    mova                    m6, [rsp+gprsize*2+16*9 ]     ;out6
3869    mova                    m1, [rsp+gprsize*2+16*4 ]     ;out1
3870    ret
3871
3872
3873cglobal inv_txfm_add_dct_dct_32x8, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
3874%if ARCH_X86_32
3875    LEA                     r5, $$
3876%endif
3877    test                  eobd, eobd
3878    jz .dconly
3879    call  m(idct_32x8_internal)
3880    RET
3881
3882.dconly:
3883    movd                    m1, [o(pw_2896x8)]
3884    pmulhrsw                m0, m1, [coeffq]
3885    movd                    m2, [o(pw_8192)]
3886    mov               [coeffq], eobd
3887    mov                    r3d, 8
3888    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_32x8).end)]
3889
3890.body:
3891    pmulhrsw                m0, m2
3892    movd                    m2, [o(pw_2048)]  ;intentionally rip-relative
3893    pmulhrsw                m0, m1
3894    pmulhrsw                m0, m2
3895    pshuflw                 m0, m0, q0000
3896    punpcklwd               m0, m0
3897    pxor                    m5, m5
3898
3899.loop:
3900    mova                    m1, [dstq+16*0]
3901    mova                    m3, [dstq+16*1]
3902    punpckhbw               m2, m1, m5
3903    punpcklbw               m1, m5
3904    punpckhbw               m4, m3, m5
3905    punpcklbw               m3, m5
3906    paddw                   m2, m0
3907    paddw                   m1, m0
3908    paddw                   m4, m0
3909    paddw                   m3, m0
3910    packuswb                m1, m2
3911    packuswb                m3, m4
3912    mova           [dstq+16*0], m1
3913    mova           [dstq+16*1], m3
3914    add                   dstq, strideq
3915    dec                    r3d
3916    jg .loop
3917    jmp                   tx2q
3918
3919.end:
3920    RET
3921
3922
3923cglobal idct_32x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
3924    %undef cmp
3925    LOAD_8ROWS     coeffq+16*0, 64
3926    call  m(idct_8x8_internal).main
3927    SAVE_7ROWS    rsp+gprsize+16*3, 16
3928
3929    LOAD_8ROWS     coeffq+16*2, 64
3930    call m(idct_16x8_internal).main
3931    mova                    m7, [rsp+gprsize+16*0]
3932    SAVE_8ROWS   rsp+gprsize+16*11, 16
3933
3934    LOAD_8ROWS     coeffq+16*1, 32
3935    mova   [rsp+gprsize+16*19], m0                        ;in1
3936    mova   [rsp+gprsize+16*26], m1                        ;in3
3937    mova   [rsp+gprsize+16*23], m2                        ;in5
3938    mova   [rsp+gprsize+16*22], m3                        ;in7
3939    mova   [rsp+gprsize+16*21], m4                        ;in9
3940    mova   [rsp+gprsize+16*24], m5                        ;in11
3941    mova   [rsp+gprsize+16*25], m6                        ;in13
3942    mova   [rsp+gprsize+16*20], m7                        ;in15
3943
3944    cmp                   eobd, 106
3945    jg  .full
3946    call m(idct_8x32_internal).main_fast
3947    jmp .pass2
3948
3949.full:
3950    LOAD_8ROWS    coeffq+16*17, 32
3951    mova   [rsp+gprsize+16*33], m0                        ;in17
3952    mova   [rsp+gprsize+16*28], m1                        ;in19
3953    mova   [rsp+gprsize+16*29], m2                        ;in21
3954    mova   [rsp+gprsize+16*32], m3                        ;in23
3955    mova   [rsp+gprsize+16*31], m4                        ;in25
3956    mova   [rsp+gprsize+16*30], m5                        ;in27
3957    mova   [rsp+gprsize+16*27], m6                        ;in29
3958    mova   [rsp+gprsize+16*34], m7                        ;in31
3959    call m(idct_8x32_internal).main
3960
3961.pass2:
3962    mova   [rsp+gprsize+16*0 ], m7
3963    lea                   tx2q, [o(m(idct_32x8_internal).end)]
3964    jmp  m(idct_8x32_internal).end1
3965
3966.end:
3967    mova                    m7, [o(pw_8192)]
3968    lea                   tx2q, [o(m(idct_32x8_internal).end1)]
3969    jmp   m(idct_8x8_internal).pass1_end1
3970
3971.end1:
3972    lea                     r3, [dstq+8]
3973    lea                   tx2q, [o(m(idct_32x8_internal).end2)]
3974    jmp   m(idct_8x8_internal).pass2_main
3975
3976.end2:
3977    LOAD_8ROWS   rsp+gprsize+16*11, 16
3978    mova   [rsp+gprsize+16*0 ], m7
3979    mova                    m7, [o(pw_8192)]
3980    lea                   tx2q, [o(m(idct_32x8_internal).end3)]
3981    jmp   m(idct_8x8_internal).pass1_end1
3982
3983.end3:
3984    mov                   dstq, r3
3985    add                     r3, 8
3986    lea                   tx2q, [o(m(idct_32x8_internal).end4)]
3987    jmp   m(idct_8x8_internal).pass2_main
3988
3989.end4:
3990    LOAD_8ROWS   rsp+gprsize+16*19, 16
3991    mova   [rsp+gprsize+16*0 ], m7
3992    mova                    m7, [o(pw_8192)]
3993    lea                   tx2q, [o(m(idct_32x8_internal).end5)]
3994    jmp   m(idct_8x8_internal).pass1_end1
3995
3996.end5:
3997    mov                   dstq, r3
3998    add                     r3, 8
3999    lea                   tx2q, [o(m(idct_32x8_internal).end6)]
4000    jmp   m(idct_8x8_internal).pass2_main
4001
4002.end6:
4003    LOAD_8ROWS   rsp+gprsize+16*27, 16
4004    mova   [rsp+gprsize+16*0 ], m7
4005    mova                    m7, [o(pw_8192)]
4006    lea                   tx2q, [o(m(idct_32x8_internal).end7)]
4007    jmp   m(idct_8x8_internal).pass1_end1
4008
4009.end7:
4010    mov                   dstq, r3
4011    lea                   tx2q, [o(m(idct_32x8_internal).end8)]
4012    jmp   m(idct_8x8_internal).pass2_main
4013
4014.end8:
4015    ret
4016
4017
4018cglobal inv_txfm_add_identity_identity_8x32, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
4019    mov                    r5d, 4
4020    mov                   tx2d, 2
4021    cmp                   eobd, 107
4022    cmovns                tx2d, r5d
4023    mov                    r3d, tx2d
4024%if ARCH_X86_32
4025    LEA                     r5, $$
4026%endif
4027    lea                   tx2q, [o(m(idct_32x8_internal).end8)]
4028.loop:
4029    LOAD_8ROWS     coeffq+16*0, 64
4030    paddsw                  m6, [o(pw_5)]
4031    mova            [rsp+16*1], m6
4032    mova                    m6, [o(pw_5)]
4033    REPX        {paddsw x, m6}, m0, m1, m2, m3, m4, m5, m7
4034    call  m(idct_8x8_internal).pass1_end3
4035    REPX        {psraw  x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
4036    mova            [rsp+16*2], m5
4037    mova            [rsp+16*1], m6
4038    mova            [rsp+16*0], m7
4039    call  m(idct_8x8_internal).end3
4040    lea                   dstq, [dstq+strideq*2]
4041    pxor                    m7, m7
4042    REPX   {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
4043    add                 coeffq, 16
4044    dec                    r3d
4045    jg .loop
4046    RET
4047
4048cglobal inv_txfm_add_identity_identity_32x8, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
4049    mov                    r5d, 4
4050    mov                   tx2d, 2
4051    cmp                   eobd, 107
4052    cmovns                tx2d, r5d
4053    mov                    r3d, tx2d
4054%if ARCH_X86_32
4055    LEA                     r5, $$
4056%endif
4057
4058.loop:
4059    LOAD_8ROWS     coeffq+16*0, 16
4060    pmulhrsw                m6, [o(pw_4096)]
4061    mova            [rsp+16*1], m6
4062    mova                    m6, [o(pw_4096)]
4063    REPX      {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7
4064    lea                   tx2q, [o(m(idct_32x8_internal).end8)]
4065    call  m(idct_8x8_internal).pass1_end3
4066
4067    mov             [rsp+16*3], dstq
4068    mova            [rsp+16*2], m5
4069    mova            [rsp+16*1], m6
4070    mova            [rsp+16*0], m7
4071    lea                   tx2q, [o(m(idct_8x8_internal).end4)]
4072    call  m(idct_8x8_internal).end3
4073
4074    add                 coeffq, 16*8
4075    mov                   dstq, [rsp+16*3]
4076    lea                   dstq, [dstq+8]
4077    dec                    r3d
4078    jg .loop
4079    jnc .loop
4080    RET
4081
4082
4083cglobal inv_txfm_add_dct_dct_16x32, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
4084%if ARCH_X86_32
4085    LEA                     r5, $$
4086%endif
4087    test                  eobd, eobd
4088    jz .dconly
4089    call  m(idct_16x32_internal)
4090    RET
4091
4092.dconly:
4093    movd                    m1, [o(pw_2896x8)]
4094    pmulhrsw                m0, m1, [coeffq]
4095    movd                    m2, [o(pw_16384)]
4096    mov               [coeffq], eobd
4097    pmulhrsw                m0, m1
4098    mov                    r2d, 16
4099    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_16x32).end)]
4100    jmp m(inv_txfm_add_dct_dct_16x4).dconly
4101
4102.end:
4103    RET
4104
4105cglobal idct_16x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
4106    %undef cmp
4107
4108    LOAD_8ROWS     coeffq+16*1, 128, 1
4109    call  m(idct_8x8_internal).main
4110    SAVE_7ROWS    rsp+gprsize+16*3, 16
4111    LOAD_8ROWS     coeffq+16*5, 128, 1
4112    call m(idct_16x8_internal).main
4113    lea                   tx2q, [o(m(idct_16x32_internal).pass1_end)]
4114    jmp   m(idct_8x8_internal).pass1_end
4115
4116.pass1_end:
4117    SAVE_8ROWS    coeffq+16*33, 64               ;in8~in15
4118    LOAD_8ROWS    rsp+gprsize+16*3, 16
4119    mova    [rsp+gprsize+16*0], m7
4120    lea                   tx2q, [o(m(idct_16x32_internal).pass1_end1)]
4121    jmp   m(idct_8x8_internal).pass1_end
4122
4123.pass1_end1:
4124    mova        [coeffq+16*1 ], m0                        ;in8
4125    mova        [coeffq+16*5 ], m4                        ;in12
4126    mova   [rsp+gprsize+16*13], m2                        ;in10
4127    mova   [rsp+gprsize+16*14], m6                        ;in14
4128    mova   [rsp+gprsize+16*21], m1                        ;in9
4129    mova   [rsp+gprsize+16*24], m3                        ;in11
4130    mova   [rsp+gprsize+16*25], m5                        ;in13
4131    mova   [rsp+gprsize+16*20], m7                        ;in15
4132    LOAD_8ROWS     coeffq+16*0, 128, 1
4133    call  m(idct_8x8_internal).main
4134    SAVE_7ROWS    rsp+gprsize+16*3, 16
4135    LOAD_8ROWS     coeffq+16*4, 128, 1
4136    call m(idct_16x8_internal).main
4137    lea                   tx2q, [o(m(idct_16x32_internal).pass1_end2)]
4138    jmp   m(idct_8x8_internal).pass1_end
4139
4140.pass1_end2:
4141    SAVE_8ROWS    coeffq+16*32, 64               ;in0~in7
4142    LOAD_8ROWS    rsp+gprsize+16*3, 16
4143    mova    [rsp+gprsize+16*0], m7
4144    lea                   tx2q, [o(m(idct_16x32_internal).pass1_end3)]
4145    jmp   m(idct_8x8_internal).pass1_end
4146
4147.pass1_end3:
4148    mova   [rsp+gprsize+16*11], m2                        ;in2
4149    mova   [rsp+gprsize+16*12], m6                        ;in6
4150    mova   [rsp+gprsize+16*19], m1                        ;in1
4151    mova   [rsp+gprsize+16*26], m3                        ;in3
4152    mova   [rsp+gprsize+16*23], m5                        ;in5
4153    mova   [rsp+gprsize+16*22], m7                        ;in7
4154
4155    cmp                   eobd, 150
4156    jg .full
4157
4158    mova                    m1, m4                        ;in4
4159    mova                    m2, [coeffq+16*1 ]            ;in8
4160    mova                    m3, [coeffq+16*5 ]            ;in12
4161    pxor                    m4, m4
4162    REPX          {mova x, m4}, m5, m6, m7
4163    call  m(idct_8x8_internal).main
4164    SAVE_7ROWS    rsp+gprsize+16*3, 16
4165    mova                    m0, [rsp+gprsize+16*11]       ;in2
4166    mova                    m1, [rsp+gprsize+16*12]       ;in6
4167    mova                    m2, [rsp+gprsize+16*13]       ;in10
4168    mova                    m3, [rsp+gprsize+16*14]       ;in14
4169    pxor                    m4, m4
4170    REPX          {mova x, m4}, m5, m6, m7
4171    call m(idct_16x8_internal).main
4172    mova                    m7, [rsp+gprsize+16*0]
4173    SAVE_8ROWS   rsp+gprsize+16*11, 16
4174
4175    call m(idct_8x32_internal).main_fast
4176    jmp  .pass2
4177
4178.full:
4179    mova        [coeffq+16*0 ], m0                        ;in0
4180    mova        [coeffq+16*4 ], m4                        ;in4
4181
4182    LOAD_8ROWS     coeffq+16*2, 128, 1
4183    call  m(idct_8x8_internal).main
4184    SAVE_7ROWS    rsp+gprsize+16*3, 16
4185    LOAD_8ROWS     coeffq+16*6, 128, 1
4186    call m(idct_16x8_internal).main
4187    lea                   tx2q, [o(m(idct_16x32_internal).pass1_end4)]
4188    jmp   m(idct_8x8_internal).pass1_end
4189
4190.pass1_end4:
4191    SAVE_8ROWS    coeffq+16*34, 64               ;in16~in23
4192    LOAD_8ROWS    rsp+gprsize+16*3, 16
4193    mova    [rsp+gprsize+16*0], m7
4194    lea                   tx2q, [o(m(idct_16x32_internal).pass1_end5)]
4195    jmp   m(idct_8x8_internal).pass1_end
4196
4197.pass1_end5:
4198    mova        [coeffq+16*2 ], m0                        ;in16
4199    mova        [coeffq+16*6 ], m4                        ;in20
4200    mova   [rsp+gprsize+16*15], m2                        ;in18
4201    mova   [rsp+gprsize+16*16], m6                        ;in22
4202    mova   [rsp+gprsize+16*33], m1                        ;in17
4203    mova   [rsp+gprsize+16*28], m3                        ;in19
4204    mova   [rsp+gprsize+16*29], m5                        ;in21
4205    mova   [rsp+gprsize+16*32], m7                        ;in23
4206
4207    LOAD_8ROWS     coeffq+16*3, 128, 1
4208    call  m(idct_8x8_internal).main
4209    SAVE_7ROWS    rsp+gprsize+16*3, 16
4210    LOAD_8ROWS     coeffq+16*7, 128, 1
4211    call m(idct_16x8_internal).main
4212    lea                   tx2q, [o(m(idct_16x32_internal).pass1_end6)]
4213    jmp   m(idct_8x8_internal).pass1_end
4214
4215.pass1_end6:
4216    SAVE_8ROWS    coeffq+16*35, 64                        ;in24~in31
4217    LOAD_8ROWS    rsp+gprsize+16*3, 16
4218    mova    [rsp+gprsize+16*0], m7
4219    lea                   tx2q, [o(m(idct_16x32_internal).pass1_end7)]
4220    jmp   m(idct_8x8_internal).pass1_end
4221
4222.pass1_end7:
4223    mova   [rsp+gprsize+16*17], m2                        ;in26
4224    mova   [rsp+gprsize+16*18], m6                        ;in30
4225    mova   [rsp+gprsize+16*31], m1                        ;in25
4226    mova   [rsp+gprsize+16*30], m3                        ;in27
4227    mova   [rsp+gprsize+16*27], m5                        ;in29
4228    mova   [rsp+gprsize+16*34], m7                        ;in31
4229
4230    mova                    m6, m0                        ;in24
4231    mova                    m7, m4                        ;in28
4232    mova                    m0, [coeffq+16*0 ]            ;in0
4233    mova                    m1, [coeffq+16*4 ]            ;in4
4234    mova                    m2, [coeffq+16*1 ]            ;in8
4235    mova                    m3, [coeffq+16*5 ]            ;in12
4236    mova                    m4, [coeffq+16*2 ]            ;in16
4237    mova                    m5, [coeffq+16*6 ]            ;in20
4238    call  m(idct_8x8_internal).main
4239    SAVE_7ROWS   rsp+gprsize+16*3 , 16
4240    LOAD_8ROWS   rsp+gprsize+16*11, 16
4241    call m(idct_16x8_internal).main
4242    mova                    m7, [rsp+gprsize+16*0]
4243    SAVE_8ROWS   rsp+gprsize+16*11, 16
4244
4245    call m(idct_8x32_internal).main
4246
4247.pass2:
4248    mov  [rsp+gprsize*1+16*35], eobd
4249    lea                     r3, [dstq+8]
4250    mov  [rsp+gprsize*2+16*35], r3
4251    lea                     r3, [o(m(idct_16x32_internal).end)]
4252    jmp  m(idct_8x32_internal).end
4253
4254.end:
4255    mov                   dstq, [rsp+gprsize*2+16*35]
4256    mov                   eobd, [rsp+gprsize*1+16*35]
4257    add                 coeffq, 16*32
4258
4259    mova                    m0, [coeffq+16*4 ]            ;in1
4260    mova                    m1, [coeffq+16*12]            ;in3
4261    mova                    m2, [coeffq+16*20]            ;in5
4262    mova                    m3, [coeffq+16*28]            ;in7
4263    mova                    m4, [coeffq+16*5 ]            ;in9
4264    mova                    m5, [coeffq+16*13]            ;in11
4265    mova                    m6, [coeffq+16*21]            ;in13
4266    mova                    m7, [coeffq+16*29]            ;in15
4267
4268    mova   [rsp+gprsize+16*19], m0                        ;in1
4269    mova   [rsp+gprsize+16*26], m1                        ;in3
4270    mova   [rsp+gprsize+16*23], m2                        ;in5
4271    mova   [rsp+gprsize+16*22], m3                        ;in7
4272    mova   [rsp+gprsize+16*21], m4                        ;in9
4273    mova   [rsp+gprsize+16*24], m5                        ;in11
4274    mova   [rsp+gprsize+16*25], m6                        ;in13
4275    mova   [rsp+gprsize+16*20], m7                        ;in15
4276
4277    mova                    m0, [coeffq+16*0 ]            ;in0
4278    mova                    m1, [coeffq+16*16]            ;in4
4279    mova                    m2, [coeffq+16*1 ]            ;in8
4280    mova                    m3, [coeffq+16*17]            ;in12
4281
4282    cmp                   eobd, 150
4283    jg .full1
4284
4285    pxor                    m4, m4
4286    REPX          {mova x, m4}, m5, m6, m7
4287    call  m(idct_8x8_internal).main
4288    SAVE_7ROWS    rsp+gprsize+16*3, 16
4289
4290    mova                    m0, [coeffq+16*8 ]            ;in2
4291    mova                    m1, [coeffq+16*24]            ;in6
4292    mova                    m2, [coeffq+16*9 ]            ;in10
4293    mova                    m3, [coeffq+16*25]            ;in14
4294    pxor                    m4, m4
4295    REPX          {mova x, m4}, m5, m6, m7
4296    call m(idct_16x8_internal).main
4297    mova                    m7, [rsp+gprsize+16*0]
4298    SAVE_8ROWS   rsp+gprsize+16*11, 16
4299
4300    call m(idct_8x32_internal).main_fast
4301    jmp  .end1
4302
4303.full1:
4304    mova                    m4, [coeffq+16*2 ]            ;in16
4305    mova                    m5, [coeffq+16*18]            ;in20
4306    mova                    m6, [coeffq+16*3 ]            ;in24
4307    mova                    m7, [coeffq+16*19]            ;in26
4308    call  m(idct_8x8_internal).main
4309    SAVE_7ROWS    rsp+gprsize+16*3, 16
4310
4311    mova                    m0, [coeffq+16*8 ]            ;in2
4312    mova                    m1, [coeffq+16*24]            ;in6
4313    mova                    m2, [coeffq+16*9 ]            ;in10
4314    mova                    m3, [coeffq+16*25]            ;in14
4315    mova                    m4, [coeffq+16*10]            ;in18
4316    mova                    m5, [coeffq+16*26]            ;in22
4317    mova                    m6, [coeffq+16*11]            ;in26
4318    mova                    m7, [coeffq+16*27]            ;in30
4319    call m(idct_16x8_internal).main
4320    mova                    m7, [rsp+gprsize+16*0]
4321    SAVE_8ROWS   rsp+gprsize+16*11, 16
4322
4323    mova                    m0, [coeffq+16*6 ]            ;in17
4324    mova                    m1, [coeffq+16*14]            ;in19
4325    mova                    m2, [coeffq+16*22]            ;in21
4326    mova                    m3, [coeffq+16*30]            ;in23
4327    mova                    m4, [coeffq+16*7 ]            ;in25
4328    mova                    m5, [coeffq+16*15]            ;in27
4329    mova                    m6, [coeffq+16*23]            ;in29
4330    mova                    m7, [coeffq+16*31]            ;in31
4331
4332    mova   [rsp+gprsize+16*33], m0                        ;in17
4333    mova   [rsp+gprsize+16*28], m1                        ;in19
4334    mova   [rsp+gprsize+16*29], m2                        ;in21
4335    mova   [rsp+gprsize+16*32], m3                        ;in23
4336    mova   [rsp+gprsize+16*31], m4                        ;in25
4337    mova   [rsp+gprsize+16*30], m5                        ;in27
4338    mova   [rsp+gprsize+16*27], m6                        ;in29
4339    mova   [rsp+gprsize+16*34], m7                        ;in31
4340
4341    call m(idct_8x32_internal).main
4342
4343.end1:
4344    jmp m(idct_8x32_internal).pass2
4345
4346
4347
4348cglobal inv_txfm_add_dct_dct_32x16, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
4349%if ARCH_X86_32
4350    LEA                     r5, $$
4351%endif
4352    test                  eobd, eobd
4353    jz .dconly
4354
4355    call m(idct_32x16_internal)
4356    call m(idct_8x16_internal).pass2
4357
4358    add                 coeffq, 16*16
4359    lea                   dstq, [r3+8]
4360    LOAD_8ROWS       rsp+16*11, 16
4361    mova            [rsp+16*0], m7
4362    lea                   tx2q, [o(m(idct_32x16_internal).end)]
4363    call  m(idct_8x8_internal).pass1_end
4364    call m(idct_8x16_internal).pass2
4365
4366    add                 coeffq, 16*16
4367    lea                   dstq, [r3+8]
4368    LOAD_8ROWS       rsp+16*19, 16
4369    mova            [rsp+16*0], m7
4370    lea                   tx2q, [o(m(idct_32x16_internal).end)]
4371    call  m(idct_8x8_internal).pass1_end
4372    call m(idct_8x16_internal).pass2
4373
4374    add                 coeffq, 16*16
4375    lea                   dstq, [r3+8]
4376    LOAD_8ROWS       rsp+16*27, 16
4377    mova            [rsp+16*0], m7
4378    lea                   tx2q, [o(m(idct_32x16_internal).end)]
4379    call  m(idct_8x8_internal).pass1_end
4380    call m(idct_8x16_internal).pass2
4381    RET
4382
4383.dconly:
4384    movd                    m1, [o(pw_2896x8)]
4385    pmulhrsw                m0, m1, [coeffq]
4386    movd                    m2, [o(pw_16384)]
4387    mov               [coeffq], eobd
4388    pmulhrsw                m0, m1
4389    mov                    r3d, 16
4390    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_32x8).end)]
4391    jmp m(inv_txfm_add_dct_dct_32x8).body
4392
4393
4394cglobal idct_32x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
4395    %undef cmp
4396
4397    add                 coeffq, 16
4398    lea                     r3, [o(m(idct_32x16_internal).pass1_end1)]
4399.pass1:
4400    LOAD_8ROWS     coeffq+16*0, 128, 1
4401    call  m(idct_8x8_internal).main
4402    SAVE_7ROWS    rsp+gprsize+16*3, 16
4403
4404    LOAD_8ROWS     coeffq+16*4, 128, 1
4405    call m(idct_16x8_internal).main
4406    mova                    m7, [rsp+gprsize+16*0]
4407    SAVE_8ROWS   rsp+gprsize+16*11, 16
4408
4409    LOAD_8ROWS     coeffq+16*2, 64, 1
4410    mova   [rsp+gprsize+16*19], m0                        ;in1
4411    mova   [rsp+gprsize+16*26], m1                        ;in3
4412    mova   [rsp+gprsize+16*23], m2                        ;in5
4413    mova   [rsp+gprsize+16*22], m3                        ;in7
4414    mova   [rsp+gprsize+16*21], m4                        ;in9
4415    mova   [rsp+gprsize+16*24], m5                        ;in11
4416    mova   [rsp+gprsize+16*25], m6                        ;in13
4417    mova   [rsp+gprsize+16*20], m7                        ;in15
4418
4419    LOAD_8ROWS    coeffq+16*34, 64, 1
4420    mova   [rsp+gprsize+16*33], m0                        ;in17
4421    mova   [rsp+gprsize+16*28], m1                        ;in19
4422    mova   [rsp+gprsize+16*29], m2                        ;in21
4423    mova   [rsp+gprsize+16*32], m3                        ;in23
4424    mova   [rsp+gprsize+16*31], m4                        ;in25
4425    mova   [rsp+gprsize+16*30], m5                        ;in27
4426    mova   [rsp+gprsize+16*27], m6                        ;in29
4427    mova   [rsp+gprsize+16*34], m7                        ;in31
4428    call m(idct_8x32_internal).main
4429
4430.pass1_end:
4431    mova   [rsp+gprsize+16*0 ], m7
4432    mov                   tx2q, r3
4433    jmp   m(idct_8x8_internal).pass1_end
4434
4435.pass1_end1:
4436    SAVE_8ROWS     coeffq+16*0, 32
4437    LOAD_8ROWS   rsp+gprsize+16*11, 16
4438    mova   [rsp+gprsize+16*0 ], m7
4439    lea                   tx2q, [o(m(idct_32x16_internal).pass1_end2)]
4440    jmp   m(idct_8x8_internal).pass1_end
4441
4442.pass1_end2:
4443    SAVE_8ROWS    coeffq+16*16, 32
4444    LOAD_8ROWS   rsp+gprsize+16*19, 16
4445    mova   [rsp+gprsize+16*0 ], m7
4446    lea                   tx2q, [o(m(idct_32x16_internal).pass1_end3)]
4447    jmp   m(idct_8x8_internal).pass1_end
4448
4449.pass1_end3:
4450    SAVE_8ROWS    coeffq+16*32, 32
4451    LOAD_8ROWS   rsp+gprsize+16*27, 16
4452    mova   [rsp+gprsize+16*0 ], m7
4453    lea                   tx2q, [o(m(idct_32x16_internal).pass1_end4)]
4454    jmp   m(idct_8x8_internal).pass1_end
4455
4456.pass1_end4:
4457    SAVE_8ROWS    coeffq+16*48, 32
4458
4459    sub                 coeffq, 16
4460    lea                     r3, [o(m(idct_32x16_internal).end)]
4461    jmp .pass1
4462
4463.end:
4464    ret
4465
4466
4467cglobal inv_txfm_add_identity_identity_16x32, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
4468    %undef cmp
4469
4470    mov                    r4d, eobd
4471    cmp                   eobd, 43                ;if (eob > 43)
4472    sbb                    r3d, r3d               ;  iteration_count++
4473    cmp                    r4d, 150               ;if (eob > 150)
4474    sbb                    r3d, 0                 ;  iteration_count++
4475    cmp                    r4d, 278               ;if (eob > 278)
4476    sbb                    r3d, -4                ;  iteration_count++
4477
4478%if ARCH_X86_32
4479    LEA                     r5, $$
4480%endif
4481    lea                     r4, [dstq+8]
4482    mov             [rsp+16*3], r4
4483    mov     [rsp+gprsize+16*3], r3d
4484    mov   [rsp+gprsize*2+16*3], coeffq
4485
4486.loop:
4487    LOAD_8ROWS          coeffq, 64, 1
4488    mova            [rsp+16*1], m6
4489    pxor                    m6, m6
4490    REPX   {mova [coeffq+64*x], m6}, 0,  1,  2,  3,  4,  5,  6,  7
4491    lea                   tx2q, [o(m(idct_32x16_internal).end)]
4492    call  m(idct_8x8_internal).pass1_end3
4493    mova            [rsp+16*0], m2
4494    mova            [rsp+16*1], m3
4495    mova            [rsp+16*2], m4
4496    mova                    m3, [o(pw_1697x16)]
4497    mova                    m4, [o(pw_16384)]
4498    REPX   {IDTX16 x, 2, 3, 4}, 5, 6, 7, 0, 1
4499    mova                    m2, [o(pw_8192)]
4500    REPX      {pmulhrsw x, m2}, m5, m6, m7, m0, m1
4501    mova                    m2, [rsp+16*0]
4502    mova            [rsp+16*0], m7
4503    IDTX16                   2, 7, 3, 4
4504    mova                    m7, [rsp+16*2]
4505    mova            [rsp+16*2], m5
4506    IDTX16                   7, 5, 3, 4
4507    mova                    m5, [rsp+16*1]
4508    mova            [rsp+16*1], m6
4509    pmulhrsw                m3, m5
4510    pmulhrsw                m3, m4
4511    psrlw                   m4, 1 ; pw_8192
4512    paddsw                  m3, m5
4513    pmulhrsw                m2, m4
4514    pmulhrsw                m3, m4
4515    pmulhrsw                m4, m7
4516    call  m(idct_8x8_internal).end3
4517    lea                   dstq, [dstq+strideq*2]
4518    add                 coeffq, 16
4519    dec                    r3d
4520    jg .loop
4521    mov                 coeffq, [rsp+gprsize*2+16*3]
4522    add                 coeffq, 64*8
4523    mov                    r3d, [rsp+gprsize+16*3]
4524    xor                   dstq, dstq
4525    mov     [rsp+gprsize+16*3], dstq
4526    mov                   dstq, [rsp+16*3]
4527    test                   r3d, r3d
4528    jnz .loop
4529    RET
4530
4531
4532cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
4533    %undef cmp
4534
4535    mov                    r4d, 12                ;0100b
4536    mov                    r5d, 136               ;1000 1000b
4537    cmp                   eobd, 44                ;if (eob > 43)
4538    cmovns                 r4d, r5d               ;  iteration_count+2
4539    cmp                   eobd, 151               ;if (eob > 150)
4540    mov                    r3d, 34952             ;1000 1000 1000 1000b
4541    cmovs                  r3d, r4d               ;  iteration_count += 4
4542
4543%if ARCH_X86_32
4544    LEA                     r5, $$
4545%endif
4546    lea                     r4, [dstq+8]
4547    mov             [rsp+16*3], r4
4548
4549.loop:
4550    LOAD_8ROWS          coeffq, 32, 1
4551    REPX         {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7
4552    mova            [rsp+16*1], m6
4553    lea                   tx2q, [o(m(idct_32x16_internal).end)]
4554    call  m(idct_8x8_internal).pass1_end3
4555    mova            [rsp+16*1], m5
4556    mova            [rsp+16*2], m6
4557    mova                    m6, [o(pw_1697x16)]
4558    REPX      {IDTX16 x, 5, 6}, 7, 0, 1, 2, 3, 4
4559    pmulhrsw                m7, [o(pw_2048)]
4560    mova                    m5, [rsp+16*1]
4561    mova            [rsp+16*0], m7
4562    IDTX16                   5, 7, 6
4563    mova                    m7, [rsp+16*2]
4564    IDTX16                   7, 6, 6
4565    mova                    m6, [o(pw_2048)]
4566    REPX      {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7
4567    mova            [rsp+16*2], m5
4568    mova            [rsp+16*1], m7
4569    call  m(idct_8x8_internal).end3
4570    lea                   dstq, [dstq+strideq*2]
4571    pxor                    m7, m7
4572    REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
4573
4574.loop_end:
4575    add                 coeffq, 16
4576    shr                    r3d, 2
4577    jz .ret
4578    test                   r3d, 2
4579    jnz .loop
4580    mov                    r4d, r3d
4581    and                    r4d, 1
4582    lea                 coeffq, [coeffq+r4*8+32*7]
4583    mov                   dstq, [rsp+16*3]
4584    lea                     r4, [dstq+8]
4585    mov             [rsp+16*3], r4
4586    jmp .loop
4587
4588.ret:
4589    RET
4590
4591
4592cglobal inv_txfm_add_dct_dct_32x32, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
4593%if ARCH_X86_32
4594    LEA                     r5, $$
4595%endif
4596    test                  eobd, eobd
4597    jz .dconly
4598
4599    call m(idct_32x32_internal)
4600    RET
4601
4602.dconly:
4603    movd                    m1, [o(pw_2896x8)]
4604    pmulhrsw                m0, m1, [coeffq]
4605    movd                    m2, [o(pw_8192)]
4606    mov               [coeffq], eobd
4607    mov                    r3d, 32
4608    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_32x8).end)]
4609    jmp m(inv_txfm_add_dct_dct_32x8).body
4610
4611
4612cglobal idct_32x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
4613    %undef cmp
4614
4615    mov                    r4d, 2
4616    sub                   eobd, 136
4617    mov  [rsp+gprsize*1+16*35], eobd
4618    mov                    r3d, 4
4619    cmovs                  r3d, r4d
4620
4621%if ARCH_X86_32
4622    LEA                     r5, $$
4623%endif
4624
4625    mov  [rsp+gprsize*2+16*35], coeffq
4626
4627.pass1_loop:
4628    LOAD_8ROWS     coeffq+64*1, 64*2
4629    mova   [rsp+gprsize+16*19], m0                        ;in1
4630    mova   [rsp+gprsize+16*26], m1                        ;in3
4631    mova   [rsp+gprsize+16*23], m2                        ;in5
4632    mova   [rsp+gprsize+16*22], m3                        ;in7
4633    mova   [rsp+gprsize+16*21], m4                        ;in9
4634    mova   [rsp+gprsize+16*24], m5                        ;in11
4635    mova   [rsp+gprsize+16*25], m6                        ;in13
4636    mova   [rsp+gprsize+16*20], m7                        ;in15
4637
4638    mov                   tx2d, [rsp+gprsize*1+16*35]
4639    test                  tx2d, tx2d
4640    jl .fast
4641
4642.full:
4643    LOAD_8ROWS     coeffq+64*0, 64*4
4644    call  m(idct_8x8_internal).main
4645    SAVE_7ROWS    rsp+gprsize+16*3, 16
4646    LOAD_8ROWS     coeffq+64*2, 64*4
4647    call m(idct_16x8_internal).main
4648    mova                    m7, [rsp+gprsize+16*0]
4649    SAVE_8ROWS   rsp+gprsize+16*11, 16
4650
4651    LOAD_8ROWS    coeffq+64*17, 64*2
4652    mova   [rsp+gprsize+16*33], m0                        ;in17
4653    mova   [rsp+gprsize+16*28], m1                        ;in19
4654    mova   [rsp+gprsize+16*29], m2                        ;in21
4655    mova   [rsp+gprsize+16*32], m3                        ;in23
4656    mova   [rsp+gprsize+16*31], m4                        ;in25
4657    mova   [rsp+gprsize+16*30], m5                        ;in27
4658    mova   [rsp+gprsize+16*27], m6                        ;in29
4659    mova   [rsp+gprsize+16*34], m7                        ;in31
4660
4661    call m(idct_8x32_internal).main
4662    jmp .pass1_end
4663
4664.fast:
4665    mova                    m0, [coeffq+256*0]
4666    mova                    m1, [coeffq+256*1]
4667    mova                    m2, [coeffq+256*2]
4668    mova                    m3, [coeffq+256*3]
4669    pxor                    m4, m4
4670    REPX          {mova x, m4}, m5, m6, m7
4671    call  m(idct_8x8_internal).main
4672
4673    SAVE_7ROWS    rsp+gprsize+16*3, 16
4674    mova                    m0, [coeffq+128*1]
4675    mova                    m1, [coeffq+128*3]
4676    mova                    m2, [coeffq+128*5]
4677    mova                    m3, [coeffq+128*7]
4678    pxor                    m4, m4
4679    REPX          {mova x, m4}, m5, m6, m7
4680    call m(idct_16x8_internal).main
4681    mova                    m7, [rsp+gprsize+16*0]
4682    SAVE_8ROWS   rsp+gprsize+16*11, 16
4683
4684    call m(idct_8x32_internal).main_fast
4685
4686.pass1_end:
4687    mova    [rsp+gprsize+16*0], m7
4688    mova                    m7, [o(pw_8192)]
4689    lea                   tx2q, [o(m(idct_32x32_internal).pass1_end1)]
4690    jmp   m(idct_8x8_internal).pass1_end1
4691
4692.pass1_end1:
4693    SAVE_8ROWS     coeffq+64*0, 64
4694    LOAD_8ROWS   rsp+gprsize+16*11, 16
4695    mova    [rsp+gprsize+16*0], m7
4696    mova                    m7, [o(pw_8192)]
4697    lea                   tx2q, [o(m(idct_32x32_internal).pass1_end2)]
4698    jmp   m(idct_8x8_internal).pass1_end1
4699
4700.pass1_end2:
4701    SAVE_8ROWS     coeffq+64*8, 64
4702    LOAD_8ROWS   rsp+gprsize+16*19, 16
4703    mova    [rsp+gprsize+16*0], m7
4704    mova                    m7, [o(pw_8192)]
4705    lea                   tx2q, [o(m(idct_32x32_internal).pass1_end3)]
4706    jmp   m(idct_8x8_internal).pass1_end1
4707
4708.pass1_end3:
4709    SAVE_8ROWS    coeffq+64*16, 64
4710    LOAD_8ROWS   rsp+gprsize+16*27, 16
4711    mova    [rsp+gprsize+16*0], m7
4712    mova                    m7, [o(pw_8192)]
4713    lea                   tx2q, [o(m(idct_32x32_internal).pass1_end4)]
4714    jmp   m(idct_8x8_internal).pass1_end1
4715
4716.pass1_end4:
4717    SAVE_8ROWS    coeffq+64*24, 64
4718
4719    add                 coeffq, 16
4720    dec                    r3d
4721    jg .pass1_loop
4722
4723
4724.pass2:
4725    mov                 coeffq, [rsp+gprsize*2+16*35]
4726    mov                    r3d, 4
4727    lea                   tx2q, [o(m(idct_32x32_internal).pass2_end)]
4728
4729.pass2_loop:
4730    mov  [rsp+gprsize*3+16*35], r3d
4731    lea                     r3, [dstq+8]
4732    mov  [rsp+gprsize*2+16*35], r3
4733
4734    mova                    m0, [coeffq+16*4 ]
4735    mova                    m1, [coeffq+16*12]
4736    mova                    m2, [coeffq+16*20]
4737    mova                    m3, [coeffq+16*28]
4738    mova                    m4, [coeffq+16*5 ]
4739    mova                    m5, [coeffq+16*13]
4740    mova                    m6, [coeffq+16*21]
4741    mova                    m7, [coeffq+16*29]
4742    mova   [rsp+gprsize+16*19], m0                        ;in1
4743    mova   [rsp+gprsize+16*26], m1                        ;in3
4744    mova   [rsp+gprsize+16*23], m2                        ;in5
4745    mova   [rsp+gprsize+16*22], m3                        ;in7
4746    mova   [rsp+gprsize+16*21], m4                        ;in9
4747    mova   [rsp+gprsize+16*24], m5                        ;in11
4748    mova   [rsp+gprsize+16*25], m6                        ;in13
4749    mova   [rsp+gprsize+16*20], m7                        ;in15
4750
4751    mov                   eobd, [rsp+gprsize*1+16*35]
4752    test                  eobd, eobd
4753    jl .fast1
4754
4755.full1:
4756    mova                    m0, [coeffq+16*0 ]
4757    mova                    m1, [coeffq+16*16]
4758    mova                    m2, [coeffq+16*1 ]
4759    mova                    m3, [coeffq+16*17]
4760    mova                    m4, [coeffq+16*2 ]
4761    mova                    m5, [coeffq+16*18]
4762    mova                    m6, [coeffq+16*3 ]
4763    mova                    m7, [coeffq+16*19]
4764    call  m(idct_8x8_internal).main
4765    SAVE_7ROWS    rsp+gprsize+16*3, 16
4766
4767    mova                    m0, [coeffq+16*8 ]
4768    mova                    m1, [coeffq+16*24]
4769    mova                    m2, [coeffq+16*9 ]
4770    mova                    m3, [coeffq+16*25]
4771    mova                    m4, [coeffq+16*10]
4772    mova                    m5, [coeffq+16*26]
4773    mova                    m6, [coeffq+16*11]
4774    mova                    m7, [coeffq+16*27]
4775    call m(idct_16x8_internal).main
4776    mova                    m7, [rsp+gprsize+16*0]
4777    SAVE_8ROWS   rsp+gprsize+16*11, 16
4778
4779    mova                    m0, [coeffq+16*6 ]
4780    mova                    m1, [coeffq+16*14]
4781    mova                    m2, [coeffq+16*22]
4782    mova                    m3, [coeffq+16*30]
4783    mova                    m4, [coeffq+16*7 ]
4784    mova                    m5, [coeffq+16*15]
4785    mova                    m6, [coeffq+16*23]
4786    mova                    m7, [coeffq+16*31]
4787    mova   [rsp+gprsize+16*33], m0                        ;in17
4788    mova   [rsp+gprsize+16*28], m1                        ;in19
4789    mova   [rsp+gprsize+16*29], m2                        ;in21
4790    mova   [rsp+gprsize+16*32], m3                        ;in23
4791    mova   [rsp+gprsize+16*31], m4                        ;in25
4792    mova   [rsp+gprsize+16*30], m5                        ;in27
4793    mova   [rsp+gprsize+16*27], m6                        ;in29
4794    mova   [rsp+gprsize+16*34], m7                        ;in31
4795
4796    call m(idct_8x32_internal).main
4797    jmp                   tx2q
4798
4799.fast1:
4800    mova                    m0, [coeffq+16*0 ]
4801    mova                    m1, [coeffq+16*16]
4802    mova                    m2, [coeffq+16*1 ]
4803    mova                    m3, [coeffq+16*17]
4804    pxor                    m4, m4
4805    REPX          {mova x, m4}, m5, m6, m7
4806    call  m(idct_8x8_internal).main
4807    SAVE_7ROWS    rsp+gprsize+16*3, 16
4808
4809    mova                    m0, [coeffq+16*8 ]
4810    mova                    m1, [coeffq+16*24]
4811    mova                    m2, [coeffq+16*9 ]
4812    mova                    m3, [coeffq+16*25]
4813    pxor                    m4, m4
4814    REPX          {mova x, m4}, m5, m6, m7
4815    call m(idct_16x8_internal).main
4816    mova                    m7, [rsp+gprsize+16*0]
4817    SAVE_8ROWS   rsp+gprsize+16*11, 16
4818
4819    call m(idct_8x32_internal).main_fast
4820    jmp                   tx2q
4821
4822.pass2_end:
4823    lea                     r3, [o(m(idct_32x32_internal).pass2_end1)]
4824    jmp  m(idct_8x32_internal).end
4825
4826.pass2_end1:
4827    lea                   tx2q, [o(m(idct_32x32_internal).pass2_end)]
4828    add                 coeffq, 16*32
4829    mov                   dstq, [rsp+gprsize*2+16*35]
4830    mov                    r3d, [rsp+gprsize*3+16*35]
4831    dec                    r3d
4832    jg .pass2_loop
4833
4834    ret
4835
4836
4837cglobal inv_txfm_add_identity_identity_32x32, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2
4838    %undef cmp
4839
4840    mov                    r4d, 2
4841    cmp                   eobd, 136
4842    mov                    r3d, 4
4843    cmovs                  r3d, r4d
4844
4845%if ARCH_X86_32
4846    LEA                     r5, $$
4847%endif
4848
4849    lea                     r4, [dstq+8]
4850    mov   [rsp+gprsize*0+16*3], r4
4851    mov   [rsp+gprsize*1+16*3], r3d
4852    mov   [rsp+gprsize*2+16*3], r3d
4853    mov   [rsp+gprsize*3+16*3], coeffq
4854
4855.loop:
4856    LOAD_8ROWS          coeffq, 64
4857    mova            [rsp+16*1], m6
4858    lea                   tx2q, [o(m(idct_32x16_internal).end)]
4859    call  m(idct_8x8_internal).pass1_end3
4860    pmulhrsw                m7, [o(pw_8192)]
4861    mova            [rsp+16*0], m7
4862    mova                    m7, [o(pw_8192)]
4863    REPX      {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
4864    mova            [rsp+16*1], m6
4865    mova            [rsp+16*2], m5
4866    call  m(idct_8x8_internal).end3
4867    lea                   dstq, [dstq+strideq*2]
4868
4869    pxor                    m7, m7
4870    REPX   {mova [coeffq+64*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
4871
4872    add                 coeffq, 16
4873    dec                    r3d
4874    jg .loop
4875
4876    mov                    r4d, [rsp+gprsize*2+16*3]
4877    dec                    r4d
4878    jle .ret
4879
4880    mov                   dstq, [rsp+gprsize*0+16*3]
4881    mov                 coeffq, [rsp+gprsize*3+16*3]
4882    mov   [rsp+gprsize*2+16*3], r4
4883    lea                     r3, [dstq+8]
4884    add                 coeffq, 64*8
4885    mov   [rsp+gprsize*0+16*3], r3
4886    mov                    r3d, [rsp+gprsize*1+16*3]
4887    mov   [rsp+gprsize*3+16*3], coeffq
4888    jmp .loop
4889
4890.ret:
4891    RET
4892
4893
4894cglobal inv_txfm_add_dct_dct_16x64, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2
4895%if ARCH_X86_32
4896    LEA                     r5, $$
4897%endif
4898    test                  eobd, eobd
4899    jz .dconly
4900
4901    call m(idct_16x64_internal)
4902    RET
4903
4904.dconly:
4905    movd                    m1, [o(pw_2896x8)]
4906    pmulhrsw                m0, m1, [coeffq]
4907    movd                    m2, [o(pw_8192)]
4908    mov               [coeffq], eobd
4909    mov                    r2d, 32
4910    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_16x64).end)]
4911    jmp m(inv_txfm_add_dct_dct_16x4).dconly
4912
4913.end:
4914    RET
4915
4916
4917cglobal idct_16x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
4918    %undef cmp
4919
4920    mov                    r4d, 2
4921    sub                   eobd, 151
4922    mov  [rsp+gprsize*1+16*67], eobd
4923    mov                    r3d, 4
4924    cmovs                  r3d, r4d
4925
4926%if ARCH_X86_32
4927    LEA                     r5, $$
4928%endif
4929
4930    mov  [rsp+gprsize*2+16*67], coeffq
4931
4932.pass1_loop:
4933    LOAD_8ROWS     coeffq+64*0, 64*2
4934    call  m(idct_8x8_internal).main
4935    SAVE_7ROWS    rsp+gprsize+16*3, 16
4936    LOAD_8ROWS     coeffq+64*1, 64*2
4937    call m(idct_16x8_internal).main
4938    mova                    m7, [o(pw_8192)]
4939    lea                   tx2q, [o(m(idct_16x64_internal).pass1_end)]
4940    jmp   m(idct_8x8_internal).pass1_end1
4941
4942.pass1_end:
4943    SAVE_8ROWS     coeffq+64*8, 64
4944    LOAD_8ROWS    rsp+gprsize+16*3, 16
4945    mova    [rsp+gprsize+16*0], m7
4946    mova                    m7, [o(pw_8192)]
4947    lea                   tx2q, [o(m(idct_16x64_internal).pass1_end1)]
4948    jmp   m(idct_8x8_internal).pass1_end1
4949
4950.pass1_end1:
4951    SAVE_8ROWS     coeffq+64*0, 64
4952
4953    add                 coeffq, 16
4954    dec                    r3d
4955    jg .pass1_loop
4956
4957    mov                 coeffq, [rsp+gprsize*2+16*67]
4958    mov                    r3d, 2
4959    lea                     r4, [dstq+8]
4960    mov  [rsp+gprsize*2+16*67], r4
4961    lea                     r4, [o(m(idct_16x64_internal).end1)]
4962
4963.pass2_loop:
4964    mov  [rsp+gprsize*3+16*67], r3d
4965    mov                   eobd, [rsp+gprsize*1+16*67]
4966
4967    mova                    m0, [coeffq+16*4 ]            ;in1
4968    mova                    m1, [coeffq+16*12]            ;in3
4969    mova                    m2, [coeffq+16*20]            ;in5
4970    mova                    m3, [coeffq+16*28]            ;in7
4971    mova                    m4, [coeffq+16*5 ]            ;in9
4972    mova                    m5, [coeffq+16*13]            ;in11
4973    mova                    m6, [coeffq+16*21]            ;in13
4974    mova                    m7, [coeffq+16*29]            ;in15
4975    mova   [rsp+gprsize+16*35], m0                        ;in1
4976    mova   [rsp+gprsize+16*49], m1                        ;in3
4977    mova   [rsp+gprsize+16*43], m2                        ;in5
4978    mova   [rsp+gprsize+16*41], m3                        ;in7
4979    mova   [rsp+gprsize+16*39], m4                        ;in9
4980    mova   [rsp+gprsize+16*45], m5                        ;in11
4981    mova   [rsp+gprsize+16*47], m6                        ;in13
4982    mova   [rsp+gprsize+16*37], m7                        ;in15
4983
4984    pxor                    m4, m4
4985    mova                    m0, [coeffq+16*0]
4986    mova                    m1, [coeffq+16*1]
4987
4988    test                  eobd, eobd
4989    jl .fast
4990
4991.full:
4992    mova                    m2, [coeffq+16*2]
4993    mova                    m3, [coeffq+16*3]
4994
4995    REPX          {mova x, m4}, m5, m6, m7
4996    call  m(idct_8x8_internal).main
4997    SAVE_7ROWS    rsp+gprsize+16*3, 16
4998
4999    pxor                    m4, m4
5000    mova                    m0, [coeffq+16*16]
5001    mova                    m1, [coeffq+16*17]
5002    mova                    m2, [coeffq+16*18]
5003    mova                    m3, [coeffq+16*19]
5004
5005    REPX          {mova x, m4}, m5, m6, m7
5006    call m(idct_16x8_internal).main
5007    mova                    m7, [rsp+gprsize+16*0]
5008    SAVE_8ROWS   rsp+gprsize+16*11, 16
5009
5010    mova                    m0, [coeffq+16*8 ]
5011    mova                    m1, [coeffq+16*24]
5012    mova                    m2, [coeffq+16*9 ]
5013    mova                    m3, [coeffq+16*25]
5014    mova                    m4, [coeffq+16*10]
5015    mova                    m5, [coeffq+16*26]
5016    mova                    m6, [coeffq+16*11]
5017    mova                    m7, [coeffq+16*27]
5018    mova   [rsp+gprsize+16*19], m0
5019    mova   [rsp+gprsize+16*26], m1
5020    mova   [rsp+gprsize+16*23], m2
5021    mova   [rsp+gprsize+16*22], m3
5022    mova   [rsp+gprsize+16*21], m4
5023    mova   [rsp+gprsize+16*24], m5
5024    mova   [rsp+gprsize+16*25], m6
5025    mova   [rsp+gprsize+16*20], m7
5026
5027    call m(idct_8x32_internal).main_fast
5028    SAVE_8ROWS    rsp+gprsize+16*3, 16
5029
5030    mova                    m0, [coeffq+16*6 ]            ;in17
5031    mova                    m1, [coeffq+16*14]            ;in19
5032    mova                    m2, [coeffq+16*22]            ;in21
5033    mova                    m3, [coeffq+16*30]            ;in23
5034    mova                    m4, [coeffq+16*7 ]            ;in25
5035    mova                    m5, [coeffq+16*15]            ;in27
5036    mova                    m6, [coeffq+16*23]            ;in29
5037    mova                    m7, [coeffq+16*31]            ;in31
5038    mova   [rsp+gprsize+16*63], m0                        ;in17
5039    mova   [rsp+gprsize+16*53], m1                        ;in19
5040    mova   [rsp+gprsize+16*55], m2                        ;in21
5041    mova   [rsp+gprsize+16*61], m3                        ;in23
5042    mova   [rsp+gprsize+16*59], m4                        ;in25
5043    mova   [rsp+gprsize+16*57], m5                        ;in27
5044    mova   [rsp+gprsize+16*51], m6                        ;in29
5045    mova   [rsp+gprsize+16*65], m7                        ;in31
5046
5047    call .main
5048    jmp  .end
5049
5050.fast:
5051    REPX          {mova x, m4}, m2, m3, m5, m6, m7
5052    call  m(idct_8x8_internal).main
5053    SAVE_7ROWS    rsp+gprsize+16*3, 16
5054
5055    pxor                    m4, m4
5056    mova                    m0, [coeffq+16*16]
5057    mova                    m1, [coeffq+16*17]
5058
5059    REPX          {mova x, m4}, m2, m3, m5, m6, m7
5060    call m(idct_16x8_internal).main
5061    mova                    m7, [rsp+gprsize+16*0]
5062    SAVE_8ROWS   rsp+gprsize+16*11, 16
5063
5064    mova                    m0, [coeffq+16*8 ]
5065    mova                    m1, [coeffq+16*24]
5066    mova                    m2, [coeffq+16*9 ]
5067    mova                    m3, [coeffq+16*25]
5068    mova   [rsp+gprsize+16*19], m0                        ;in1
5069    mova   [rsp+gprsize+16*26], m1                        ;in3
5070    mova   [rsp+gprsize+16*23], m2                        ;in5
5071    mova   [rsp+gprsize+16*22], m3                        ;in7
5072
5073    call m(idct_8x32_internal).main_veryfast
5074    SAVE_8ROWS    rsp+gprsize+16*3, 16
5075
5076    call .main_fast
5077
5078.end:
5079    LOAD_8ROWS   rsp+gprsize+16*3, 16
5080    mova    [rsp+gprsize+16*0], m7
5081    mov                     r3, r4
5082    jmp  m(idct_8x32_internal).end2
5083
5084.end1:
5085    LOAD_8ROWS   rsp+gprsize+16*35, 16
5086    lea                   dstq, [dstq+strideq*2]
5087    add                    rsp, 16*32
5088    lea                     r3, [o(m(idct_16x64_internal).end2)]
5089    jmp  m(idct_8x32_internal).end
5090
5091.end2:
5092    add                 coeffq, 16*32
5093    sub                    rsp, 16*32
5094
5095    mov                   dstq, [rsp+gprsize*2+16*67]
5096    mov                    r3d, [rsp+gprsize*3+16*67]
5097    lea                     r4, [dstq+8]
5098    mov  [rsp+gprsize*2+16*67], r4
5099    lea                     r4, [o(m(idct_16x64_internal).end1)]
5100
5101    dec                    r3d
5102    jg .pass2_loop
5103    ret
5104
5105
5106ALIGN function_align
5107.main_fast:
5108    mova                    m0, [rsp+gprsize*2+16*35]     ;in1
5109    pmulhrsw                m3, m0, [o(pw_4095x8)]        ;t62,t63
5110    pmulhrsw                m0, [o(pw_101x8)]             ;t32,t33
5111    mova                    m7, [o(pd_2048)]
5112    mova [rsp+gprsize*2+16*35], m0                        ;t32
5113    mova [rsp+gprsize*2+16*66], m3                        ;t63
5114    ITX_MULSUB_2W            3, 0, 1, 2, 7,  401, 4076    ;t33a, t62a
5115    mova [rsp+gprsize*2+16*36], m3                        ;t33a
5116    mova [rsp+gprsize*2+16*65], m0                        ;t62a
5117
5118    mova                    m1, [rsp+gprsize*2+16*37]     ;in15
5119    pmulhrsw                m2, m1, [o(pw_3822x8)]        ;t60,t61
5120    pmulhrsw                m1, [o(pw_m1474x8)]           ;t34,t35
5121    mova [rsp+gprsize*2+16*38], m1                        ;t35
5122    mova [rsp+gprsize*2+16*63], m2                        ;t60
5123    ITX_MULSUB_2W            2, 1, 0, 3, 7, m4076, 401    ;t34a, t61a
5124    mova [rsp+gprsize*2+16*37], m2                        ;t34a
5125    mova [rsp+gprsize*2+16*64], m1                        ;t61a
5126
5127    mova                    m0, [rsp+gprsize*2+16*39]     ;in9
5128    pmulhrsw                m3, m0, [o(pw_3996x8)]        ;t58,t59
5129    pmulhrsw                m0, [o(pw_897x8)]             ;t36,t37
5130    mova [rsp+gprsize*2+16*39], m0                        ;t36
5131    mova [rsp+gprsize*2+16*62], m3                        ;t59
5132    ITX_MULSUB_2W            3, 0, 1, 2, 7, 3166, 2598    ;t37a, t58a
5133    mova [rsp+gprsize*2+16*40], m3                        ;t37a
5134    mova [rsp+gprsize*2+16*61], m0                        ;t58a
5135
5136    mova                    m1, [rsp+gprsize*2+16*41]     ;in7
5137    pmulhrsw                m2, m1, [o(pw_4036x8)]        ;t56,t57
5138    pmulhrsw                m1, [o(pw_m700x8)]            ;t38,t39
5139    mova [rsp+gprsize*2+16*42], m1                        ;t39
5140    mova [rsp+gprsize*2+16*59], m2                        ;t56
5141    ITX_MULSUB_2W            2, 1, 0, 3, 7, m2598, 3166   ;t38a, t57a
5142    mova [rsp+gprsize*2+16*41], m2                        ;t38a
5143    mova [rsp+gprsize*2+16*60], m1                        ;t57a
5144
5145    mova                    m0, [rsp+gprsize*2+16*43]     ;in5
5146    pmulhrsw                m3, m0, [o(pw_4065x8)]        ;t54,t55
5147    pmulhrsw                m0, [o(pw_501x8)]             ;t40,t41
5148    mova [rsp+gprsize*2+16*43], m0                        ;t40
5149    mova [rsp+gprsize*2+16*58], m3                        ;t55
5150    ITX_MULSUB_2W            3, 0, 1, 2, 7, 1931, 3612    ;t41a, t54a
5151    mova [rsp+gprsize*2+16*44], m3                        ;t41a
5152    mova [rsp+gprsize*2+16*57], m0                        ;t54a
5153
5154    mova                    m1, [rsp+gprsize*2+16*45]     ;in11
5155    pmulhrsw                m2, m1, [o(pw_3948x8)]        ;t52,t53
5156    pmulhrsw                m1, [o(pw_m1092x8)]           ;t42,t43
5157    mova [rsp+gprsize*2+16*46], m1                        ;t43
5158    mova [rsp+gprsize*2+16*55], m2                        ;t52
5159    ITX_MULSUB_2W            2, 1, 0, 3, 7, m3612, 1931   ;t42a, t53a
5160    mova [rsp+gprsize*2+16*45], m2                        ;t42a
5161    mova [rsp+gprsize*2+16*56], m1                        ;t53a
5162
5163    mova                    m0, [rsp+gprsize*2+16*47]     ;in13
5164    pmulhrsw                m3, m0, [o(pw_3889x8)]        ;t50,t51
5165    pmulhrsw                m0, [o(pw_1285x8)]            ;t44,t45
5166    mova                    m6, m0
5167    mova [rsp+gprsize*2+16*54], m3                        ;t51
5168    ITX_MULSUB_2W            3, 0, 1, 2, 7, 3920, 1189    ;t45a, t50a
5169    mova [rsp+gprsize*2+16*48], m3                        ;t45a
5170    mova [rsp+gprsize*2+16*53], m0                        ;t50a
5171
5172    mova                    m0, [rsp+gprsize*2+16*49]     ;in3
5173    pmulhrsw                m3, m0, [o(pw_4085x8)]        ;t48,t49
5174    pmulhrsw                m0, [o(pw_m301x8)]            ;t46,t47
5175    mova                    m4, m3
5176    mova                    m5, m0
5177
5178    jmp .main2
5179
5180ALIGN function_align
5181.main:
5182    mova                    m0, [rsp+gprsize*2+16*35]     ;in1
5183    mova                    m1, [rsp+gprsize*2+16*65]     ;in31
5184    pmulhrsw                m3, m0, [o(pw_4095x8)]        ;t63a
5185    pmulhrsw                m0, [o(pw_101x8)]             ;t32a
5186    pmulhrsw                m2, m1, [o(pw_2967x8)]        ;t62a
5187    pmulhrsw                m1, [o(pw_m2824x8)]           ;t33a
5188    mova                    m7, [o(pd_2048)]
5189    psubsw                  m4, m0, m1                    ;t33
5190    paddsw                  m0, m1                        ;t32
5191    psubsw                  m5, m3, m2                    ;t62
5192    paddsw                  m3, m2                        ;t63
5193    ITX_MULSUB_2W            5, 4, 1, 2, 7,  401, 4076    ;t33a, t62a
5194    mova [rsp+gprsize*2+16*35], m0                        ;t32
5195    mova [rsp+gprsize*2+16*36], m5                        ;t33a
5196    mova [rsp+gprsize*2+16*65], m4                        ;t62a
5197    mova [rsp+gprsize*2+16*66], m3                        ;t63
5198
5199    mova                    m0, [rsp+gprsize*2+16*63]     ;in17
5200    mova                    m1, [rsp+gprsize*2+16*37]     ;in15
5201    pmulhrsw                m3, m0, [o(pw_3745x8)]        ;t61a
5202    pmulhrsw                m0, [o(pw_1660x8)]            ;t34a
5203    pmulhrsw                m2, m1, [o(pw_3822x8)]        ;t60a
5204    pmulhrsw                m1, [o(pw_m1474x8)]           ;t35a
5205    psubsw                  m4, m1, m0                    ;t34
5206    paddsw                  m0, m1                        ;t35
5207    psubsw                  m5, m2, m3                    ;t61
5208    paddsw                  m3, m2                        ;t60
5209    ITX_MULSUB_2W            5, 4, 1, 2, 7, m4076, 401    ;t34a, t61a
5210    mova [rsp+gprsize*2+16*37], m5                        ;t34a
5211    mova [rsp+gprsize*2+16*38], m0                        ;t35
5212    mova [rsp+gprsize*2+16*63], m3                        ;t60
5213    mova [rsp+gprsize*2+16*64], m4                        ;t61a
5214
5215    mova                    m0, [rsp+gprsize*2+16*39]     ;in9
5216    mova                    m1, [rsp+gprsize*2+16*61]     ;in23
5217    pmulhrsw                m3, m0, [o(pw_3996x8)]        ;t59a
5218    pmulhrsw                m0, [o(pw_897x8)]             ;t36a
5219    pmulhrsw                m2, m1, [o(pw_3461x8)]        ;t58a
5220    pmulhrsw                m1, [o(pw_m2191x8)]           ;t37a
5221    psubsw                  m4, m0, m1                    ;t37
5222    paddsw                  m0, m1                        ;t36
5223    psubsw                  m5, m3, m2                    ;t58
5224    paddsw                  m3, m2                        ;t59
5225    ITX_MULSUB_2W            5, 4, 1, 2, 7, 3166, 2598    ;t37a, t58a
5226    mova [rsp+gprsize*2+16*39], m0                        ;t36
5227    mova [rsp+gprsize*2+16*40], m5                        ;t37a
5228    mova [rsp+gprsize*2+16*61], m4                        ;t58a
5229    mova [rsp+gprsize*2+16*62], m3                        ;t59
5230
5231    mova                    m0, [rsp+gprsize*2+16*59]     ;in25
5232    mova                    m1, [rsp+gprsize*2+16*41]     ;in7
5233    pmulhrsw                m3, m0, [o(pw_3349x8)]        ;t57a
5234    pmulhrsw                m0, [o(pw_2359x8)]            ;t38a
5235    pmulhrsw                m2, m1, [o(pw_4036x8)]        ;t56a
5236    pmulhrsw                m1, [o(pw_m700x8)]            ;t39a
5237    psubsw                  m4, m1, m0                    ;t38
5238    paddsw                  m0, m1                        ;t39
5239    psubsw                  m5, m2, m3                    ;t57
5240    paddsw                  m3, m2                        ;t56
5241    ITX_MULSUB_2W            5, 4, 1, 2, 7, m2598, 3166   ;t38a, t57a
5242    mova [rsp+gprsize*2+16*41], m5                        ;t38a
5243    mova [rsp+gprsize*2+16*42], m0                        ;t39
5244    mova [rsp+gprsize*2+16*59], m3                        ;t56
5245    mova [rsp+gprsize*2+16*60], m4                        ;t57a
5246
5247    mova                    m0, [rsp+gprsize*2+16*43]     ;in5
5248    mova                    m1, [rsp+gprsize*2+16*57]     ;in27
5249    pmulhrsw                m3, m0, [o(pw_4065x8)]        ;t55a
5250    pmulhrsw                m0, [o(pw_501x8)]             ;t40a
5251    pmulhrsw                m2, m1, [o(pw_3229x8)]        ;t54a
5252    pmulhrsw                m1, [o(pw_m2520x8)]           ;t41a
5253    psubsw                  m4, m0, m1                    ;t41
5254    paddsw                  m0, m1                        ;t40
5255    psubsw                  m5, m3, m2                    ;t54
5256    paddsw                  m3, m2                        ;t55
5257    ITX_MULSUB_2W            5, 4, 1, 2, 7, 1931, 3612    ;t41a, t54a
5258    mova [rsp+gprsize*2+16*43], m0                        ;t40
5259    mova [rsp+gprsize*2+16*44], m5                        ;t41a
5260    mova [rsp+gprsize*2+16*57], m4                        ;t54a
5261    mova [rsp+gprsize*2+16*58], m3                        ;t55
5262
5263    mova                    m0, [rsp+gprsize*2+16*55]     ;in21
5264    mova                    m1, [rsp+gprsize*2+16*45]     ;in11
5265    pmulhrsw                m3, m0, [o(pw_3564x8)]        ;t53a
5266    pmulhrsw                m0, [o(pw_2019x8)]            ;t42a
5267    pmulhrsw                m2, m1, [o(pw_3948x8)]        ;t52a
5268    pmulhrsw                m1, [o(pw_m1092x8)]           ;t43a
5269    psubsw                  m4, m1, m0                    ;t42
5270    paddsw                  m0, m1                        ;t43
5271    psubsw                  m5, m2, m3                    ;t53
5272    paddsw                  m3, m2                        ;t52
5273    ITX_MULSUB_2W            5, 4, 1, 2, 7, m3612, 1931   ;t42a, t53a
5274    mova [rsp+gprsize*2+16*45], m5                        ;t42a
5275    mova [rsp+gprsize*2+16*46], m0                        ;t43
5276    mova [rsp+gprsize*2+16*55], m3                        ;t52
5277    mova [rsp+gprsize*2+16*56], m4                        ;t53a
5278
5279    mova                    m0, [rsp+gprsize*2+16*47]     ;in13
5280    mova                    m1, [rsp+gprsize*2+16*53]     ;in19
5281    pmulhrsw                m3, m0, [o(pw_3889x8)]        ;t51a
5282    pmulhrsw                m0, [o(pw_1285x8)]            ;t44a
5283    pmulhrsw                m2, m1, [o(pw_3659x8)]        ;t50a
5284    pmulhrsw                m1, [o(pw_m1842x8)]           ;t45a
5285    psubsw                  m4, m0, m1                    ;t45
5286    paddsw                  m0, m1                        ;t44
5287    psubsw                  m5, m3, m2                    ;t50
5288    paddsw                  m3, m2                        ;t51
5289    ITX_MULSUB_2W            5, 4, 1, 2, 7, 3920, 1189    ;t45a, t50a
5290    mova                    m6, m0
5291    mova [rsp+gprsize*2+16*48], m5                        ;t45a
5292    mova [rsp+gprsize*2+16*53], m4                        ;t50a
5293    mova [rsp+gprsize*2+16*54], m3                        ;t51
5294
5295    mova                    m0, [rsp+gprsize*2+16*51]     ;in29
5296    mova                    m1, [rsp+gprsize*2+16*49]     ;in3
5297    pmulhrsw                m3, m0, [o(pw_3102x8)]        ;t49a
5298    pmulhrsw                m0, [o(pw_2675x8)]            ;t46a
5299    pmulhrsw                m2, m1, [o(pw_4085x8)]        ;t48a
5300    pmulhrsw                m1, [o(pw_m301x8)]            ;t47a
5301    psubsw                  m5, m1, m0                    ;t46
5302    paddsw                  m0, m1                        ;t47
5303    psubsw                  m4, m2, m3                    ;t49
5304    paddsw                  m3, m2                        ;t48
5305
5306ALIGN function_align
5307.main2:
5308    ITX_MULSUB_2W            4, 5, 1, 2, 7, m1189, 3920   ;t46a, t49a
5309    mova                    m1, [rsp+gprsize*2+16*54]     ;t51
5310    psubsw                  m2, m0, m6                    ;t44a
5311    paddsw                  m0, m6                        ;t47a
5312    psubsw                  m6, m3, m1                    ;t51a
5313    paddsw                  m3, m1                        ;t48a
5314    mova [rsp+gprsize*2+16*50], m0                        ;t47a
5315    mova [rsp+gprsize*2+16*51], m3                        ;t48a
5316    ITX_MULSUB_2W            6, 2, 0, 3, 7, m2276, 3406   ;t44, t51
5317    mova [rsp+gprsize*2+16*47], m6                        ;t44
5318    mova [rsp+gprsize*2+16*54], m2                        ;t51
5319
5320    mova                    m0, [rsp+gprsize*2+16*48]     ;t45a
5321    mova                    m3, [rsp+gprsize*2+16*53]     ;t50a
5322    psubsw                  m2, m4, m0                    ;t45
5323    paddsw                  m4, m0                        ;t46
5324    psubsw                  m6, m5, m3                    ;t50
5325    paddsw                  m5, m3                        ;t49
5326    ITX_MULSUB_2W            6, 2, 0, 3, 7, m2276, 3406   ;t45a, t50a
5327    mova [rsp+gprsize*2+16*48], m6                        ;t45a
5328    mova [rsp+gprsize*2+16*49], m4                        ;t46
5329    mova [rsp+gprsize*2+16*52], m5                        ;t49
5330    mova [rsp+gprsize*2+16*53], m2                        ;t50a
5331
5332    mova                    m0, [rsp+gprsize*2+16*43]     ;t40
5333    mova                    m2, [rsp+gprsize*2+16*46]     ;t43
5334    mova                    m3, [rsp+gprsize*2+16*55]     ;t52
5335    mova                    m1, [rsp+gprsize*2+16*58]     ;t55
5336    psubsw                  m4, m0, m2                    ;t43a
5337    paddsw                  m0, m2                        ;t40a
5338    psubsw                  m5, m1, m3                    ;t52a
5339    paddsw                  m1, m3                        ;t55a
5340    ITX_MULSUB_2W            5, 4, 2, 3, 7, 3406, 2276    ;t43, t52
5341    mova [rsp+gprsize*2+16*43], m0                        ;t40a
5342    mova [rsp+gprsize*2+16*46], m5                        ;t43
5343    mova [rsp+gprsize*2+16*55], m4                        ;t52
5344    mova [rsp+gprsize*2+16*58], m1                        ;t55a
5345
5346    mova                    m0, [rsp+gprsize*2+16*44]     ;t41a
5347    mova                    m2, [rsp+gprsize*2+16*45]     ;t42a
5348    mova                    m3, [rsp+gprsize*2+16*56]     ;t53a
5349    mova                    m1, [rsp+gprsize*2+16*57]     ;t54a
5350    psubsw                  m4, m0, m2                    ;t42
5351    paddsw                  m0, m2                        ;t41
5352    psubsw                  m5, m1, m3                    ;t53
5353    paddsw                  m1, m3                        ;t54
5354    ITX_MULSUB_2W            5, 4, 2, 3, 7, 3406, 2276    ;t42a, t53a
5355    mova [rsp+gprsize*2+16*44], m0                        ;t41
5356    mova [rsp+gprsize*2+16*45], m5                        ;t42a
5357    mova [rsp+gprsize*2+16*56], m4                        ;t53a
5358    mova [rsp+gprsize*2+16*57], m1                        ;t54
5359
5360    mova                    m0, [rsp+gprsize*2+16*41]     ;t38a
5361    mova                    m2, [rsp+gprsize*2+16*40]     ;t37a
5362    mova                    m3, [rsp+gprsize*2+16*61]     ;t58a
5363    mova                    m1, [rsp+gprsize*2+16*60]     ;t57a
5364    psubsw                  m4, m0, m2                    ;t37
5365    paddsw                  m0, m2                        ;t38
5366    psubsw                  m5, m1, m3                    ;t58
5367    paddsw                  m1, m3                        ;t57
5368    ITX_MULSUB_2W            5, 4, 2, 3, 7, m4017, 799    ;t37a, t58a
5369    mova [rsp+gprsize*2+16*41], m0                        ;t38
5370    mova [rsp+gprsize*2+16*40], m5                        ;t37a
5371    mova [rsp+gprsize*2+16*61], m4                        ;t58a
5372    mova [rsp+gprsize*2+16*60], m1                        ;t57
5373
5374    mova                    m0, [rsp+gprsize*2+16*42]     ;t39
5375    mova                    m2, [rsp+gprsize*2+16*39]     ;t36
5376    mova                    m3, [rsp+gprsize*2+16*62]     ;t59
5377    mova                    m1, [rsp+gprsize*2+16*59]     ;t56
5378    psubsw                  m4, m0, m2                    ;t36a
5379    paddsw                  m0, m2                        ;t39a
5380    psubsw                  m5, m1, m3                    ;t59a
5381    paddsw                  m1, m3                        ;t56a
5382    ITX_MULSUB_2W            5, 4, 2, 3, 7, m4017, 799    ;t36, t59
5383    mova [rsp+gprsize*2+16*42], m0                        ;t39a
5384    mova [rsp+gprsize*2+16*39], m5                        ;t36
5385    mova [rsp+gprsize*2+16*62], m4                        ;t59
5386    mova [rsp+gprsize*2+16*59], m1                        ;t56a
5387
5388    mova                    m0, [rsp+gprsize*2+16*35]     ;t32
5389    mova                    m2, [rsp+gprsize*2+16*38]     ;t35
5390    mova                    m3, [rsp+gprsize*2+16*63]     ;t60
5391    mova                    m1, [rsp+gprsize*2+16*66]     ;t63
5392    psubsw                  m4, m0, m2                    ;t35a
5393    paddsw                  m0, m2                        ;t32a
5394    psubsw                  m5, m1, m3                    ;t60a
5395    paddsw                  m1, m3                        ;t63a
5396    ITX_MULSUB_2W            5, 4, 2, 3, 7,  799, 4017    ;t35, t60
5397    mova [rsp+gprsize*2+16*35], m0                        ;t32a
5398    mova [rsp+gprsize*2+16*38], m5                        ;t35
5399    mova [rsp+gprsize*2+16*63], m4                        ;t60
5400    mova [rsp+gprsize*2+16*66], m1                        ;t63a
5401
5402    mova                    m0, [rsp+gprsize*2+16*36]     ;t33a
5403    mova                    m2, [rsp+gprsize*2+16*37]     ;t34a
5404    mova                    m3, [rsp+gprsize*2+16*64]     ;t61a
5405    mova                    m1, [rsp+gprsize*2+16*65]     ;t62a
5406    psubsw                  m4, m0, m2                    ;t34
5407    paddsw                  m0, m2                        ;t33
5408    psubsw                  m5, m1, m3                    ;t61
5409    paddsw                  m1, m3                        ;t62
5410    ITX_MULSUB_2W            5, 4, 2, 3, 7,  799, 4017    ;t34a, t61a
5411
5412    mova                    m2, [rsp+gprsize*2+16*41]     ;t38
5413    mova                    m3, [rsp+gprsize*2+16*60]     ;t57
5414    psubsw                  m6, m0, m2                    ;t38a
5415    paddsw                  m0, m2                        ;t33a
5416    psubsw                  m2, m1, m3                    ;t57a
5417    paddsw                  m1, m3                        ;t62a
5418    mova [rsp+gprsize*2+16*36], m0                        ;t33a
5419    mova [rsp+gprsize*2+16*65], m1                        ;t62a
5420    ITX_MULSUB_2W            2, 6, 0, 3, 7, 1567, 3784    ;t38, t57
5421    mova [rsp+gprsize*2+16*41], m2                        ;t38
5422    mova [rsp+gprsize*2+16*60], m6                        ;t57
5423
5424    mova                    m2, [rsp+gprsize*2+16*40]     ;t37
5425    mova                    m3, [rsp+gprsize*2+16*61]     ;t58
5426    psubsw                  m0, m5, m2                    ;t37
5427    paddsw                  m5, m2                        ;t34
5428    psubsw                  m1, m4, m3                    ;t58
5429    paddsw                  m4, m3                        ;t61
5430    ITX_MULSUB_2W            1, 0, 2, 3, 7, 1567, 3784    ;t37a, t58a
5431    mova [rsp+gprsize*2+16*37], m5                        ;t34
5432    mova [rsp+gprsize*2+16*64], m4                        ;t61
5433    mova [rsp+gprsize*2+16*40], m1                        ;t37a
5434    mova [rsp+gprsize*2+16*61], m0                        ;t58a
5435
5436    mova                    m0, [rsp+gprsize*2+16*38]     ;t35
5437    mova                    m2, [rsp+gprsize*2+16*39]     ;t36
5438    mova                    m3, [rsp+gprsize*2+16*62]     ;t59
5439    mova                    m1, [rsp+gprsize*2+16*63]     ;t60
5440    psubsw                  m4, m0, m2                    ;t36a
5441    paddsw                  m0, m2                        ;t35a
5442    psubsw                  m5, m1, m3                    ;t59a
5443    paddsw                  m1, m3                        ;t60a
5444    ITX_MULSUB_2W            5, 4, 2, 3, 7, 1567, 3784    ;t36, t59
5445    mova [rsp+gprsize*2+16*38], m0                        ;t35a
5446    mova [rsp+gprsize*2+16*39], m5                        ;t36
5447    mova [rsp+gprsize*2+16*62], m4                        ;t59
5448    mova [rsp+gprsize*2+16*63], m1                        ;t60a
5449
5450    mova                    m0, [rsp+gprsize*2+16*35]     ;t32a
5451    mova                    m2, [rsp+gprsize*2+16*42]     ;t39a
5452    mova                    m3, [rsp+gprsize*2+16*59]     ;t56a
5453    mova                    m1, [rsp+gprsize*2+16*66]     ;t63a
5454    psubsw                  m4, m0, m2                    ;t39
5455    paddsw                  m0, m2                        ;t32
5456    psubsw                  m5, m1, m3                    ;t56
5457    paddsw                  m1, m3                        ;t63
5458    ITX_MULSUB_2W            5, 4, 2, 3, 7, 1567, 3784    ;t39a, t56a
5459    mova [rsp+gprsize*2+16*35], m0                        ;t32
5460    mova [rsp+gprsize*2+16*42], m5                        ;t39a
5461    mova [rsp+gprsize*2+16*59], m4                        ;t56a
5462    mova [rsp+gprsize*2+16*66], m1                        ;t63
5463
5464    mova                    m0, [rsp+gprsize*2+16*50]     ;t47a
5465    mova                    m2, [rsp+gprsize*2+16*43]     ;t40a
5466    mova                    m3, [rsp+gprsize*2+16*58]     ;t55a
5467    mova                    m1, [rsp+gprsize*2+16*51]     ;t48a
5468    psubsw                  m4, m0, m2                    ;t40
5469    paddsw                  m0, m2                        ;t47
5470    psubsw                  m5, m1, m3                    ;t55
5471    paddsw                  m1, m3                        ;t48
5472    ITX_MULSUB_2W            5, 4, 2, 3, 7, m3784, 1567   ;t40a, t55a
5473    mova [rsp+gprsize*2+16*50], m0                        ;t47
5474    mova [rsp+gprsize*2+16*43], m5                        ;t40a
5475    mova [rsp+gprsize*2+16*58], m4                        ;t55a
5476    mova [rsp+gprsize*2+16*51], m1                        ;t48
5477
5478    mova                    m0, [rsp+gprsize*2+16*49]     ;t46
5479    mova                    m2, [rsp+gprsize*2+16*44]     ;t41
5480    mova                    m3, [rsp+gprsize*2+16*57]     ;t54
5481    mova                    m1, [rsp+gprsize*2+16*52]     ;t49
5482    psubsw                  m4, m0, m2                    ;t41a
5483    paddsw                  m0, m2                        ;t46a
5484    psubsw                  m5, m1, m3                    ;t54a
5485    paddsw                  m1, m3                        ;t49a
5486    ITX_MULSUB_2W            5, 4, 2, 3, 7, m3784, 1567   ;t41, t54
5487    mova [rsp+gprsize*2+16*49], m0                        ;t46a
5488    mova [rsp+gprsize*2+16*44], m5                        ;t41
5489    mova [rsp+gprsize*2+16*57], m4                        ;t54
5490    mova [rsp+gprsize*2+16*52], m1                        ;t49a
5491
5492    mova                    m0, [rsp+gprsize*2+16*48]     ;t45a
5493    mova                    m2, [rsp+gprsize*2+16*45]     ;t42a
5494    mova                    m3, [rsp+gprsize*2+16*56]     ;t53a
5495    mova                    m1, [rsp+gprsize*2+16*53]     ;t50a
5496    psubsw                  m4, m0, m2                    ;t42
5497    paddsw                  m0, m2                        ;t45
5498    psubsw                  m5, m1, m3                    ;t53
5499    paddsw                  m1, m3                        ;t50
5500    ITX_MULSUB_2W            5, 4, 2, 3, 7, m3784, 1567   ;t42a, t53a
5501    mova [rsp+gprsize*2+16*48], m0                        ;t45
5502    mova [rsp+gprsize*2+16*45], m5                        ;t42a
5503    mova [rsp+gprsize*2+16*56], m4                        ;t53a
5504    mova [rsp+gprsize*2+16*53], m1                        ;t50
5505
5506    mova                    m0, [rsp+gprsize*2+16*47]     ;t44
5507    mova                    m2, [rsp+gprsize*2+16*46]     ;t43
5508    mova                    m3, [rsp+gprsize*2+16*55]     ;t52
5509    mova                    m1, [rsp+gprsize*2+16*54]     ;t51
5510    psubsw                  m4, m0, m2                    ;t43a
5511    paddsw                  m0, m2                        ;t44a
5512    psubsw                  m5, m1, m3                    ;t52a
5513    paddsw                  m1, m3                        ;t51a
5514    ITX_MULSUB_2W            5, 4, 2, 3, 7, m3784, 1567   ;t43, t52
5515
5516    mova                    m2, [rsp+gprsize*2+16*38]     ;t35a
5517    mova                    m3, [rsp+gprsize*2+16*31]     ;tmp[28]
5518    psubsw                  m6, m2, m0                    ;t44
5519    paddsw                  m2, m0                        ;t35
5520    psubsw                  m0, m3, m2                    ;out35
5521    paddsw                  m2, m3                        ;out28
5522    mova                    m3, [rsp+gprsize*2+16*63]     ;t60a
5523    mova [rsp+gprsize*2+16*38], m0                        ;out35
5524    mova [rsp+gprsize*2+16*31], m2                        ;out28
5525    psubsw                  m0, m3, m1                    ;t51
5526    paddsw                  m3, m1                        ;t60
5527    ITX_MULSUB_2W            0, 6, 1, 2, 7, 2896, 2896    ;t44a, t51a
5528    mova                    m2, [rsp+gprsize*2+16*6 ]     ;tmp[3]
5529    psubsw                  m1, m2, m3                    ;out60
5530    paddsw                  m2, m3                        ;out3
5531    mova                    m3, [rsp+gprsize*2+16*22]     ;tmp[19]
5532    mova [rsp+gprsize*2+16*63], m1                        ;out60
5533    mova [rsp+gprsize*2+16*6 ], m2                        ;out3
5534    psubsw                  m1, m3, m0                    ;out44
5535    paddsw                  m3, m0                        ;out19
5536    mova                    m2, [rsp+gprsize*2+16*15]     ;tmp[12]
5537
5538    mova                    m0, [rsp+gprsize*2+16*39]     ;t36
5539    mova [rsp+gprsize*2+16*47], m1                        ;out44
5540    mova [rsp+gprsize*2+16*22], m3                        ;out19
5541    mova                    m1, [rsp+gprsize*2+16*62]     ;t59
5542    psubsw                  m3, m2, m6                    ;out51
5543    paddsw                  m2, m6                        ;out12
5544    mova [rsp+gprsize*2+16*54], m3                        ;out51
5545    mova [rsp+gprsize*2+16*15], m2                        ;out12
5546    psubsw                  m2, m0, m5                    ;t43a
5547    paddsw                  m0, m5                        ;t36a
5548    mova                    m5, [rsp+gprsize*2+16*30]     ;tmp[27]
5549    psubsw                  m3, m1, m4                    ;t52a
5550    paddsw                  m1, m4                        ;t59a
5551    ITX_MULSUB_2W            3, 2, 4, 6, 7, 2896, 2896    ;t43, t52
5552    mova                    m4, [rsp+gprsize*2+16*7 ]     ;tmp[4 ]
5553    psubsw                  m6, m5, m0                    ;out36
5554    paddsw                  m5, m0                        ;out27
5555    psubsw                  m0, m4, m1                    ;out59
5556    paddsw                  m4, m1                        ;out4
5557    mova [rsp+gprsize*2+16*39], m6                        ;out36
5558    mova [rsp+gprsize*2+16*30], m5                        ;out27
5559    mova [rsp+gprsize*2+16*62], m0                        ;out59
5560    mova [rsp+gprsize*2+16*7 ], m4                        ;out4
5561    mova                    m0, [rsp+gprsize*2+16*23]     ;tmp[20]
5562    mova                    m5, [rsp+gprsize*2+16*14]     ;tmp[11]
5563    psubsw                  m4, m0, m3                    ;out43
5564    paddsw                  m0, m3                        ;out20
5565    psubsw                  m6, m5, m2                    ;out52
5566    paddsw                  m5, m2                        ;out11
5567    mova [rsp+gprsize*2+16*46], m4                        ;out43
5568    mova [rsp+gprsize*2+16*23], m0                        ;out20
5569    mova [rsp+gprsize*2+16*55], m6                        ;out52
5570    mova [rsp+gprsize*2+16*14], m5                        ;out11
5571
5572    mova                    m0, [rsp+gprsize*2+16*40]     ;t37a
5573    mova                    m5, [rsp+gprsize*2+16*45]     ;t42a
5574    mova                    m3, [rsp+gprsize*2+16*56]     ;t53a
5575    mova                    m1, [rsp+gprsize*2+16*61]     ;t58a
5576    mova                    m2, [rsp+gprsize*2+16*29]     ;tmp[26]
5577    psubsw                  m4, m0, m5                    ;t42
5578    paddsw                  m0, m5                        ;t37
5579    psubsw                  m5, m1, m3                    ;t53
5580    paddsw                  m1, m3                        ;t58
5581    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t43, t52
5582    mova                    m3, [rsp+gprsize*2+16*8 ]     ;tmp[5 ]
5583    psubsw                  m6, m2, m0                    ;out37
5584    paddsw                  m2, m0                        ;out26
5585    psubsw                  m0, m3, m1                    ;out58
5586    paddsw                  m3, m1                        ;out5
5587    mova [rsp+gprsize*2+16*40], m6                        ;out37
5588    mova [rsp+gprsize*2+16*29], m2                        ;out26
5589    mova [rsp+gprsize*2+16*61], m0                        ;out58
5590    mova [rsp+gprsize*2+16*8 ], m3                        ;out5
5591    mova                    m0, [rsp+gprsize*2+16*24]     ;tmp[21]
5592    mova                    m1, [rsp+gprsize*2+16*13]     ;tmp[10]
5593    psubsw                  m2, m0, m5                    ;out42
5594    paddsw                  m0, m5                        ;out21
5595    psubsw                  m3, m1, m4                    ;out53
5596    paddsw                  m1, m4                        ;out10
5597    mova [rsp+gprsize*2+16*45], m2                        ;out42
5598    mova [rsp+gprsize*2+16*24], m0                        ;out21
5599    mova [rsp+gprsize*2+16*56], m3                        ;out53
5600    mova [rsp+gprsize*2+16*13], m1                        ;out10
5601
5602    mova                    m0, [rsp+gprsize*2+16*41]     ;t38
5603    mova                    m5, [rsp+gprsize*2+16*44]     ;t41
5604    mova                    m3, [rsp+gprsize*2+16*57]     ;t54
5605    mova                    m1, [rsp+gprsize*2+16*60]     ;t57
5606    mova                    m2, [rsp+gprsize*2+16*28]     ;tmp[25]
5607    psubsw                  m4, m0, m5                    ;t41a
5608    paddsw                  m0, m5                        ;t38a
5609    psubsw                  m5, m1, m3                    ;t54a
5610    paddsw                  m1, m3                        ;t57a
5611    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t41a, t54a
5612    mova                    m3, [rsp+gprsize*2+16*9 ]     ;tmp[6 ]
5613    psubsw                  m6, m2, m0                    ;out38
5614    paddsw                  m2, m0                        ;out25
5615    psubsw                  m0, m3, m1                    ;out57
5616    paddsw                  m3, m1                        ;out6
5617    mova [rsp+gprsize*2+16*41], m6                        ;out38
5618    mova [rsp+gprsize*2+16*28], m2                        ;out25
5619    mova [rsp+gprsize*2+16*60], m0                        ;out57
5620    mova [rsp+gprsize*2+16*9 ], m3                        ;out6
5621    mova                    m0, [rsp+gprsize*2+16*25]     ;tmp[22]
5622    mova                    m1, [rsp+gprsize*2+16*12]     ;tmp[9 ]
5623    psubsw                  m2, m0, m5                    ;out41
5624    paddsw                  m0, m5                        ;out22
5625    psubsw                  m3, m1, m4                    ;out54
5626    paddsw                  m1, m4                        ;out9
5627    mova [rsp+gprsize*2+16*44], m2                        ;out41
5628    mova [rsp+gprsize*2+16*25], m0                        ;out22
5629    mova [rsp+gprsize*2+16*57], m3                        ;out54
5630    mova [rsp+gprsize*2+16*12], m1                        ;out9
5631
5632    mova                    m0, [rsp+gprsize*2+16*42]     ;t39a
5633    mova                    m5, [rsp+gprsize*2+16*43]     ;t40a
5634    mova                    m3, [rsp+gprsize*2+16*58]     ;t55a
5635    mova                    m1, [rsp+gprsize*2+16*59]     ;t56a
5636    mova                    m2, [rsp+gprsize*2+16*27]     ;tmp[24]
5637    psubsw                  m4, m0, m5                    ;t40
5638    paddsw                  m0, m5                        ;t39
5639    psubsw                  m5, m1, m3                    ;t55
5640    paddsw                  m1, m3                        ;t56
5641    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t40a, t55a
5642    mova                    m3, [rsp+gprsize*2+16*10]     ;tmp[7 ]
5643    psubsw                  m6, m2, m0                    ;out39
5644    paddsw                  m2, m0                        ;out24
5645    psubsw                  m0, m3, m1                    ;out56
5646    paddsw                  m3, m1                        ;out7
5647    mova [rsp+gprsize*2+16*42], m6                        ;out39
5648    mova [rsp+gprsize*2+16*27], m2                        ;out24
5649    mova [rsp+gprsize*2+16*59], m0                        ;out56
5650    mova [rsp+gprsize*2+16*10], m3                        ;out7
5651    mova                    m0, [rsp+gprsize*2+16*26]     ;tmp[23]
5652    mova                    m1, [rsp+gprsize*2+16*11]     ;tmp[8 ]
5653    psubsw                  m2, m0, m5                    ;out40
5654    paddsw                  m0, m5                        ;out23
5655    psubsw                  m3, m1, m4                    ;out55
5656    paddsw                  m1, m4                        ;out8
5657    mova [rsp+gprsize*2+16*43], m2                        ;out40
5658    mova [rsp+gprsize*2+16*26], m0                        ;out23
5659    mova [rsp+gprsize*2+16*58], m3                        ;out55
5660    mova [rsp+gprsize*2+16*11], m1                        ;out8
5661
5662    mova                    m0, [rsp+gprsize*2+16*37]     ;t34
5663    mova                    m5, [rsp+gprsize*2+16*48]     ;t45
5664    mova                    m3, [rsp+gprsize*2+16*53]     ;t50
5665    mova                    m1, [rsp+gprsize*2+16*64]     ;t61
5666    mova                    m2, [rsp+gprsize*2+16*32]     ;tmp[29]
5667    psubsw                  m4, m0, m5                    ;t45a
5668    paddsw                  m0, m5                        ;t34a
5669    psubsw                  m5, m1, m3                    ;t50a
5670    paddsw                  m1, m3                        ;t61a
5671    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t45, t50
5672    mova                    m3, [rsp+gprsize*2+16*5 ]     ;tmp[2 ]
5673    psubsw                  m6, m2, m0                    ;out34
5674    paddsw                  m2, m0                        ;out29
5675    psubsw                  m0, m3, m1                    ;out61
5676    paddsw                  m3, m1                        ;out2
5677    mova [rsp+gprsize*2+16*37], m6                        ;out34
5678    mova [rsp+gprsize*2+16*32], m2                        ;out29
5679    mova [rsp+gprsize*2+16*64], m0                        ;out61
5680    mova [rsp+gprsize*2+16*5 ], m3                        ;out2
5681    mova                    m0, [rsp+gprsize*2+16*21]     ;tmp[18]
5682    mova                    m1, [rsp+gprsize*2+16*16]     ;tmp[13]
5683    psubsw                  m2, m0, m5                    ;out45
5684    paddsw                  m0, m5                        ;out18
5685    psubsw                  m3, m1, m4                    ;out50
5686    paddsw                  m1, m4                        ;out13
5687    mova [rsp+gprsize*2+16*48], m2                        ;out45
5688    mova [rsp+gprsize*2+16*21], m0                        ;out18
5689    mova [rsp+gprsize*2+16*53], m3                        ;out50
5690    mova [rsp+gprsize*2+16*16], m1                        ;out13
5691
5692    mova                    m0, [rsp+gprsize*2+16*36]     ;t33a
5693    mova                    m5, [rsp+gprsize*2+16*49]     ;t46a
5694    mova                    m3, [rsp+gprsize*2+16*52]     ;t49a
5695    mova                    m1, [rsp+gprsize*2+16*65]     ;t62a
5696    mova                    m2, [rsp+gprsize*2+16*33]     ;tmp[30]
5697    psubsw                  m4, m0, m5                    ;t46
5698    paddsw                  m0, m5                        ;t33
5699    psubsw                  m5, m1, m3                    ;t49
5700    paddsw                  m1, m3                        ;t62
5701    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t45, t50
5702    mova                    m3, [rsp+gprsize*2+16*4 ]     ;tmp[1 ]
5703    psubsw                  m6, m2, m0                    ;out33
5704    paddsw                  m2, m0                        ;out30
5705    psubsw                  m0, m3, m1                    ;out62
5706    paddsw                  m3, m1                        ;out1
5707    mova [rsp+gprsize*2+16*36], m6                        ;out33
5708    mova [rsp+gprsize*2+16*33], m2                        ;out30
5709    mova [rsp+gprsize*2+16*65], m0                        ;out62
5710    mova [rsp+gprsize*2+16*4 ], m3                        ;out1
5711    mova                    m0, [rsp+gprsize*2+16*20]     ;tmp[17]
5712    mova                    m1, [rsp+gprsize*2+16*17]     ;tmp[14]
5713    psubsw                  m2, m0, m5                    ;out46
5714    paddsw                  m0, m5                        ;out17
5715    psubsw                  m3, m1, m4                    ;out49
5716    paddsw                  m1, m4                        ;out14
5717    mova [rsp+gprsize*2+16*49], m2                        ;out46
5718    mova [rsp+gprsize*2+16*20], m0                        ;out17
5719    mova [rsp+gprsize*2+16*52], m3                        ;out49
5720    mova [rsp+gprsize*2+16*17], m1                        ;out14
5721
5722    mova                    m0, [rsp+gprsize*2+16*35]     ;t32
5723    mova                    m5, [rsp+gprsize*2+16*50]     ;t47
5724    mova                    m3, [rsp+gprsize*2+16*51]     ;t48
5725    mova                    m1, [rsp+gprsize*2+16*66]     ;t63
5726    mova                    m2, [rsp+gprsize*2+16*34]     ;tmp[31]
5727    psubsw                  m4, m0, m5                    ;t47a
5728    paddsw                  m0, m5                        ;t32a
5729    psubsw                  m5, m1, m3                    ;t48a
5730    paddsw                  m1, m3                        ;t63a
5731    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t47, t48
5732    mova                    m3, [rsp+gprsize*2+16*3 ]     ;tmp[0 ]
5733    psubsw                  m6, m2, m0                    ;out32
5734    paddsw                  m2, m0                        ;out31
5735    psubsw                  m0, m3, m1                    ;out63
5736    paddsw                  m3, m1                        ;out0
5737    mova [rsp+gprsize*2+16*35], m6                        ;out32
5738    mova [rsp+gprsize*2+16*34], m2                        ;out31
5739    mova [rsp+gprsize*2+16*66], m0                        ;out63
5740    mova [rsp+gprsize*2+16*3 ], m3                        ;out0
5741    mova                    m0, [rsp+gprsize*2+16*19]     ;tmp[16]
5742    mova                    m1, [rsp+gprsize*2+16*18]     ;tmp[15]
5743    psubsw                  m2, m0, m5                    ;out47
5744    paddsw                  m0, m5                        ;out16
5745    psubsw                  m3, m1, m4                    ;out48
5746    paddsw                  m1, m4                        ;out15
5747    mova [rsp+gprsize*2+16*50], m2                        ;out47
5748    mova [rsp+gprsize*2+16*19], m0                        ;out16
5749    mova [rsp+gprsize*2+16*51], m3                        ;out48
5750    mova [rsp+gprsize*2+16*18], m1                        ;out15
5751    ret
5752
5753
5754cglobal inv_txfm_add_dct_dct_64x16, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2
5755%if ARCH_X86_32
5756    LEA                     r5, $$
5757%endif
5758    test                  eobd, eobd
5759    jz .dconly
5760
5761    call m(idct_64x16_internal)
5762    RET
5763
5764.dconly:
5765    movd                    m1, [o(pw_2896x8)]
5766    pmulhrsw                m0, m1, [coeffq]
5767    movd                    m2, [o(pw_8192)]
5768    mov               [coeffq], eobd
5769    mov                    r3d, 16
5770    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_64x16).end)]
5771
5772.body:
5773    pmulhrsw                m0, m2
5774    movd                    m2, [o(pw_2048)]  ;intentionally rip-relative
5775    pmulhrsw                m0, m1
5776    pmulhrsw                m0, m2
5777    pshuflw                 m0, m0, q0000
5778    punpcklwd               m0, m0
5779    pxor                    m7, m7
5780
5781.loop:
5782    mova                    m1, [dstq+16*0]
5783    mova                    m3, [dstq+16*1]
5784    mova                    m5, [dstq+16*2]
5785    mova                    m6, [dstq+16*3]
5786    punpckhbw               m2, m1, m7
5787    punpcklbw               m1, m7
5788    punpckhbw               m4, m3, m7
5789    punpcklbw               m3, m7
5790    paddw                   m2, m0
5791    paddw                   m1, m0
5792    paddw                   m4, m0
5793    paddw                   m3, m0
5794    packuswb                m1, m2
5795    packuswb                m3, m4
5796    punpckhbw               m2, m5, m7
5797    punpcklbw               m5, m7
5798    punpckhbw               m4, m6, m7
5799    punpcklbw               m6, m7
5800    paddw                   m2, m0
5801    paddw                   m5, m0
5802    paddw                   m4, m0
5803    paddw                   m6, m0
5804    packuswb                m5, m2
5805    packuswb                m6, m4
5806    mova           [dstq+16*0], m1
5807    mova           [dstq+16*1], m3
5808    mova           [dstq+16*2], m5
5809    mova           [dstq+16*3], m6
5810    add                   dstq, strideq
5811    dec                    r3d
5812    jg .loop
5813    jmp                   tx2q
5814
5815.end:
5816    RET
5817
5818
5819%macro LOAD_4ROWS 2-3 0 ;src, stride, is_rect2
5820
5821%if %3
5822    mova                 m3, [o(pw_2896x8)]
5823    pmulhrsw             m0, m3, [%1+%2*0]
5824    pmulhrsw             m1, m3, [%1+%2*1]
5825    pmulhrsw             m2, m3, [%1+%2*2]
5826    pmulhrsw             m3, [%1+%2*3]
5827%else
5828    mova                 m0, [%1+%2*0]
5829    mova                 m1, [%1+%2*1]
5830    mova                 m2, [%1+%2*2]
5831    mova                 m3, [%1+%2*3]
5832%endif
5833%endmacro
5834
5835%macro LOAD_4ROWS_H 2 ;src, stride
5836    mova                 m4, [%1+%2*0]
5837    mova                 m5, [%1+%2*1]
5838    mova                 m6, [%1+%2*2]
5839    mova                 m7, [%1+%2*3]
5840%endmacro
5841
5842cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
5843    mov                    r3d, 2
5844    mov  [rsp+gprsize*2+16*67], dstq
5845    lea                   dstq, [rsp+gprsize+16*68]
5846
5847.pass1_loop:
5848    LOAD_4ROWS     coeffq+32*0, 32*8
5849    pxor                    m4, m4
5850    REPX          {mova x, m4}, m5, m6, m7
5851    call  m(idct_8x8_internal).main
5852    SAVE_7ROWS    rsp+gprsize+16*3, 16
5853
5854    pxor                    m4, m4
5855    LOAD_4ROWS     coeffq+32*4, 32*8
5856
5857    REPX          {mova x, m4}, m5, m6, m7
5858    call m(idct_16x8_internal).main
5859    mova                    m7, [rsp+gprsize+16*0]
5860    SAVE_8ROWS   rsp+gprsize+16*11, 16
5861
5862    LOAD_8ROWS     coeffq+32*2, 32*4
5863    mova   [rsp+gprsize+16*19], m0
5864    mova   [rsp+gprsize+16*26], m1
5865    mova   [rsp+gprsize+16*23], m2
5866    mova   [rsp+gprsize+16*22], m3
5867    mova   [rsp+gprsize+16*21], m4
5868    mova   [rsp+gprsize+16*24], m5
5869    mova   [rsp+gprsize+16*25], m6
5870    mova   [rsp+gprsize+16*20], m7
5871
5872    call m(idct_8x32_internal).main_fast
5873    SAVE_8ROWS    rsp+gprsize+16*3, 16
5874
5875    LOAD_8ROWS     coeffq+32*1, 32*2
5876    mova   [rsp+gprsize+16*35], m0                        ;in1
5877    mova   [rsp+gprsize+16*49], m1                        ;in3
5878    mova   [rsp+gprsize+16*43], m2                        ;in5
5879    mova   [rsp+gprsize+16*41], m3                        ;in7
5880    mova   [rsp+gprsize+16*39], m4                        ;in9
5881    mova   [rsp+gprsize+16*45], m5                        ;in11
5882    mova   [rsp+gprsize+16*47], m6                        ;in13
5883    mova   [rsp+gprsize+16*37], m7                        ;in15
5884
5885    LOAD_8ROWS    coeffq+32*17, 32*2
5886    mova   [rsp+gprsize+16*63], m0                        ;in17
5887    mova   [rsp+gprsize+16*53], m1                        ;in19
5888    mova   [rsp+gprsize+16*55], m2                        ;in21
5889    mova   [rsp+gprsize+16*61], m3                        ;in23
5890    mova   [rsp+gprsize+16*59], m4                        ;in25
5891    mova   [rsp+gprsize+16*57], m5                        ;in27
5892    mova   [rsp+gprsize+16*51], m6                        ;in29
5893    mova   [rsp+gprsize+16*65], m7                        ;in31
5894
5895    call m(idct_16x64_internal).main
5896
5897    LOAD_8ROWS    rsp+gprsize+16*3, 16
5898    mova    [rsp+gprsize+16*0], m7
5899    mova                    m7, [o(pw_8192)]
5900    lea                   tx2q, [o(m(idct_64x16_internal).pass1_end)]
5901    jmp   m(idct_8x8_internal).pass1_end1
5902
5903.pass1_end:
5904    SAVE_8ROWS     coeffq+32*0, 32
5905    LOAD_8ROWS   rsp+gprsize+16*11, 16
5906    mova    [rsp+gprsize+16*0], m7
5907    mova                    m7, [o(pw_8192)]
5908    lea                   tx2q, [o(m(idct_64x16_internal).pass1_end1)]
5909    jmp   m(idct_8x8_internal).pass1_end1
5910
5911.pass1_end1:
5912    SAVE_8ROWS     coeffq+32*8, 32
5913    LOAD_8ROWS   rsp+gprsize+16*19, 16
5914    mova    [rsp+gprsize+16*0], m7
5915    mova                    m7, [o(pw_8192)]
5916    lea                   tx2q, [o(m(idct_64x16_internal).pass1_end2)]
5917    jmp   m(idct_8x8_internal).pass1_end1
5918
5919.pass1_end2:
5920    SAVE_8ROWS    coeffq+32*16, 32
5921    LOAD_8ROWS   rsp+gprsize+16*27, 16
5922    mova    [rsp+gprsize+16*0], m7
5923    mova                    m7, [o(pw_8192)]
5924    lea                   tx2q, [o(m(idct_64x16_internal).pass1_end3)]
5925    jmp   m(idct_8x8_internal).pass1_end1
5926
5927.pass1_end3:
5928    SAVE_8ROWS    coeffq+32*24, 32
5929    LOAD_8ROWS   rsp+gprsize+16*35, 16
5930    mova    [rsp+gprsize+16*0], m7
5931    mova                    m7, [o(pw_8192)]
5932    lea                   tx2q, [o(m(idct_64x16_internal).pass1_end4)]
5933    jmp   m(idct_8x8_internal).pass1_end1
5934
5935.pass1_end4:
5936    SAVE_8ROWS       dstq+32*0, 32
5937    LOAD_8ROWS   rsp+gprsize+16*43, 16
5938    mova    [rsp+gprsize+16*0], m7
5939    mova                    m7, [o(pw_8192)]
5940    lea                   tx2q, [o(m(idct_64x16_internal).pass1_end5)]
5941    jmp   m(idct_8x8_internal).pass1_end1
5942
5943.pass1_end5:
5944    SAVE_8ROWS       dstq+32*8, 32
5945    LOAD_8ROWS   rsp+gprsize+16*51, 16
5946    mova    [rsp+gprsize+16*0], m7
5947    mova                    m7, [o(pw_8192)]
5948    lea                   tx2q, [o(m(idct_64x16_internal).pass1_end6)]
5949    jmp   m(idct_8x8_internal).pass1_end1
5950
5951.pass1_end6:
5952    SAVE_8ROWS      dstq+32*16, 32
5953    LOAD_8ROWS   rsp+gprsize+16*59, 16
5954    mova    [rsp+gprsize+16*0], m7
5955    mova                    m7, [o(pw_8192)]
5956    lea                   tx2q, [o(m(idct_64x16_internal).pass1_end7)]
5957    jmp   m(idct_8x8_internal).pass1_end1
5958
5959.pass1_end7:
5960    SAVE_8ROWS      dstq+32*24, 32
5961
5962    add                 coeffq, 16
5963    add                   dstq, 16
5964    dec                    r3d
5965    jg .pass1_loop
5966
5967.pass2:
5968    mov                   dstq, [rsp+gprsize*2+16*67]
5969    sub                 coeffq, 32
5970    mov                    r3d, 4
5971
5972.pass2_loop:
5973    mov  [rsp+gprsize*1+16*67], r3d
5974
5975    LOAD_4ROWS     coeffq+16*0, 32*2
5976    LOAD_4ROWS_H   coeffq+16*1, 32*2
5977    call  m(idct_8x8_internal).main
5978    SAVE_7ROWS    rsp+gprsize+16*3, 16
5979    LOAD_4ROWS     coeffq+16*2, 32*2
5980    LOAD_4ROWS_H   coeffq+16*3, 32*2
5981    call m(idct_16x8_internal).main
5982
5983    mov                    r3, dstq
5984    lea                  tx2q, [o(m(idct_64x16_internal).end)]
5985    lea                  dstq, [dstq+strideq*8]
5986    jmp  m(idct_8x8_internal).end
5987
5988.end:
5989    LOAD_8ROWS   rsp+gprsize+16*3, 16
5990    mova   [rsp+gprsize+16*0], m7
5991    lea                  tx2q, [o(m(idct_64x16_internal).end1)]
5992    mov                  dstq, r3
5993    jmp  m(idct_8x8_internal).end
5994
5995.end1:
5996    pxor                   m7, m7
5997    REPX  {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
5998
5999    add                 coeffq, 16*16
6000    mov                    r3d, [rsp+gprsize*1+16*67]
6001    mov                   dstq, [rsp+gprsize*2+16*67]
6002    add                   dstq, 8
6003    mov  [rsp+gprsize*2+16*67], dstq
6004    dec                    r3d
6005    jg .pass2_loop
6006
6007    mov                    r3d, 4
6008    lea                 coeffq, [rsp+gprsize+16*68]
6009.pass2_loop2:
6010    mov  [rsp+gprsize*1+16*67], r3d
6011
6012    LOAD_4ROWS     coeffq+16*0, 32*2
6013    LOAD_4ROWS_H   coeffq+16*1, 32*2
6014    call  m(idct_8x8_internal).main
6015    SAVE_7ROWS    rsp+gprsize+16*3, 16
6016    LOAD_4ROWS     coeffq+16*2, 32*2
6017    LOAD_4ROWS_H   coeffq+16*3, 32*2
6018    call m(idct_16x8_internal).main
6019
6020    mov                    r3, dstq
6021    lea                  tx2q, [o(m(idct_64x16_internal).end2)]
6022    lea                  dstq, [dstq+strideq*8]
6023    jmp  m(idct_8x8_internal).end
6024
6025.end2:
6026    LOAD_8ROWS   rsp+gprsize+16*3, 16
6027    mova   [rsp+gprsize+16*0], m7
6028    lea                  tx2q, [o(m(idct_64x16_internal).end3)]
6029    mov                  dstq, r3
6030    jmp  m(idct_8x8_internal).end
6031
6032.end3:
6033
6034    add                 coeffq, 16*16
6035    mov                    r3d, [rsp+gprsize*1+16*67]
6036    mov                   dstq, [rsp+gprsize*2+16*67]
6037    add                   dstq, 8
6038    mov  [rsp+gprsize*2+16*67], dstq
6039    dec                    r3d
6040    jg .pass2_loop2
6041    ret
6042
6043
6044cglobal inv_txfm_add_dct_dct_32x64, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2
6045%if ARCH_X86_32
6046    LEA                     r5, $$
6047%endif
6048    test                  eobd, eobd
6049    jz .dconly
6050
6051    call m(idct_32x64_internal)
6052    RET
6053
6054.dconly:
6055    movd                    m1, [o(pw_2896x8)]
6056    pmulhrsw                m0, m1, [coeffq]
6057    movd                    m2, [o(pw_16384)]
6058    mov               [coeffq], eobd
6059    pmulhrsw                m0, m1
6060    mov                    r3d, 64
6061    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_32x64).end)]
6062    jmp m(inv_txfm_add_dct_dct_32x8).body
6063
6064.end:
6065    RET
6066
6067
6068cglobal idct_32x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
6069    %undef cmp
6070
6071    mov                    r4d, 2
6072    sub                   eobd, 136
6073    mov  [rsp+gprsize*1+16*67], eobd
6074    mov                    r3d, 4
6075    cmovs                  r3d, r4d
6076
6077%if ARCH_X86_32
6078    LEA                     r5, $$
6079%endif
6080
6081    mov  [rsp+gprsize*2+16*67], coeffq
6082
6083.pass1_loop:
6084    LOAD_8ROWS     coeffq+64*1, 64*2, 1
6085    mova   [rsp+gprsize+16*19], m0                        ;in1
6086    mova   [rsp+gprsize+16*26], m1                        ;in3
6087    mova   [rsp+gprsize+16*23], m2                        ;in5
6088    mova   [rsp+gprsize+16*22], m3                        ;in7
6089    mova   [rsp+gprsize+16*21], m4                        ;in9
6090    mova   [rsp+gprsize+16*24], m5                        ;in11
6091    mova   [rsp+gprsize+16*25], m6                        ;in13
6092    mova   [rsp+gprsize+16*20], m7                        ;in15
6093
6094    mov                   tx2d, [rsp+gprsize*1+16*67]
6095    test                  tx2d, tx2d
6096    jl .fast
6097
6098.full:
6099    LOAD_8ROWS     coeffq+64*0, 64*4, 1
6100    call  m(idct_8x8_internal).main
6101    SAVE_7ROWS    rsp+gprsize+16*3, 16
6102    LOAD_8ROWS     coeffq+64*2, 64*4, 1
6103    call m(idct_16x8_internal).main
6104    mova                    m7, [rsp+gprsize+16*0]
6105    SAVE_8ROWS   rsp+gprsize+16*11, 16
6106
6107    LOAD_8ROWS    coeffq+64*17, 64*2, 1
6108    mova   [rsp+gprsize+16*33], m0                        ;in17
6109    mova   [rsp+gprsize+16*28], m1                        ;in19
6110    mova   [rsp+gprsize+16*29], m2                        ;in21
6111    mova   [rsp+gprsize+16*32], m3                        ;in23
6112    mova   [rsp+gprsize+16*31], m4                        ;in25
6113    mova   [rsp+gprsize+16*30], m5                        ;in27
6114    mova   [rsp+gprsize+16*27], m6                        ;in29
6115    mova   [rsp+gprsize+16*34], m7                        ;in31
6116
6117    call m(idct_8x32_internal).main
6118    jmp .pass1_end
6119
6120.fast:
6121    LOAD_4ROWS          coeffq, 256, 1
6122    pxor                    m4, m4
6123    REPX          {mova x, m4}, m5, m6, m7
6124    call  m(idct_8x8_internal).main
6125
6126    SAVE_7ROWS    rsp+gprsize+16*3, 16
6127    LOAD_4ROWS    coeffq+128*1, 256, 1
6128    pxor                    m4, m4
6129    REPX          {mova x, m4}, m5, m6, m7
6130    call m(idct_16x8_internal).main
6131    mova                    m7, [rsp+gprsize+16*0]
6132    SAVE_8ROWS   rsp+gprsize+16*11, 16
6133
6134    call m(idct_8x32_internal).main_fast
6135
6136.pass1_end:
6137    mova    [rsp+gprsize+16*0], m7
6138    lea                   tx2q, [o(m(idct_32x64_internal).pass1_end1)]
6139    jmp   m(idct_8x8_internal).pass1_end
6140
6141.pass1_end1:
6142    SAVE_8ROWS     coeffq+64*0, 64
6143    LOAD_8ROWS   rsp+gprsize+16*11, 16
6144    mova    [rsp+gprsize+16*0], m7
6145    lea                   tx2q, [o(m(idct_32x64_internal).pass1_end2)]
6146    jmp   m(idct_8x8_internal).pass1_end
6147
6148.pass1_end2:
6149    SAVE_8ROWS     coeffq+64*8, 64
6150    LOAD_8ROWS   rsp+gprsize+16*19, 16
6151    mova    [rsp+gprsize+16*0], m7
6152    lea                   tx2q, [o(m(idct_32x64_internal).pass1_end3)]
6153    jmp   m(idct_8x8_internal).pass1_end
6154
6155.pass1_end3:
6156    SAVE_8ROWS    coeffq+64*16, 64
6157    LOAD_8ROWS   rsp+gprsize+16*27, 16
6158    mova    [rsp+gprsize+16*0], m7
6159    lea                   tx2q, [o(m(idct_32x64_internal).pass1_end4)]
6160    jmp   m(idct_8x8_internal).pass1_end
6161
6162.pass1_end4:
6163    SAVE_8ROWS    coeffq+64*24, 64
6164
6165    add                 coeffq, 16
6166    dec                    r3d
6167    jg .pass1_loop
6168
6169.pass2:
6170    mov                 coeffq, [rsp+gprsize*2+16*67]
6171    mov                    r3d, 4
6172    lea                     r4, [dstq+8]
6173    mov  [rsp+gprsize*2+16*67], r4
6174    lea                     r4, [o(m(idct_16x64_internal).end1)]
6175    jmp m(idct_16x64_internal).pass2_loop
6176
6177
6178cglobal inv_txfm_add_dct_dct_64x32, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2
6179%if ARCH_X86_32
6180    LEA                     r5, $$
6181%endif
6182    test                  eobd, eobd
6183    jz .dconly
6184
6185    call m(idct_64x32_internal)
6186    RET
6187
6188.dconly:
6189    movd                    m1, [o(pw_2896x8)]
6190    pmulhrsw                m0, m1, [coeffq]
6191    movd                    m2, [o(pw_16384)]
6192    pmulhrsw                m0, m1
6193    mov               [coeffq], eobd
6194    mov                    r3d, 32
6195    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_64x32).end)]
6196    jmp m(inv_txfm_add_dct_dct_64x16).body
6197
6198.end:
6199    RET
6200
6201cglobal idct_64x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
6202    %undef cmp
6203
6204    mov                    r4d, 2
6205    sub                   eobd, 136
6206    mov  [rsp+gprsize*1+16*67], eobd
6207    mov                    r3d, 4
6208    cmovs                  r3d, r4d
6209
6210%if ARCH_X86_32
6211    LEA                     r5, $$
6212%endif
6213
6214    mov  [rsp+gprsize*2+16*67], coeffq
6215    mov  [rsp+gprsize*3+16*67], dstq
6216    lea                   dstq, [rsp+gprsize+16*69]
6217    mov  [rsp+gprsize*4+16*67], dstq
6218
6219.pass1_loop:
6220    LOAD_4ROWS     coeffq+64*0, 64*8, 1
6221    pxor                    m4, m4
6222    REPX          {mova x, m4}, m5, m6, m7
6223    call  m(idct_8x8_internal).main
6224    SAVE_7ROWS    rsp+gprsize+16*3, 16
6225
6226    pxor                    m4, m4
6227    LOAD_4ROWS     coeffq+64*4, 64*8, 1
6228
6229    REPX          {mova x, m4}, m5, m6, m7
6230    call m(idct_16x8_internal).main
6231    mova                    m7, [rsp+gprsize+16*0]
6232    SAVE_8ROWS   rsp+gprsize+16*11, 16
6233
6234    LOAD_8ROWS     coeffq+64*2, 64*4, 1
6235    mova   [rsp+gprsize+16*19], m0
6236    mova   [rsp+gprsize+16*26], m1
6237    mova   [rsp+gprsize+16*23], m2
6238    mova   [rsp+gprsize+16*22], m3
6239    mova   [rsp+gprsize+16*21], m4
6240    mova   [rsp+gprsize+16*24], m5
6241    mova   [rsp+gprsize+16*25], m6
6242    mova   [rsp+gprsize+16*20], m7
6243
6244    call m(idct_8x32_internal).main_fast
6245    SAVE_8ROWS    rsp+gprsize+16*3, 16
6246
6247    LOAD_8ROWS     coeffq+64*1, 64*2, 1
6248    mova   [rsp+gprsize+16*35], m0                        ;in1
6249    mova   [rsp+gprsize+16*49], m1                        ;in3
6250    mova   [rsp+gprsize+16*43], m2                        ;in5
6251    mova   [rsp+gprsize+16*41], m3                        ;in7
6252    mova   [rsp+gprsize+16*39], m4                        ;in9
6253    mova   [rsp+gprsize+16*45], m5                        ;in11
6254    mova   [rsp+gprsize+16*47], m6                        ;in13
6255    mova   [rsp+gprsize+16*37], m7                        ;in15
6256
6257    LOAD_8ROWS    coeffq+64*17, 64*2, 1
6258    mova   [rsp+gprsize+16*63], m0                        ;in17
6259    mova   [rsp+gprsize+16*53], m1                        ;in19
6260    mova   [rsp+gprsize+16*55], m2                        ;in21
6261    mova   [rsp+gprsize+16*61], m3                        ;in23
6262    mova   [rsp+gprsize+16*59], m4                        ;in25
6263    mova   [rsp+gprsize+16*57], m5                        ;in27
6264    mova   [rsp+gprsize+16*51], m6                        ;in29
6265    mova   [rsp+gprsize+16*65], m7                        ;in31
6266
6267    call m(idct_16x64_internal).main
6268
6269    LOAD_8ROWS    rsp+gprsize+16*3, 16
6270    mova    [rsp+gprsize+16*0], m7
6271    lea                   tx2q, [o(m(idct_64x32_internal).pass1_end)]
6272    jmp   m(idct_8x8_internal).pass1_end
6273
6274.pass1_end:
6275    SAVE_8ROWS     coeffq+64*0, 64
6276    LOAD_8ROWS   rsp+gprsize+16*11, 16
6277    mova    [rsp+gprsize+16*0], m7
6278    lea                   tx2q, [o(m(idct_64x32_internal).pass1_end1)]
6279    jmp   m(idct_8x8_internal).pass1_end
6280
6281.pass1_end1:
6282    SAVE_8ROWS     coeffq+64*8, 64
6283    LOAD_8ROWS   rsp+gprsize+16*19, 16
6284    mova    [rsp+gprsize+16*0], m7
6285    lea                   tx2q, [o(m(idct_64x32_internal).pass1_end2)]
6286    jmp   m(idct_8x8_internal).pass1_end
6287
6288.pass1_end2:
6289    SAVE_8ROWS    coeffq+64*16, 64
6290    LOAD_8ROWS   rsp+gprsize+16*27, 16
6291    mova    [rsp+gprsize+16*0], m7
6292    lea                   tx2q, [o(m(idct_64x32_internal).pass1_end3)]
6293    jmp   m(idct_8x8_internal).pass1_end
6294
6295.pass1_end3:
6296    SAVE_8ROWS    coeffq+64*24, 64
6297    LOAD_8ROWS   rsp+gprsize+16*35, 16
6298    mova    [rsp+gprsize+16*0], m7
6299    lea                   tx2q, [o(m(idct_64x32_internal).pass1_end4)]
6300    jmp   m(idct_8x8_internal).pass1_end
6301
6302.pass1_end4:
6303    SAVE_8ROWS       dstq+64*0, 64
6304    LOAD_8ROWS   rsp+gprsize+16*43, 16
6305    mova    [rsp+gprsize+16*0], m7
6306    lea                   tx2q, [o(m(idct_64x32_internal).pass1_end5)]
6307    jmp   m(idct_8x8_internal).pass1_end
6308
6309.pass1_end5:
6310    SAVE_8ROWS       dstq+64*8, 64
6311    LOAD_8ROWS   rsp+gprsize+16*51, 16
6312    mova    [rsp+gprsize+16*0], m7
6313    lea                   tx2q, [o(m(idct_64x32_internal).pass1_end6)]
6314    jmp   m(idct_8x8_internal).pass1_end
6315
6316.pass1_end6:
6317    SAVE_8ROWS      dstq+64*16, 64
6318    LOAD_8ROWS   rsp+gprsize+16*59, 16
6319    mova    [rsp+gprsize+16*0], m7
6320    lea                   tx2q, [o(m(idct_64x32_internal).pass1_end7)]
6321    jmp   m(idct_8x8_internal).pass1_end
6322
6323.pass1_end7:
6324    SAVE_8ROWS      dstq+64*24, 64
6325
6326    add                 coeffq, 16
6327    add                   dstq, 16
6328    dec                    r3d
6329    jg .pass1_loop
6330
6331.pass2:
6332    mov                 coeffq, [rsp+gprsize*4+16*67]
6333    mov                   dstq, [rsp+gprsize*3+16*67]
6334    mov                   eobd, [rsp+gprsize*1+16*67]
6335    lea                   dstq, [dstq+32]
6336    mov  [rsp+gprsize*1+16*35], eobd
6337    lea                   tx2q, [o(m(idct_64x32_internal).pass2_end)]
6338    mov                    r3d, 4
6339    jmp m(idct_32x32_internal).pass2_loop
6340
6341.pass2_end:
6342    mova    [rsp+gprsize+16*0], m7
6343    lea                     r3, [o(m(idct_64x32_internal).pass2_end1)]
6344    jmp  m(idct_8x32_internal).end2
6345
6346.pass2_end1:
6347    lea                   tx2q, [o(m(idct_64x32_internal).pass2_end)]
6348    add                 coeffq, 16*32
6349    mov                   dstq, [rsp+gprsize*2+16*35]
6350    mov                    r3d, [rsp+gprsize*3+16*35]
6351    dec                    r3d
6352    jg m(idct_32x32_internal).pass2_loop
6353
6354.pass2_end2:
6355    mov                   dstq, [rsp+gprsize*3+16*67]
6356    mov                 coeffq, [rsp+gprsize*2+16*67]
6357    lea                   tx2q, [o(m(idct_32x32_internal).pass2_end)]
6358    mov                    r3d, 4
6359    jmp m(idct_32x32_internal).pass2_loop
6360
6361
6362cglobal inv_txfm_add_dct_dct_64x64, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2
6363%if ARCH_X86_32
6364    LEA                     r5, $$
6365%endif
6366    test                  eobd, eobd
6367    jz .dconly
6368
6369    call m(idct_64x64_internal)
6370    RET
6371
6372.dconly:
6373    movd                    m1, [o(pw_2896x8)]
6374    pmulhrsw                m0, m1, [coeffq]
6375    movd                    m2, [o(pw_8192)]
6376    mov               [coeffq], eobd
6377    mov                    r3d, 64
6378    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_64x32).end)]
6379    jmp m(inv_txfm_add_dct_dct_64x16).body
6380
6381cglobal idct_64x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
6382    %undef cmp
6383
6384    mov                    r5d, 4
6385    mov                    r4d, 2
6386    sub                   eobd, 136
6387    cmovns                 r4d, r5d
6388
6389%if ARCH_X86_32
6390    LEA                     r5, $$
6391%endif
6392
6393    mov  [rsp+gprsize*1+16*67], eobd
6394    mov                    r3d, r4d
6395    mov  [rsp+gprsize*4+16*67], coeffq
6396    mov  [rsp+gprsize*3+16*67], dstq
6397    lea                   dstq, [rsp+gprsize+16*69]
6398    mov  [rsp+gprsize*2+16*67], dstq
6399
6400.pass1_loop:
6401    LOAD_4ROWS     coeffq+64*0, 64*8
6402    pxor                    m4, m4
6403    REPX          {mova x, m4}, m5, m6, m7
6404    call  m(idct_8x8_internal).main
6405    SAVE_7ROWS    rsp+gprsize+16*3, 16
6406
6407    pxor                    m4, m4
6408    LOAD_4ROWS     coeffq+64*4, 64*8
6409
6410    REPX          {mova x, m4}, m5, m6, m7
6411    call m(idct_16x8_internal).main
6412    mova                    m7, [rsp+gprsize+16*0]
6413    SAVE_8ROWS   rsp+gprsize+16*11, 16
6414
6415    LOAD_8ROWS     coeffq+64*2, 64*4
6416    mova   [rsp+gprsize+16*19], m0
6417    mova   [rsp+gprsize+16*26], m1
6418    mova   [rsp+gprsize+16*23], m2
6419    mova   [rsp+gprsize+16*22], m3
6420    mova   [rsp+gprsize+16*21], m4
6421    mova   [rsp+gprsize+16*24], m5
6422    mova   [rsp+gprsize+16*25], m6
6423    mova   [rsp+gprsize+16*20], m7
6424
6425    call m(idct_8x32_internal).main_fast
6426    SAVE_8ROWS    rsp+gprsize+16*3, 16
6427
6428    LOAD_8ROWS     coeffq+64*1, 64*2
6429    mova   [rsp+gprsize+16*35], m0                        ;in1
6430    mova   [rsp+gprsize+16*49], m1                        ;in3
6431    mova   [rsp+gprsize+16*43], m2                        ;in5
6432    mova   [rsp+gprsize+16*41], m3                        ;in7
6433    mova   [rsp+gprsize+16*39], m4                        ;in9
6434    mova   [rsp+gprsize+16*45], m5                        ;in11
6435    mova   [rsp+gprsize+16*47], m6                        ;in13
6436    mova   [rsp+gprsize+16*37], m7                        ;in15
6437
6438    LOAD_8ROWS    coeffq+64*17, 64*2
6439    mova   [rsp+gprsize+16*63], m0                        ;in17
6440    mova   [rsp+gprsize+16*53], m1                        ;in19
6441    mova   [rsp+gprsize+16*55], m2                        ;in21
6442    mova   [rsp+gprsize+16*61], m3                        ;in23
6443    mova   [rsp+gprsize+16*59], m4                        ;in25
6444    mova   [rsp+gprsize+16*57], m5                        ;in27
6445    mova   [rsp+gprsize+16*51], m6                        ;in29
6446    mova   [rsp+gprsize+16*65], m7                        ;in31
6447
6448    call m(idct_16x64_internal).main
6449
6450    LOAD_8ROWS    rsp+gprsize+16*3, 16
6451    mova    [rsp+gprsize+16*0], m7
6452    mova                    m7, [o(pw_8192)]
6453    lea                   tx2q, [o(m(idct_64x64_internal).pass1_end)]
6454    jmp   m(idct_8x8_internal).pass1_end1
6455
6456.pass1_end:
6457    SAVE_8ROWS     coeffq+64*0, 64
6458    LOAD_8ROWS   rsp+gprsize+16*11, 16
6459    mova    [rsp+gprsize+16*0], m7
6460    mova                    m7, [o(pw_8192)]
6461    lea                   tx2q, [o(m(idct_64x64_internal).pass1_end1)]
6462    jmp   m(idct_8x8_internal).pass1_end1
6463
6464.pass1_end1:
6465    SAVE_8ROWS     coeffq+64*8, 64
6466    LOAD_8ROWS   rsp+gprsize+16*19, 16
6467    mova    [rsp+gprsize+16*0], m7
6468    mova                    m7, [o(pw_8192)]
6469    lea                   tx2q, [o(m(idct_64x64_internal).pass1_end2)]
6470    jmp   m(idct_8x8_internal).pass1_end1
6471
6472.pass1_end2:
6473    SAVE_8ROWS    coeffq+64*16, 64
6474    LOAD_8ROWS   rsp+gprsize+16*27, 16
6475    mova    [rsp+gprsize+16*0], m7
6476    mova                    m7, [o(pw_8192)]
6477    lea                   tx2q, [o(m(idct_64x64_internal).pass1_end3)]
6478    jmp   m(idct_8x8_internal).pass1_end1
6479
6480.pass1_end3:
6481    SAVE_8ROWS    coeffq+64*24, 64
6482    LOAD_8ROWS   rsp+gprsize+16*35, 16
6483    mova    [rsp+gprsize+16*0], m7
6484    mova                    m7, [o(pw_8192)]
6485    lea                   tx2q, [o(m(idct_64x64_internal).pass1_end4)]
6486    jmp   m(idct_8x8_internal).pass1_end1
6487
6488.pass1_end4:
6489    SAVE_8ROWS       dstq+64*0, 64
6490    LOAD_8ROWS   rsp+gprsize+16*43, 16
6491    mova    [rsp+gprsize+16*0], m7
6492    mova                    m7, [o(pw_8192)]
6493    lea                   tx2q, [o(m(idct_64x64_internal).pass1_end5)]
6494    jmp   m(idct_8x8_internal).pass1_end1
6495
6496.pass1_end5:
6497    SAVE_8ROWS       dstq+64*8, 64
6498    LOAD_8ROWS   rsp+gprsize+16*51, 16
6499    mova    [rsp+gprsize+16*0], m7
6500    mova                    m7, [o(pw_8192)]
6501    lea                   tx2q, [o(m(idct_64x64_internal).pass1_end6)]
6502    jmp   m(idct_8x8_internal).pass1_end1
6503
6504.pass1_end6:
6505    SAVE_8ROWS      dstq+64*16, 64
6506    LOAD_8ROWS   rsp+gprsize+16*59, 16
6507    mova    [rsp+gprsize+16*0], m7
6508    mova                    m7, [o(pw_8192)]
6509    lea                   tx2q, [o(m(idct_64x64_internal).pass1_end7)]
6510    jmp   m(idct_8x8_internal).pass1_end1
6511
6512.pass1_end7:
6513    SAVE_8ROWS      dstq+64*24, 64
6514
6515    add                 coeffq, 16
6516    add                   dstq, 16
6517    dec                    r3d
6518    jg .pass1_loop
6519
6520.pass2:
6521    mov                   dstq, [rsp+gprsize*3+16*67]
6522    mov                 coeffq, [rsp+gprsize*2+16*67]
6523    lea                   dstq, [dstq+32]
6524    mov                    r3d, 4
6525    lea                     r4, [dstq+8]
6526    mov  [rsp+gprsize*2+16*67], r4
6527    lea                     r4, [o(m(idct_64x64_internal).pass2_end)]
6528    jmp m(idct_16x64_internal).pass2_loop
6529
6530.pass2_end:
6531    LOAD_8ROWS   rsp+gprsize+16*35, 16
6532    lea                   dstq, [dstq+strideq*2]
6533    add                    rsp, 16*32
6534    mova    [rsp+gprsize+16*0], m7
6535    lea                     r3, [o(m(idct_64x64_internal).pass2_end1)]
6536    jmp  m(idct_8x32_internal).end2
6537
6538.pass2_end1:
6539    add                 coeffq, 16*32
6540    sub                    rsp, 16*32
6541
6542    mov                   dstq, [rsp+gprsize*2+16*67]
6543    mov                    r3d, [rsp+gprsize*3+16*67]
6544    lea                     r4, [dstq+8]
6545    mov  [rsp+gprsize*2+16*67], r4
6546    lea                     r4, [o(m(idct_64x64_internal).pass2_end)]
6547
6548    dec                    r3d
6549    jg  m(idct_16x64_internal).pass2_loop
6550
6551.pass2_end2:
6552    mov                 coeffq, [rsp+gprsize*4+16*67]
6553    mov                   dstq, [rsp+gprsize*2+16*67]
6554    mov                    r3d, 4
6555    sub                   dstq, 72
6556    lea                     r4, [dstq+8]
6557    mov  [rsp+gprsize*2+16*67], r4
6558    lea                     r4, [o(m(idct_16x64_internal).end1)]
6559    jmp m(idct_16x64_internal).pass2_loop
6560