1; Copyright © 2018-2021, VideoLAN and dav1d authors
2; Copyright © 2018, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29
30SECTION_RODATA 16
31
32deint_shuf:  db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
33
34deint_shuf1: db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
35deint_shuf2: db  8,  9,  0,  1, 10, 11,  2,  3, 12, 13,  4,  5, 14, 15,  6,  7
36
37%macro COEF_PAIR 2-3 0 ; !0 = m%1_m%2, 2 = no %2_%1
38pw_%1_m%2:  times 4 dw  %1, -%2
39%if %3 != 2
40pw_%2_%1:   times 4 dw  %2,  %1
41%endif
42%if %3
43pw_m%1_m%2: times 4 dw -%1, -%2
44%endif
45%endmacro
46
47;adst4
48pw_1321_3803:   times 4 dw  1321,  3803
49pw_2482_m1321:  times 4 dw  2482, -1321
50pw_3344_2482:   times 4 dw  3344,  2482
51pw_3344_m3803:  times 4 dw  3344, -3803
52pw_3344_m3344:  times 4 dw  3344, -3344
53pw_0_3344       times 4 dw     0,  3344
54pw_m6688_m3803: times 4 dw -6688, -3803
55
56COEF_PAIR 2896, 2896
57COEF_PAIR 1567, 3784
58COEF_PAIR  799, 4017
59COEF_PAIR 3406, 2276
60COEF_PAIR  401, 4076
61COEF_PAIR 1931, 3612
62COEF_PAIR 3166, 2598
63COEF_PAIR 3920, 1189
64COEF_PAIR 3784, 1567, 1
65COEF_PAIR  995, 3973
66COEF_PAIR 1751, 3703
67COEF_PAIR 3513, 2106
68COEF_PAIR 3857, 1380
69COEF_PAIR 4017,  799, 1
70COEF_PAIR  201, 4091
71COEF_PAIR 2440, 3290
72COEF_PAIR 3035, 2751
73COEF_PAIR 4052,  601
74COEF_PAIR 2276, 3406, 1
75COEF_PAIR 4076,  401, 2
76COEF_PAIR 2598, 3166, 2
77COEF_PAIR 3612, 1931, 2
78COEF_PAIR 1189, 3920, 2
79
80pd_2048:        times 4 dd  2048
81pw_2048:        times 8 dw  2048
82pw_m2048:       times 8 dw -2048
83pw_4096:        times 8 dw  4096
84pw_16384:       times 8 dw  16384
85pw_m16384:      times 8 dw  -16384
86pw_1697x16:     times 8 dw  1697*16
87pw_1697x8:      times 8 dw  1697*8
88pw_2896x8:      times 8 dw  2896*8
89pw_3344x8:      times 8 dw  3344*8
90pw_8192:        times 8 dw  8192
91pw_m8192:       times 8 dw -8192
92pw_5:           times 8 dw  5
93pw_201x8:       times 8 dw   201*8
94pw_4091x8:      times 8 dw  4091*8
95pw_m2751x8:     times 8 dw -2751*8
96pw_3035x8:      times 8 dw  3035*8
97pw_1751x8:      times 8 dw  1751*8
98pw_3703x8:      times 8 dw  3703*8
99pw_m1380x8:     times 8 dw -1380*8
100pw_3857x8:      times 8 dw  3857*8
101pw_995x8:       times 8 dw   995*8
102pw_3973x8:      times 8 dw  3973*8
103pw_m2106x8:     times 8 dw -2106*8
104pw_3513x8:      times 8 dw  3513*8
105pw_2440x8:      times 8 dw  2440*8
106pw_3290x8:      times 8 dw  3290*8
107pw_m601x8:      times 8 dw  -601*8
108pw_4052x8:      times 8 dw  4052*8
109
110pw_4095x8:      times 8 dw  4095*8
111pw_101x8:       times 8 dw   101*8
112pw_2967x8:      times 8 dw  2967*8
113pw_m2824x8:     times 8 dw -2824*8
114pw_3745x8:      times 8 dw  3745*8
115pw_1660x8:      times 8 dw  1660*8
116pw_3822x8:      times 8 dw  3822*8
117pw_m1474x8:     times 8 dw -1474*8
118pw_3996x8:      times 8 dw  3996*8
119pw_897x8:       times 8 dw   897*8
120pw_3461x8:      times 8 dw  3461*8
121pw_m2191x8:     times 8 dw -2191*8
122pw_3349x8:      times 8 dw  3349*8
123pw_2359x8:      times 8 dw  2359*8
124pw_4036x8:      times 8 dw  4036*8
125pw_m700x8:      times 8 dw  -700*8
126pw_4065x8:      times 8 dw  4065*8
127pw_501x8:       times 8 dw   501*8
128pw_3229x8:      times 8 dw  3229*8
129pw_m2520x8:     times 8 dw -2520*8
130pw_3564x8:      times 8 dw  3564*8
131pw_2019x8:      times 8 dw  2019*8
132pw_3948x8:      times 8 dw  3948*8
133pw_m1092x8:     times 8 dw -1092*8
134pw_3889x8:      times 8 dw  3889*8
135pw_1285x8:      times 8 dw  1285*8
136pw_3659x8:      times 8 dw  3659*8
137pw_m1842x8:     times 8 dw -1842*8
138pw_3102x8:      times 8 dw  3102*8
139pw_2675x8:      times 8 dw  2675*8
140pw_4085x8:      times 8 dw  4085*8
141pw_m301x8:      times 8 dw  -301*8
142
143SECTION .text
144
145%macro REPX 2-*
146    %xdefine %%f(x) %1
147%rep %0 - 1
148    %rotate 1
149    %%f(%1)
150%endrep
151%endmacro
152
153%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
154
155%if ARCH_X86_64
156%define o(x) x
157%else
158%define o(x) r5-$$+x ; PIC
159%endif
160
161%macro WRITE_4X4 9  ;src[1-2], tmp[1-3], row[1-4]
162    lea                  r2, [dstq+strideq*2]
163%assign %%i 1
164%rotate 5
165%rep 4
166    %if %1 & 2
167        CAT_XDEFINE %%row_adr, %%i, r2   + strideq*(%1&1)
168    %else
169        CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
170    %endif
171    %assign %%i %%i + 1
172    %rotate 1
173%endrep
174
175    movd                 m%3, [%%row_adr1]        ;dst0
176    movd                 m%5, [%%row_adr2]        ;dst1
177    punpckldq            m%3, m%5                 ;high: dst1 :low: dst0
178    movd                 m%4, [%%row_adr3]        ;dst2
179    movd                 m%5, [%%row_adr4]        ;dst3
180    punpckldq            m%4, m%5                 ;high: dst3 :low: dst2
181
182    pxor                 m%5, m%5
183    punpcklbw            m%3, m%5                 ;extend byte to word
184    punpcklbw            m%4, m%5                 ;extend byte to word
185
186    paddw                m%3, m%1                 ;high: dst1 + out1 ;low: dst0 + out0
187    paddw                m%4, m%2                 ;high: dst3 + out3 ;low: dst2 + out2
188
189    packuswb             m%3, m%4                 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0
190
191    movd        [%%row_adr1], m%3                  ;store dst0 + out0
192    pshuflw              m%4, m%3, q1032
193    movd        [%%row_adr2], m%4                  ;store dst1 + out1
194    punpckhqdq           m%3, m%3
195    movd        [%%row_adr3], m%3                  ;store dst2 + out2
196    psrlq                m%3, 32
197    movd        [%%row_adr4], m%3                  ;store dst3 + out3
198%endmacro
199
200%macro ITX4_END 4-5 2048 ; row[1-4], rnd
201%if %5
202    mova                 m2, [o(pw_%5)]
203    pmulhrsw             m0, m2
204    pmulhrsw             m1, m2
205%endif
206
207    WRITE_4X4            0, 1, 2, 3, 4, %1, %2, %3, %4
208    ret
209%endmacro
210
211; flags: 1 = swap, 2: coef_regs, 4: no_pack
212%macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags
213%if %6 & 2
214    pmaddwd              m%2, m%4, m%1
215    pmaddwd              m%1, m%5
216%elif %6 & 1
217    pmaddwd              m%2, m%1, [o(pw_%5_%4)]
218    pmaddwd              m%1, [o(pw_%4_m%5)]
219%else
220    pmaddwd              m%2, m%1, [o(pw_%4_m%5)]
221    pmaddwd              m%1, [o(pw_%5_%4)]
222%endif
223    paddd                m%2, m%3
224    paddd                m%1, m%3
225    psrad                m%2, 12
226    psrad                m%1, 12
227%if %6 & 4 == 0
228    packssdw             m%1, m%2
229%endif
230%endmacro
231
232%macro IDCT4_1D_PACKED 0-1   ;pw_2896x8
233    mova                 m3, [o(pd_2048)]
234    punpckhwd            m2, m0, m1            ;unpacked in1 in3
235    punpcklwd            m0, m1                ;unpacked in0 in2
236    ITX_MUL2X_PACK        2, 1, 3, 1567, 3784
237    ITX_MUL2X_PACK        0, 1, 3, 2896, 2896
238    psubsw               m1, m0, m2            ;high: out2 ;low: out3
239    paddsw               m0, m2                ;high: out1 ;low: out0
240%endmacro
241
242%macro INV_TXFM_FN 4+ ; type1, type2, size, xmm/stack
243cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, %4, dst, stride, coeff, eob, tx2
244    %define %%p1 m(i%1_%3_internal_8bpc)
245%if ARCH_X86_32
246    LEA                    r5, $$
247%endif
248%if has_epilogue
249%ifidn %1_%2, dct_dct
250    test                 eobd, eobd
251    jz %%end
252%endif
253    lea                  tx2q, [o(m(i%2_%3_internal_8bpc).pass2)]
254    call %%p1
255    RET
256%%end:
257%else
258    lea                  tx2q, [o(m(i%2_%3_internal_8bpc).pass2)]
259%ifidn %1_%2, dct_dct
260    test                 eobd, eobd
261    jnz %%p1
262%else
263    times ((%%end - %%p1) >> 31) & 1 jmp %%p1
264ALIGN function_align
265%%end:
266%endif
267%endif
268%endmacro
269
270%macro INV_TXFM_4X4_FN 2 ; type1, type2
271    INV_TXFM_FN          %1, %2, 4x4, 6
272%ifidn %1_%2, dct_dct
273    pshuflw              m0, [coeffq], q0000
274    punpcklqdq           m0, m0
275    mova                 m1, [o(pw_2896x8)]
276    pmulhrsw             m0, m1
277    mov            [coeffq], eobd                ;0
278    pmulhrsw             m0, m1
279    mova                 m1, m0
280    TAIL_CALL m(iadst_4x4_internal_8bpc).end2
281%endif
282%endmacro
283
284INIT_XMM ssse3
285; itx16 relies on dct_dct being the first function. If you change the order, adjust `itx8_start` in itx16.
286
287INV_TXFM_4X4_FN dct, dct
288INV_TXFM_4X4_FN dct, adst
289INV_TXFM_4X4_FN dct, flipadst
290INV_TXFM_4X4_FN dct, identity
291
292cglobal idct_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
293    mova                 m0, [coeffq+16*0]      ;high: in1 ;low: in0
294    mova                 m1, [coeffq+16*1]      ;high: in3 ;low in2
295
296    IDCT4_1D_PACKED
297
298    mova                 m2, [o(deint_shuf)]
299    shufps               m3, m0, m1, q1331
300    shufps               m0, m1, q0220
301    pshufb               m0, m2                 ;high: in1 ;low: in0
302    pshufb               m1, m3, m2             ;high: in3 ;low :in2
303    jmp                tx2q
304
305.pass2:
306    IDCT4_1D_PACKED
307
308    pxor                 m2, m2
309    mova      [coeffq+16*0], m2
310    mova      [coeffq+16*1], m2                 ;memset(coeff, 0, sizeof(*coeff) * sh * sw);
311
312    ITX4_END     0, 1, 3, 2
313
314INV_TXFM_4X4_FN adst, dct
315INV_TXFM_4X4_FN adst, adst
316INV_TXFM_4X4_FN adst, flipadst
317INV_TXFM_4X4_FN adst, identity
318
319cglobal iadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
320    mova                 m0, [coeffq+16*0]
321    mova                 m1, [coeffq+16*1]
322    call .main
323    punpckhwd            m2, m0, m1
324    punpcklwd            m0, m1
325    punpckhwd            m1, m0, m2       ;high: in3 ;low :in2
326    punpcklwd            m0, m2           ;high: in1 ;low: in0
327    jmp                tx2q
328
329.pass2:
330    call .main
331
332.end:
333    pxor                 m2, m2
334    mova      [coeffq+16*0], m2
335    mova      [coeffq+16*1], m2
336
337.end2:
338    ITX4_END              0, 1, 2, 3
339
340ALIGN function_align
341cglobal_label .main
342    punpcklwd            m2, m0, m1                ;unpacked in0 in2
343    punpckhwd            m0, m1                    ;unpacked in1 in3
344    mova                 m3, m0
345    pmaddwd              m1, m2, [o(pw_3344_m3344)];3344 * in0 - 3344 * in2
346    pmaddwd              m0, [o(pw_0_3344)]        ;3344 * in3
347    paddd                m1, m0                    ;t2
348    pmaddwd              m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
349    pmaddwd              m2, [o(pw_2482_m1321)]    ;2482 * in0 - 1321 * in2
350    pmaddwd              m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
351    pmaddwd              m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3
352    paddd                m4, m0                    ;t0 + t3
353    pmaddwd              m3, [o(pw_m6688_m3803)]   ;-2 * 3344 * in1 - 3803 * in3
354    mova                 m0, [o(pd_2048)]
355    paddd                m1, m0                    ;t2 + 2048
356    paddd                m2, m0
357    paddd                m0, m4                    ;t0 + t3 + 2048
358    paddd                m5, m2                    ;t1 + t3 + 2048
359    paddd                m2, m4
360    paddd                m2, m3                    ;t0 + t1 - t3 + 2048
361    REPX      {psrad x, 12}, m1, m0, m5, m2
362    packssdw             m0, m5                    ;high: out1 ;low: out0
363    packssdw             m1, m2                    ;high: out3 ;low: out3
364    ret
365
366INV_TXFM_4X4_FN flipadst, dct
367INV_TXFM_4X4_FN flipadst, adst
368INV_TXFM_4X4_FN flipadst, flipadst
369INV_TXFM_4X4_FN flipadst, identity
370
371cglobal iflipadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
372    mova                 m0, [coeffq+16*0]
373    mova                 m1, [coeffq+16*1]
374    call m(iadst_4x4_internal_8bpc).main
375    punpcklwd            m2, m1, m0
376    punpckhwd            m1, m0
377    punpcklwd            m0, m1, m2            ;high: in3 ;low :in2
378    punpckhwd            m1, m2                ;high: in1 ;low: in0
379    jmp                tx2q
380
381.pass2:
382    call m(iadst_4x4_internal_8bpc).main
383
384.end:
385    pxor                 m2, m2
386    mova      [coeffq+16*0], m2
387    mova      [coeffq+16*1], m2
388
389.end2:
390    ITX4_END              3, 2, 1, 0
391
392INV_TXFM_4X4_FN identity, dct
393INV_TXFM_4X4_FN identity, adst
394INV_TXFM_4X4_FN identity, flipadst
395INV_TXFM_4X4_FN identity, identity
396
397cglobal iidentity_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
398    mova                 m0, [coeffq+16*0]
399    mova                 m1, [coeffq+16*1]
400    mova                 m3, [o(pw_1697x8)]
401    pmulhrsw             m2, m0, m3
402    pmulhrsw             m3, m1
403    paddsw               m0, m2
404    paddsw               m1, m3
405    punpckhwd            m2, m0, m1
406    punpcklwd            m0, m1
407    punpckhwd            m1, m0, m2            ;high: in3 ;low :in2
408    punpcklwd            m0, m2                ;high: in1 ;low: in0
409    jmp                tx2q
410
411.pass2:
412    mova                 m3, [o(pw_1697x8)]
413    pmulhrsw             m2, m3, m0
414    pmulhrsw             m3, m1
415    paddsw               m0, m2
416    paddsw               m1, m3
417    jmp m(iadst_4x4_internal_8bpc).end
418
419%macro IWHT4_1D_PACKED 0
420    punpckhqdq           m3, m0, m1            ;low: in1 high: in3
421    punpcklqdq           m0, m1                ;low: in0 high: in2
422    psubw                m2, m0, m3            ;low: in0 - in1 high: in2 - in3
423    paddw                m0, m3                ;low: in0 + in1 high: in2 + in3
424    punpckhqdq           m2, m2                ;t2 t2
425    punpcklqdq           m0, m0                ;t0 t0
426    psubw                m1, m0, m2
427    psraw                m1, 1                 ;t4 t4
428    psubw                m1, m3                ;low: t1/out2 high: t3/out1
429    psubw                m0, m1                ;high: out0
430    paddw                m2, m1                ;low: out3
431%endmacro
432
433cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, coeff
434    mova                 m0, [coeffq+16*0]
435    mova                 m1, [coeffq+16*1]
436    pxor                 m2, m2
437    mova      [coeffq+16*0], m2
438    mova      [coeffq+16*1], m2
439    psraw                m0, 2
440    psraw                m1, 2
441
442    IWHT4_1D_PACKED
443
444    punpckhwd            m0, m1
445    punpcklwd            m3, m1, m2
446    punpckhdq            m1, m0, m3
447    punpckldq            m0, m3
448
449    IWHT4_1D_PACKED
450
451    shufpd               m0, m2, 0x01
452    ITX4_END              0, 3, 2, 1, 0
453
454
455%macro IDCT8_1D_PACKED 0
456    mova                 m6, [o(pd_2048)]
457    punpckhwd            m4, m0, m3                 ;unpacked in1 in7
458    punpcklwd            m0, m2                     ;unpacked in0 in4
459    punpckhwd            m2, m1                     ;unpacked in5 in3
460    punpcklwd            m1, m3                     ;unpacked in2 in6
461    ITX_MUL2X_PACK        4, 3, 6,  799, 4017       ;low: t7a high: t4a
462    ITX_MUL2X_PACK        2, 3, 6, 3406, 2276       ;low: t6a high: t5a
463    ITX_MUL2X_PACK        1, 3, 6, 1567, 3784       ;low: t3  high: t2
464    psubsw               m3, m4, m2                 ;low: t6a high: t5a
465    paddsw               m4, m2                     ;low: t7  high: t4
466    pshufb               m3, [o(deint_shuf1)]
467    ITX_MUL2X_PACK        0, 2, 6, 2896, 2896       ;low: t0  high: t1
468    ITX_MUL2X_PACK        3, 2, 6, 2896, 2896       ;low: t6  high: t5
469    psubsw               m2, m0, m1                 ;low: tmp3 high: tmp2
470    paddsw               m0, m1                     ;low: tmp0 high: tmp1
471    punpcklqdq           m1, m4, m3                 ;low: t7   high: t6
472    punpckhqdq           m4, m3                     ;low: t4   high: t5
473    psubsw               m3, m0, m1                 ;low: out7 high: out6
474    paddsw               m0, m1                     ;low: out0 high: out1
475    paddsw               m1, m2, m4                 ;low: out3 high: out2
476    psubsw               m2, m4                     ;low: out4 high: out5
477%endmacro
478
479;dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
480;dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
481%macro ITX_MULSUB_2W 7-8 0 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2_in_tmp1
482    punpckhwd           m%4, m%1, m%2
483    punpcklwd           m%1, m%2
484%if %7 < 8
485    pmaddwd             m%2, m%7, m%1
486    pmaddwd             m%3, m%7, m%4
487%else
488    mova                m%2, [o(pw_%7_%6)]
489%if %8
490    pmaddwd             m%3, m%1, m%2
491    pmaddwd             m%2, m%4
492%else
493    pmaddwd             m%3, m%4, m%2
494    pmaddwd             m%2, m%1
495%endif
496%endif
497    paddd               m%3, m%5
498    paddd               m%2, m%5
499    psrad               m%3, 12
500    psrad               m%2, 12
501%if %8
502    packssdw            m%3, m%2
503%else
504    packssdw            m%2, m%3                 ;dst2
505%endif
506%if %7 < 8
507    pmaddwd             m%4, m%6
508    pmaddwd             m%1, m%6
509%elif %8
510    mova                m%2, [o(pw_%6_m%7)]
511    pmaddwd             m%4, m%2
512    pmaddwd             m%1, m%2
513%else
514    mova                m%3, [o(pw_%6_m%7)]
515    pmaddwd             m%4, m%3
516    pmaddwd             m%1, m%3
517%endif
518    paddd               m%4, m%5
519    paddd               m%1, m%5
520    psrad               m%4, 12
521    psrad               m%1, 12
522    packssdw            m%1, m%4                 ;dst1
523%endmacro
524
525%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
526    ITX_MULSUB_2W        %2, %4, %5, %6, %7, 1567, 3784, 1 ;t2, t3
527    ITX_MULSUB_2W        %1, %3, %4, %6, %7, 2896, 2896, 1 ;t1, t0
528    psubsw              m%3, m%1, m%2                      ;out2
529    paddsw              m%2, m%1                           ;out1
530    paddsw              m%1, m%5, m%4                      ;out0
531    psubsw              m%4, m%5                           ;out3
532%endmacro
533
534%macro WRITE_4X8 4 ;row[1-4]
535    WRITE_4X4             0, 1, 4, 5, 6, %1, %2, %3, %4
536    lea                dstq, [dstq+strideq*4]
537    WRITE_4X4             2, 3, 4, 5, 6, %1, %2, %3, %4
538%endmacro
539
540%macro INV_4X8 0
541    punpckhwd            m4, m2, m3
542    punpcklwd            m2, m3
543    punpckhwd            m3, m0, m1
544    punpcklwd            m0, m1
545    punpckhdq            m1, m0, m2                  ;low: in2 high: in3
546    punpckldq            m0, m2                      ;low: in0 high: in1
547    punpckldq            m2, m3, m4                  ;low: in4 high: in5
548    punpckhdq            m3, m4                      ;low: in6 high: in7
549%endmacro
550
551%macro INV_TXFM_4X8_FN 2 ; type1, type2
552    INV_TXFM_FN          %1, %2, 4x8, 8
553%ifidn %1_%2, dct_dct
554    pshuflw              m0, [coeffq], q0000
555    punpcklqdq           m0, m0
556    mova                 m1, [o(pw_2896x8)]
557    pmulhrsw             m0, m1
558    mov           [coeffq], eobd
559    pmulhrsw             m0, m1
560    pmulhrsw             m0, m1
561    pmulhrsw             m0, [o(pw_2048)]
562    mova                 m1, m0
563    mova                 m2, m0
564    mova                 m3, m0
565    TAIL_CALL m(iadst_4x8_internal_8bpc).end3
566%endif
567%endmacro
568
569INV_TXFM_4X8_FN dct, dct
570INV_TXFM_4X8_FN dct, adst
571INV_TXFM_4X8_FN dct, flipadst
572INV_TXFM_4X8_FN dct, identity
573
574cglobal idct_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
575    mova                 m3, [o(pw_2896x8)]
576    pmulhrsw             m0, m3, [coeffq+16*0]
577    pmulhrsw             m1, m3, [coeffq+16*1]
578    pmulhrsw             m2, m3, [coeffq+16*2]
579    pmulhrsw             m3,     [coeffq+16*3]
580
581.pass1:
582    call m(idct_8x4_internal_8bpc).main
583    jmp m(iadst_4x8_internal_8bpc).pass1_end
584
585.pass2:
586    call .main
587    shufps               m1, m1, q1032
588    shufps               m3, m3, q1032
589    mova                 m4, [o(pw_2048)]
590    jmp m(iadst_4x8_internal_8bpc).end2
591
592ALIGN function_align
593cglobal_label .main
594    IDCT8_1D_PACKED
595    ret
596
597
598INV_TXFM_4X8_FN adst, dct
599INV_TXFM_4X8_FN adst, adst
600INV_TXFM_4X8_FN adst, flipadst
601INV_TXFM_4X8_FN adst, identity
602
603cglobal iadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
604    mova                 m3, [o(pw_2896x8)]
605    pmulhrsw             m0, m3, [coeffq+16*0]
606    pmulhrsw             m1, m3, [coeffq+16*1]
607    pmulhrsw             m2, m3, [coeffq+16*2]
608    pmulhrsw             m3,     [coeffq+16*3]
609
610.pass1:
611    call m(iadst_8x4_internal_8bpc).main
612
613.pass1_end:
614    INV_4X8
615    jmp                tx2q
616
617.pass2:
618    shufps               m0, m0, q1032
619    shufps               m1, m1, q1032
620    call .main
621    mova                 m4, [o(pw_2048)]
622    pxor                 m5, m5
623    psubw                m5, m4
624
625.end:
626    punpcklqdq           m4, m5
627
628.end2:
629    pmulhrsw             m0, m4
630    pmulhrsw             m1, m4
631    pmulhrsw             m2, m4
632    pmulhrsw             m3, m4
633    pxor                 m5, m5
634    mova      [coeffq+16*0], m5
635    mova      [coeffq+16*1], m5
636    mova      [coeffq+16*2], m5
637    mova      [coeffq+16*3], m5
638
639.end3:
640    WRITE_4X8             0, 1, 2, 3
641    RET
642
643ALIGN function_align
644cglobal_label .main
645    mova                 m6, [o(pd_2048)]
646    punpckhwd            m4, m3, m0                ;unpacked in7 in0
647    punpckhwd            m5, m2, m1                ;unpacked in5 in2
648    punpcklwd            m1, m2                    ;unpacked in3 in4
649    punpcklwd            m0, m3                    ;unpacked in1 in6
650    ITX_MUL2X_PACK        4, 2, 6,  401, 4076      ;low:  t0a   high:  t1a
651    ITX_MUL2X_PACK        5, 2, 6, 1931, 3612      ;low:  t2a   high:  t3a
652    ITX_MUL2X_PACK        1, 2, 6, 3166, 2598      ;low:  t4a   high:  t5a
653    ITX_MUL2X_PACK        0, 2, 6, 3920, 1189      ;low:  t6a   high:  t7a
654
655    psubsw               m3, m4, m1                ;low:  t4    high:  t5
656    paddsw               m4, m1                    ;low:  t0    high:  t1
657    psubsw               m2, m5, m0                ;low:  t6    high:  t7
658    paddsw               m5, m0                    ;low:  t2    high:  t3
659
660    shufps               m1, m3, m2, q1032
661    punpckhwd            m2, m1
662    punpcklwd            m3, m1
663    ITX_MUL2X_PACK        3, 0, 6, 1567, 3784, 1   ;low:  t5a   high:  t4a
664    ITX_MUL2X_PACK        2, 0, 6, 3784, 1567      ;low:  t7a   high:  t6a
665
666    psubsw               m1, m4, m5                ;low:  t2    high:  t3
667    paddsw               m4, m5                    ;low:  out0  high: -out7
668    psubsw               m5, m3, m2                ;low:  t7    high:  t6
669    paddsw               m3, m2                    ;low:  out6  high: -out1
670    shufps               m0, m4, m3, q3210         ;low:  out0  high: -out1
671    shufps               m3, m4, q3210             ;low:  out6  high: -out7
672
673    mova                 m2, [o(pw_2896_m2896)]
674    mova                 m7, [o(pw_2896_2896)]
675    shufps               m4, m1, m5, q1032         ;low:  t3    high:  t7
676    shufps               m1, m5, q3210             ;low:  t2    high:  t6
677    punpcklwd            m5, m1, m4
678    punpckhwd            m1, m4
679    pmaddwd              m4, m2, m1                ;-out5
680    pmaddwd              m2, m5                    ; out4
681    pmaddwd              m1, m7                    ; out2
682    pmaddwd              m5, m7                    ;-out3
683    REPX      {paddd x, m6}, m4, m2, m1, m5
684    REPX      {psrad x, 12}, m4, m2, m1, m5
685    packssdw             m1, m5                    ;low:  out2  high: -out3
686    packssdw             m2, m4                    ;low:  out4  high: -out5
687    ret
688
689INV_TXFM_4X8_FN flipadst, dct
690INV_TXFM_4X8_FN flipadst, adst
691INV_TXFM_4X8_FN flipadst, flipadst
692INV_TXFM_4X8_FN flipadst, identity
693
694cglobal iflipadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
695    mova                 m3, [o(pw_2896x8)]
696    pmulhrsw             m0, m3, [coeffq+16*0]
697    pmulhrsw             m1, m3, [coeffq+16*1]
698    pmulhrsw             m2, m3, [coeffq+16*2]
699    pmulhrsw             m3,     [coeffq+16*3]
700
701.pass1:
702    call m(iadst_8x4_internal_8bpc).main
703
704    punpcklwd            m4, m3, m2
705    punpckhwd            m3, m2
706    punpcklwd            m5, m1, m0
707    punpckhwd            m1, m0
708    punpckldq            m2, m3, m1                  ;low: in4 high: in5
709    punpckhdq            m3, m1                      ;low: in6 high: in7
710    punpckldq            m0, m4, m5                  ;low: in0 high: in1
711    punpckhdq            m1, m4, m5                  ;low: in2 high: in3
712    jmp                tx2q
713
714.pass2:
715    shufps               m0, m0, q1032
716    shufps               m1, m1, q1032
717    call m(iadst_4x8_internal_8bpc).main
718
719    mova                 m4, m0
720    mova                 m5, m1
721    pshufd               m0, m3, q1032
722    pshufd               m1, m2, q1032
723    pshufd               m2, m5, q1032
724    pshufd               m3, m4, q1032
725    mova                 m5, [o(pw_2048)]
726    pxor                 m4, m4
727    psubw                m4, m5
728    jmp m(iadst_4x8_internal_8bpc).end
729
730INV_TXFM_4X8_FN identity, dct
731INV_TXFM_4X8_FN identity, adst
732INV_TXFM_4X8_FN identity, flipadst
733INV_TXFM_4X8_FN identity, identity
734
735cglobal iidentity_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
736    mova                 m3, [o(pw_2896x8)]
737    pmulhrsw             m0, m3, [coeffq+16*0]
738    pmulhrsw             m1, m3, [coeffq+16*1]
739    pmulhrsw             m2, m3, [coeffq+16*2]
740    pmulhrsw             m3,     [coeffq+16*3]
741
742.pass1:
743    mova                 m7, [o(pw_1697x8)]
744    pmulhrsw             m4, m7, m0
745    pmulhrsw             m5, m7, m1
746    pmulhrsw             m6, m7, m2
747    pmulhrsw             m7, m3
748    paddsw               m0, m4
749    paddsw               m1, m5
750    paddsw               m2, m6
751    paddsw               m3, m7
752    jmp m(iadst_4x8_internal_8bpc).pass1_end
753
754.pass2:
755    mova                 m4, [o(pw_4096)]
756    jmp m(iadst_4x8_internal_8bpc).end2
757
758
759%macro WRITE_8X2 5       ;coefs[1-2], tmp[1-3]
760    movq                 m%3, [dstq        ]
761    movq                 m%4, [dstq+strideq]
762    pxor                 m%5, m%5
763    punpcklbw            m%3, m%5                 ;extend byte to word
764    punpcklbw            m%4, m%5                 ;extend byte to word
765%ifnum %1
766    paddw                m%3, m%1
767%else
768    paddw                m%3, %1
769%endif
770%ifnum %2
771    paddw                m%4, m%2
772%else
773    paddw                m%4, %2
774%endif
775    packuswb             m%3, m%4
776    movq      [dstq        ], m%3
777    punpckhqdq           m%3, m%3
778    movq      [dstq+strideq], m%3
779%endmacro
780
781%macro WRITE_8X4 7      ;coefs[1-4], tmp[1-3]
782    WRITE_8X2             %1, %2, %5, %6, %7
783    lea                dstq, [dstq+strideq*2]
784    WRITE_8X2             %3, %4, %5, %6, %7
785%endmacro
786
787%macro INV_TXFM_8X4_FN 2 ; type1, type2
788    INV_TXFM_FN          %1, %2, 8x4, 8
789%ifidn %1_%2, dct_dct
790    pshuflw              m0, [coeffq], q0000
791    punpcklqdq           m0, m0
792    mova                 m1, [o(pw_2896x8)]
793    pmulhrsw             m0, m1
794    pmulhrsw             m0, m1
795    mova                 m2, [o(pw_2048)]
796    pmulhrsw             m0, m1
797    pmulhrsw             m0, m2
798    mova                 m1, m0
799    mova                 m2, m0
800    mova                 m3, m0
801    TAIL_CALL m(iadst_8x4_internal_8bpc).end2
802%endif
803%endmacro
804
805INV_TXFM_8X4_FN dct, dct
806INV_TXFM_8X4_FN dct, adst
807INV_TXFM_8X4_FN dct, flipadst
808INV_TXFM_8X4_FN dct, identity
809
810cglobal idct_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
811    mova                 m3, [o(pw_2896x8)]
812    pmulhrsw             m0, m3, [coeffq+16*0]
813    pmulhrsw             m1, m3, [coeffq+16*1]
814    pmulhrsw             m2, m3, [coeffq+16*2]
815    pmulhrsw             m3,     [coeffq+16*3]
816
817    call m(idct_4x8_internal_8bpc).main
818
819    mova                 m4, [o(deint_shuf1)]
820    mova                 m5, [o(deint_shuf2)]
821    pshufb               m0, m4
822    pshufb               m1, m5
823    pshufb               m2, m4
824    pshufb               m3, m5
825    punpckhdq            m4, m0, m1
826    punpckldq            m0, m1
827    punpckhdq            m5, m2, m3
828    punpckldq            m2, m3
829    punpckhqdq           m1, m0, m2                      ;in1
830    punpcklqdq           m0, m2                          ;in0
831    punpckhqdq           m3, m4, m5                      ;in3
832    punpcklqdq           m2 ,m4, m5                      ;in2
833    jmp                tx2q
834
835.pass2:
836    call .main
837    jmp m(iadst_8x4_internal_8bpc).end
838
839ALIGN function_align
840cglobal_label .main
841    mova                 m6, [o(pd_2048)]
842    IDCT4_1D             0, 1, 2, 3, 4, 5, 6
843    ret
844
845INV_TXFM_8X4_FN adst, dct
846INV_TXFM_8X4_FN adst, adst
847INV_TXFM_8X4_FN adst, flipadst
848INV_TXFM_8X4_FN adst, identity
849
850cglobal iadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
851    mova                 m3, [o(pw_2896x8)]
852    pmulhrsw             m0, m3, [coeffq+16*0]
853    pmulhrsw             m1, m3, [coeffq+16*1]
854    pmulhrsw             m2, m3, [coeffq+16*2]
855    pmulhrsw             m3,     [coeffq+16*3]
856
857    shufps               m0, m0, q1032
858    shufps               m1, m1, q1032
859    call m(iadst_4x8_internal_8bpc).main
860
861    punpckhwd            m4, m0, m1
862    punpcklwd            m0, m1
863    punpckhwd            m1, m2, m3
864    punpcklwd            m2, m3
865    pxor                 m5, m5
866    psubsw               m3, m5, m1
867    psubsw               m5, m4
868    punpckhdq            m4, m5, m3
869    punpckldq            m5, m3
870    punpckhdq            m3, m0, m2
871    punpckldq            m0, m2
872    punpckhwd            m1, m0, m5      ;in1
873    punpcklwd            m0, m5          ;in0
874    punpcklwd            m2, m3, m4      ;in2
875    punpckhwd            m3, m4          ;in3
876    jmp              tx2q
877
878.pass2:
879    call .main
880
881.end:
882    mova                 m4, [o(pw_2048)]
883    pmulhrsw             m0, m4
884    pmulhrsw             m1, m4
885    pmulhrsw             m2, m4
886    pmulhrsw             m3, m4
887
888.end2:
889    pxor                 m6, m6
890    mova      [coeffq+16*0], m6
891    mova      [coeffq+16*1], m6
892    mova      [coeffq+16*2], m6
893    mova      [coeffq+16*3], m6
894.end3:
895    WRITE_8X4             0, 1, 2, 3, 4, 5, 6
896    RET
897
898ALIGN function_align
899cglobal_label .main
900    punpckhwd            m6, m0, m2                    ;unpacked in0 in2
901    punpcklwd            m0, m2                        ;unpacked in0 in2
902    punpckhwd            m7, m1, m3                    ;unpacked in1 in3
903    punpcklwd            m1, m3                        ;unpacked in1 in3
904
905    mova                 m2, [o(pw_3344_m3344)]
906    mova                 m4, [o(pw_0_3344)]
907    pmaddwd              m3, m2, m6                    ;3344 * in0 - 3344 * in2
908    pmaddwd              m5, m4, m7                    ;3344 * in3
909    pmaddwd              m2, m0
910    pmaddwd              m4, m1
911    paddd                m3, m5
912    paddd                m2, m4
913    mova                 m4, [o(pd_2048)]
914    paddd                m3, m4                        ;t2 + 2048
915    paddd                m2, m4
916    psrad                m3, 12
917    psrad                m2, 12
918    packssdw             m2, m3                        ;out2
919
920    pmaddwd              m4, m0, [o(pw_1321_3803)]     ;1321 * in0 + 3803 * in2
921    pmaddwd              m0, [o(pw_2482_m1321)]        ;2482 * in0 - 1321 * in2
922    pmaddwd              m3, m1, [o(pw_3344_2482)]     ;3344 * in1 + 2482 * in3
923    pmaddwd              m5, m1, [o(pw_3344_m3803)]    ;3344 * in1 - 3803 * in3
924    paddd                m3, m4                        ;t0 + t3
925
926    pmaddwd              m1, [o(pw_m6688_m3803)]       ;-2 * 3344 * in1 - 3803 * in3
927    mova                 m4, [o(pd_2048)]
928    paddd                m0, m4
929    paddd                m4, m3                        ;t0 + t3 + 2048
930    paddd                m5, m0                        ;t1 + t3 + 2048
931    paddd                m3, m0
932    paddd                m3, m1                        ;t0 + t1 - t3 + 2048
933
934    psrad                m4, 12                        ;out0
935    psrad                m5, 12                        ;out1
936    psrad                m3, 12                        ;out3
937    packssdw             m0, m4, m5                    ;low: out0  high: out1
938
939    pmaddwd              m4, m6, [o(pw_1321_3803)]     ;1321 * in0 + 3803 * in2
940    pmaddwd              m6, [o(pw_2482_m1321)]        ;2482 * in0 - 1321 * in2
941    pmaddwd              m1, m7, [o(pw_3344_2482)]     ;3344 * in1 + 2482 * in3
942    pmaddwd              m5, m7, [o(pw_3344_m3803)]    ;3344 * in1 - 3803 * in3
943    paddd                m1, m4                        ;t0 + t3
944    pmaddwd              m7, [o(pw_m6688_m3803)]       ;-2 * 3344 * in1 - 3803 * in3
945
946    mova                 m4, [o(pd_2048)]
947    paddd                m6, m4
948    paddd                m4, m1                        ;t0 + t3 + 2048
949    paddd                m5, m6                        ;t1 + t3 + 2048
950    paddd                m1, m6
951    paddd                m1, m7                        ;t0 + t1 - t3 + 2048
952
953    psrad                m4, 12                        ;out0
954    psrad                m5, 12                        ;out1
955    psrad                m1, 12                        ;out3
956    packssdw             m3, m1                        ;out3
957    packssdw             m4, m5                        ;low: out0  high: out1
958
959    punpckhqdq           m1, m0, m4                    ;out1
960    punpcklqdq           m0, m4                        ;out0
961    ret
962
963INV_TXFM_8X4_FN flipadst, dct
964INV_TXFM_8X4_FN flipadst, adst
965INV_TXFM_8X4_FN flipadst, flipadst
966INV_TXFM_8X4_FN flipadst, identity
967
968cglobal iflipadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
969    mova                 m3, [o(pw_2896x8)]
970    pmulhrsw             m0, m3, [coeffq+16*0]
971    pmulhrsw             m1, m3, [coeffq+16*1]
972    pmulhrsw             m2, m3, [coeffq+16*2]
973    pmulhrsw             m3,     [coeffq+16*3]
974
975    shufps               m0, m0, q1032
976    shufps               m1, m1, q1032
977    call m(iadst_4x8_internal_8bpc).main
978
979    punpckhwd            m5, m3, m2
980    punpcklwd            m3, m2
981    punpckhwd            m2, m1, m0
982    punpcklwd            m1, m0
983
984    pxor                 m0, m0
985    psubsw               m4, m0, m2
986    psubsw               m0, m5
987    punpckhdq            m2, m0, m4
988    punpckldq            m0, m4
989    punpckhdq            m4, m3, m1
990    punpckldq            m3, m1
991    punpckhwd            m1, m0, m3      ;in1
992    punpcklwd            m0, m3          ;in0
993    punpckhwd            m3, m2, m4      ;in3
994    punpcklwd            m2, m4          ;in2
995    jmp                  tx2q
996
997.pass2:
998    call m(iadst_8x4_internal_8bpc).main
999    mova                 m4, m0
1000    mova                 m5, m1
1001    mova                 m0, m3
1002    mova                 m1, m2
1003    mova                 m2, m5
1004    mova                 m3, m4
1005    jmp m(iadst_8x4_internal_8bpc).end
1006
1007INV_TXFM_8X4_FN identity, dct
1008INV_TXFM_8X4_FN identity, adst
1009INV_TXFM_8X4_FN identity, flipadst
1010INV_TXFM_8X4_FN identity, identity
1011
1012cglobal iidentity_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
1013    mova                 m3, [o(pw_2896x8)]
1014    pmulhrsw             m0, m3, [coeffq+16*0]
1015    pmulhrsw             m1, m3, [coeffq+16*1]
1016    pmulhrsw             m2, m3, [coeffq+16*2]
1017    pmulhrsw             m3,     [coeffq+16*3]
1018    paddsw               m0, m0
1019    paddsw               m1, m1
1020    paddsw               m2, m2
1021    paddsw               m3, m3
1022
1023    punpckhwd            m4, m0, m1
1024    punpcklwd            m0, m1
1025    punpckhwd            m1, m2, m3
1026    punpcklwd            m2, m3
1027    punpckhdq            m5, m4, m1
1028    punpckldq            m4, m1
1029    punpckhdq            m3, m0, m2
1030    punpckldq            m0, m2
1031    punpckhwd            m1, m0, m4      ;in1
1032    punpcklwd            m0, m4          ;in0
1033    punpcklwd            m2, m3, m5      ;in2
1034    punpckhwd            m3, m5          ;in3
1035    jmp                tx2q
1036
1037.pass2:
1038    mova                 m7, [o(pw_1697x8)]
1039    pmulhrsw             m4, m7, m0
1040    pmulhrsw             m5, m7, m1
1041    pmulhrsw             m6, m7, m2
1042    pmulhrsw             m7, m3
1043    paddsw               m0, m4
1044    paddsw               m1, m5
1045    paddsw               m2, m6
1046    paddsw               m3, m7
1047    jmp m(iadst_8x4_internal_8bpc).end
1048
1049%macro INV_TXFM_8X8_FN 2 ; type1, type2
1050    INV_TXFM_FN          %1, %2, 8x8, 8, 16*4
1051%ifidn %1_%2, dct_dct
1052    pshuflw              m0, [coeffq], q0000
1053    punpcklwd            m0, m0
1054    mova                 m1, [o(pw_2896x8)]
1055    pmulhrsw             m0, m1
1056    mova                 m2, [o(pw_16384)]
1057    mov            [coeffq], eobd
1058    pmulhrsw             m0, m2
1059    psrlw                m2, 3
1060    pmulhrsw             m0, m1
1061    pmulhrsw             m0, m2
1062.end:
1063    mov                 r3d, 2
1064    lea                tx2q, [o(m(inv_txfm_add_dct_dct_8x8_8bpc).end3)]
1065.loop:
1066    WRITE_8X4             0, 0, 0, 0, 1, 2, 3
1067    lea                dstq, [dstq+strideq*2]
1068    dec                 r3d
1069    jg .loop
1070    jmp                tx2q
1071.end3:
1072    RET
1073%endif
1074%endmacro
1075
1076%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2
1077%if %3
1078    mova                 m7, [o(pw_2896x8)]
1079    pmulhrsw             m0, m7, [%1+%2*0]
1080    pmulhrsw             m1, m7, [%1+%2*1]
1081    pmulhrsw             m2, m7, [%1+%2*2]
1082    pmulhrsw             m3, m7, [%1+%2*3]
1083    pmulhrsw             m4, m7, [%1+%2*4]
1084    pmulhrsw             m5, m7, [%1+%2*5]
1085    pmulhrsw             m6, m7, [%1+%2*6]
1086    pmulhrsw             m7, [%1+%2*7]
1087%else
1088    mova                 m0, [%1+%2*0]
1089    mova                 m1, [%1+%2*1]
1090    mova                 m2, [%1+%2*2]
1091    mova                 m3, [%1+%2*3]
1092    mova                 m4, [%1+%2*4]
1093    mova                 m5, [%1+%2*5]
1094    mova                 m6, [%1+%2*6]
1095    mova                 m7, [%1+%2*7]
1096%endif
1097%endmacro
1098
1099%macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048
1100    ITX_MULSUB_2W         %1, %4, %5, %6, %7,  799, 4017    ;t4a, t7a
1101    ITX_MULSUB_2W         %3, %2, %5, %6, %7, 3406, 2276, 1 ;t5a, t6a
1102    psubsw               m%2, m%4, m%5                      ;t6a
1103    paddsw               m%4, m%5                           ;t7
1104    psubsw               m%5, m%1, m%3                      ;t5a
1105    paddsw               m%1, m%3                           ;t4
1106    ITX_MULSUB_2W         %2, %5, %3, %6, %7, 2896, 2896, 1 ;t5, t6
1107%endmacro
1108
1109INV_TXFM_8X8_FN dct, dct
1110INV_TXFM_8X8_FN dct, adst
1111INV_TXFM_8X8_FN dct, flipadst
1112INV_TXFM_8X8_FN dct, identity
1113
1114cglobal idct_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
1115    LOAD_8ROWS          coeffq, 16
1116
1117.pass1:
1118    call .main
1119
1120.pass1_end:
1121    mova                    m7, [o(pw_16384)]
1122
1123.pass1_end1:
1124    REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
1125    mova    [rsp+gprsize+16*1], m6
1126
1127.pass1_end2:
1128    REPX      {pmulhrsw x, m7}, m1, m3, m5
1129    pmulhrsw                m7, [rsp+gprsize+16*0]
1130
1131cglobal_label .pass1_end3
1132    punpcklwd               m6, m1, m5             ;10 50 11 51 12 52 13 53
1133    punpckhwd               m1, m5                 ;14 54 15 55 16 56 17 57
1134    punpckhwd               m5, m0, m4             ;04 44 05 45 06 46 07 47
1135    punpcklwd               m0, m4                 ;00 40 01 41 02 42 03 43
1136    punpckhwd               m4, m3, m7             ;34 74 35 75 36 76 37 77
1137    punpcklwd               m3, m7                 ;30 70 31 71 32 72 33 73
1138    punpckhwd               m7, m1, m4             ;16 36 56 76 17 37 57 77
1139    punpcklwd               m1, m4                 ;14 34 54 74 15 35 55 75
1140    punpckhwd               m4, m6, m3             ;12 32 52 72 13 33 53 73
1141    punpcklwd               m6, m3                 ;10 30 50 70 11 31 51 71
1142    mova    [rsp+gprsize+16*2], m6
1143    mova                    m6, [rsp+gprsize+16*1]
1144    punpckhwd               m3, m2, m6             ;24 64 25 65 26 66 27 67
1145    punpcklwd               m2, m6                 ;20 60 21 61 22 62 23 63
1146    punpckhwd               m6, m5, m3             ;06 26 46 66 07 27 47 67
1147    punpcklwd               m5, m3                 ;04 24 44 64 05 25 45 65
1148    punpckhwd               m3, m0, m2             ;02 22 42 62 03 23 43 63
1149    punpcklwd               m0, m2                 ;00 20 40 60 01 21 41 61
1150
1151    punpckhwd               m2, m6, m7             ;07 17 27 37 47 57 67 77
1152    punpcklwd               m6, m7                 ;06 16 26 36 46 56 66 76
1153    mova    [rsp+gprsize+16*0], m2
1154    punpcklwd               m2, m3, m4             ;02 12 22 32 42 52 62 72
1155    punpckhwd               m3, m4                 ;03 13 23 33 43 53 63 73
1156    punpcklwd               m4, m5, m1             ;04 14 24 34 44 54 64 74
1157    punpckhwd               m5, m1                 ;05 15 25 35 45 55 65 75
1158    mova                    m7, [rsp+gprsize+16*2]
1159    punpckhwd               m1, m0, m7             ;01 11 21 31 41 51 61 71
1160    punpcklwd               m0, m7                 ;00 10 20 30 40 50 60 70
1161    mova                    m7, [rsp+gprsize+16*0]
1162    jmp                   tx2q
1163
1164.pass2:
1165    lea                   tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
1166
1167.pass2_main:
1168    call .main
1169
1170.end:
1171    mova                    m7, [o(pw_2048)]
1172    REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
1173    mova    [rsp+gprsize+16*1], m6
1174
1175.end2:
1176    REPX      {pmulhrsw x, m7}, m1, m3, m5
1177    pmulhrsw                m7, [rsp+gprsize+16*0]
1178    mova    [rsp+gprsize+16*2], m5
1179    mova    [rsp+gprsize+16*0], m7
1180
1181.end3:
1182    WRITE_8X4                0, 1, 2, 3, 5, 6, 7
1183    lea                   dstq, [dstq+strideq*2]
1184    WRITE_8X4                4, [rsp+gprsize+16*2], [rsp+gprsize+16*1], [rsp+gprsize+16*0], 5, 6, 7
1185    jmp                   tx2q
1186
1187.end4:
1188    pxor                    m7, m7
1189    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
1190    ret
1191
1192ALIGN function_align
1193cglobal_label .main
1194    mova  [rsp+gprsize*2+16*0], m7
1195    mova  [rsp+gprsize*2+16*1], m3
1196    mova  [rsp+gprsize*2+16*2], m1
1197    mova                    m7, [o(pd_2048)]
1198    IDCT4_1D                 0, 2, 4, 6, 1, 3, 7
1199    mova                    m3, [rsp+gprsize*2+16*2]
1200    mova  [rsp+gprsize*2+16*2], m2
1201    mova                    m2, [rsp+gprsize*2+16*1]
1202    mova  [rsp+gprsize*2+16*1], m4
1203    mova                    m4, [rsp+gprsize*2+16*0]
1204    mova  [rsp+gprsize*2+16*0], m6
1205    IDCT8_1D_ODDHALF         3, 2, 5, 4, 1, 6, 7
1206    mova                    m6, [rsp+gprsize*2+16*0]
1207    psubsw                  m7, m0, m4                    ;out7
1208    paddsw                  m0, m4                        ;out0
1209    mova  [rsp+gprsize*2+16*0], m7
1210    mova                    m1, [rsp+gprsize*2+16*2]
1211    psubsw                  m4, m6, m3                    ;out4
1212    paddsw                  m3, m6                        ;out3
1213    mova                    m7, [rsp+gprsize*2+16*1]
1214    psubsw                  m6, m1, m5                    ;out6
1215    paddsw                  m1, m5                        ;out1
1216    psubsw                  m5, m7, m2                    ;out5
1217    paddsw                  m2, m7                        ;out2
1218    ret
1219
1220
1221INV_TXFM_8X8_FN adst, dct
1222INV_TXFM_8X8_FN adst, adst
1223INV_TXFM_8X8_FN adst, flipadst
1224INV_TXFM_8X8_FN adst, identity
1225
1226cglobal iadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
1227    LOAD_8ROWS          coeffq, 16
1228
1229.pass1:
1230    call .main
1231    call .main_pass1_end
1232
1233.pass1_end:
1234    mova                    m7, [o(pw_16384)]
1235
1236.pass1_end1:
1237    REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
1238    mova    [rsp+gprsize+16*1], m6
1239    pxor                    m6, m6
1240    psubw                   m6, m7
1241    mova                    m7, m6
1242    jmp m(idct_8x8_internal_8bpc).pass1_end2
1243
1244ALIGN function_align
1245.pass2:
1246    lea                   tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
1247
1248.pass2_main:
1249    call .main
1250    call .main_pass2_end
1251
1252.end:
1253    mova                    m7, [o(pw_2048)]
1254    REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
1255    mova    [rsp+gprsize+16*1], m6
1256    pxor                    m6, m6
1257    psubw                   m6, m7
1258    mova                    m7, m6
1259    jmp m(idct_8x8_internal_8bpc).end2
1260
1261ALIGN function_align
1262cglobal_label .main
1263    mova  [rsp+gprsize*2+16*0], m7
1264    mova  [rsp+gprsize*2+16*1], m3
1265    mova  [rsp+gprsize*2+16*2], m4
1266    mova                    m7, [o(pd_2048)]
1267    ITX_MULSUB_2W            5, 2, 3, 4, 7, 1931, 3612    ;t3a, t2a
1268    ITX_MULSUB_2W            1, 6, 3, 4, 7, 3920, 1189    ;t7a, t6a
1269    paddsw                  m3, m2, m6                    ;t2
1270    psubsw                  m2, m6                        ;t6
1271    paddsw                  m4, m5, m1                    ;t3
1272    psubsw                  m5, m1                        ;t7
1273    ITX_MULSUB_2W            5, 2, 1, 6, 7, 3784, 1567    ;t6a, t7a
1274
1275    mova                    m6, [rsp+gprsize*2+16*2]
1276    mova  [rsp+gprsize*2+16*2], m5
1277    mova                    m1, [rsp+gprsize*2+16*1]
1278    mova  [rsp+gprsize*2+16*1], m2
1279    mova                    m5, [rsp+gprsize*2+16*0]
1280    mova  [rsp+gprsize*2+16*0], m3
1281    ITX_MULSUB_2W            5, 0, 2, 3, 7,  401, 4076    ;t1a, t0a
1282    ITX_MULSUB_2W            1, 6, 2, 3, 7, 3166, 2598    ;t5a, t4a
1283    psubsw                  m2, m0, m6                    ;t4
1284    paddsw                  m0, m6                        ;t0
1285    paddsw                  m3, m5, m1                    ;t1
1286    psubsw                  m5, m1                        ;t5
1287    ITX_MULSUB_2W            2, 5, 1, 6, 7, 1567, 3784    ;t5a, t4a
1288
1289    mova                    m7, [rsp+gprsize*2+16*0]
1290    paddsw                  m1, m3, m4                    ;-out7
1291    psubsw                  m3, m4                        ;t3
1292    mova  [rsp+gprsize*2+16*0], m1
1293    psubsw                  m4, m0, m7                    ;t2
1294    paddsw                  m0, m7                        ;out0
1295    mova                    m6, [rsp+gprsize*2+16*2]
1296    mova                    m7, [rsp+gprsize*2+16*1]
1297    paddsw                  m1, m5, m6                    ;-out1
1298    psubsw                  m5, m6                        ;t6
1299    paddsw                  m6, m2, m7                    ;out6
1300    psubsw                  m2, m7                        ;t7
1301    ret
1302ALIGN function_align
1303.main_pass1_end:
1304    mova  [rsp+gprsize*2+16*1], m1
1305    mova  [rsp+gprsize*2+16*2], m6
1306    punpckhwd               m1, m4, m3
1307    punpcklwd               m4, m3
1308    punpckhwd               m7, m5, m2
1309    punpcklwd               m5, m2
1310    mova                    m2, [o(pw_2896_2896)]
1311    mova                    m6, [o(pd_2048)]
1312    pmaddwd                 m3, m2, m7
1313    pmaddwd                 m2, m5
1314    paddd                   m3, m6
1315    paddd                   m2, m6
1316    psrad                   m3, 12
1317    psrad                   m2, 12
1318    packssdw                m2, m3                        ;out2
1319    mova                    m3, [o(pw_2896_m2896)]
1320    pmaddwd                 m7, m3
1321    pmaddwd                 m5, m3
1322    paddd                   m7, m6
1323    paddd                   m5, m6
1324    psrad                   m7, 12
1325    psrad                   m5, 12
1326    packssdw                m5, m7                        ;-out5
1327    mova                    m3, [o(pw_2896_2896)]
1328    pmaddwd                 m7, m3, m1
1329    pmaddwd                 m3, m4
1330    paddd                   m7, m6
1331    paddd                   m3, m6
1332    psrad                   m7, 12
1333    psrad                   m3, 12
1334    packssdw                m3, m7                        ;-out3
1335    mova                    m7, [o(pw_2896_m2896)]
1336    pmaddwd                 m1, m7
1337    pmaddwd                 m4, m7
1338    paddd                   m1, m6
1339    paddd                   m4, m6
1340    psrad                   m1, 12
1341    psrad                   m4, 12
1342    packssdw                m4, m1                        ;-out5
1343    mova                    m1, [rsp+gprsize*2+16*1]
1344    mova                    m6, [rsp+gprsize*2+16*2]
1345    ret
1346ALIGN function_align
1347cglobal_label .main_pass2_end
1348    paddsw                  m7, m4, m3                    ;t2 + t3
1349    psubsw                  m4, m3                        ;t2 - t3
1350    paddsw                  m3, m5, m2                    ;t6 + t7
1351    psubsw                  m5, m2                        ;t6 - t7
1352    mova                    m2, [o(pw_2896x8)]
1353    pmulhrsw                m4, m2                        ;out4
1354    pmulhrsw                m5, m2                        ;-out5
1355    pmulhrsw                m7, m2                        ;-out3
1356    pmulhrsw                m2, m3                        ;out2
1357    mova                    m3, m7
1358    ret
1359
1360INV_TXFM_8X8_FN flipadst, dct
1361INV_TXFM_8X8_FN flipadst, adst
1362INV_TXFM_8X8_FN flipadst, flipadst
1363INV_TXFM_8X8_FN flipadst, identity
1364
1365cglobal iflipadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
1366    LOAD_8ROWS          coeffq, 16
1367
1368.pass1:
1369    call m(iadst_8x8_internal_8bpc).main
1370    call m(iadst_8x8_internal_8bpc).main_pass1_end
1371
1372.pass1_end:
1373    mova                    m7, [o(pw_m16384)]
1374
1375.pass1_end1:
1376    pmulhrsw                m1, m7
1377    mova    [rsp+gprsize+16*1], m1
1378    mova                    m1, m6
1379    mova                    m6, m2
1380    pmulhrsw                m2, m5, m7
1381    mova                    m5, m6
1382    mova                    m6, m4
1383    pmulhrsw                m4, m3, m7
1384    mova                    m3, m6
1385    mova                    m6, m0
1386    mova                    m0, m7
1387    pxor                    m7, m7
1388    psubw                   m7, m0
1389    pmulhrsw                m0, [rsp+gprsize+16*0]
1390    REPX      {pmulhrsw x, m7}, m1, m3, m5
1391    pmulhrsw                m7, m6
1392    jmp m(idct_8x8_internal_8bpc).pass1_end3
1393
1394ALIGN function_align
1395.pass2:
1396    lea                   tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
1397
1398.pass2_main:
1399    call m(iadst_8x8_internal_8bpc).main
1400    call m(iadst_8x8_internal_8bpc).main_pass2_end
1401
1402.end:
1403    mova                    m7, [o(pw_2048)]
1404    REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
1405    mova    [rsp+gprsize+16*2], m2
1406    mova                    m2, m0
1407    pxor                    m0, m0
1408    psubw                   m0, m7
1409    mova                    m7, m2
1410    pmulhrsw                m1, m0
1411    pmulhrsw                m2, m5, m0
1412    mova    [rsp+gprsize+16*1], m1
1413    mova                    m5, m4
1414    mova                    m1, m6
1415    pmulhrsw                m4, m3, m0
1416    pmulhrsw                m0, [rsp+gprsize+16*0]
1417    mova                    m3, m5
1418    mova    [rsp+gprsize+16*0], m7
1419    jmp m(idct_8x8_internal_8bpc).end3
1420
1421INV_TXFM_8X8_FN identity, dct
1422INV_TXFM_8X8_FN identity, adst
1423INV_TXFM_8X8_FN identity, flipadst
1424INV_TXFM_8X8_FN identity, identity
1425
1426cglobal iidentity_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
1427    LOAD_8ROWS          coeffq, 16
1428    mova    [rsp+gprsize+16*1], m6
1429    jmp   m(idct_8x8_internal_8bpc).pass1_end3
1430
1431ALIGN function_align
1432.pass2:
1433    lea                   tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
1434
1435.end:
1436    pmulhrsw                m7, [o(pw_4096)]
1437    mova    [rsp+gprsize+16*0], m7
1438    mova                    m7, [o(pw_4096)]
1439    REPX      {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
1440    mova    [rsp+gprsize+16*2], m5
1441    mova    [rsp+gprsize+16*1], m6
1442    jmp m(idct_8x8_internal_8bpc).end3
1443
1444
1445%macro INV_TXFM_4X16_FN 2 ; type1, type2
1446    INV_TXFM_FN          %1, %2, 4x16, 8
1447%ifidn %1_%2, dct_dct
1448    pshuflw               m0, [coeffq], q0000
1449    punpcklwd             m0, m0
1450    mova                  m1, [o(pw_2896x8)]
1451    pmulhrsw              m0, m1
1452    mov             [coeffq], eobd
1453    pmulhrsw              m0, [o(pw_16384)]
1454    pmulhrsw              m0, m1
1455    pmulhrsw              m0, [o(pw_2048)]
1456.end:
1457    WRITE_4X4             0, 0, 1, 2, 3, 0, 1, 2, 3
1458    lea                dstq, [dstq+strideq*4]
1459    WRITE_4X4             0, 0, 1, 2, 3, 0, 1, 2, 3
1460    lea                dstq, [dstq+strideq*4]
1461    WRITE_4X4             0, 0, 1, 2, 3, 0, 1, 2, 3
1462    lea                dstq, [dstq+strideq*4]
1463    WRITE_4X4             0, 0, 1, 2, 3, 0, 1, 2, 3
1464    RET
1465%endif
1466%endmacro
1467
1468INV_TXFM_4X16_FN dct, dct
1469INV_TXFM_4X16_FN dct, adst
1470INV_TXFM_4X16_FN dct, flipadst
1471INV_TXFM_4X16_FN dct, identity
1472
1473cglobal idct_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
1474    lea                  r3, [o(m(idct_4x8_internal_8bpc).pass1)]
1475
1476.pass1:
1477    mova                 m0, [coeffq+16*1]
1478    mova                 m1, [coeffq+16*3]
1479    mova                 m2, [coeffq+16*5]
1480    mova                 m3, [coeffq+16*7]
1481    push               tx2q
1482    lea                tx2q, [o(m(idct_4x16_internal_8bpc).pass1_2)]
1483    jmp                  r3
1484
1485.pass1_2:
1486    mova      [coeffq+16*1], m0
1487    mova      [coeffq+16*3], m1
1488    mova      [coeffq+16*5], m2
1489    mova      [coeffq+16*7], m3
1490    mova                 m0, [coeffq+16*0]
1491    mova                 m1, [coeffq+16*2]
1492    mova                 m2, [coeffq+16*4]
1493    mova                 m3, [coeffq+16*6]
1494    lea                tx2q, [o(m(idct_4x16_internal_8bpc).pass1_end)]
1495    jmp                  r3
1496
1497.pass1_end:
1498    pop                tx2q
1499
1500    mova                 m4, [coeffq+16*1]
1501    mova                 m5, [coeffq+16*3]
1502    mova                 m6, [coeffq+16*5]
1503    mova                 m7, [o(pw_16384)]
1504    REPX   {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
1505
1506    pmulhrsw             m7, [coeffq+16*7]
1507    mova       [coeffq+16*7], m7
1508    jmp                tx2q
1509
1510.pass2:
1511    call m(idct_16x4_internal_8bpc).main
1512
1513.end:
1514    mova                  m7, [o(pw_2048)]
1515    REPX    {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
1516    pmulhrsw              m7, [coeffq+16*7]
1517    mova       [coeffq+16*4], m4
1518
1519.end1:
1520    mova       [coeffq+16*5], m5
1521    mova       [coeffq+16*6], m6
1522    mov                   r3, coeffq
1523    WRITE_4X8              0, 1, 3, 2
1524
1525    mova                  m0, [r3+16*4]
1526    mova                  m1, [r3+16*5]
1527    mova                  m2, [r3+16*6]
1528    mova                  m3, m7
1529    lea                 dstq, [dstq+strideq*4]
1530    WRITE_4X8              0, 1, 3, 2
1531
1532.end2:
1533    pxor                  m7, m7
1534    REPX     {mova [r3+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
1535    ret
1536
1537INV_TXFM_4X16_FN adst, dct
1538INV_TXFM_4X16_FN adst, adst
1539INV_TXFM_4X16_FN adst, flipadst
1540INV_TXFM_4X16_FN adst, identity
1541
1542cglobal iadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
1543    lea                   r3, [o(m(iadst_4x8_internal_8bpc).pass1)]
1544    jmp   m(idct_4x16_internal_8bpc).pass1
1545
1546.pass2:
1547    call m(iadst_16x4_internal_8bpc).main
1548    call m(iadst_16x4_internal_8bpc).main_pass2_end
1549
1550    punpcklqdq            m6, m5, m4                ;low: -out5  high: -out7
1551    punpckhqdq            m4, m5                    ;low:  out8  high:  out10
1552    punpcklqdq            m5, m7, m2                ;low:  out4  high:  out6
1553    punpckhqdq            m2, m7                    ;low: -out9  high: -out11
1554    mova       [coeffq+16*4], m2
1555    mova       [coeffq+16*5], m6
1556    mova                  m2, [coeffq+16*6]
1557    mova                  m6, [coeffq+16*7]
1558    punpckhqdq            m1, m6, m0                ;low: -out13 high: -out15
1559    punpcklqdq            m0, m6                    ;low:  out0  high:  out2
1560    punpckhqdq            m6, m3, m2                ;low:  out12 high:  out14
1561    punpcklqdq            m2, m3                    ;low: -out1  high: -out3
1562
1563    mova                  m7, [o(pw_2048)]
1564
1565.end1:
1566    REPX    {pmulhrsw x, m7}, m0, m5, m4, m6
1567    pxor                  m3, m3
1568    psubw                 m3, m7
1569    mova                  m7, [coeffq+16*4]
1570    REPX    {pmulhrsw x, m3}, m2, m7, m1
1571    pmulhrsw              m3, [coeffq+16*5]
1572    mova       [coeffq+16*7], m5
1573
1574    punpckhqdq            m5, m4, m7                ;low:  out10 high:  out11
1575    punpcklqdq            m4, m7                    ;low:  out8  high:  out9
1576    punpckhqdq            m7, m6, m1                ;low:  out14 high:  out15
1577    punpcklqdq            m6, m1                    ;low:  out12 high:  out13
1578    punpckhqdq            m1, m0, m2                ;low:  out2  high:  out3
1579    punpcklqdq            m0, m2                    ;low:  out0  high:  out1
1580    mova       [coeffq+16*4], m4
1581    mova                  m4, [coeffq+16*7]
1582    punpcklqdq            m2, m4, m3                ;low:  out4  high:  out5
1583    punpckhqdq            m4, m3                    ;low:  out6  high:  out7
1584    mova                  m3, m4
1585
1586.end2:
1587    mova       [coeffq+16*5], m5
1588    mova       [coeffq+16*6], m6
1589    mov                   r3, coeffq
1590    WRITE_4X8              0, 1, 2, 3
1591
1592    mova                  m0, [r3+16*4]
1593    mova                  m1, [r3+16*5]
1594    mova                  m2, [r3+16*6]
1595    mova                  m3, m7
1596    lea                 dstq, [dstq+strideq*4]
1597    WRITE_4X8              0, 1, 2, 3
1598
1599.end3:
1600    pxor                  m7, m7
1601    REPX     {mova [r3+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
1602    ret
1603
1604
1605INV_TXFM_4X16_FN flipadst, dct
1606INV_TXFM_4X16_FN flipadst, adst
1607INV_TXFM_4X16_FN flipadst, flipadst
1608INV_TXFM_4X16_FN flipadst, identity
1609
1610cglobal iflipadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
1611    lea                   r3, [o(m(iflipadst_4x8_internal_8bpc).pass1)]
1612    jmp   m(idct_4x16_internal_8bpc).pass1
1613
1614.pass2:
1615    call m(iadst_16x4_internal_8bpc).main
1616    call m(iadst_16x4_internal_8bpc).main_pass2_end
1617
1618    punpckhqdq            m6, m5, m4                ;low:  out5  high:  out7
1619    punpcklqdq            m4, m5                    ;low: -out8  high: -out10
1620    punpckhqdq            m5, m7, m2                ;low: -out4  high: -out6
1621    punpcklqdq            m2, m7                    ;low:  out9  high:  out11
1622    mova       [coeffq+16*4], m2
1623    mova       [coeffq+16*5], m6
1624    mova                  m2, [coeffq+16*6]
1625    mova                  m6, [coeffq+16*7]
1626    punpcklqdq            m1, m6, m0                ;low:  out13 high:  out15
1627    punpckhqdq            m0, m6                    ;low: -out0  high: -out2
1628    punpcklqdq            m6, m3, m2                ;low: -out12 high: -out14
1629    punpckhqdq            m2, m3                    ;low:  out1  high:  out3
1630
1631    mova                  m7, [o(pw_m2048)]
1632    jmp   m(iadst_4x16_internal_8bpc).end1
1633
1634
1635INV_TXFM_4X16_FN identity, dct
1636INV_TXFM_4X16_FN identity, adst
1637INV_TXFM_4X16_FN identity, flipadst
1638INV_TXFM_4X16_FN identity, identity
1639
1640%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394]
1641    pmulhrsw            m%2, m%3, m%1
1642%if %0 == 4 ; if downshifting by 1
1643    pmulhrsw            m%2, m%4
1644%else
1645    paddsw              m%1, m%1
1646%endif
1647    paddsw              m%1, m%2
1648%endmacro
1649
1650cglobal iidentity_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
1651    mova                  m0, [coeffq+16*1]
1652    mova                  m6, [o(pw_1697x8)]
1653    mova                  m1, [coeffq+16*3]
1654    mova                  m2, [coeffq+16*5]
1655    mova                  m3, [coeffq+16*7]
1656    pcmpeqw               m7, m7
1657    mov                   r3, tx2q
1658    lea                 tx2q, [o(.pass1_2)]
1659.pass1:
1660    pmulhrsw              m4, m6, m0
1661    pmulhrsw              m5, m6, m1
1662    pavgw                 m4, m0
1663    pcmpeqw               m0, m7
1664    pavgw                 m5, m1
1665    pcmpeqw               m1, m7
1666    pandn                 m0, m4
1667    pmulhrsw              m4, m6, m2
1668    pandn                 m1, m5
1669    pmulhrsw              m5, m6, m3
1670    pavgw                 m4, m2
1671    pcmpeqw               m2, m7
1672    pavgw                 m5, m3
1673    pcmpeqw               m3, m7
1674    pandn                 m2, m4
1675    pandn                 m3, m5
1676    jmp m(iadst_4x8_internal_8bpc).pass1_end
1677.pass1_2:
1678    mova       [coeffq+16*1], m0
1679    mova       [coeffq+16*3], m1
1680    mova       [coeffq+16*5], m2
1681    mova       [coeffq+16*7], m3
1682    mova                  m0, [coeffq+16*0]
1683    mova                  m1, [coeffq+16*2]
1684    mova                  m2, [coeffq+16*4]
1685    mova                  m3, [coeffq+16*6]
1686    lea                 tx2q, [o(.pass1_end)]
1687    jmp .pass1
1688.pass1_end:
1689    mova                  m4, [coeffq+16*1]
1690    mova                  m5, [coeffq+16*3]
1691    mova                  m6, [coeffq+16*5]
1692    jmp                   r3
1693.pass2:
1694    mova                  m7, [o(pw_1697x16)]
1695    mova       [coeffq+16*6], m6
1696    REPX    {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5
1697    mova                  m6, [coeffq+16*7]
1698    IDTX16                 6, 7, 7
1699    mova       [coeffq+16*7], m6
1700    mova                  m6, [coeffq+16*6]
1701    pmulhrsw              m7, m6, [o(pw_1697x16)]
1702    paddsw                m6, m6
1703    paddsw                m6, m7
1704    mova                  m7, [o(pw_2048)]
1705    REPX    {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
1706    pmulhrsw              m7, [coeffq+16*7]
1707    mova       [coeffq+16*4], m4
1708    jmp m(iadst_4x16_internal_8bpc).end2
1709
1710
1711%macro INV_TXFM_16X4_FN 2 ; type1, type2
1712    INV_TXFM_FN          %1, %2, 16x4, 8
1713%ifidn %1_%2, dct_dct
1714    movd                 m1, [o(pw_2896x8)]
1715    pmulhrsw             m0, m1, [coeffq]
1716    movd                 m2, [o(pw_16384)]
1717    mov            [coeffq], eobd
1718    mov                 r2d, 2
1719    lea                tx2q, [o(m(inv_txfm_add_dct_dct_16x4_8bpc).end)]
1720.dconly:
1721    pmulhrsw             m0, m2
1722    movd                 m2, [o(pw_2048)]              ;intentionally rip-relative
1723    pmulhrsw             m0, m1
1724    pmulhrsw             m0, m2
1725    pshuflw              m0, m0, q0000
1726    punpcklwd            m0, m0
1727    pxor                 m5, m5
1728.dconly_loop:
1729    mova                 m1, [dstq]
1730    mova                 m3, [dstq+strideq]
1731    punpckhbw            m2, m1, m5
1732    punpcklbw            m1, m5
1733    punpckhbw            m4, m3, m5
1734    punpcklbw            m3, m5
1735    paddw                m2, m0
1736    paddw                m1, m0
1737    paddw                m4, m0
1738    paddw                m3, m0
1739    packuswb             m1, m2
1740    packuswb             m3, m4
1741    mova             [dstq], m1
1742    mova     [dstq+strideq], m3
1743    lea                dstq, [dstq+strideq*2]
1744    dec                 r2d
1745    jg .dconly_loop
1746    jmp                tx2q
1747.end:
1748    RET
1749%endif
1750%endmacro
1751
1752%macro LOAD_7ROWS 2 ;src, stride
1753    mova                 m0, [%1+%2*0]
1754    mova                 m1, [%1+%2*1]
1755    mova                 m2, [%1+%2*2]
1756    mova                 m3, [%1+%2*3]
1757    mova                 m4, [%1+%2*4]
1758    mova                 m5, [%1+%2*5]
1759    mova                 m6, [%1+%2*6]
1760%endmacro
1761
1762%macro SAVE_7ROWS 2 ;src, stride
1763    mova          [%1+%2*0], m0
1764    mova          [%1+%2*1], m1
1765    mova          [%1+%2*2], m2
1766    mova          [%1+%2*3], m3
1767    mova          [%1+%2*4], m4
1768    mova          [%1+%2*5], m5
1769    mova          [%1+%2*6], m6
1770%endmacro
1771
1772%macro IDCT16_1D_PACKED_ODDHALF 7  ;src[1-4], tmp[1-3]
1773    punpckhwd            m%5, m%4, m%1                ;packed in13 in3
1774    punpcklwd            m%1, m%4                     ;packed in1  in15
1775    punpcklwd            m%4, m%3, m%2                ;packed in9  in7
1776    punpckhwd            m%2, m%3                     ;packed in5  in11
1777    mova                 m%7, [o(pd_2048)]
1778    ITX_MUL2X_PACK        %1, %6, %7,  401, 4076, 1    ;low: t8a   high: t15a
1779    ITX_MUL2X_PACK        %4, %6, %7, 3166, 2598, 1    ;low: t9a   high: t14a
1780    ITX_MUL2X_PACK        %2, %6, %7, 1931, 3612, 1    ;low: t10a  high: t13a
1781    ITX_MUL2X_PACK        %5, %6, %7, 3920, 1189, 1    ;low: t11a  high: t12a
1782    psubsw               m%6, m%1, m%4                 ;low: t9    high: t14
1783    paddsw               m%1, m%4                      ;low: t8    high: t15
1784    psubsw               m%4, m%5, m%2                 ;low: t10   high: t13
1785    paddsw               m%5, m%2                      ;low: t11   high: t12
1786    mova                 m%2, [o(deint_shuf2)]
1787    pshufb               m%6, m%2
1788    pshufb               m%4, m%2
1789    ITX_MUL2X_PACK        %6, %3, %7, 1567, 3784, 1    ;low: t9a   high: t14a
1790    ITX_MUL2X_PACK        %4, %3, %7, m3784, 1567, 1   ;low: t10a  high: t13a
1791    psubsw               m%3, m%1, m%5                 ;low: t11a  high: t12a
1792    paddsw               m%1, m%5                      ;low: t8a   high: t15a
1793    psubsw               m%5, m%6, m%4                 ;low: t10   high: t13
1794    paddsw               m%6, m%4                      ;low: t9    high: t14
1795    pshufb               m%3, m%2
1796    pshufb               m%5, m%2
1797    ITX_MUL2X_PACK        %3, %2, %7, 2896, 2896, 4    ;t12,  t11
1798    ITX_MUL2X_PACK        %5, %4, %7, 2896, 2896, 4    ;t13a, t10a
1799    packssdw             m%2, m%4                      ;low: t11   high: t10a
1800    packssdw             m%3, m%5                      ;low: t12   high: t13a
1801    punpckhqdq           m%4, m%1, m%6                 ;low: t15a  high: t14
1802    punpcklqdq           m%1, m%6                      ;low: t8a   high: t9
1803%endmacro
1804
1805INV_TXFM_16X4_FN dct, dct
1806INV_TXFM_16X4_FN dct, adst
1807INV_TXFM_16X4_FN dct, flipadst
1808INV_TXFM_16X4_FN dct, identity
1809
1810cglobal idct_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
1811    LOAD_7ROWS        coeffq, 16
1812    call .main
1813
1814.pass1_end:
1815    punpckhwd             m7, m0, m2                 ;packed out1,  out5
1816    punpcklwd             m0, m2                     ;packed out0,  out4
1817    punpcklwd             m2, m1, m3                 ;packed out3,  out7
1818    punpckhwd             m1, m3                     ;packed out2,  out6
1819    mova       [coeffq+16*6], m7
1820    mova                  m7, [coeffq+16*7]
1821    punpckhwd             m3, m4, m6                 ;packed out9,  out13
1822    punpcklwd             m4, m6                     ;packed out8,  out12
1823    punpcklwd             m6, m5, m7                 ;packed out11, out15
1824    punpckhwd             m5, m7                     ;packed out10, out14
1825
1826.pass1_end2:
1827    mova                  m7, [o(pw_16384)]
1828    REPX    {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
1829    pmulhrsw              m7, [coeffq+16*6]
1830    mova       [coeffq+16*6], m7
1831
1832.pass1_end3:
1833    punpckhwd             m7, m3, m6                 ;packed 9, 11, 13, 15 high
1834    punpcklwd             m3, m6                     ;packed 9, 10, 13, 15 low
1835    punpckhwd             m6, m4, m5                 ;packed 8, 10, 12, 14 high
1836    punpcklwd             m4, m5                     ;packed 8, 10, 12, 14 low
1837    punpckhwd             m5, m4, m3                 ;8, 9, 10, 11, 12, 13, 14, 15(1)
1838    punpcklwd             m4, m3                     ;8, 9, 10, 11, 12, 13, 14, 15(0)
1839    punpckhwd             m3, m6, m7                 ;8, 9, 10, 11, 12, 13, 14, 15(3)
1840    punpcklwd             m6, m7                     ;8, 9, 10, 11, 12, 13, 14, 15(2)
1841    mova       [coeffq+16*7], m3
1842    mova                  m3, [coeffq+16*6]
1843    punpckhwd             m7, m3, m2                 ;packed 1, 3, 5, 7 high
1844    punpcklwd             m3, m2                     ;packed 1, 3, 5, 7 low
1845    punpckhwd             m2, m0, m1                 ;packed 0, 2, 4, 6 high
1846    punpcklwd             m0, m1                     ;packed 0, 2, 4, 6 low
1847    punpckhwd             m1, m0, m3                 ;0, 1, 2, 3, 4, 5, 6, 7(1)
1848    punpcklwd             m0, m3                     ;0, 1, 2, 3, 4, 5, 6, 7(0)
1849    punpckhwd             m3, m2, m7                 ;0, 1, 2, 3, 4, 5, 6, 7(3)
1850    punpcklwd             m2, m7                     ;0, 1, 2, 3, 4, 5, 6, 7(2)
1851    jmp                 tx2q
1852
1853.pass2:
1854    lea                 tx2q, [o(m(idct_8x4_internal_8bpc).pass2)]
1855
1856.pass2_end:
1857    mova       [coeffq+16*4], m4
1858    mova       [coeffq+16*5], m5
1859    mova       [coeffq+16*6], m6
1860    lea                   r3, [dstq+8]
1861    call                tx2q
1862
1863    add               coeffq, 16*4
1864    mova                  m0, [coeffq+16*0]
1865    mova                  m1, [coeffq+16*1]
1866    mova                  m2, [coeffq+16*2]
1867    mova                  m3, [coeffq+16*3]
1868    mov                 dstq, r3
1869    jmp                 tx2q
1870
1871ALIGN function_align
1872cglobal_label .main
1873    punpckhqdq            m7, m0, m1                 ;low:in1  high:in3
1874    punpcklqdq            m0, m1
1875    punpcklqdq            m1, m2, m3
1876    punpckhqdq            m3, m2                     ;low:in7  high:in5
1877    mova       [coeffq+16*4], m7
1878    mova       [coeffq+16*5], m3
1879    mova                  m7, [coeffq+16*7]
1880    punpcklqdq            m2, m4, m5
1881    punpckhqdq            m4, m5                     ;low:in9  high:in11
1882    punpcklqdq            m3, m6, m7
1883    punpckhqdq            m7, m6                     ;low:in15 high:in13
1884    mova       [coeffq+16*6], m4
1885    IDCT8_1D_PACKED
1886    mova                  m6, [coeffq+16*4]
1887    mova                  m4, [coeffq+16*5]
1888    mova                  m5, [coeffq+16*6]
1889    mova       [coeffq+16*4], m1
1890    mova       [coeffq+16*5], m2
1891    mova       [coeffq+16*6], m3
1892
1893    IDCT16_1D_PACKED_ODDHALF 6, 4, 5, 7, 1, 2, 3
1894
1895    mova                  m1, [coeffq+16*4]
1896    psubsw                m3, m0, m7                 ;low:out15 high:out14
1897    paddsw                m0, m7                     ;low:out0  high:out1
1898    psubsw                m7, m1, m5                 ;low:out12 high:out13
1899    paddsw                m1, m5                     ;low:out3  high:out2
1900    mova       [coeffq+16*7], m3
1901    mova                  m2, [coeffq+16*5]
1902    mova                  m3, [coeffq+16*6]
1903    psubsw                m5, m2, m4                 ;low:out11 high:out10
1904    paddsw                m2, m4                     ;low:out4  high:out5
1905    psubsw                m4, m3, m6                 ;low:out8  high:out9
1906    paddsw                m3, m6                     ;low:out7  high:out6
1907    mova                  m6, m7
1908    ret
1909
1910INV_TXFM_16X4_FN adst, dct
1911INV_TXFM_16X4_FN adst, adst
1912INV_TXFM_16X4_FN adst, flipadst
1913INV_TXFM_16X4_FN adst, identity
1914
1915cglobal iadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
1916    LOAD_7ROWS        coeffq, 16
1917    call .main
1918    call .main_pass1_end
1919
1920    punpckhwd             m6, m7, m0                 ;packed -out11, -out15
1921    punpcklwd             m0, m7                     ;packed   out0,   out4
1922    punpcklwd             m7, m3, m4                 ;packed  -out3,  -out7
1923    punpckhwd             m4, m3                     ;packed   out8,  out12
1924    mova                  m1, [coeffq+16*6]
1925    punpcklwd             m3, m1, m5                 ;packed  -out1,  -out5
1926    punpckhwd             m5, m1                     ;packed  out10,  out14
1927    mova                  m1, [coeffq+16*7]
1928    mova       [coeffq+16*6], m3
1929    mova       [coeffq+16*7], m7
1930    punpckhwd             m3, m2, m1                 ;packed  -out9,  -out13
1931    punpcklwd             m1, m2                     ;packed   out2,   out6
1932
1933    mova                  m7, [o(pw_16384)]
1934
1935.pass1_end:
1936    REPX    {pmulhrsw x, m7}, m0, m1, m4, m5
1937    pxor                  m2, m2
1938    psubw                 m2, m7
1939    mova                  m7, [coeffq+16*6]
1940    REPX    {pmulhrsw x, m2}, m7, m3, m6
1941    pmulhrsw              m2, [coeffq+16*7]
1942    mova       [coeffq+16*6], m7
1943    jmp   m(idct_16x4_internal_8bpc).pass1_end3
1944
1945.pass2:
1946    lea                 tx2q, [o(m(iadst_8x4_internal_8bpc).pass2)]
1947    jmp   m(idct_16x4_internal_8bpc).pass2_end
1948
1949ALIGN function_align
1950cglobal_label .main
1951    mova       [coeffq+16*6], m0
1952    pshufd                m0, m1, q1032
1953    pshufd                m2, m2, q1032
1954    punpckhwd             m1, m6, m0                 ;packed in13,  in2
1955    punpcklwd             m0, m6                     ;packed  in3, in12
1956    punpckhwd             m7, m5, m2                 ;packed in11,  in4
1957    punpcklwd             m2, m5                     ;packed  in5, in10
1958    mova                  m6, [o(pd_2048)]
1959    ITX_MUL2X_PACK         1, 5, 6,  995, 3973       ;low:t2   high:t3
1960    ITX_MUL2X_PACK         7, 5, 6, 1751, 3703       ;low:t4   high:t5
1961    ITX_MUL2X_PACK         2, 5, 6, 3513, 2106       ;low:t10  high:t11
1962    ITX_MUL2X_PACK         0, 5, 6, 3857, 1380       ;low:t12  high:t13
1963    psubsw                m5, m1, m2                 ;low:t10a high:t11a
1964    paddsw                m1, m2                     ;low:t2a  high:t3a
1965    psubsw                m2, m7, m0                 ;low:t12a high:t13a
1966    paddsw                m7, m0                     ;low:t4a  high:t5a
1967    punpcklqdq            m0, m5
1968    punpckhwd             m0, m5                     ;packed t10a, t11a
1969    punpcklqdq            m5, m2
1970    punpckhwd             m2, m5                     ;packed t13a, t12a
1971    ITX_MUL2X_PACK         0, 5, 6, 3406, 2276       ;low:t10  high:t11
1972    ITX_MUL2X_PACK         2, 5, 6, 4017,  799, 1    ;low:t12  high:t13
1973    mova       [coeffq+16*4], m1
1974    mova       [coeffq+16*5], m7
1975    mova                  m1, [coeffq+16*6]
1976    mova                  m7, [coeffq+16*7]
1977    pshufd                m1, m1, q1032
1978    pshufd                m3, m3, q1032
1979    punpckhwd             m5, m7, m1                 ;packed in15,  in0
1980    punpcklwd             m1, m7                     ;packed  in1, in14
1981    punpckhwd             m7, m4, m3                 ;packed  in9,  in6
1982    punpcklwd             m3, m4                     ;packed  in7,  in8
1983    ITX_MUL2X_PACK         5, 4, 6,  201, 4091       ;low:t0    high:t1
1984    ITX_MUL2X_PACK         7, 4, 6, 2440, 3290       ;low:t6    high:t7
1985    ITX_MUL2X_PACK         3, 4, 6, 3035, 2751       ;low:t8    high:t9
1986    ITX_MUL2X_PACK         1, 4, 6, 4052,  601       ;low:t14   high:t15
1987    psubsw                m4, m5, m3                 ;low:t8a   high:t9a
1988    paddsw                m5, m3                     ;low:t0a   high:t1a
1989    psubsw                m3, m7, m1                 ;low:t14a  high:t15a
1990    paddsw                m7, m1                     ;low:t6a   high:t7a
1991    punpcklqdq            m1, m4
1992    punpckhwd             m1, m4                     ;packed  t8a,  t9a
1993    punpcklqdq            m4, m3
1994    punpckhwd             m3, m4                     ;packed t15a, t14a
1995    ITX_MUL2X_PACK         1, 4, 6,  799, 4017       ;low:t8    high:t9
1996    ITX_MUL2X_PACK         3, 4, 6, 2276, 3406, 1    ;low:t14   high:t15
1997    paddsw                m4, m1, m2                 ;low:t12a  high:t13a
1998    psubsw                m1, m2                     ;low:t8a   high:t9a
1999    psubsw                m2, m0, m3                 ;low:t14a  high:t15a
2000    paddsw                m0, m3                     ;low:t10a  high:t11a
2001    punpcklqdq            m3, m1
2002    punpckhwd             m3, m1                     ;packed t12a, t13a
2003    punpcklqdq            m1, m2
2004    punpckhwd             m2, m1                     ;packed t15a, t14a
2005    ITX_MUL2X_PACK         3, 1, 6, 1567, 3784       ;low:t12   high:t13
2006    ITX_MUL2X_PACK         2, 1, 6, 3784, 1567, 1    ;low:t14   high:t15
2007    psubsw                m1, m3, m2                 ;low:t14a  high:t15a
2008    paddsw                m3, m2                     ;low:out2  high:-out13
2009    psubsw                m2, m4, m0                 ;low:t10   high:t11
2010    paddsw                m0, m4                     ;low:-out1 high:out14
2011    mova       [coeffq+16*6], m0
2012    mova       [coeffq+16*7], m3
2013    mova                  m0, [coeffq+16*4]
2014    mova                  m3, [coeffq+16*5]
2015    psubsw                m4, m5, m3                 ;low:t4    high:t5
2016    paddsw                m5, m3                     ;low:t0    high:t1
2017    psubsw                m3, m0, m7                 ;low:t6    high:t7
2018    paddsw                m0, m7                     ;low:t2    high:t3
2019    punpcklqdq            m7, m4
2020    punpckhwd             m7, m4                     ;packed t4, t5
2021    punpcklqdq            m4, m3
2022    punpckhwd             m3, m4                     ;packed t7, t6
2023    ITX_MUL2X_PACK         7, 4, 6, 1567, 3784       ;low:t4a   high:t5a
2024    ITX_MUL2X_PACK         3, 4, 6, 3784, 1567, 1    ;low:t6a   high:t7a
2025    psubsw                m4, m5, m0                 ;low:t2a   high:t3a
2026    paddsw                m0, m5                     ;low:out0  high:-out15
2027    psubsw                m5, m7, m3                 ;low:t6    high:t7
2028    paddsw                m3, m7                     ;low:-out3 high:out12
2029    ret
2030ALIGN function_align
2031.main_pass1_end:
2032    mova                  m7, [o(deint_shuf1)]
2033    mova       [coeffq+16*4], m0
2034    mova       [coeffq+16*5], m3
2035    mova                  m0, [o(pw_2896_m2896)]
2036    mova                  m3, [o(pw_2896_2896)]
2037    pshufb                m1, m7                     ;t14a t15a
2038    pshufb                m2, m7                     ;t10  t11
2039    pshufb                m4, m7                     ;t2a  t3a
2040    pshufb                m5, m7                     ;t6   t7
2041    pmaddwd               m7, m0, m2
2042    pmaddwd               m2, m3
2043    paddd                 m7, m6
2044    paddd                 m2, m6
2045    psrad                 m7, 12
2046    psrad                 m2, 12
2047    packssdw              m2, m7                     ;low:out6  high:-out9
2048    pmaddwd               m7, m0, m4
2049    pmaddwd               m4, m3
2050    paddd                 m7, m6
2051    paddd                 m4, m6
2052    psrad                 m7, 12
2053    psrad                 m4, 12
2054    packssdw              m4, m7                     ;low:-out7 high:out8
2055    pmaddwd               m7, m3, m5
2056    pmaddwd               m5, m0
2057    paddd                 m7, m6
2058    paddd                 m5, m6
2059    psrad                 m7, 12
2060    psrad                 m5, 12
2061    packssdw              m7, m5                     ;low:out4  high:-out11
2062    pmaddwd               m5, m3, m1
2063    pmaddwd               m1, m0
2064    paddd                 m5, m6
2065    paddd                 m1, m6
2066    psrad                 m5, 12
2067    psrad                 m1, 12
2068    packssdw              m5, m1                     ;low:-out5 high:out10
2069    mova                  m0, [coeffq+16*4]
2070    mova                  m3, [coeffq+16*5]
2071    ret
2072ALIGN function_align
2073cglobal_label .main_pass2_end
2074    mova                  m7, [o(pw_2896x8)]
2075    punpckhqdq            m6, m2, m1                 ;low:t11   high:t15a
2076    punpcklqdq            m2, m1                     ;low:t10   high:t14a
2077    psubsw                m1, m2, m6
2078    paddsw                m2, m6
2079    punpckhqdq            m6, m4, m5                 ;low:t3a   high:t7
2080    punpcklqdq            m4, m5                     ;low:t2a   high:t6
2081    psubsw                m5, m4, m6
2082    paddsw                m4, m6
2083    pmulhrsw              m1, m7                     ;low:-out9 high:out10
2084    pmulhrsw              m2, m7                     ;low:out6  high:-out5
2085    pmulhrsw              m5, m7                     ;low:out8  high:-out11
2086    pmulhrsw              m4, m7                     ;low:-out7 high:out4
2087    punpckhqdq            m7, m4, m5                 ;low:out4  high:-out11
2088    punpcklqdq            m4, m5                     ;low:-out7 high:out8
2089    punpckhqdq            m5, m2, m1                 ;low:-out5 high:out10
2090    punpcklqdq            m2, m1                     ;low:out6  high:-out9
2091    ret
2092
2093
2094INV_TXFM_16X4_FN flipadst, dct
2095INV_TXFM_16X4_FN flipadst, adst
2096INV_TXFM_16X4_FN flipadst, flipadst
2097INV_TXFM_16X4_FN flipadst, identity
2098
2099cglobal iflipadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
2100    LOAD_7ROWS        coeffq, 16
2101    call m(iadst_16x4_internal_8bpc).main
2102    call m(iadst_16x4_internal_8bpc).main_pass1_end
2103
2104    punpcklwd             m6, m7, m0                 ;packed  out11,  out15
2105    punpckhwd             m0, m7                     ;packed  -out0,  -out4
2106    punpckhwd             m7, m3, m4                 ;packed   out3,   out7
2107    punpcklwd             m4, m3                     ;packed  -out8, -out12
2108    mova                  m1, [coeffq+16*6]
2109    punpckhwd             m3, m1, m5                 ;packed   out1,   out5
2110    punpcklwd             m5, m1                     ;packed -out10, -out14
2111    mova                  m1, [coeffq+16*7]
2112    mova       [coeffq+16*6], m3
2113    mova       [coeffq+16*7], m7
2114    punpcklwd             m3, m2, m1                 ;packed   out9,  out13
2115    punpckhwd             m1, m2                     ;packed  -out2,  -out6
2116
2117    mova                  m7, [o(pw_m16384)]
2118    jmp   m(iadst_16x4_internal_8bpc).pass1_end
2119
2120.pass2:
2121    lea                 tx2q, [o(m(iflipadst_8x4_internal_8bpc).pass2)]
2122    jmp   m(idct_16x4_internal_8bpc).pass2_end
2123
2124
2125INV_TXFM_16X4_FN identity, dct
2126INV_TXFM_16X4_FN identity, adst
2127INV_TXFM_16X4_FN identity, flipadst
2128INV_TXFM_16X4_FN identity, identity
2129
2130cglobal iidentity_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
2131    mova                  m1, [coeffq+16*6]
2132    mova                  m0, [coeffq+16*5]
2133    mova                  m2, [coeffq+16*7]
2134    mova                  m6, [o(pw_1697x16)]
2135    mova                  m7, [o(pw_16384)]
2136    pmulhrsw              m4, m6, m1
2137    pmulhrsw              m3, m6, m0
2138    pmulhrsw              m5, m6, m2
2139    pmulhrsw              m4, m7
2140    pmulhrsw              m3, m7
2141    pmulhrsw              m5, m7
2142    paddsw                m1, m4
2143    paddsw                m0, m3
2144    paddsw                m5, m2
2145    mova                  m2, [coeffq+16*2]
2146    mova                  m3, [coeffq+16*3]
2147    mova                  m4, [coeffq+16*4]
2148    mova       [coeffq+16*6], m1
2149    mova       [coeffq+16*5], m0
2150    mova       [coeffq+16*7], m5
2151    pmulhrsw              m0, m6, m2
2152    pmulhrsw              m1, m6, m3
2153    pmulhrsw              m5, m6, m4
2154    pmulhrsw              m0, m7
2155    pmulhrsw              m1, m7
2156    pmulhrsw              m5, m7
2157    paddsw                m2, m0
2158    paddsw                m3, m1
2159    paddsw                m4, m5
2160    mova                  m0, [coeffq+16*0]
2161    mova                  m1, [coeffq+16*1]
2162    pmulhrsw              m5, m6, m0
2163    pmulhrsw              m6, m1
2164    pmulhrsw              m5, m7
2165    pmulhrsw              m6, m7
2166    paddsw                m0, m5
2167    paddsw                m1, m6
2168    mova                  m6, [coeffq+16*6]
2169    mova                  m5, [coeffq+16*5]
2170    punpckhwd             m7, m0, m2                 ;packed out1,  out5
2171    punpcklwd             m0, m2                     ;packed out0,  out4
2172    punpckhwd             m2, m1, m3                 ;packed out3,  out7
2173    punpcklwd             m1, m3                     ;packed out2,  out6
2174    mova       [coeffq+16*6], m7
2175    mova                  m7, [coeffq+16*7]
2176    punpckhwd             m3, m4, m6                 ;packed out9,  out13
2177    punpcklwd             m4, m6                     ;packed out8,  out12
2178    punpckhwd             m6, m5, m7                 ;packed out11, out15
2179    punpcklwd             m5, m7                     ;packed out10, out14
2180    jmp   m(idct_16x4_internal_8bpc).pass1_end3
2181
2182.pass2:
2183    lea                 tx2q, [o(m(iidentity_8x4_internal_8bpc).pass2)]
2184    jmp   m(idct_16x4_internal_8bpc).pass2_end
2185
2186
2187%macro SAVE_8ROWS 2  ;src, stride
2188    mova                 [%1+%2*0], m0
2189    mova                 [%1+%2*1], m1
2190    mova                 [%1+%2*2], m2
2191    mova                 [%1+%2*3], m3
2192    mova                 [%1+%2*4], m4
2193    mova                 [%1+%2*5], m5
2194    mova                 [%1+%2*6], m6
2195    mova                 [%1+%2*7], m7
2196%endmacro
2197
2198%macro INV_TXFM_8X16_FN 2 ; type1, type2
2199    INV_TXFM_FN          %1, %2, 8x16, 8, 16*16
2200%ifidn %1_%2, dct_dct
2201    pshuflw              m0, [coeffq], q0000
2202    punpcklwd            m0, m0
2203    mova                 m1, [o(pw_2896x8)]
2204    pmulhrsw             m0, m1
2205    mova                 m2, [o(pw_16384)]
2206    mov            [coeffq], eobd
2207    pmulhrsw             m0, m1
2208    pmulhrsw             m0, m2
2209    psrlw                m2, 3              ; pw_2048
2210    pmulhrsw             m0, m1
2211    pmulhrsw             m0, m2
2212    mov                 r3d, 4
2213    lea                tx2q, [o(m(inv_txfm_add_dct_dct_8x16_8bpc).end)]
2214    jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop
2215.end:
2216    RET
2217%endif
2218%endmacro
2219
2220INV_TXFM_8X16_FN dct, dct
2221INV_TXFM_8X16_FN dct, adst
2222INV_TXFM_8X16_FN dct, flipadst
2223INV_TXFM_8X16_FN dct, identity
2224
2225cglobal idct_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
2226    lea                    r3, [o(m(idct_8x8_internal_8bpc).pass1)]
2227
2228.pass1:
2229    LOAD_8ROWS    coeffq+16*1, 32, 1
2230    mov   [rsp+gprsize+16*11], tx2q
2231    lea                  tx2q, [o(m(idct_8x16_internal_8bpc).pass1_end)]
2232    jmp                    r3
2233
2234.pass1_end:
2235    SAVE_8ROWS    coeffq+16*1, 32
2236    LOAD_8ROWS    coeffq+16*0, 32, 1
2237    mov                  tx2q, [rsp+gprsize+16*11]
2238    jmp                    r3
2239
2240.pass2:
2241    lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end)]
2242
2243.pass2_pre:
2244    mova       [coeffq+16*2 ], m1
2245    mova       [coeffq+16*6 ], m3
2246    mova       [coeffq+16*10], m5
2247    mova       [coeffq+16*14], m7
2248    mova                   m1, m2
2249    mova                   m2, m4
2250    mova                   m3, m6
2251    mova                   m4, [coeffq+16*1 ]
2252    mova                   m5, [coeffq+16*5 ]
2253    mova                   m6, [coeffq+16*9 ]
2254    mova                   m7, [coeffq+16*13]
2255
2256.pass2_main:
2257    call m(idct_8x8_internal_8bpc).main
2258
2259    SAVE_7ROWS   rsp+gprsize+16*3, 16
2260    mova                   m0, [coeffq+16*2 ]
2261    mova                   m1, [coeffq+16*6 ]
2262    mova                   m2, [coeffq+16*10]
2263    mova                   m3, [coeffq+16*14]
2264    mova                   m4, [coeffq+16*3 ]
2265    mova                   m5, [coeffq+16*7 ]
2266    mova                   m6, [coeffq+16*11]
2267    mova                   m7, [coeffq+16*15]
2268    call m(idct_16x8_internal_8bpc).main
2269
2270    mov                    r3, dstq
2271    lea                  dstq, [dstq+strideq*8]
2272    jmp  m(idct_8x8_internal_8bpc).end
2273
2274.end:
2275    LOAD_8ROWS   rsp+gprsize+16*3, 16
2276    mova   [rsp+gprsize+16*0], m7
2277    lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
2278    mov                  dstq, r3
2279    jmp  m(idct_8x8_internal_8bpc).end
2280
2281.end1:
2282    pxor                   m7, m7
2283    REPX  {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
2284    ret
2285
2286INV_TXFM_8X16_FN adst, dct
2287INV_TXFM_8X16_FN adst, adst
2288INV_TXFM_8X16_FN adst, flipadst
2289INV_TXFM_8X16_FN adst, identity
2290
2291cglobal iadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
2292    lea                    r3, [o(m(iadst_8x8_internal_8bpc).pass1)]
2293    jmp  m(idct_8x16_internal_8bpc).pass1
2294
2295.pass2:
2296    lea                  tx2q, [o(m(iadst_8x16_internal_8bpc).end)]
2297
2298.pass2_pre:
2299    mova    [rsp+gprsize+16*7], m0
2300    mova    [rsp+gprsize+16*8], m1
2301    mova    [rsp+gprsize+16*5], m6
2302    mova    [rsp+gprsize+16*6], m7
2303    mova                    m0, m2
2304    mova                    m1, m3
2305    mova                    m2, m4
2306    mova                    m3, m5
2307
2308.pass2_main:
2309    mova                    m4, [coeffq+16*1 ]
2310    mova                    m5, [coeffq+16*3 ]
2311    mova                    m6, [coeffq+16*13]
2312    mova                    m7, [coeffq+16*15]
2313    mova    [rsp+gprsize+16*3], m4
2314    mova    [rsp+gprsize+16*4], m5
2315    mova    [rsp+gprsize+16*9], m6
2316    mova    [rsp+gprsize+32*5], m7
2317    mova                    m4, [coeffq+16*5 ]
2318    mova                    m5, [coeffq+16*7 ]
2319    mova                    m6, [coeffq+16*9 ]
2320    mova                    m7, [coeffq+16*11]
2321
2322    call m(iadst_16x8_internal_8bpc).main
2323    call m(iadst_16x8_internal_8bpc).main_pass2_end
2324
2325    mov                    r3, dstq
2326    lea                  dstq, [dstq+strideq*8]
2327    jmp m(iadst_8x8_internal_8bpc).end
2328
2329.end:
2330    LOAD_8ROWS   rsp+gprsize+16*3, 16
2331    mova   [rsp+gprsize+16*0], m7
2332    lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
2333    mov                  dstq, r3
2334    jmp  m(iadst_8x8_internal_8bpc).end
2335
2336
2337INV_TXFM_8X16_FN flipadst, dct
2338INV_TXFM_8X16_FN flipadst, adst
2339INV_TXFM_8X16_FN flipadst, flipadst
2340INV_TXFM_8X16_FN flipadst, identity
2341
2342cglobal iflipadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
2343    lea                    r3, [o(m(iflipadst_8x8_internal_8bpc).pass1)]
2344    jmp  m(idct_8x16_internal_8bpc).pass1
2345
2346.pass2:
2347    lea                   tx2q, [o(m(iflipadst_8x16_internal_8bpc).end)]
2348    lea                     r3, [dstq+strideq*8]
2349
2350.pass2_pre:
2351    mova    [rsp+gprsize+16*7], m0
2352    mova    [rsp+gprsize+16*8], m1
2353    mova    [rsp+gprsize+16*5], m6
2354    mova    [rsp+gprsize+16*6], m7
2355    mova                    m0, m2
2356    mova                    m1, m3
2357    mova                    m2, m4
2358    mova                    m3, m5
2359
2360.pass2_main:
2361    mova                    m4, [coeffq+16*1 ]
2362    mova                    m5, [coeffq+16*3 ]
2363    mova                    m6, [coeffq+16*13]
2364    mova                    m7, [coeffq+16*15]
2365    mova    [rsp+gprsize+16*3], m4
2366    mova    [rsp+gprsize+16*4], m5
2367    mova    [rsp+gprsize+16*9], m6
2368    mova    [rsp+gprsize+32*5], m7
2369    mova                    m4, [coeffq+16*5 ]
2370    mova                    m5, [coeffq+16*7 ]
2371    mova                    m6, [coeffq+16*9 ]
2372    mova                    m7, [coeffq+16*11]
2373
2374    call m(iadst_16x8_internal_8bpc).main
2375    call m(iadst_16x8_internal_8bpc).main_pass2_end
2376    jmp  m(iflipadst_8x8_internal_8bpc).end
2377
2378.end:
2379    LOAD_8ROWS    rsp+gprsize+16*3, 16
2380    mova    [rsp+gprsize+16*0], m7
2381    lea                   tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
2382    mov                   dstq, r3
2383    jmp  m(iflipadst_8x8_internal_8bpc).end
2384
2385
2386INV_TXFM_8X16_FN identity, dct
2387INV_TXFM_8X16_FN identity, adst
2388INV_TXFM_8X16_FN identity, flipadst
2389INV_TXFM_8X16_FN identity, identity
2390
2391cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
2392    LOAD_8ROWS    coeffq+16*1, 32, 1
2393    mov                    r3, tx2q
2394    lea                  tx2q, [o(m(iidentity_8x16_internal_8bpc).pass1_end)]
2395    mova   [rsp+gprsize+16*1], m6
2396    jmp  m(idct_8x8_internal_8bpc).pass1_end3
2397
2398.pass1_end:
2399    SAVE_8ROWS    coeffq+16*1, 32
2400    LOAD_8ROWS    coeffq+16*0, 32, 1
2401    mov                  tx2q, r3
2402    mova   [rsp+gprsize+16*1], m6
2403    jmp  m(idct_8x8_internal_8bpc).pass1_end3
2404
2405.pass2:
2406    lea                  tx2q, [o(m(iidentity_8x16_internal_8bpc).end1)]
2407
2408.end:
2409    mova   [rsp+gprsize+16*0], m7
2410    mova   [rsp+gprsize+16*1], m6
2411    mova                   m7, [o(pw_1697x16)]
2412    REPX     {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5
2413    mova                   m6, [rsp+gprsize+16*1]
2414    mova   [rsp+gprsize+16*2], m5
2415    IDTX16                  6, 5, 7
2416    mova                   m5, [rsp+gprsize+16*0]
2417    IDTX16                  5, 7, 7
2418    mova                   m7, [o(pw_2048)]
2419    REPX     {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
2420    pmulhrsw               m7, [rsp+gprsize+16*2]
2421    mova   [rsp+gprsize+16*0], m5
2422    mova   [rsp+gprsize+16*1], m6
2423    mova   [rsp+gprsize+16*2], m7
2424    jmp  m(idct_8x8_internal_8bpc).end3
2425
2426.end1:
2427    LOAD_8ROWS    coeffq+16*1, 32
2428    lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
2429    lea                  dstq, [dstq+strideq*2]
2430    jmp .end
2431
2432
2433%macro INV_TXFM_16X8_FN 2 ; type1, type2
2434    INV_TXFM_FN          %1, %2, 16x8, 8, 16*16
2435%ifidn %1_%2, dct_dct
2436    movd                 m1, [o(pw_2896x8)]
2437    pmulhrsw             m0, m1, [coeffq]
2438    movd                 m2, [o(pw_16384)]
2439    mov            [coeffq], eobd
2440    pmulhrsw             m0, m1
2441    mov                 r2d, 4
2442    lea                tx2q, [o(m(inv_txfm_add_dct_dct_16x8_8bpc).end)]
2443    jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
2444.end:
2445    RET
2446%endif
2447%endmacro
2448
2449INV_TXFM_16X8_FN dct, dct
2450INV_TXFM_16X8_FN dct, adst
2451INV_TXFM_16X8_FN dct, flipadst
2452INV_TXFM_16X8_FN dct, identity
2453
2454cglobal idct_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
2455    LOAD_8ROWS    coeffq+16*0, 32, 1
2456    call m(idct_8x8_internal_8bpc).main
2457    SAVE_7ROWS   rsp+gprsize+16*3, 16
2458
2459    LOAD_8ROWS    coeffq+16*1, 32, 1
2460    call  .main
2461    mov                    r3, tx2q
2462    lea                  tx2q, [o(m(idct_16x8_internal_8bpc).pass1_end)]
2463    jmp  m(idct_8x8_internal_8bpc).pass1_end
2464
2465.pass1_end:
2466    SAVE_8ROWS    coeffq+16*1, 32
2467    LOAD_8ROWS   rsp+gprsize+16*3, 16
2468    mova   [rsp+gprsize+16*0], m7
2469    mov                  tx2q, r3
2470    jmp  m(idct_8x8_internal_8bpc).pass1_end
2471
2472.pass2:
2473    lea                  tx2q, [o(m(idct_16x8_internal_8bpc).end)]
2474    lea                    r3, [dstq+8]
2475    jmp  m(idct_8x8_internal_8bpc).pass2_main
2476
2477.end:
2478    LOAD_8ROWS    coeffq+16*1, 32
2479    lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
2480    mov                  dstq, r3
2481    jmp  m(idct_8x8_internal_8bpc).pass2_main
2482
2483
2484ALIGN function_align
2485cglobal_label .main
2486    mova [rsp+gprsize*2+16*1], m2
2487    mova [rsp+gprsize*2+16*2], m6
2488    mova [rsp+gprsize*2+32*5], m5
2489
2490    mova                   m6, [o(pd_2048)]
2491    ITX_MULSUB_2W           0, 7, 2, 5, 6,  401, 4076   ;t8a, t15a
2492    ITX_MULSUB_2W           4, 3, 2, 5, 6, 3166, 2598   ;t9a, t14a
2493    psubsw                 m2, m0, m4                   ;t9
2494    paddsw                 m0, m4                       ;t8
2495    psubsw                 m4, m7, m3                   ;t14
2496    paddsw                 m7, m3                       ;t15
2497    ITX_MULSUB_2W           4, 2, 3, 5, 6, 1567, 3784   ;t9a, t14a
2498    mova                   m3, [rsp+gprsize*2+16*1]
2499    mova                   m5, [rsp+gprsize*2+32*5]
2500    mova [rsp+gprsize*2+16*1], m2
2501    mova [rsp+gprsize*2+32*5], m4
2502    mova                   m2, [rsp+gprsize*2+16*2]
2503    mova [rsp+gprsize*2+16*2], m7
2504    ITX_MULSUB_2W           3, 5, 7, 4, 6, 1931, 3612   ;t10a, t13a
2505    ITX_MULSUB_2W           2, 1, 7, 4, 6, 3920, 1189   ;t11a, t12a
2506    psubsw                 m4, m2, m3                   ;t10
2507    paddsw                 m2, m3                       ;t11
2508    psubsw                 m3, m1, m5                   ;t13
2509    paddsw                 m1, m5                       ;t12
2510    ITX_MULSUB_2W           3, 4, 7, 5, 6, m3784, 1567  ;t10a, t13a
2511    mova                   m7, [rsp+gprsize*2+32*5]
2512    psubsw                 m6, m0, m2                   ;t11a
2513    paddsw                 m0, m2                       ;t8a
2514    paddsw                 m2, m7, m3                   ;t9
2515    psubsw                 m7, m3                       ;t10
2516    mova                   m5, [rsp+gprsize*2+16*0]
2517    psubsw                 m3, m5, m0                   ;out8
2518    paddsw                 m0, m5                       ;out7
2519    mova [rsp+gprsize*2+32*5], m0
2520    mova                   m5, [rsp+gprsize*2+16*9]
2521    psubsw                 m0, m5, m2                   ;out9
2522    paddsw                 m2, m5                       ;out6
2523    mova [rsp+gprsize*2+16*0], m0
2524    mova [rsp+gprsize*2+16*9], m2
2525    mova                   m0, [rsp+gprsize*2+16*1]
2526    mova                   m2, [rsp+gprsize*2+16*2]
2527    mova [rsp+gprsize*2+16*1], m3
2528    psubsw                 m5, m0, m4                   ;t13
2529    paddsw                 m0, m4                       ;t14
2530    mova                   m3, [o(pd_2048)]
2531    psubsw                 m4, m2, m1                   ;t12a
2532    paddsw                 m1, m2                       ;t15a
2533    mova [rsp+gprsize*2+16*2], m1
2534    ITX_MULSUB_2W           5, 7, 1, 2, 3, 2896, 2896   ;t10a, t13a
2535    ITX_MULSUB_2W           4, 6, 1, 2, 3, 2896, 2896   ;t11,  t12
2536    mova                   m3, [rsp+gprsize*2+16*8]
2537    psubsw                 m2, m3, m5                   ;out10
2538    paddsw                 m3, m5                       ;out5
2539    mova                   m5, [rsp+gprsize*2+16*7]
2540    mova [rsp+gprsize*2+16*8], m3
2541    psubsw                 m3, m5, m4                   ;out11
2542    paddsw                 m5, m4                       ;out4
2543    mova                   m4, [rsp+gprsize*2+16*6]
2544    mova [rsp+gprsize*2+16*7], m5
2545    paddsw                 m5, m4, m6                   ;out3
2546    psubsw                 m4, m6                       ;out12
2547    mova                   m6, [rsp+gprsize*2+16*5]
2548    mova [rsp+gprsize*2+16*6], m5
2549    psubsw                 m5, m6, m7                   ;out13
2550    paddsw                 m6, m7                       ;out2
2551    mova                   m7, [rsp+gprsize*2+16*4]
2552    mova [rsp+gprsize*2+16*5], m6
2553    psubsw                 m6, m7, m0                   ;out14
2554    paddsw                 m7, m0                       ;out1
2555    mova                   m1, [rsp+gprsize*2+16*2]
2556    mova                   m0, [rsp+gprsize*2+16*3]
2557    mova [rsp+gprsize*2+16*4], m7
2558    psubsw                 m7, m0, m1                   ;out15
2559    paddsw                 m0, m1                       ;out0
2560    mova [rsp+gprsize*2+16*3], m0
2561    mova                   m1, [rsp+gprsize*2+16*0]
2562    mova                   m0, [rsp+gprsize*2+16*1]
2563    mova [rsp+gprsize*2+16*0], m7
2564    ret
2565
2566INV_TXFM_16X8_FN adst, dct
2567INV_TXFM_16X8_FN adst, adst
2568INV_TXFM_16X8_FN adst, flipadst
2569INV_TXFM_16X8_FN adst, identity
2570
2571cglobal iadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
2572    mova                    m7, [o(pw_2896x8)]
2573    pmulhrsw                m0, m7, [coeffq+16*0 ]
2574    pmulhrsw                m1, m7, [coeffq+16*1 ]
2575    pmulhrsw                m2, m7, [coeffq+16*14]
2576    pmulhrsw                m3, m7, [coeffq+16*15]
2577    mova    [rsp+gprsize+16*7], m0
2578    mova    [rsp+gprsize+16*8], m1
2579    mova    [rsp+gprsize+16*9], m2
2580    mova    [rsp+gprsize+32*5], m3
2581    pmulhrsw                m0, m7, [coeffq+16*6 ]
2582    pmulhrsw                m1, m7, [coeffq+16*7 ]
2583    pmulhrsw                m2, m7, [coeffq+16*8 ]
2584    pmulhrsw                m3, m7, [coeffq+16*9 ]
2585    mova    [rsp+gprsize+16*3], m2
2586    mova    [rsp+gprsize+16*4], m3
2587    mova    [rsp+gprsize+16*5], m0
2588    mova    [rsp+gprsize+16*6], m1
2589    pmulhrsw                m0, m7, [coeffq+16*2 ]
2590    pmulhrsw                m1, m7, [coeffq+16*3 ]
2591    pmulhrsw                m2, m7, [coeffq+16*4 ]
2592    pmulhrsw                m3, m7, [coeffq+16*5 ]
2593    pmulhrsw                m4, m7, [coeffq+16*10]
2594    pmulhrsw                m5, m7, [coeffq+16*11]
2595    pmulhrsw                m6, m7, [coeffq+16*12]
2596    pmulhrsw                m7,     [coeffq+16*13]
2597
2598    call .main
2599    call .main_pass1_end
2600    mov                    r3, tx2q
2601    lea                  tx2q, [o(m(iadst_16x8_internal_8bpc).pass1_end)]
2602    jmp m(iadst_8x8_internal_8bpc).pass1_end
2603
2604.pass1_end:
2605    SAVE_8ROWS    coeffq+16*1, 32
2606    LOAD_8ROWS   rsp+gprsize+16*3, 16
2607    mova   [rsp+gprsize+16*0], m7
2608    mov                  tx2q, r3
2609    jmp m(iadst_8x8_internal_8bpc).pass1_end
2610
2611.pass2:
2612    lea                  tx2q, [o(m(iadst_16x8_internal_8bpc).end)]
2613    lea                    r3, [dstq+8]
2614    jmp m(iadst_8x8_internal_8bpc).pass2_main
2615
2616.end:
2617    LOAD_8ROWS    coeffq+16*1, 32
2618    lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
2619    mov                  dstq, r3
2620    jmp m(iadst_8x8_internal_8bpc).pass2_main
2621
2622ALIGN function_align
2623cglobal_label .main
2624    mova  [rsp+gprsize*2+16*0], m1
2625    mova  [rsp+gprsize*2+16*1], m2
2626    mova  [rsp+gprsize*2+16*2], m6
2627
2628    mova                    m6, [o(pd_2048)]
2629    ITX_MULSUB_2W            7, 0, 1, 2, 6,  995, 3973   ;t3,  t2
2630    ITX_MULSUB_2W            3, 4, 1, 2, 6, 3513, 2106   ;t11, t10
2631    psubsw                  m1, m0, m4                   ;t10a
2632    paddsw                  m0, m4                       ;t2a
2633    psubsw                  m4, m7, m3                   ;t11a
2634    paddsw                  m3, m7                       ;t3a
2635    ITX_MULSUB_2W            1, 4, 7, 2, 6, 3406, 2276   ;t11, t10
2636    mova                    m2, [rsp+gprsize*2+16*0]     ;in3
2637    mova                    m7, [rsp+gprsize*2+16*1]     ;in4
2638    mova  [rsp+gprsize*2+16*0], m1                       ;t11
2639    mova  [rsp+gprsize*2+16*1], m4                       ;t10
2640    mova                    m1, [rsp+gprsize*2+16*2]     ;in12
2641    mova  [rsp+gprsize*2+16*2], m0                       ;t2a
2642    ITX_MULSUB_2W            5, 7, 0, 4, 6, 1751, 3703   ;t5,  t4
2643    ITX_MULSUB_2W            2, 1, 0, 4, 6, 3857, 1380   ;t13, t12
2644    psubsw                  m0, m7, m1                   ;t12a
2645    paddsw                  m1, m7                       ;t4a
2646    psubsw                  m4, m5, m2                   ;t13a
2647    paddsw                  m5, m2                       ;t5a
2648    ITX_MULSUB_2W            4, 0, 7, 2, 6, 4017,  799   ;t12, t13
2649    mova                    m2, [rsp+gprsize*2+16*8]     ;in1
2650    mova                    m7, [rsp+gprsize*2+16*9]     ;in14
2651    mova  [rsp+gprsize*2+16*8], m4                       ;t12
2652    mova  [rsp+gprsize*2+16*9], m0                       ;t13
2653    mova                    m4, [rsp+gprsize*2+16*4]     ;in9
2654    mova                    m0, [rsp+gprsize*2+16*5]     ;in6
2655    mova  [rsp+gprsize*2+16*4], m1                       ;t4a
2656    mova  [rsp+gprsize*2+16*5], m5                       ;t5a
2657    ITX_MULSUB_2W            2, 7, 1, 5, 6, 4052,  601   ;t15, t14
2658    ITX_MULSUB_2W            4, 0, 1, 5, 6, 2440, 3290   ;t7,  t6
2659    psubsw                  m1, m0, m7                   ;t14a
2660    paddsw                  m0, m7                       ;t6a
2661    psubsw                  m5, m4, m2                   ;t15a
2662    paddsw                  m4, m2                       ;t7a
2663    ITX_MULSUB_2W            5, 1, 7, 2, 6, 2276, 3406   ;t14, t15
2664    mova                    m2, [rsp+gprsize*2+16*2]     ;t2a
2665    mova  [rsp+gprsize*2+16*2], m5                       ;t14
2666    psubsw                  m7, m2, m0                   ;t6
2667    paddsw                  m2, m0                       ;t2
2668    psubsw                  m0, m3, m4                   ;t7
2669    paddsw                  m3, m4                       ;t3
2670    ITX_MULSUB_2W            0, 7, 4, 5, 6, 3784, 1567   ;t6a, t7a
2671    mova                    m4, [rsp+gprsize*2+16*7]     ;in0
2672    mova                    m5, [rsp+gprsize*2+32*5]     ;in15
2673    mova  [rsp+gprsize*2+16*7], m3                       ;t3
2674    mova  [rsp+gprsize*2+32*5], m1                       ;t15
2675    mova                    m1, [rsp+gprsize*2+16*6]     ;in7
2676    mova                    m3, [rsp+gprsize*2+16*3]     ;in8
2677    mova  [rsp+gprsize*2+16*6], m7                       ;t7a
2678    mova  [rsp+gprsize*2+16*3], m0                       ;t6a
2679    ITX_MULSUB_2W            5, 4, 0, 7, 6,  201, 4091   ;t1,  t0
2680    ITX_MULSUB_2W            1, 3, 0, 7, 6, 3035, 2751   ;t9,  t8
2681    psubsw                  m0, m4, m3                   ;t8a
2682    paddsw                  m4, m3                       ;t0a
2683    psubsw                  m3, m5, m1                   ;t9a
2684    paddsw                  m5, m1                       ;t1a
2685    ITX_MULSUB_2W            0, 3, 1, 7, 6,  799, 4017   ;t9,  t8
2686    mova                    m1, [rsp+gprsize*2+16*4]     ;t4a
2687    mova                    m7, [rsp+gprsize*2+16*5]     ;t5a
2688    mova  [rsp+gprsize*2+16*4], m3                       ;t8
2689    mova  [rsp+gprsize*2+16*5], m0                       ;t9
2690    psubsw                  m0, m4, m1                   ;t4
2691    paddsw                  m4, m1                       ;t0
2692    psubsw                  m3, m5, m7                   ;t5
2693    paddsw                  m5, m7                       ;t1
2694    ITX_MULSUB_2W            0, 3, 1, 7, 6, 1567, 3784   ;t5a, t4a
2695    mova                    m7, [rsp+gprsize*2+16*3]     ;t6a
2696    psubsw                  m1, m4, m2                   ;t2a
2697    paddsw                  m4, m2                       ;out0
2698    mova  [rsp+gprsize*2+16*3], m4                       ;out0
2699    mova                    m4, [rsp+gprsize*2+16*6]     ;t7a
2700    psubsw                  m2, m3, m7                   ;t6
2701    paddsw                  m3, m7                       ;-out3
2702    mova  [rsp+gprsize*2+16*6], m3                       ;-out3
2703    psubsw                  m3, m0, m4                   ;t7
2704    paddsw                  m0, m4                       ;out12
2705    mova [rsp+gprsize*2+16*12], m3
2706    mova                    m3, [rsp+gprsize*2+16*7]     ;t3
2707    mova [rsp+gprsize*2+16* 7], m2                       ;out4
2708    psubsw                  m2, m5, m3                   ;t3a
2709    paddsw                  m5, m3                       ;-out15
2710    mova [rsp+gprsize*2+16*11], m2
2711    mova                    m2, [rsp+gprsize*2+32*5]     ;t15
2712    mova [rsp+gprsize*2+16*10], m1                       ;-out7
2713    mova                    m1, [rsp+gprsize*2+16*0]     ;t11
2714    mova [rsp+gprsize*2+16*0 ], m5                       ;-out15
2715    mova                    m3, [rsp+gprsize*2+16*1]     ;t10
2716    mova [rsp+gprsize*2+16*1 ], m4                       ;-out11
2717    mova                    m4, [rsp+gprsize*2+16*2]     ;t14
2718    mova [rsp+gprsize*2+16*2 ], m0                       ;out12
2719    psubsw                  m0, m3, m4                   ;t14a
2720    paddsw                  m3, m4                       ;t10a
2721    psubsw                  m5, m1, m2                   ;t15a
2722    paddsw                  m1, m2                       ;t11a
2723    ITX_MULSUB_2W            5, 0, 2, 4, 6, 3784, 1567   ;t14, t15
2724    mova                    m2, [rsp+gprsize*2+16*4]     ;t8
2725    mova                    m4, [rsp+gprsize*2+16*5]     ;t9
2726    mova  [rsp+gprsize*2+16*4], m3                       ;t10a
2727    mova  [rsp+gprsize*2+16*5], m1                       ;t11a
2728    mova                    m3, [rsp+gprsize*2+16*8]     ;t12
2729    mova                    m1, [rsp+gprsize*2+16*9]     ;t13
2730    mova  [rsp+gprsize*2+16*8], m5                       ;t14
2731    mova  [rsp+gprsize*2+16*9], m0                       ;t15
2732    psubsw                  m5, m2, m3                   ;t12a
2733    paddsw                  m2, m3                       ;t8a
2734    psubsw                  m0, m4, m1                   ;t13a
2735    paddsw                  m4, m1                       ;t9a
2736    ITX_MULSUB_2W            5, 0, 1, 3, 6, 1567, 3784   ;t13, t12
2737    mova                    m6, [rsp+gprsize*2+16*4]     ;t10a
2738    mova                    m1, [rsp+gprsize*2+16*5]     ;t11a
2739    psubsw                  m3, m2, m6                   ;t10
2740    paddsw                  m2, m6                       ;-out1
2741    paddsw                  m6, m4, m1                   ;out14
2742    psubsw                  m4, m1                       ;t11
2743    mova [rsp+gprsize*2+16*14], m4
2744    mova [rsp+gprsize*2+16* 4], m2                       ;-out1
2745    mova                    m4, [rsp+gprsize*2+16*8]     ;t14
2746    mova                    m2, [rsp+gprsize*2+16*9]     ;t15
2747    mova [rsp+gprsize*2+16* 9], m3                       ;out6
2748    psubsw                  m3, m0, m4                   ;t14a
2749    paddsw                  m0, m4                       ;out2
2750    psubsw                  m4, m5, m2                   ;t15a
2751    paddsw                  m5, m2                       ;-out13
2752    mova [rsp+gprsize*2+16* 5], m0                       ;out2
2753    ret
2754ALIGN function_align
2755.main_pass1_end:
2756    mova                    m0, [rsp+gprsize*2+16*14]
2757    mova [rsp+gprsize*2+16*14], m5
2758    mova [rsp+gprsize*2+16*15], m6
2759    mova                    m5, [o(pw_2896_2896)]
2760    mova                    m6, [o(pw_2896_m2896)]
2761    mova                    m7, [o(pd_2048)]
2762    punpcklwd               m2, m3, m4
2763    punpckhwd               m3, m4
2764    pmaddwd                 m4, m5, m2
2765    pmaddwd                 m2, m6
2766    pmaddwd                 m1, m5, m3
2767    pmaddwd                 m3, m6
2768    REPX         {paddd x, m7}, m4, m2, m1, m3
2769    REPX         {psrad x, 12}, m4, m1, m2, m3
2770    packssdw                m4, m1                       ;-out5
2771    packssdw                m2, m3                       ;out10
2772    mova [rsp+gprsize*2+16* 8], m4
2773    mova                    m3, [rsp+gprsize*2+16* 9]
2774    punpcklwd               m1, m3, m0
2775    punpckhwd               m3, m0
2776    pmaddwd                 m0, m5, m1
2777    pmaddwd                 m1, m6
2778    pmaddwd                 m4, m5, m3
2779    pmaddwd                 m3, m6
2780    REPX         {paddd x, m7}, m0, m1, m4, m3
2781    REPX         {psrad x, 12}, m0, m4, m1, m3
2782    packssdw                m0, m4                       ;out6
2783    packssdw                m1, m3                       ;-out9
2784    mova [rsp+gprsize*2+16* 9], m0
2785    mova                    m0, [rsp+gprsize*2+16* 7]
2786    mova                    m4, [rsp+gprsize*2+16*12]
2787    punpcklwd               m3, m0, m4
2788    punpckhwd               m0, m4
2789    pmaddwd                 m4, m5, m3
2790    pmaddwd                 m3, m6
2791    pmaddwd                 m5, m0
2792    pmaddwd                 m0, m6
2793    REPX         {paddd x, m7}, m4, m3, m5, m0
2794    REPX         {psrad x, 12}, m4, m5, m3, m0
2795    packssdw                m4, m5                       ;out4
2796    packssdw                m3, m0                       ;-out11
2797    mova [rsp+gprsize*2+16* 7], m4
2798    mova                    m4, [rsp+gprsize*2+16*10]
2799    mova                    m5, [rsp+gprsize*2+16*11]
2800    punpcklwd               m0, m4, m5
2801    punpckhwd               m4, m5
2802    pmaddwd                 m5, m0, [o(pw_2896_2896)]
2803    pmaddwd                 m0, m6
2804    pmaddwd                 m6, m4
2805    pmaddwd                 m4, [o(pw_2896_2896)]
2806    REPX         {paddd x, m7}, m5, m0, m6, m4
2807    REPX         {psrad x, 12}, m0, m6, m5, m4
2808    packssdw                m0, m6                       ;out8
2809    packssdw                m5, m4                       ;-out7
2810    mova [rsp+gprsize*2+16*10], m5
2811    mova                    m4, [rsp+gprsize*2+16* 2]    ;out12
2812    mova                    m5, [rsp+gprsize*2+16*14]    ;-out13
2813    mova                    m6, [rsp+gprsize*2+16*15]    ;out14
2814    ret
2815ALIGN function_align
2816cglobal_label .main_pass2_end
2817    mova                    m7, [o(pw_2896x8)]
2818    mova                    m1, [rsp+gprsize*2+16* 9]
2819    mova                    m2, [rsp+gprsize*2+16*14]
2820    paddsw                  m0, m1, m2
2821    psubsw                  m1, m2
2822    pmulhrsw                m0, m7                       ;out6
2823    pmulhrsw                m1, m7                       ;-out9
2824    mova [rsp+gprsize*2+16* 9], m0
2825    psubsw                  m2, m3, m4
2826    paddsw                  m3, m4
2827    pmulhrsw                m2, m7                       ;out10
2828    pmulhrsw                m3, m7                       ;-out5
2829    mova [rsp+gprsize*2+16* 8], m3
2830    mova                    m3, [rsp+gprsize*2+16* 7]
2831    mova                    m4, [rsp+gprsize*2+16*12]
2832    paddsw                  m0, m3, m4
2833    psubsw                  m3, m4
2834    pmulhrsw                m0, m7                       ;out4
2835    pmulhrsw                m3, m7                       ;-out11
2836    mova [rsp+gprsize*2+16* 7], m0
2837    mova                    m0, [rsp+gprsize*2+16*10]
2838    paddsw                  m4, m0, [rsp+gprsize*2+16*11]
2839    psubsw                  m0, [rsp+gprsize*2+16*11]
2840    pmulhrsw                m4, m7                       ;-out7
2841    pmulhrsw                m0, m7                       ;out8
2842    mova [rsp+gprsize*2+16*10], m4
2843    mova                    m4, [rsp+gprsize*2+16*2 ]    ;out12
2844    ret
2845
2846INV_TXFM_16X8_FN flipadst, dct
2847INV_TXFM_16X8_FN flipadst, adst
2848INV_TXFM_16X8_FN flipadst, flipadst
2849INV_TXFM_16X8_FN flipadst, identity
2850
2851cglobal iflipadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
2852    mova                    m7, [o(pw_2896x8)]
2853    pmulhrsw                m0, m7, [coeffq+16*0 ]
2854    pmulhrsw                m1, m7, [coeffq+16*1 ]
2855    pmulhrsw                m2, m7, [coeffq+16*14]
2856    pmulhrsw                m3, m7, [coeffq+16*15]
2857    mova    [rsp+gprsize+16*7], m0
2858    mova    [rsp+gprsize+16*8], m1
2859    mova    [rsp+gprsize+16*9], m2
2860    mova    [rsp+gprsize+32*5], m3
2861    pmulhrsw                m0, m7, [coeffq+16*6 ]
2862    pmulhrsw                m1, m7, [coeffq+16*7 ]
2863    pmulhrsw                m2, m7, [coeffq+16*8 ]
2864    pmulhrsw                m3, m7, [coeffq+16*9 ]
2865    mova    [rsp+gprsize+16*3], m2
2866    mova    [rsp+gprsize+16*4], m3
2867    mova    [rsp+gprsize+16*5], m0
2868    mova    [rsp+gprsize+16*6], m1
2869    pmulhrsw                m0, m7, [coeffq+16*2 ]
2870    pmulhrsw                m1, m7, [coeffq+16*3 ]
2871    pmulhrsw                m2, m7, [coeffq+16*4 ]
2872    pmulhrsw                m3, m7, [coeffq+16*5 ]
2873    pmulhrsw                m4, m7, [coeffq+16*10]
2874    pmulhrsw                m5, m7, [coeffq+16*11]
2875    pmulhrsw                m6, m7, [coeffq+16*12]
2876    pmulhrsw                m7,     [coeffq+16*13]
2877
2878    call m(iadst_16x8_internal_8bpc).main
2879    call m(iadst_16x8_internal_8bpc).main_pass1_end
2880
2881    mova                    m7, [rsp+gprsize+16*0]
2882    SAVE_8ROWS     coeffq+16*0, 32
2883    LOAD_8ROWS    rsp+gprsize+16*3, 16
2884    mova    [rsp+gprsize+16*0], m7
2885    mov                     r3, tx2q
2886    lea                   tx2q, [o(m(iflipadst_16x8_internal_8bpc).pass1_end)]
2887    jmp m(iflipadst_8x8_internal_8bpc).pass1_end
2888
2889.pass1_end:
2890    SAVE_8ROWS     coeffq+16*1, 32
2891    LOAD_8ROWS     coeffq+16*0, 32
2892    mova    [rsp+gprsize+16*0], m7
2893    mov                   tx2q, r3
2894    jmp m(iflipadst_8x8_internal_8bpc).pass1_end
2895
2896.pass2:
2897    lea                   tx2q, [o(m(iflipadst_16x8_internal_8bpc).end)]
2898    lea                     r3, [dstq+8]
2899    jmp m(iflipadst_8x8_internal_8bpc).pass2_main
2900
2901.end:
2902    LOAD_8ROWS     coeffq+16*1, 32
2903    lea                   tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
2904    mov                   dstq, r3
2905    jmp m(iflipadst_8x8_internal_8bpc).pass2_main
2906
2907
2908INV_TXFM_16X8_FN identity, dct
2909INV_TXFM_16X8_FN identity, adst
2910INV_TXFM_16X8_FN identity, flipadst
2911INV_TXFM_16X8_FN identity, identity
2912
2913cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
2914    add                coeffq, 16*16
2915    mova                   m4, [coeffq-16*7]
2916    mova                   m5, [coeffq-16*5]
2917    mova                   m6, [coeffq-16*3]
2918    mova                   m7, [coeffq-16*1]
2919    mov                    r3, tx2q
2920    lea                  tx2q, [o(m(iidentity_16x8_internal_8bpc).pass1_end)]
2921
2922.pass1:
2923    mova                   m0, [o(pw_2896x8)]
2924    mova                   m2, [o(pw_1697x16)]
2925    mova                   m3, [o(pw_16384)]
2926    sub                coeffq, 8*16
2927    REPX     {pmulhrsw x, m0}, m4, m5, m6, m7
2928    pmulhrsw               m1, m2, m4
2929    pmulhrsw               m1, m3
2930    paddsw                 m1, m4 ; 1
2931    pmulhrsw               m4, m2, m5
2932    pmulhrsw               m4, m3
2933    paddsw                 m4, m5 ; 3
2934    pmulhrsw               m5, m2, m6
2935    pmulhrsw               m5, m3
2936    paddsw                 m5, m6 ; 5
2937    pmulhrsw               m6, m2, m7
2938    pmulhrsw               m6, m3
2939    paddsw                 m7, m6 ; 7
2940    pmulhrsw               m6, m0, [coeffq+16*6]
2941    mova   [rsp+gprsize+16*0], m4
2942    pmulhrsw               m4, m2, m6
2943    pmulhrsw               m4, m3
2944    paddsw                 m6, m4 ; 6
2945    pmulhrsw               m4, m0, [coeffq+16*4]
2946    mova   [rsp+gprsize+16*1], m6
2947    pmulhrsw               m6, m2, m4
2948    pmulhrsw               m6, m3
2949    paddsw                 m4, m6 ; 4
2950    pmulhrsw               m6, m0, [coeffq+16*2]
2951    pmulhrsw               m0,     [coeffq+16*0]
2952    pmulhrsw               m2, m6
2953    pmulhrsw               m2, m3
2954    paddsw                 m2, m6 ; 2
2955    pmulhrsw               m6, m0, [o(pw_1697x16)]
2956    pmulhrsw               m6, m3
2957    mova                   m3, [rsp+gprsize+16*0]
2958    paddsw                 m0, m6
2959    jmp   m(idct_8x8_internal_8bpc).pass1_end3
2960
2961.pass1_end:
2962    mova        [coeffq+16*1], m4
2963    mova        [coeffq+16*3], m5
2964    mova        [coeffq+16*5], m6
2965    mova        [coeffq+16*7], m7
2966    mova                   m4, [coeffq-16*7]
2967    mova                   m5, [coeffq-16*5]
2968    mova                   m6, [coeffq-16*3]
2969    mova                   m7, [coeffq-16*1]
2970    mova        [coeffq-16*7], m0
2971    mova        [coeffq-16*5], m1
2972    mova        [coeffq-16*3], m2
2973    mova        [coeffq-16*1], m3
2974    mov                  tx2q, r3
2975    jmp .pass1
2976
2977.pass2:
2978    lea                  tx2q, [o(m(iidentity_16x8_internal_8bpc).end)]
2979    lea                    r3, [dstq+8]
2980    jmp  m(iidentity_8x8_internal_8bpc).end
2981
2982.end:
2983    LOAD_8ROWS    coeffq+16*1, 32
2984    lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
2985    mov                  dstq, r3
2986    jmp  m(iidentity_8x8_internal_8bpc).end
2987
2988
2989%macro INV_TXFM_16X16_FN 2 ; type1, type2
2990    INV_TXFM_FN          %1, %2, 16x16, 8, 16*16
2991%ifidn %1_%2, dct_dct
2992    movd                   m1, [o(pw_2896x8)]
2993    pmulhrsw               m0, m1, [coeffq]
2994    movd                   m2, [o(pw_8192)]
2995    mov              [coeffq], eobd
2996    mov                   r2d, 8
2997    lea                  tx2q, [o(m(inv_txfm_add_dct_dct_16x16_8bpc).end)]
2998    jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
2999.end:
3000    RET
3001%endif
3002%endmacro
3003
3004INV_TXFM_16X16_FN dct, dct
3005INV_TXFM_16X16_FN dct, adst
3006INV_TXFM_16X16_FN dct, flipadst
3007INV_TXFM_16X16_FN dct, identity
3008
3009cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
3010    LOAD_8ROWS     coeffq+16*1, 64
3011    call  m(idct_8x8_internal_8bpc).main
3012    SAVE_7ROWS    rsp+gprsize+16*3, 16
3013    LOAD_8ROWS     coeffq+16*3, 64
3014    call m(idct_16x8_internal_8bpc).main
3015    mov                     r3, tx2q
3016    lea                   tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end)]
3017    mova                    m7, [o(pw_8192)]
3018    jmp   m(idct_8x8_internal_8bpc).pass1_end1
3019
3020.pass1_end:
3021    SAVE_8ROWS    coeffq+16*17, 32
3022    LOAD_8ROWS    rsp+gprsize+16*3, 16
3023    mova    [rsp+gprsize+16*0], m7
3024    lea                   tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end1)]
3025    mova                    m7, [o(pw_8192)]
3026    jmp   m(idct_8x8_internal_8bpc).pass1_end1
3027
3028.pass1_end1:
3029    SAVE_8ROWS     coeffq+16*1, 32
3030    LOAD_8ROWS     coeffq+16*0, 64
3031    call  m(idct_8x8_internal_8bpc).main
3032    SAVE_7ROWS    rsp+gprsize+16*3, 16
3033    LOAD_8ROWS     coeffq+16*2, 64
3034    call m(idct_16x8_internal_8bpc).main
3035    lea                   tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end2)]
3036    mova                    m7, [o(pw_8192)]
3037    jmp   m(idct_8x8_internal_8bpc).pass1_end1
3038
3039.pass1_end2:
3040    SAVE_8ROWS    coeffq+16*16, 32
3041    LOAD_8ROWS    rsp+gprsize+16*3, 16
3042    mova    [rsp+gprsize+16*0], m7
3043    mov                   tx2q, r3
3044    mova                    m7, [o(pw_8192)]
3045    jmp   m(idct_8x8_internal_8bpc).pass1_end1
3046
3047.pass2:
3048    lea                   tx2q, [o(m(idct_16x16_internal_8bpc).end)]
3049    jmp  m(idct_8x16_internal_8bpc).pass2_pre
3050
3051.end:
3052    LOAD_8ROWS    rsp+gprsize+16*3, 16
3053    mova    [rsp+gprsize+16*0], m7
3054    lea                   tx2q, [o(m(idct_16x16_internal_8bpc).end1)]
3055    mov                   dstq, r3
3056    lea                     r3, [dstq+8]
3057    jmp   m(idct_8x8_internal_8bpc).end
3058
3059.end1:
3060    pxor                    m7, m7
3061    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
3062
3063    add                 coeffq, 32*8
3064    mov                   dstq, r3
3065
3066    mova                    m0, [coeffq+16*0 ]
3067    mova                    m1, [coeffq+16*4 ]
3068    mova                    m2, [coeffq+16*8 ]
3069    mova                    m3, [coeffq+16*12]
3070    mova                    m4, [coeffq+16*1 ]
3071    mova                    m5, [coeffq+16*5 ]
3072    mova                    m6, [coeffq+16*9 ]
3073    mova                    m7, [coeffq+16*13]
3074    lea                   tx2q, [o(m(idct_8x16_internal_8bpc).end)]
3075    jmp  m(idct_8x16_internal_8bpc).pass2_main
3076
3077
3078%macro ITX_16X16_ADST_LOAD_ODD_COEFS 0
3079    mova                    m0, [coeffq+16*1 ]
3080    mova                    m1, [coeffq+16*3 ]
3081    mova                    m2, [coeffq+16*29]
3082    mova                    m3, [coeffq+16*31]
3083    mova    [rsp+gprsize+16*7], m0
3084    mova    [rsp+gprsize+16*8], m1
3085    mova    [rsp+gprsize+16*9], m2
3086    mova    [rsp+gprsize+32*5], m3
3087    mova                    m0, [coeffq+16*13]
3088    mova                    m1, [coeffq+16*15]
3089    mova                    m2, [coeffq+16*17]
3090    mova                    m3, [coeffq+16*19]
3091    mova    [rsp+gprsize+16*3], m2
3092    mova    [rsp+gprsize+16*4], m3
3093    mova    [rsp+gprsize+16*5], m0
3094    mova    [rsp+gprsize+16*6], m1
3095    mova                    m0, [coeffq+16*5 ]
3096    mova                    m1, [coeffq+16*7 ]
3097    mova                    m2, [coeffq+16*9 ]
3098    mova                    m3, [coeffq+16*11]
3099    mova                    m4, [coeffq+16*21]
3100    mova                    m5, [coeffq+16*23]
3101    mova                    m6, [coeffq+16*25]
3102    mova                    m7, [coeffq+16*27]
3103%endmacro
3104
3105%macro ITX_16X16_ADST_LOAD_EVEN_COEFS 0
3106    mova                    m0, [coeffq+16*0 ]
3107    mova                    m1, [coeffq+16*2 ]
3108    mova                    m2, [coeffq+16*28]
3109    mova                    m3, [coeffq+16*30]
3110    mova    [rsp+gprsize+16*7], m0
3111    mova    [rsp+gprsize+16*8], m1
3112    mova    [rsp+gprsize+16*9], m2
3113    mova    [rsp+gprsize+32*5], m3
3114    mova                    m0, [coeffq+16*12]
3115    mova                    m1, [coeffq+16*14]
3116    mova                    m2, [coeffq+16*16]
3117    mova                    m3, [coeffq+16*18]
3118    mova    [rsp+gprsize+16*3], m2
3119    mova    [rsp+gprsize+16*4], m3
3120    mova    [rsp+gprsize+16*5], m0
3121    mova    [rsp+gprsize+16*6], m1
3122    mova                    m0, [coeffq+16*4 ]
3123    mova                    m1, [coeffq+16*6 ]
3124    mova                    m2, [coeffq+16*8 ]
3125    mova                    m3, [coeffq+16*10]
3126    mova                    m4, [coeffq+16*20]
3127    mova                    m5, [coeffq+16*22]
3128    mova                    m6, [coeffq+16*24]
3129    mova                    m7, [coeffq+16*26]
3130%endmacro
3131
3132INV_TXFM_16X16_FN adst, dct
3133INV_TXFM_16X16_FN adst, adst
3134INV_TXFM_16X16_FN adst, flipadst
3135
3136cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
3137    ITX_16X16_ADST_LOAD_ODD_COEFS
3138    call m(iadst_16x8_internal_8bpc).main
3139    call m(iadst_16x8_internal_8bpc).main_pass1_end
3140
3141    mov                     r3, tx2q
3142    lea                   tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end)]
3143    mova                    m7, [o(pw_8192)]
3144    jmp  m(iadst_8x8_internal_8bpc).pass1_end1
3145
3146.pass1_end:
3147    SAVE_8ROWS    coeffq+16*17, 32
3148    LOAD_8ROWS    rsp+gprsize+16*3, 16
3149    mova    [rsp+gprsize+16*0], m7
3150    lea                   tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end1)]
3151    mova                    m7, [o(pw_8192)]
3152    jmp  m(iadst_8x8_internal_8bpc).pass1_end1
3153
3154.pass1_end1:
3155    SAVE_8ROWS     coeffq+16*1, 32
3156    ITX_16X16_ADST_LOAD_EVEN_COEFS
3157    call m(iadst_16x8_internal_8bpc).main
3158    call m(iadst_16x8_internal_8bpc).main_pass1_end
3159
3160    lea                   tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end2)]
3161    mova                    m7, [o(pw_8192)]
3162    jmp  m(iadst_8x8_internal_8bpc).pass1_end1
3163
3164.pass1_end2:
3165    SAVE_8ROWS    coeffq+16*16, 32
3166    LOAD_8ROWS    rsp+gprsize+16*3, 16
3167    mova    [rsp+gprsize+16*0], m7
3168    mov                   tx2q, r3
3169    mova                    m7, [o(pw_8192)]
3170    jmp  m(iadst_8x8_internal_8bpc).pass1_end1
3171
3172.pass2:
3173    lea                   tx2q, [o(m(iadst_16x16_internal_8bpc).end)]
3174    jmp m(iadst_8x16_internal_8bpc).pass2_pre
3175
3176.end:
3177    LOAD_8ROWS    rsp+gprsize+16*3, 16
3178    mova    [rsp+gprsize+16*0], m7
3179    lea                   tx2q, [o(m(iadst_16x16_internal_8bpc).end1)]
3180    mov                   dstq, r3
3181    lea                     r3, [dstq+8]
3182    jmp  m(iadst_8x8_internal_8bpc).end
3183
3184.end1:
3185    pxor                    m7, m7
3186    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
3187
3188    add                 coeffq, 32*8
3189    mov                   dstq, r3
3190
3191    mova                    m4, [coeffq+16*0 ]
3192    mova                    m5, [coeffq+16*2 ]
3193    mova                    m0, [coeffq+16*4 ]
3194    mova                    m1, [coeffq+16*6 ]
3195    mova                    m2, [coeffq+16*8 ]
3196    mova                    m3, [coeffq+16*10]
3197    mova                    m6, [coeffq+16*12]
3198    mova                    m7, [coeffq+16*14]
3199    mova    [rsp+gprsize+16*7], m4
3200    mova    [rsp+gprsize+16*8], m5
3201    mova    [rsp+gprsize+16*5], m6
3202    mova    [rsp+gprsize+16*6], m7
3203    lea                   tx2q, [o(m(iadst_8x16_internal_8bpc).end)]
3204    jmp m(iadst_8x16_internal_8bpc).pass2_main
3205
3206
3207INV_TXFM_16X16_FN flipadst, dct
3208INV_TXFM_16X16_FN flipadst, adst
3209INV_TXFM_16X16_FN flipadst, flipadst
3210
3211cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
3212    ITX_16X16_ADST_LOAD_ODD_COEFS
3213    call m(iadst_16x8_internal_8bpc).main
3214    call m(iadst_16x8_internal_8bpc).main_pass1_end
3215
3216    mov                     r3, tx2q
3217    lea                   tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end)]
3218    mova                    m7, [o(pw_m8192)]
3219    jmp  m(iflipadst_8x8_internal_8bpc).pass1_end1
3220
3221.pass1_end:
3222    SAVE_8ROWS     coeffq+16*1, 32
3223    LOAD_8ROWS    rsp+gprsize+16*3, 16
3224    mova    [rsp+gprsize+16*0], m7
3225    lea                   tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end1)]
3226    mova                    m7, [o(pw_m8192)]
3227    jmp  m(iflipadst_8x8_internal_8bpc).pass1_end1
3228
3229.pass1_end1:
3230    SAVE_8ROWS    coeffq+16*17, 32
3231    ITX_16X16_ADST_LOAD_EVEN_COEFS
3232    call m(iadst_16x8_internal_8bpc).main
3233    call m(iadst_16x8_internal_8bpc).main_pass1_end
3234
3235    mova                    m7, [rsp+gprsize+16*0]
3236    SAVE_8ROWS     coeffq+16*0, 32
3237    LOAD_8ROWS    rsp+gprsize+16*3, 16
3238    mova    [rsp+gprsize+16*0], m7
3239    lea                   tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end2)]
3240    mova                    m7, [o(pw_m8192)]
3241    jmp  m(iflipadst_8x8_internal_8bpc).pass1_end1
3242
3243.pass1_end2:
3244    SAVE_8ROWS    coeffq+16*16, 32
3245    LOAD_8ROWS    coeffq+16* 0, 32
3246    mova    [rsp+gprsize+16*0], m7
3247    mov                   tx2q, r3
3248    mova                    m7, [o(pw_m8192)]
3249    jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
3250
3251.pass2:
3252    lea                   tx2q, [o(m(iflipadst_16x16_internal_8bpc).end)]
3253    lea                     r3, [dstq+8]
3254    jmp m(iflipadst_8x16_internal_8bpc).pass2_pre
3255
3256.end:
3257    LOAD_8ROWS    rsp+gprsize+16*3, 16
3258    mova    [rsp+gprsize+16*0], m7
3259    lea                   tx2q, [o(m(iflipadst_16x16_internal_8bpc).end1)]
3260    lea                   dstq, [dstq+strideq*2]
3261    jmp  m(iflipadst_8x8_internal_8bpc).end
3262
3263.end1:
3264    pxor                    m7, m7
3265    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
3266
3267    add                 coeffq, 32*8
3268
3269    mova                    m4, [coeffq+16*0 ]
3270    mova                    m5, [coeffq+16*2 ]
3271    mova                    m0, [coeffq+16*4 ]
3272    mova                    m1, [coeffq+16*6 ]
3273    mova                    m2, [coeffq+16*8 ]
3274    mova                    m3, [coeffq+16*10]
3275    mova                    m6, [coeffq+16*12]
3276    mova                    m7, [coeffq+16*14]
3277    mova    [rsp+gprsize+16*7], m4
3278    mova    [rsp+gprsize+16*8], m5
3279    mova    [rsp+gprsize+16*5], m6
3280    mova    [rsp+gprsize+16*6], m7
3281
3282    lea                   tx2q, [o(m(iflipadst_16x16_internal_8bpc).end2)]
3283    mov                   dstq, r3
3284    jmp m(iflipadst_8x16_internal_8bpc).pass2_main
3285
3286.end2:
3287    LOAD_8ROWS    rsp+gprsize+16*3, 16
3288    mova    [rsp+gprsize+16*0], m7
3289    lea                   tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
3290    lea                   dstq, [dstq+strideq*2]
3291    jmp  m(iflipadst_8x8_internal_8bpc).end
3292
3293
3294%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16
3295    pmulhrsw            m%2, m%3, m%1
3296    psraw               m%2, 1
3297    pavgw               m%1, m%2
3298%endmacro
3299
3300INV_TXFM_16X16_FN identity, dct
3301INV_TXFM_16X16_FN identity, identity
3302
3303cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
3304    add                 coeffq, 16*17
3305    mov                     r3, tx2q
3306    lea                   tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end)]
3307
3308.pass1:
3309    mova                    m6, [o(pw_1697x16)]
3310    mova                    m7, [coeffq+32*6]
3311    mova                    m0, [coeffq+32*0]
3312    mova                    m1, [coeffq+32*1]
3313    mova                    m2, [coeffq+32*2]
3314    mova                    m3, [coeffq+32*3]
3315    mova                    m4, [coeffq+32*4]
3316    REPX     {IDTX16B x, 5, 6}, 7, 0, 1, 2, 3, 4
3317    mova                    m5, [coeffq+32*5]
3318    mova    [rsp+gprsize+16*1], m7
3319    IDTX16B                  5, 7, 6
3320    mova                    m7, [coeffq+32*7]
3321    IDTX16B                  7, 6, 6
3322    jmp   m(idct_8x8_internal_8bpc).pass1_end3
3323
3324.pass1_end:
3325    SAVE_8ROWS          coeffq, 32
3326    sub                 coeffq, 16
3327    lea                   tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end1)]
3328    jmp .pass1
3329
3330.pass1_end1:
3331    SAVE_8ROWS          coeffq, 32
3332    sub                 coeffq, 15*16
3333    lea                   tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end2)]
3334    jmp .pass1
3335
3336.pass1_end2:
3337    SAVE_8ROWS          coeffq, 32
3338    sub                 coeffq, 16
3339    mov                   tx2q, r3
3340    jmp .pass1
3341
3342.pass2:
3343    lea                     r3, [dstq+8]
3344    lea                   tx2q, [o(m(iidentity_16x16_internal_8bpc).end1)]
3345
3346.end:
3347    mova    [rsp+gprsize+16*0], m7
3348    mova    [rsp+gprsize+16*1], m4
3349    mova                    m7, [o(pw_1697x16)]
3350    REPX      {IDTX16 x, 4, 7}, 5, 6, 0, 1, 2, 3
3351    mova                    m4, [o(pw_2048)]
3352    pmulhrsw                m5, m4
3353    pmulhrsw                m6, m4
3354    mova    [rsp+gprsize+16*2], m5
3355    mova                    m5, [rsp+gprsize+16*1]
3356    mova    [rsp+gprsize+16*1], m6
3357    IDTX16                   5, 6, 7
3358    mova                    m6, [rsp+gprsize+16*0]
3359    IDTX16                   6, 7, 7
3360    REPX      {pmulhrsw x, m4}, m0, m1, m2, m3, m6
3361    pmulhrsw                m4, m5
3362    mova    [rsp+gprsize+16*0], m6
3363    jmp   m(idct_8x8_internal_8bpc).end3
3364
3365.end1:
3366    LOAD_8ROWS     coeffq+16*1, 32
3367    lea                   tx2q, [o(m(iidentity_16x16_internal_8bpc).end2)]
3368    lea                   dstq, [dstq+strideq*2]
3369    jmp .end
3370
3371.end2:
3372    pxor                    m7, m7
3373    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
3374
3375    add                 coeffq, 32*8
3376    LOAD_8ROWS          coeffq, 32
3377    lea                   tx2q, [o(m(iidentity_16x16_internal_8bpc).end3)]
3378    mov                   dstq, r3
3379    jmp .end
3380
3381.end3:
3382    LOAD_8ROWS     coeffq+16*1, 32
3383    lea                   tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
3384    lea                   dstq, [dstq+strideq*2]
3385    jmp .end
3386
3387
3388cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
3389%if ARCH_X86_32
3390    LEA                     r5, $$
3391%endif
3392    test                  eobd, eobd
3393    jz .dconly
3394    call  m(idct_8x32_internal_8bpc)
3395    RET
3396
3397.dconly:
3398    movd                 m1, [o(pw_2896x8)]
3399    pmulhrsw             m0, m1, [coeffq]
3400    movd                 m2, [o(pw_8192)]
3401    mov            [coeffq], eobd
3402    pmulhrsw             m0, m2
3403    psrlw                m2, 2            ;pw_2048
3404    pmulhrsw             m0, m1
3405    pmulhrsw             m0, m2
3406    pshuflw              m0, m0, q0000
3407    punpcklwd            m0, m0
3408    mov                 r3d, 8
3409    lea                tx2q, [o(m(inv_txfm_add_dct_dct_8x32_8bpc).end)]
3410    jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop
3411
3412.end:
3413    RET
3414
3415
3416
3417cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
3418    %undef cmp
3419    cmp                   eobd, 106
3420    jle .fast
3421
3422    LOAD_8ROWS     coeffq+16*3, 64
3423    call  m(idct_8x8_internal_8bpc).main
3424    mova                    m7, [o(pw_8192)]
3425    lea                   tx2q, [o(m(idct_8x32_internal_8bpc).pass1)]
3426    jmp   m(idct_8x8_internal_8bpc).pass1_end1
3427
3428.pass1:
3429    mova   [rsp+gprsize+16*9 ], m0                        ;in24
3430    mova   [rsp+gprsize+16*10], m4                        ;in28
3431    mova   [rsp+gprsize+16*17], m2                        ;in26
3432    mova   [rsp+gprsize+16*18], m6                        ;in30
3433    mova   [rsp+gprsize+16*31], m1                        ;in25
3434    mova   [rsp+gprsize+16*30], m3                        ;in27
3435    mova   [rsp+gprsize+16*27], m5                        ;in29
3436    mova   [rsp+gprsize+16*34], m7                        ;in31
3437    LOAD_8ROWS     coeffq+16*2, 64
3438    call  m(idct_8x8_internal_8bpc).main
3439    mova                    m7, [o(pw_8192)]
3440    lea                   tx2q, [o(m(idct_8x32_internal_8bpc).pass1_1)]
3441    jmp   m(idct_8x8_internal_8bpc).pass1_end1
3442
3443.pass1_1:
3444    mova   [rsp+gprsize+16*7 ], m0                        ;in16
3445    mova   [rsp+gprsize+16*8 ], m4                        ;in20
3446    mova   [rsp+gprsize+16*15], m2                        ;in18
3447    mova   [rsp+gprsize+16*16], m6                        ;in22
3448    mova   [rsp+gprsize+16*33], m1                        ;in17
3449    mova   [rsp+gprsize+16*28], m3                        ;in19
3450    mova   [rsp+gprsize+16*29], m5                        ;in21
3451    mova   [rsp+gprsize+16*32], m7                        ;in23
3452
3453.fast:
3454    LOAD_8ROWS     coeffq+16*1, 64
3455    call  m(idct_8x8_internal_8bpc).main
3456    mova                    m7, [o(pw_8192)]
3457    lea                   tx2q, [o(m(idct_8x32_internal_8bpc).pass1_end)]
3458    jmp   m(idct_8x8_internal_8bpc).pass1_end1
3459
3460.pass1_end:
3461    mova   [rsp+gprsize+16*5 ], m0                        ;in8
3462    mova   [rsp+gprsize+16*6 ], m4                        ;in12
3463    mova   [rsp+gprsize+16*13], m2                        ;in10
3464    mova   [rsp+gprsize+16*14], m6                        ;in14
3465    mova   [rsp+gprsize+16*21], m1                        ;in9
3466    mova   [rsp+gprsize+16*24], m3                        ;in11
3467    mova   [rsp+gprsize+16*25], m5                        ;in13
3468    mova   [rsp+gprsize+16*20], m7                        ;in15
3469    LOAD_8ROWS     coeffq+16*0, 64
3470    call  m(idct_8x8_internal_8bpc).main
3471    mova                    m7, [o(pw_8192)]
3472    lea                   tx2q, [o(m(idct_8x32_internal_8bpc).pass1_end1)]
3473    jmp   m(idct_8x8_internal_8bpc).pass1_end1
3474
3475.pass1_end1:
3476    mova   [rsp+gprsize+16*11], m2                        ;in2
3477    mova   [rsp+gprsize+16*12], m6                        ;in6
3478    mova   [rsp+gprsize+16*19], m1                        ;in1
3479    mova   [rsp+gprsize+16*26], m3                        ;in3
3480    mova   [rsp+gprsize+16*23], m5                        ;in5
3481    mova   [rsp+gprsize+16*22], m7                        ;in7
3482    mova                    m1, m4                        ;in4
3483    mova                    m2, [rsp+gprsize+16*5 ]       ;in8
3484    mova                    m3, [rsp+gprsize+16*6 ]       ;in12
3485
3486    cmp                   eobd, 106
3487    jg .full
3488
3489    pxor                    m4, m4
3490    REPX          {mova x, m4}, m5, m6, m7
3491    call  m(idct_8x8_internal_8bpc).main
3492    SAVE_7ROWS   rsp+gprsize+16*3 , 16
3493    mova                    m0, [rsp+gprsize+16*11]
3494    mova                    m1, [rsp+gprsize+16*12]
3495    mova                    m2, [rsp+gprsize+16*13]
3496    mova                    m3, [rsp+gprsize+16*14]
3497    pxor                    m4, m4
3498    REPX          {mova x, m4}, m5, m6, m7
3499    call m(idct_16x8_internal_8bpc).main
3500    mova                    m7, [rsp+gprsize+16*0]
3501    SAVE_8ROWS   rsp+gprsize+16*11, 16
3502
3503    call .main_fast
3504    jmp  .pass2
3505
3506.full:
3507    mova                    m4, [rsp+gprsize+16*7 ]       ;in16
3508    mova                    m5, [rsp+gprsize+16*8 ]       ;in20
3509    mova                    m6, [rsp+gprsize+16*9 ]       ;in24
3510    mova                    m7, [rsp+gprsize+16*10]       ;in28
3511    call  m(idct_8x8_internal_8bpc).main
3512    SAVE_7ROWS   rsp+gprsize+16*3 , 16
3513    LOAD_8ROWS   rsp+gprsize+16*11, 16
3514    call m(idct_16x8_internal_8bpc).main
3515    mova                    m7, [rsp+gprsize+16*0]
3516    SAVE_8ROWS   rsp+gprsize+16*11, 16
3517    call .main
3518
3519.pass2:
3520    lea                     r3, [o(m(idct_8x32_internal_8bpc).end6)]
3521
3522.end:
3523    mova   [rsp+gprsize+16*0 ], m7
3524    lea                   tx2q, [o(m(idct_8x32_internal_8bpc).end2)]
3525
3526.end1:
3527    pxor                    m7, m7
3528    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  \
3529                                     8,  9,  10, 11, 12, 13, 14, 15, \
3530                                     16, 17, 18, 19, 20, 21, 22, 23, \
3531                                     24, 25, 26, 27, 28, 29, 30, 31
3532
3533    jmp                   tx2q
3534
3535.end2:
3536    lea                   tx2q, [o(m(idct_8x32_internal_8bpc).end3)]
3537    jmp   m(idct_8x8_internal_8bpc).end
3538
3539.end3:
3540    LOAD_8ROWS   rsp+gprsize+16*11, 16
3541    mova   [rsp+gprsize+16*0 ], m7
3542    lea                   dstq, [dstq+strideq*2]
3543    lea                   tx2q, [o(m(idct_8x32_internal_8bpc).end4)]
3544    jmp   m(idct_8x8_internal_8bpc).end
3545
3546.end4:
3547    LOAD_8ROWS   rsp+gprsize+16*19, 16
3548    mova   [rsp+gprsize+16*0 ], m7
3549    lea                   dstq, [dstq+strideq*2]
3550    lea                   tx2q, [o(m(idct_8x32_internal_8bpc).end5)]
3551    jmp   m(idct_8x8_internal_8bpc).end
3552
3553.end5:
3554    LOAD_8ROWS   rsp+gprsize+16*27, 16
3555    mova   [rsp+gprsize+16*0 ], m7
3556    lea                   dstq, [dstq+strideq*2]
3557    mov                   tx2q, r3
3558    jmp   m(idct_8x8_internal_8bpc).end
3559
3560.end6:
3561    ret
3562
3563ALIGN function_align
3564.main_veryfast:
3565    mova                    m0, [rsp+gprsize*2+16*19]     ;in1
3566    pmulhrsw                m3, m0, [o(pw_4091x8)]        ;t30,t31
3567    pmulhrsw                m0, [o(pw_201x8)]             ;t16,t17
3568    mova                    m7, [o(pd_2048)]
3569    mova [rsp+gprsize*2+16*19], m0                        ;t16
3570    mova [rsp+gprsize*2+16*34], m3                        ;t31
3571    ITX_MULSUB_2W            3, 0, 1, 2, 7,  799, 4017    ;t17a, t30a
3572    mova [rsp+gprsize*2+16*20], m3                        ;t17a
3573    mova [rsp+gprsize*2+16*33], m0                        ;t30a
3574    mova                    m1, [rsp+gprsize*2+16*22]     ;in7
3575    pmulhrsw                m2, m1, [o(pw_3857x8)]        ;t28,t29
3576    pmulhrsw                m1, [o(pw_m1380x8)]           ;t18,t19
3577    mova [rsp+gprsize*2+16*22], m1                        ;t19
3578    mova [rsp+gprsize*2+16*31], m2                        ;t28
3579    ITX_MULSUB_2W            2, 1, 0, 3, 7, m4017, 799    ;t18a, t29a
3580    mova [rsp+gprsize*2+16*21], m2                        ;t18a
3581    mova [rsp+gprsize*2+16*32], m1                        ;t29a
3582    mova                    m0, [rsp+gprsize*2+16*23]     ;in5
3583    pmulhrsw                m3, m0, [o(pw_3973x8)]        ;t26, t27
3584    pmulhrsw                m0, [o(pw_995x8)]             ;t20, t21
3585    mova [rsp+gprsize*2+16*23], m0                        ;t20
3586    mova [rsp+gprsize*2+16*30], m3                        ;t27
3587    ITX_MULSUB_2W            3, 0, 1, 2, 7, 3406, 2276    ;t21a, t26a
3588    mova [rsp+gprsize*2+16*24], m3                        ;t21a
3589    mova [rsp+gprsize*2+16*29], m0                        ;t26a
3590    mova                    m2, [rsp+gprsize*2+16*26]     ;in3
3591    pxor                    m0, m0
3592    mova                    m3, m0
3593    pmulhrsw                m1, m2, [o(pw_4052x8)]
3594    pmulhrsw                m2, [o(pw_m601x8)]
3595    jmp .main2
3596
3597ALIGN function_align
3598.main_fast: ;bottom half is zero
3599    mova                    m0, [rsp+gprsize*2+16*19]     ;in1
3600    mova                    m1, [rsp+gprsize*2+16*20]     ;in15
3601    pmulhrsw                m3, m0, [o(pw_4091x8)]        ;t31a
3602    pmulhrsw                m0, [o(pw_201x8)]             ;t16a
3603    pmulhrsw                m2, m1, [o(pw_3035x8)]        ;t30a
3604    pmulhrsw                m1, [o(pw_m2751x8)]           ;t17a
3605    mova                    m7, [o(pd_2048)]
3606    psubsw                  m4, m0, m1                    ;t17
3607    paddsw                  m0, m1                        ;t16
3608    psubsw                  m5, m3, m2                    ;t30
3609    paddsw                  m3, m2                        ;t31
3610    ITX_MULSUB_2W            5, 4, 1, 2, 7,  799, 4017    ;t17a, t30a
3611    mova [rsp+gprsize*2+16*19], m0                        ;t16
3612    mova [rsp+gprsize*2+16*20], m5                        ;t17a
3613    mova [rsp+gprsize*2+16*33], m4                        ;t30a
3614    mova [rsp+gprsize*2+16*34], m3                        ;t31
3615    mova                    m0, [rsp+gprsize*2+16*21]     ;in9
3616    mova                    m1, [rsp+gprsize*2+16*22]     ;in7
3617    pmulhrsw                m3, m0, [o(pw_3703x8)]
3618    pmulhrsw                m0, [o(pw_1751x8)]
3619    pmulhrsw                m2, m1, [o(pw_3857x8)]
3620    pmulhrsw                m1, [o(pw_m1380x8)]
3621    psubsw                  m4, m1, m0                    ;t18
3622    paddsw                  m0, m1                        ;t19
3623    psubsw                  m5, m2, m3                    ;t29
3624    paddsw                  m3, m2                        ;t28
3625    ITX_MULSUB_2W            5, 4, 1, 2, 7, m4017, 799    ;t18a, t29a
3626    mova [rsp+gprsize*2+16*21], m5                        ;t18a
3627    mova [rsp+gprsize*2+16*22], m0                        ;t19
3628    mova [rsp+gprsize*2+16*31], m3                        ;t28
3629    mova [rsp+gprsize*2+16*32], m4                        ;t29a
3630    mova                    m0, [rsp+gprsize*2+16*23]     ;in5
3631    mova                    m1, [rsp+gprsize*2+16*24]     ;in11
3632    pmulhrsw                m3, m0, [o(pw_3973x8)]
3633    pmulhrsw                m0, [o(pw_995x8)]
3634    pmulhrsw                m2, m1, [o(pw_3513x8)]
3635    pmulhrsw                m1, [o(pw_m2106x8)]
3636    psubsw                  m4, m0, m1                    ;t21
3637    paddsw                  m0, m1                        ;t20
3638    psubsw                  m5, m3, m2                    ;t26
3639    paddsw                  m3, m2                        ;t27
3640    ITX_MULSUB_2W            5, 4, 1, 2, 7, 3406, 2276    ;t21a, t26a
3641    mova [rsp+gprsize*2+16*23], m0                        ;t20
3642    mova [rsp+gprsize*2+16*24], m5                        ;t21a
3643    mova [rsp+gprsize*2+16*29], m4                        ;t26a
3644    mova [rsp+gprsize*2+16*30], m3                        ;t27
3645    mova                    m0, [rsp+gprsize*2+16*25]     ;in13
3646    mova                    m2, [rsp+gprsize*2+16*26]     ;in3
3647    pmulhrsw                m3, m0, [o(pw_3290x8)]
3648    pmulhrsw                m0, [o(pw_2440x8)]
3649    pmulhrsw                m1, m2, [o(pw_4052x8)]
3650    pmulhrsw                m2, [o(pw_m601x8)]
3651    jmp .main2
3652
3653ALIGN function_align
3654.main:
3655    mova                    m7, [o(pd_2048)]
3656    mova                    m0, [rsp+gprsize*2+16*19]     ;in1
3657    mova                    m1, [rsp+gprsize*2+16*20]     ;in15
3658    mova                    m2, [rsp+gprsize*2+16*33]     ;in17
3659    mova                    m3, [rsp+gprsize*2+16*34]     ;in31
3660    ITX_MULSUB_2W            0, 3, 4, 5, 7,  201, 4091    ;t16a, t31a
3661    ITX_MULSUB_2W            2, 1, 4, 5, 7, 3035, 2751    ;t17a, t30a
3662    psubsw                  m4, m0, m2                    ;t17
3663    paddsw                  m0, m2                        ;t16
3664    psubsw                  m5, m3, m1                    ;t30
3665    paddsw                  m3, m1                        ;t31
3666    ITX_MULSUB_2W            5, 4, 1, 2, 7,  799, 4017    ;t17a, t30a
3667    mova [rsp+gprsize*2+16*19], m0                        ;t16
3668    mova [rsp+gprsize*2+16*20], m5                        ;t17a
3669    mova [rsp+gprsize*2+16*33], m4                        ;t30a
3670    mova [rsp+gprsize*2+16*34], m3                        ;t31
3671    mova                    m0, [rsp+gprsize*2+16*21]     ;in9
3672    mova                    m1, [rsp+gprsize*2+16*22]     ;in7
3673    mova                    m2, [rsp+gprsize*2+16*31]     ;in25
3674    mova                    m3, [rsp+gprsize*2+16*32]     ;in23
3675    ITX_MULSUB_2W            0, 3, 4, 5, 7, 1751, 3703    ;t18a, t29a
3676    ITX_MULSUB_2W            2, 1, 4, 5, 7, 3857, 1380    ;t19a, t28a
3677    psubsw                  m4, m2, m0                    ;t18
3678    paddsw                  m0, m2                        ;t19
3679    psubsw                  m5, m1, m3                    ;t29
3680    paddsw                  m3, m1                        ;t28
3681    ITX_MULSUB_2W            5, 4, 1, 2, 7, m4017, 799    ;t18a, t29a
3682    mova [rsp+gprsize*2+16*21], m5                        ;t18a
3683    mova [rsp+gprsize*2+16*22], m0                        ;t19
3684    mova [rsp+gprsize*2+16*31], m3                        ;t28
3685    mova [rsp+gprsize*2+16*32], m4                        ;t29a
3686    mova                    m0, [rsp+gprsize*2+16*23]     ;in5
3687    mova                    m1, [rsp+gprsize*2+16*24]     ;in11
3688    mova                    m2, [rsp+gprsize*2+16*29]     ;in21
3689    mova                    m3, [rsp+gprsize*2+16*30]     ;in27
3690    ITX_MULSUB_2W            0, 3, 4, 5, 7,  995, 3973    ;t20a, t27a
3691    ITX_MULSUB_2W            2, 1, 4, 5, 7, 3513, 2106    ;t21a, t26a
3692    psubsw                  m4, m0, m2                    ;t21
3693    paddsw                  m0, m2                        ;t20
3694    psubsw                  m5, m3, m1                    ;t26
3695    paddsw                  m3, m1                        ;t27
3696    ITX_MULSUB_2W            5, 4, 1, 2, 7, 3406, 2276    ;t21a, t26a
3697    mova [rsp+gprsize*2+16*23], m0                        ;t20
3698    mova [rsp+gprsize*2+16*24], m5                        ;t21a
3699    mova [rsp+gprsize*2+16*29], m4                        ;t26a
3700    mova [rsp+gprsize*2+16*30], m3                        ;t27
3701    mova                    m0, [rsp+gprsize*2+16*25]     ;in13
3702    mova                    m1, [rsp+gprsize*2+16*26]     ;in3
3703    mova                    m2, [rsp+gprsize*2+16*27]     ;in29
3704    mova                    m3, [rsp+gprsize*2+16*28]     ;in19
3705    ITX_MULSUB_2W            0, 3, 4, 5, 7, 2440, 3290    ;t22a, t25a
3706    ITX_MULSUB_2W            2, 1, 4, 5, 7, 4052,  601    ;t23a, t24a
3707
3708.main2:
3709    psubsw                  m4, m2, m0                    ;t22
3710    paddsw                  m0, m2                        ;t23
3711    psubsw                  m5, m1, m3                    ;t25
3712    paddsw                  m3, m1                        ;t24
3713    ITX_MULSUB_2W            5, 4, 1, 2, 7, m2276, 3406   ;t22a, t25a
3714    mova                    m2, [rsp+gprsize*2+16*24]     ;t21a
3715    psubsw                  m1, m5, m2                    ;t21
3716    paddsw                  m5, m2                        ;t22
3717    mova [rsp+gprsize*2+16*25], m5                        ;t22
3718    mova                    m2, [rsp+gprsize*2+16*29]     ;t26a
3719    psubsw                  m5, m4, m2                    ;t26
3720    paddsw                  m4, m2                        ;t25
3721    mova [rsp+gprsize*2+16*28], m4                        ;t25
3722    ITX_MULSUB_2W            5, 1, 2, 4, 7, m3784, 1567   ;t21a, t26a
3723    mova [rsp+gprsize*2+16*24], m5                        ;t21a
3724    mova [rsp+gprsize*2+16*29], m1                        ;t26a
3725
3726    mova                    m1, [rsp+gprsize*2+16*23]     ;t20
3727    mova                    m5, [rsp+gprsize*2+16*30]     ;t27
3728    psubsw                  m2, m0, m1                    ;t20a
3729    paddsw                  m0, m1                        ;t23a
3730    psubsw                  m6, m3, m5                    ;t27a
3731    paddsw                  m3, m5                        ;t24a
3732    ITX_MULSUB_2W            6, 2, 1, 5, 7, m3784, 1567   ;t20, t27
3733    mova [rsp+gprsize*2+16*26], m0                        ;t23a
3734    mova [rsp+gprsize*2+16*27], m3                        ;t24a
3735    mova [rsp+gprsize*2+16*30], m2                        ;t27
3736
3737    mova                    m0, [rsp+gprsize*2+16*20]     ;t17a
3738    mova                    m1, [rsp+gprsize*2+16*21]     ;t18a
3739    mova                    m2, [rsp+gprsize*2+16*32]     ;t29a
3740    mova                    m3, [rsp+gprsize*2+16*33]     ;t30a
3741    psubsw                  m4, m0, m1                    ;t18
3742    paddsw                  m0, m1                        ;t17
3743    psubsw                  m5, m3, m2                    ;t29
3744    paddsw                  m3, m2                        ;t30
3745    ITX_MULSUB_2W            5, 4, 1, 2, 7, 1567, 3784    ;t18a, t29a
3746    mova [rsp+gprsize*2+16*20], m0                        ;t17
3747    mova [rsp+gprsize*2+16*21], m5                        ;t18a
3748    mova [rsp+gprsize*2+16*32], m4                        ;t29a
3749    mova [rsp+gprsize*2+16*33], m3                        ;t30
3750    mova                    m0, [rsp+gprsize*2+16*19]     ;t16
3751    mova                    m1, [rsp+gprsize*2+16*22]     ;t19
3752    mova                    m2, [rsp+gprsize*2+16*31]     ;t28
3753    mova                    m3, [rsp+gprsize*2+16*34]     ;t31
3754    psubsw                  m4, m0, m1                    ;t19a
3755    paddsw                  m0, m1                        ;t16a
3756    psubsw                  m5, m3, m2                    ;t28a
3757    paddsw                  m3, m2                        ;t31a
3758    ITX_MULSUB_2W            5, 4, 1, 2, 7, 1567, 3784    ;t19, t28
3759    mova                    m2, [rsp+gprsize*2+16*15]     ;tmp12
3760    psubsw                  m1, m5, m6                    ;t20a
3761    paddsw                  m5, m6                        ;t19a
3762    psubsw                  m6, m2, m5                    ;out19
3763    paddsw                  m2, m5                        ;out12
3764    mova                    m5, [rsp+gprsize*2+16*30]     ;t27
3765    mova [rsp+gprsize*2+16*22], m6                        ;out19
3766    mova [rsp+gprsize*2+16*15], m2                        ;out12
3767    psubsw                  m6, m4, m5                    ;t27a
3768    paddsw                  m4, m5                        ;t28a
3769    ITX_MULSUB_2W            6, 1, 2, 5, 7, 2896, 2896    ;t20, t27
3770    mova                    m2, [rsp+gprsize*2+16*6 ]     ;tmp3
3771    psubsw                  m5, m2, m4                    ;out28
3772    paddsw                  m2, m4                        ;out3
3773    mova                    m4, [rsp+gprsize*2+16*14]     ;tmp11
3774    mova [rsp+gprsize*2+16*31], m5                        ;out28
3775    mova [rsp+gprsize*2+16*6 ], m2                        ;out3
3776    psubsw                  m5, m4, m6                    ;out20
3777    paddsw                  m4, m6                        ;out11
3778    mova                    m2, [rsp+gprsize*2+16*7 ]     ;tmp4
3779    mova [rsp+gprsize*2+16*23], m5                        ;out20
3780    mova [rsp+gprsize*2+16*14], m4                        ;out11
3781    psubsw                  m5, m2, m1                    ;out27
3782    paddsw                  m2, m1                        ;out4
3783    mova                    m1, [rsp+gprsize*2+16*26]     ;t23a
3784    mova                    m4, [rsp+gprsize*2+16*27]     ;t24a
3785    mova [rsp+gprsize*2+16*30], m5                        ;out27
3786    mova [rsp+gprsize*2+16*7 ], m2                        ;out4
3787    psubsw                  m5, m0, m1                    ;t23
3788    paddsw                  m0, m1                        ;t16
3789    psubsw                  m2, m3, m4                    ;t24
3790    paddsw                  m3, m4                        ;t31
3791    ITX_MULSUB_2W            2, 5, 4, 6, 7, 2896, 2896    ;t23a, t24a
3792    mova                    m6, [rsp+gprsize*2+16*18]     ;tmp15
3793    psubsw                  m4, m6, m0                    ;out16
3794    paddsw                  m6, m0                        ;out15
3795    mova                    m0, [rsp+gprsize*2+16*3 ]     ;tmp0
3796    mova                    m1, [rsp+gprsize*2+16*11]     ;tmp8
3797    mova [rsp+gprsize*2+16*18], m6                        ;out15
3798    mova [rsp+gprsize*2+16*19], m4                        ;out16
3799    psubsw                  m6, m0, m3                    ;out31
3800    paddsw                  m0, m3                        ;out0
3801    psubsw                  m4, m1, m2                    ;out23
3802    paddsw                  m1, m2                        ;out8
3803    mova                    m3, [rsp+gprsize*2+16*10]     ;tmp7
3804    mova [rsp+gprsize*2+16*34], m6                        ;out31
3805    mova [rsp+gprsize*2+16*11], m1                        ;out8
3806    mova [rsp+gprsize*2+16*26], m4                        ;out23
3807    paddsw                  m6, m3, m5                    ;out7
3808    psubsw                  m3, m5                        ;out24
3809    mova                    m1, [rsp+gprsize*2+16*20]     ;t17
3810    mova                    m5, [rsp+gprsize*2+16*25]     ;t22
3811    mova                    m2, [rsp+gprsize*2+16*17]     ;tmp14
3812    mova [rsp+gprsize*2+16*27], m3                        ;out24
3813    psubsw                  m4, m1, m5                    ;t22a
3814    paddsw                  m1, m5                        ;t17a
3815    psubsw                  m3, m2, m1                    ;out17
3816    paddsw                  m2, m1                        ;out14
3817    mova                    m5, [rsp+gprsize*2+16*28]     ;t25
3818    mova                    m1, [rsp+gprsize*2+16*33]     ;t30
3819    mova [rsp+gprsize*2+16*17], m2                        ;out14
3820    mova [rsp+gprsize*2+16*20], m3                        ;out17
3821    psubsw                  m2, m1, m5                    ;t25a
3822    paddsw                  m1, m5                        ;t30a
3823    ITX_MULSUB_2W            2, 4, 3, 5, 7, 2896, 2896    ;t22, t25
3824    mova                    m5, [rsp+gprsize*2+16*4 ]     ;tmp1
3825    psubsw                  m3, m5, m1                    ;out30
3826    paddsw                  m5, m1                        ;out1
3827    mova                    m1, [rsp+gprsize*2+16*12]     ;tmp9
3828    mova [rsp+gprsize*2+16*33], m3                        ;out30
3829    mova [rsp+gprsize*2+16*4 ], m5                        ;out1
3830    psubsw                  m3, m1, m2                    ;out22
3831    paddsw                  m1, m2                        ;out9
3832    mova                    m5, [rsp+gprsize*2+16*9 ]     ;tmp6
3833    mova [rsp+gprsize*2+16*25], m3                        ;out22
3834    mova [rsp+gprsize*2+16*12], m1                        ;out9
3835    psubsw                  m3, m5, m4                    ;out25
3836    paddsw                  m5, m4                        ;out6
3837    mova                    m4, [rsp+gprsize*2+16*21]     ;t18a
3838    mova                    m1, [rsp+gprsize*2+16*24]     ;t21a
3839    mova                    m2, [rsp+gprsize*2+16*16]     ;tmp13
3840    mova [rsp+gprsize*2+16*28], m3                        ;out25
3841    mova [rsp+gprsize*2+16*9 ], m5                        ;out6
3842    paddsw                  m3, m4, m1                    ;t18
3843    psubsw                  m4, m1                        ;t21
3844    psubsw                  m5, m2, m3                    ;out18
3845    paddsw                  m2, m3                        ;out13
3846    mova                    m1, [rsp+gprsize*2+16*29]     ;t26a
3847    mova                    m3, [rsp+gprsize*2+16*32]     ;t29a
3848    mova [rsp+gprsize*2+16*21], m5                        ;out18
3849    mova [rsp+gprsize*2+16*16], m2                        ;out13
3850    psubsw                  m5, m3, m1                    ;t26
3851    paddsw                  m3, m1                        ;t29
3852    ITX_MULSUB_2W            5, 4, 1, 2, 7, 2896, 2896    ;t21a, t26a
3853    mova                    m2, [rsp+gprsize*2+16*5 ]     ;tmp2
3854    psubsw                  m1, m2, m3                    ;out29
3855    paddsw                  m2, m3                        ;out2
3856    mova                    m3, [rsp+gprsize*2+16*13]     ;tmp10
3857    mova [rsp+gprsize*2+16*32], m1                        ;out29
3858    psubsw                  m7, m3, m5                    ;out21
3859    paddsw                  m3, m5                        ;out10
3860    mova                    m5, [rsp+gprsize*2+16*8 ]     ;tmp5
3861    mova [rsp+gprsize*2+16*24], m7                        ;out21
3862    mova [rsp+gprsize*2+16*13], m3                        ;out10
3863    psubsw                  m1, m5, m4                    ;out26
3864    paddsw                  m5, m4                        ;out5
3865    mova                    m7, m6                        ;out7
3866    mova                    m3, [rsp+gprsize*2+16*6 ]     ;out3
3867    mova                    m4, [rsp+gprsize*2+16*7 ]     ;out4
3868    mova [rsp+gprsize*2+16*29], m1                        ;out26
3869    mova                    m6, [rsp+gprsize*2+16*9 ]     ;out6
3870    mova                    m1, [rsp+gprsize*2+16*4 ]     ;out1
3871    ret
3872
3873
3874cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
3875%if ARCH_X86_32
3876    LEA                     r5, $$
3877%endif
3878    test                  eobd, eobd
3879    jz .dconly
3880    call  m(idct_32x8_internal_8bpc)
3881    RET
3882
3883.dconly:
3884    movd                    m1, [o(pw_2896x8)]
3885    pmulhrsw                m0, m1, [coeffq]
3886    movd                    m2, [o(pw_8192)]
3887    mov               [coeffq], eobd
3888    mov                    r3d, 8
3889    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)]
3890
3891.body:
3892    pmulhrsw                m0, m2
3893    movd                    m2, [o(pw_2048)]  ;intentionally rip-relative
3894    pmulhrsw                m0, m1
3895    pmulhrsw                m0, m2
3896    pshuflw                 m0, m0, q0000
3897    punpcklwd               m0, m0
3898    pxor                    m5, m5
3899
3900.loop:
3901    mova                    m1, [dstq+16*0]
3902    mova                    m3, [dstq+16*1]
3903    punpckhbw               m2, m1, m5
3904    punpcklbw               m1, m5
3905    punpckhbw               m4, m3, m5
3906    punpcklbw               m3, m5
3907    paddw                   m2, m0
3908    paddw                   m1, m0
3909    paddw                   m4, m0
3910    paddw                   m3, m0
3911    packuswb                m1, m2
3912    packuswb                m3, m4
3913    mova           [dstq+16*0], m1
3914    mova           [dstq+16*1], m3
3915    add                   dstq, strideq
3916    dec                    r3d
3917    jg .loop
3918    jmp                   tx2q
3919
3920.end:
3921    RET
3922
3923
3924cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
3925    %undef cmp
3926    LOAD_8ROWS     coeffq+16*0, 64
3927    call  m(idct_8x8_internal_8bpc).main
3928    SAVE_7ROWS    rsp+gprsize+16*3, 16
3929
3930    LOAD_8ROWS     coeffq+16*2, 64
3931    call m(idct_16x8_internal_8bpc).main
3932    mova                    m7, [rsp+gprsize+16*0]
3933    SAVE_8ROWS   rsp+gprsize+16*11, 16
3934
3935    LOAD_8ROWS     coeffq+16*1, 32
3936    mova   [rsp+gprsize+16*19], m0                        ;in1
3937    mova   [rsp+gprsize+16*26], m1                        ;in3
3938    mova   [rsp+gprsize+16*23], m2                        ;in5
3939    mova   [rsp+gprsize+16*22], m3                        ;in7
3940    mova   [rsp+gprsize+16*21], m4                        ;in9
3941    mova   [rsp+gprsize+16*24], m5                        ;in11
3942    mova   [rsp+gprsize+16*25], m6                        ;in13
3943    mova   [rsp+gprsize+16*20], m7                        ;in15
3944
3945    cmp                   eobd, 106
3946    jg  .full
3947    call m(idct_8x32_internal_8bpc).main_fast
3948    jmp .pass2
3949
3950.full:
3951    LOAD_8ROWS    coeffq+16*17, 32
3952    mova   [rsp+gprsize+16*33], m0                        ;in17
3953    mova   [rsp+gprsize+16*28], m1                        ;in19
3954    mova   [rsp+gprsize+16*29], m2                        ;in21
3955    mova   [rsp+gprsize+16*32], m3                        ;in23
3956    mova   [rsp+gprsize+16*31], m4                        ;in25
3957    mova   [rsp+gprsize+16*30], m5                        ;in27
3958    mova   [rsp+gprsize+16*27], m6                        ;in29
3959    mova   [rsp+gprsize+16*34], m7                        ;in31
3960    call m(idct_8x32_internal_8bpc).main
3961
3962.pass2:
3963    mova   [rsp+gprsize+16*0 ], m7
3964    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end)]
3965    jmp  m(idct_8x32_internal_8bpc).end1
3966
3967.end:
3968    mova                    m7, [o(pw_8192)]
3969    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end1)]
3970    jmp   m(idct_8x8_internal_8bpc).pass1_end1
3971
3972.end1:
3973    lea                     r3, [dstq+8]
3974    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end2)]
3975    jmp   m(idct_8x8_internal_8bpc).pass2_main
3976
3977.end2:
3978    LOAD_8ROWS   rsp+gprsize+16*11, 16
3979    mova   [rsp+gprsize+16*0 ], m7
3980    mova                    m7, [o(pw_8192)]
3981    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end3)]
3982    jmp   m(idct_8x8_internal_8bpc).pass1_end1
3983
3984.end3:
3985    mov                   dstq, r3
3986    add                     r3, 8
3987    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end4)]
3988    jmp   m(idct_8x8_internal_8bpc).pass2_main
3989
3990.end4:
3991    LOAD_8ROWS   rsp+gprsize+16*19, 16
3992    mova   [rsp+gprsize+16*0 ], m7
3993    mova                    m7, [o(pw_8192)]
3994    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end5)]
3995    jmp   m(idct_8x8_internal_8bpc).pass1_end1
3996
3997.end5:
3998    mov                   dstq, r3
3999    add                     r3, 8
4000    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end6)]
4001    jmp   m(idct_8x8_internal_8bpc).pass2_main
4002
4003.end6:
4004    LOAD_8ROWS   rsp+gprsize+16*27, 16
4005    mova   [rsp+gprsize+16*0 ], m7
4006    mova                    m7, [o(pw_8192)]
4007    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end7)]
4008    jmp   m(idct_8x8_internal_8bpc).pass1_end1
4009
4010.end7:
4011    mov                   dstq, r3
4012    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end8)]
4013    jmp   m(idct_8x8_internal_8bpc).pass2_main
4014
4015.end8:
4016    ret
4017
4018
4019cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
4020    mov                    r5d, 4
4021    mov                   tx2d, 2
4022    cmp                   eobd, 107
4023    cmovns                tx2d, r5d
4024    mov                    r3d, tx2d
4025%if ARCH_X86_32
4026    LEA                     r5, $$
4027%endif
4028    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end8)]
4029.loop:
4030    LOAD_8ROWS     coeffq+16*0, 64
4031    paddsw                  m6, [o(pw_5)]
4032    mova            [rsp+16*1], m6
4033    mova                    m6, [o(pw_5)]
4034    REPX        {paddsw x, m6}, m0, m1, m2, m3, m4, m5, m7
4035    call  m(idct_8x8_internal_8bpc).pass1_end3
4036    REPX        {psraw  x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
4037    mova            [rsp+16*2], m5
4038    mova            [rsp+16*1], m6
4039    mova            [rsp+16*0], m7
4040    call  m(idct_8x8_internal_8bpc).end3
4041    lea                   dstq, [dstq+strideq*2]
4042    pxor                    m7, m7
4043    REPX   {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
4044    add                 coeffq, 16
4045    dec                    r3d
4046    jg .loop
4047    RET
4048
4049cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
4050    mov                    r5d, 4
4051    mov                   tx2d, 2
4052    cmp                   eobd, 107
4053    cmovns                tx2d, r5d
4054    mov                    r3d, tx2d
4055%if ARCH_X86_32
4056    LEA                     r5, $$
4057%endif
4058
4059.loop:
4060    LOAD_8ROWS     coeffq+16*0, 16
4061    pmulhrsw                m6, [o(pw_4096)]
4062    mova            [rsp+16*1], m6
4063    mova                    m6, [o(pw_4096)]
4064    REPX      {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7
4065    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end8)]
4066    call  m(idct_8x8_internal_8bpc).pass1_end3
4067
4068    mov             [rsp+16*3], dstq
4069    mova            [rsp+16*2], m5
4070    mova            [rsp+16*1], m6
4071    mova            [rsp+16*0], m7
4072    lea                   tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
4073    call  m(idct_8x8_internal_8bpc).end3
4074
4075    add                 coeffq, 16*8
4076    mov                   dstq, [rsp+16*3]
4077    lea                   dstq, [dstq+8]
4078    dec                    r3d
4079    jg .loop
4080    jnc .loop
4081    RET
4082
4083
4084cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
4085%if ARCH_X86_32
4086    LEA                     r5, $$
4087%endif
4088    test                  eobd, eobd
4089    jz .dconly
4090    call  m(idct_16x32_internal_8bpc)
4091    RET
4092
4093.dconly:
4094    movd                    m1, [o(pw_2896x8)]
4095    pmulhrsw                m0, m1, [coeffq]
4096    movd                    m2, [o(pw_16384)]
4097    mov               [coeffq], eobd
4098    pmulhrsw                m0, m1
4099    mov                    r2d, 16
4100    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_16x32_8bpc).end)]
4101    jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
4102
4103.end:
4104    RET
4105
4106cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
4107    %undef cmp
4108
4109    LOAD_8ROWS     coeffq+16*1, 128, 1
4110    call  m(idct_8x8_internal_8bpc).main
4111    SAVE_7ROWS    rsp+gprsize+16*3, 16
4112    LOAD_8ROWS     coeffq+16*5, 128, 1
4113    call m(idct_16x8_internal_8bpc).main
4114    lea                   tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end)]
4115    jmp   m(idct_8x8_internal_8bpc).pass1_end
4116
4117.pass1_end:
4118    SAVE_8ROWS    coeffq+16*33, 64               ;in8~in15
4119    LOAD_8ROWS    rsp+gprsize+16*3, 16
4120    mova    [rsp+gprsize+16*0], m7
4121    lea                   tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end1)]
4122    jmp   m(idct_8x8_internal_8bpc).pass1_end
4123
4124.pass1_end1:
4125    mova        [coeffq+16*1 ], m0                        ;in8
4126    mova        [coeffq+16*5 ], m4                        ;in12
4127    mova   [rsp+gprsize+16*13], m2                        ;in10
4128    mova   [rsp+gprsize+16*14], m6                        ;in14
4129    mova   [rsp+gprsize+16*21], m1                        ;in9
4130    mova   [rsp+gprsize+16*24], m3                        ;in11
4131    mova   [rsp+gprsize+16*25], m5                        ;in13
4132    mova   [rsp+gprsize+16*20], m7                        ;in15
4133    LOAD_8ROWS     coeffq+16*0, 128, 1
4134    call  m(idct_8x8_internal_8bpc).main
4135    SAVE_7ROWS    rsp+gprsize+16*3, 16
4136    LOAD_8ROWS     coeffq+16*4, 128, 1
4137    call m(idct_16x8_internal_8bpc).main
4138    lea                   tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end2)]
4139    jmp   m(idct_8x8_internal_8bpc).pass1_end
4140
4141.pass1_end2:
4142    SAVE_8ROWS    coeffq+16*32, 64               ;in0~in7
4143    LOAD_8ROWS    rsp+gprsize+16*3, 16
4144    mova    [rsp+gprsize+16*0], m7
4145    lea                   tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end3)]
4146    jmp   m(idct_8x8_internal_8bpc).pass1_end
4147
4148.pass1_end3:
4149    mova   [rsp+gprsize+16*11], m2                        ;in2
4150    mova   [rsp+gprsize+16*12], m6                        ;in6
4151    mova   [rsp+gprsize+16*19], m1                        ;in1
4152    mova   [rsp+gprsize+16*26], m3                        ;in3
4153    mova   [rsp+gprsize+16*23], m5                        ;in5
4154    mova   [rsp+gprsize+16*22], m7                        ;in7
4155
4156    cmp                   eobd, 150
4157    jg .full
4158
4159    mova                    m1, m4                        ;in4
4160    mova                    m2, [coeffq+16*1 ]            ;in8
4161    mova                    m3, [coeffq+16*5 ]            ;in12
4162    pxor                    m4, m4
4163    REPX          {mova x, m4}, m5, m6, m7
4164    call  m(idct_8x8_internal_8bpc).main
4165    SAVE_7ROWS    rsp+gprsize+16*3, 16
4166    mova                    m0, [rsp+gprsize+16*11]       ;in2
4167    mova                    m1, [rsp+gprsize+16*12]       ;in6
4168    mova                    m2, [rsp+gprsize+16*13]       ;in10
4169    mova                    m3, [rsp+gprsize+16*14]       ;in14
4170    pxor                    m4, m4
4171    REPX          {mova x, m4}, m5, m6, m7
4172    call m(idct_16x8_internal_8bpc).main
4173    mova                    m7, [rsp+gprsize+16*0]
4174    SAVE_8ROWS   rsp+gprsize+16*11, 16
4175
4176    call m(idct_8x32_internal_8bpc).main_fast
4177    jmp  .pass2
4178
4179.full:
4180    mova        [coeffq+16*0 ], m0                        ;in0
4181    mova        [coeffq+16*4 ], m4                        ;in4
4182
4183    LOAD_8ROWS     coeffq+16*2, 128, 1
4184    call  m(idct_8x8_internal_8bpc).main
4185    SAVE_7ROWS    rsp+gprsize+16*3, 16
4186    LOAD_8ROWS     coeffq+16*6, 128, 1
4187    call m(idct_16x8_internal_8bpc).main
4188    lea                   tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end4)]
4189    jmp   m(idct_8x8_internal_8bpc).pass1_end
4190
4191.pass1_end4:
4192    SAVE_8ROWS    coeffq+16*34, 64               ;in16~in23
4193    LOAD_8ROWS    rsp+gprsize+16*3, 16
4194    mova    [rsp+gprsize+16*0], m7
4195    lea                   tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end5)]
4196    jmp   m(idct_8x8_internal_8bpc).pass1_end
4197
4198.pass1_end5:
4199    mova        [coeffq+16*2 ], m0                        ;in16
4200    mova        [coeffq+16*6 ], m4                        ;in20
4201    mova   [rsp+gprsize+16*15], m2                        ;in18
4202    mova   [rsp+gprsize+16*16], m6                        ;in22
4203    mova   [rsp+gprsize+16*33], m1                        ;in17
4204    mova   [rsp+gprsize+16*28], m3                        ;in19
4205    mova   [rsp+gprsize+16*29], m5                        ;in21
4206    mova   [rsp+gprsize+16*32], m7                        ;in23
4207
4208    LOAD_8ROWS     coeffq+16*3, 128, 1
4209    call  m(idct_8x8_internal_8bpc).main
4210    SAVE_7ROWS    rsp+gprsize+16*3, 16
4211    LOAD_8ROWS     coeffq+16*7, 128, 1
4212    call m(idct_16x8_internal_8bpc).main
4213    lea                   tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end6)]
4214    jmp   m(idct_8x8_internal_8bpc).pass1_end
4215
4216.pass1_end6:
4217    SAVE_8ROWS    coeffq+16*35, 64                        ;in24~in31
4218    LOAD_8ROWS    rsp+gprsize+16*3, 16
4219    mova    [rsp+gprsize+16*0], m7
4220    lea                   tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end7)]
4221    jmp   m(idct_8x8_internal_8bpc).pass1_end
4222
4223.pass1_end7:
4224    mova   [rsp+gprsize+16*17], m2                        ;in26
4225    mova   [rsp+gprsize+16*18], m6                        ;in30
4226    mova   [rsp+gprsize+16*31], m1                        ;in25
4227    mova   [rsp+gprsize+16*30], m3                        ;in27
4228    mova   [rsp+gprsize+16*27], m5                        ;in29
4229    mova   [rsp+gprsize+16*34], m7                        ;in31
4230
4231    mova                    m6, m0                        ;in24
4232    mova                    m7, m4                        ;in28
4233    mova                    m0, [coeffq+16*0 ]            ;in0
4234    mova                    m1, [coeffq+16*4 ]            ;in4
4235    mova                    m2, [coeffq+16*1 ]            ;in8
4236    mova                    m3, [coeffq+16*5 ]            ;in12
4237    mova                    m4, [coeffq+16*2 ]            ;in16
4238    mova                    m5, [coeffq+16*6 ]            ;in20
4239    call  m(idct_8x8_internal_8bpc).main
4240    SAVE_7ROWS   rsp+gprsize+16*3 , 16
4241    LOAD_8ROWS   rsp+gprsize+16*11, 16
4242    call m(idct_16x8_internal_8bpc).main
4243    mova                    m7, [rsp+gprsize+16*0]
4244    SAVE_8ROWS   rsp+gprsize+16*11, 16
4245
4246    call m(idct_8x32_internal_8bpc).main
4247
4248.pass2:
4249    mov  [rsp+gprsize*1+16*35], eobd
4250    lea                     r3, [dstq+8]
4251    mov  [rsp+gprsize*2+16*35], r3
4252    lea                     r3, [o(m(idct_16x32_internal_8bpc).end)]
4253    jmp  m(idct_8x32_internal_8bpc).end
4254
4255.end:
4256    mov                   dstq, [rsp+gprsize*2+16*35]
4257    mov                   eobd, [rsp+gprsize*1+16*35]
4258    add                 coeffq, 16*32
4259
4260    mova                    m0, [coeffq+16*4 ]            ;in1
4261    mova                    m1, [coeffq+16*12]            ;in3
4262    mova                    m2, [coeffq+16*20]            ;in5
4263    mova                    m3, [coeffq+16*28]            ;in7
4264    mova                    m4, [coeffq+16*5 ]            ;in9
4265    mova                    m5, [coeffq+16*13]            ;in11
4266    mova                    m6, [coeffq+16*21]            ;in13
4267    mova                    m7, [coeffq+16*29]            ;in15
4268
4269    mova   [rsp+gprsize+16*19], m0                        ;in1
4270    mova   [rsp+gprsize+16*26], m1                        ;in3
4271    mova   [rsp+gprsize+16*23], m2                        ;in5
4272    mova   [rsp+gprsize+16*22], m3                        ;in7
4273    mova   [rsp+gprsize+16*21], m4                        ;in9
4274    mova   [rsp+gprsize+16*24], m5                        ;in11
4275    mova   [rsp+gprsize+16*25], m6                        ;in13
4276    mova   [rsp+gprsize+16*20], m7                        ;in15
4277
4278    mova                    m0, [coeffq+16*0 ]            ;in0
4279    mova                    m1, [coeffq+16*16]            ;in4
4280    mova                    m2, [coeffq+16*1 ]            ;in8
4281    mova                    m3, [coeffq+16*17]            ;in12
4282
4283    cmp                   eobd, 150
4284    jg .full1
4285
4286    pxor                    m4, m4
4287    REPX          {mova x, m4}, m5, m6, m7
4288    call  m(idct_8x8_internal_8bpc).main
4289    SAVE_7ROWS    rsp+gprsize+16*3, 16
4290
4291    mova                    m0, [coeffq+16*8 ]            ;in2
4292    mova                    m1, [coeffq+16*24]            ;in6
4293    mova                    m2, [coeffq+16*9 ]            ;in10
4294    mova                    m3, [coeffq+16*25]            ;in14
4295    pxor                    m4, m4
4296    REPX          {mova x, m4}, m5, m6, m7
4297    call m(idct_16x8_internal_8bpc).main
4298    mova                    m7, [rsp+gprsize+16*0]
4299    SAVE_8ROWS   rsp+gprsize+16*11, 16
4300
4301    call m(idct_8x32_internal_8bpc).main_fast
4302    jmp  .end1
4303
4304.full1:
4305    mova                    m4, [coeffq+16*2 ]            ;in16
4306    mova                    m5, [coeffq+16*18]            ;in20
4307    mova                    m6, [coeffq+16*3 ]            ;in24
4308    mova                    m7, [coeffq+16*19]            ;in26
4309    call  m(idct_8x8_internal_8bpc).main
4310    SAVE_7ROWS    rsp+gprsize+16*3, 16
4311
4312    mova                    m0, [coeffq+16*8 ]            ;in2
4313    mova                    m1, [coeffq+16*24]            ;in6
4314    mova                    m2, [coeffq+16*9 ]            ;in10
4315    mova                    m3, [coeffq+16*25]            ;in14
4316    mova                    m4, [coeffq+16*10]            ;in18
4317    mova                    m5, [coeffq+16*26]            ;in22
4318    mova                    m6, [coeffq+16*11]            ;in26
4319    mova                    m7, [coeffq+16*27]            ;in30
4320    call m(idct_16x8_internal_8bpc).main
4321    mova                    m7, [rsp+gprsize+16*0]
4322    SAVE_8ROWS   rsp+gprsize+16*11, 16
4323
4324    mova                    m0, [coeffq+16*6 ]            ;in17
4325    mova                    m1, [coeffq+16*14]            ;in19
4326    mova                    m2, [coeffq+16*22]            ;in21
4327    mova                    m3, [coeffq+16*30]            ;in23
4328    mova                    m4, [coeffq+16*7 ]            ;in25
4329    mova                    m5, [coeffq+16*15]            ;in27
4330    mova                    m6, [coeffq+16*23]            ;in29
4331    mova                    m7, [coeffq+16*31]            ;in31
4332
4333    mova   [rsp+gprsize+16*33], m0                        ;in17
4334    mova   [rsp+gprsize+16*28], m1                        ;in19
4335    mova   [rsp+gprsize+16*29], m2                        ;in21
4336    mova   [rsp+gprsize+16*32], m3                        ;in23
4337    mova   [rsp+gprsize+16*31], m4                        ;in25
4338    mova   [rsp+gprsize+16*30], m5                        ;in27
4339    mova   [rsp+gprsize+16*27], m6                        ;in29
4340    mova   [rsp+gprsize+16*34], m7                        ;in31
4341
4342    call m(idct_8x32_internal_8bpc).main
4343
4344.end1:
4345    jmp m(idct_8x32_internal_8bpc).pass2
4346
4347
4348
4349cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
4350%if ARCH_X86_32
4351    LEA                     r5, $$
4352%endif
4353    test                  eobd, eobd
4354    jz .dconly
4355
4356    call m(idct_32x16_internal_8bpc)
4357    call m(idct_8x16_internal_8bpc).pass2
4358
4359    add                 coeffq, 16*16
4360    lea                   dstq, [r3+8]
4361    LOAD_8ROWS       rsp+16*11, 16
4362    mova            [rsp+16*0], m7
4363    lea                   tx2q, [o(m(idct_32x16_internal_8bpc).end)]
4364    call  m(idct_8x8_internal_8bpc).pass1_end
4365    call m(idct_8x16_internal_8bpc).pass2
4366
4367    add                 coeffq, 16*16
4368    lea                   dstq, [r3+8]
4369    LOAD_8ROWS       rsp+16*19, 16
4370    mova            [rsp+16*0], m7
4371    lea                   tx2q, [o(m(idct_32x16_internal_8bpc).end)]
4372    call  m(idct_8x8_internal_8bpc).pass1_end
4373    call m(idct_8x16_internal_8bpc).pass2
4374
4375    add                 coeffq, 16*16
4376    lea                   dstq, [r3+8]
4377    LOAD_8ROWS       rsp+16*27, 16
4378    mova            [rsp+16*0], m7
4379    lea                   tx2q, [o(m(idct_32x16_internal_8bpc).end)]
4380    call  m(idct_8x8_internal_8bpc).pass1_end
4381    call m(idct_8x16_internal_8bpc).pass2
4382    RET
4383
4384.dconly:
4385    movd                    m1, [o(pw_2896x8)]
4386    pmulhrsw                m0, m1, [coeffq]
4387    movd                    m2, [o(pw_16384)]
4388    mov               [coeffq], eobd
4389    pmulhrsw                m0, m1
4390    mov                    r3d, 16
4391    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)]
4392    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body
4393
4394
4395cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
4396    %undef cmp
4397
4398    add                 coeffq, 16
4399    lea                     r3, [o(m(idct_32x16_internal_8bpc).pass1_end1)]
4400.pass1:
4401    LOAD_8ROWS     coeffq+16*0, 128, 1
4402    call  m(idct_8x8_internal_8bpc).main
4403    SAVE_7ROWS    rsp+gprsize+16*3, 16
4404
4405    LOAD_8ROWS     coeffq+16*4, 128, 1
4406    call m(idct_16x8_internal_8bpc).main
4407    mova                    m7, [rsp+gprsize+16*0]
4408    SAVE_8ROWS   rsp+gprsize+16*11, 16
4409
4410    LOAD_8ROWS     coeffq+16*2, 64, 1
4411    mova   [rsp+gprsize+16*19], m0                        ;in1
4412    mova   [rsp+gprsize+16*26], m1                        ;in3
4413    mova   [rsp+gprsize+16*23], m2                        ;in5
4414    mova   [rsp+gprsize+16*22], m3                        ;in7
4415    mova   [rsp+gprsize+16*21], m4                        ;in9
4416    mova   [rsp+gprsize+16*24], m5                        ;in11
4417    mova   [rsp+gprsize+16*25], m6                        ;in13
4418    mova   [rsp+gprsize+16*20], m7                        ;in15
4419
4420    LOAD_8ROWS    coeffq+16*34, 64, 1
4421    mova   [rsp+gprsize+16*33], m0                        ;in17
4422    mova   [rsp+gprsize+16*28], m1                        ;in19
4423    mova   [rsp+gprsize+16*29], m2                        ;in21
4424    mova   [rsp+gprsize+16*32], m3                        ;in23
4425    mova   [rsp+gprsize+16*31], m4                        ;in25
4426    mova   [rsp+gprsize+16*30], m5                        ;in27
4427    mova   [rsp+gprsize+16*27], m6                        ;in29
4428    mova   [rsp+gprsize+16*34], m7                        ;in31
4429    call m(idct_8x32_internal_8bpc).main
4430
4431.pass1_end:
4432    mova   [rsp+gprsize+16*0 ], m7
4433    mov                   tx2q, r3
4434    jmp   m(idct_8x8_internal_8bpc).pass1_end
4435
4436.pass1_end1:
4437    SAVE_8ROWS     coeffq+16*0, 32
4438    LOAD_8ROWS   rsp+gprsize+16*11, 16
4439    mova   [rsp+gprsize+16*0 ], m7
4440    lea                   tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end2)]
4441    jmp   m(idct_8x8_internal_8bpc).pass1_end
4442
4443.pass1_end2:
4444    SAVE_8ROWS    coeffq+16*16, 32
4445    LOAD_8ROWS   rsp+gprsize+16*19, 16
4446    mova   [rsp+gprsize+16*0 ], m7
4447    lea                   tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end3)]
4448    jmp   m(idct_8x8_internal_8bpc).pass1_end
4449
4450.pass1_end3:
4451    SAVE_8ROWS    coeffq+16*32, 32
4452    LOAD_8ROWS   rsp+gprsize+16*27, 16
4453    mova   [rsp+gprsize+16*0 ], m7
4454    lea                   tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end4)]
4455    jmp   m(idct_8x8_internal_8bpc).pass1_end
4456
4457.pass1_end4:
4458    SAVE_8ROWS    coeffq+16*48, 32
4459
4460    sub                 coeffq, 16
4461    lea                     r3, [o(m(idct_32x16_internal_8bpc).end)]
4462    jmp .pass1
4463
4464.end:
4465    ret
4466
4467
4468cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
4469    %undef cmp
4470
4471    mov                    r4d, eobd
4472    cmp                   eobd, 43                ;if (eob > 43)
4473    sbb                    r3d, r3d               ;  iteration_count++
4474    cmp                    r4d, 150               ;if (eob > 150)
4475    sbb                    r3d, 0                 ;  iteration_count++
4476    cmp                    r4d, 278               ;if (eob > 278)
4477    sbb                    r3d, -4                ;  iteration_count++
4478
4479%if ARCH_X86_32
4480    LEA                     r5, $$
4481%endif
4482    lea                     r4, [dstq+8]
4483    mov             [rsp+16*3], r4
4484    mov     [rsp+gprsize+16*3], r3d
4485    mov   [rsp+gprsize*2+16*3], coeffq
4486
4487.loop:
4488    LOAD_8ROWS          coeffq, 64, 1
4489    mova            [rsp+16*1], m6
4490    pxor                    m6, m6
4491    REPX   {mova [coeffq+64*x], m6}, 0,  1,  2,  3,  4,  5,  6,  7
4492    lea                   tx2q, [o(m(idct_32x16_internal_8bpc).end)]
4493    call  m(idct_8x8_internal_8bpc).pass1_end3
4494    mova            [rsp+16*0], m2
4495    mova            [rsp+16*1], m3
4496    mova            [rsp+16*2], m4
4497    mova                    m3, [o(pw_1697x16)]
4498    mova                    m4, [o(pw_16384)]
4499    REPX   {IDTX16 x, 2, 3, 4}, 5, 6, 7, 0, 1
4500    mova                    m2, [o(pw_8192)]
4501    REPX      {pmulhrsw x, m2}, m5, m6, m7, m0, m1
4502    mova                    m2, [rsp+16*0]
4503    mova            [rsp+16*0], m7
4504    IDTX16                   2, 7, 3, 4
4505    mova                    m7, [rsp+16*2]
4506    mova            [rsp+16*2], m5
4507    IDTX16                   7, 5, 3, 4
4508    mova                    m5, [rsp+16*1]
4509    mova            [rsp+16*1], m6
4510    pmulhrsw                m3, m5
4511    pmulhrsw                m3, m4
4512    psrlw                   m4, 1 ; pw_8192
4513    paddsw                  m3, m5
4514    pmulhrsw                m2, m4
4515    pmulhrsw                m3, m4
4516    pmulhrsw                m4, m7
4517    call  m(idct_8x8_internal_8bpc).end3
4518    lea                   dstq, [dstq+strideq*2]
4519    add                 coeffq, 16
4520    dec                    r3d
4521    jg .loop
4522    mov                 coeffq, [rsp+gprsize*2+16*3]
4523    add                 coeffq, 64*8
4524    mov                    r3d, [rsp+gprsize+16*3]
4525    xor                   dstq, dstq
4526    mov     [rsp+gprsize+16*3], dstq
4527    mov                   dstq, [rsp+16*3]
4528    test                   r3d, r3d
4529    jnz .loop
4530    RET
4531
4532
4533cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
4534    %undef cmp
4535
4536    mov                    r4d, 12                ;0100b
4537    mov                    r5d, 136               ;1000 1000b
4538    cmp                   eobd, 44                ;if (eob > 43)
4539    cmovns                 r4d, r5d               ;  iteration_count+2
4540    cmp                   eobd, 151               ;if (eob > 150)
4541    mov                    r3d, 34952             ;1000 1000 1000 1000b
4542    cmovs                  r3d, r4d               ;  iteration_count += 4
4543
4544%if ARCH_X86_32
4545    LEA                     r5, $$
4546%endif
4547    lea                     r4, [dstq+8]
4548    mov             [rsp+16*3], r4
4549
4550.loop:
4551    LOAD_8ROWS          coeffq, 32, 1
4552    REPX         {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7
4553    mova            [rsp+16*1], m6
4554    lea                   tx2q, [o(m(idct_32x16_internal_8bpc).end)]
4555    call  m(idct_8x8_internal_8bpc).pass1_end3
4556    mova            [rsp+16*1], m5
4557    mova            [rsp+16*2], m6
4558    mova                    m6, [o(pw_1697x16)]
4559    REPX      {IDTX16 x, 5, 6}, 7, 0, 1, 2, 3, 4
4560    pmulhrsw                m7, [o(pw_2048)]
4561    mova                    m5, [rsp+16*1]
4562    mova            [rsp+16*0], m7
4563    IDTX16                   5, 7, 6
4564    mova                    m7, [rsp+16*2]
4565    IDTX16                   7, 6, 6
4566    mova                    m6, [o(pw_2048)]
4567    REPX      {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7
4568    mova            [rsp+16*2], m5
4569    mova            [rsp+16*1], m7
4570    call  m(idct_8x8_internal_8bpc).end3
4571    lea                   dstq, [dstq+strideq*2]
4572    pxor                    m7, m7
4573    REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
4574
4575.loop_end:
4576    add                 coeffq, 16
4577    shr                    r3d, 2
4578    jz .ret
4579    test                   r3d, 2
4580    jnz .loop
4581    mov                    r4d, r3d
4582    and                    r4d, 1
4583    lea                 coeffq, [coeffq+r4*8+32*7]
4584    mov                   dstq, [rsp+16*3]
4585    lea                     r4, [dstq+8]
4586    mov             [rsp+16*3], r4
4587    jmp .loop
4588
4589.ret:
4590    RET
4591
4592
4593cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
4594%if ARCH_X86_32
4595    LEA                     r5, $$
4596%endif
4597    test                  eobd, eobd
4598    jz .dconly
4599
4600    call m(idct_32x32_internal_8bpc)
4601    RET
4602
4603.dconly:
4604    movd                    m1, [o(pw_2896x8)]
4605    pmulhrsw                m0, m1, [coeffq]
4606    movd                    m2, [o(pw_8192)]
4607    mov               [coeffq], eobd
4608    mov                    r3d, 32
4609    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)]
4610    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body
4611
4612
4613cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
4614    %undef cmp
4615
4616    mov                    r4d, 2
4617    sub                   eobd, 136
4618    mov  [rsp+gprsize*1+16*35], eobd
4619    mov                    r3d, 4
4620    cmovs                  r3d, r4d
4621
4622%if ARCH_X86_32
4623    LEA                     r5, $$
4624%endif
4625
4626    mov  [rsp+gprsize*2+16*35], coeffq
4627
4628.pass1_loop:
4629    LOAD_8ROWS     coeffq+64*1, 64*2
4630    mova   [rsp+gprsize+16*19], m0                        ;in1
4631    mova   [rsp+gprsize+16*26], m1                        ;in3
4632    mova   [rsp+gprsize+16*23], m2                        ;in5
4633    mova   [rsp+gprsize+16*22], m3                        ;in7
4634    mova   [rsp+gprsize+16*21], m4                        ;in9
4635    mova   [rsp+gprsize+16*24], m5                        ;in11
4636    mova   [rsp+gprsize+16*25], m6                        ;in13
4637    mova   [rsp+gprsize+16*20], m7                        ;in15
4638
4639    mov                   tx2d, [rsp+gprsize*1+16*35]
4640    test                  tx2d, tx2d
4641    jl .fast
4642
4643.full:
4644    LOAD_8ROWS     coeffq+64*0, 64*4
4645    call  m(idct_8x8_internal_8bpc).main
4646    SAVE_7ROWS    rsp+gprsize+16*3, 16
4647    LOAD_8ROWS     coeffq+64*2, 64*4
4648    call m(idct_16x8_internal_8bpc).main
4649    mova                    m7, [rsp+gprsize+16*0]
4650    SAVE_8ROWS   rsp+gprsize+16*11, 16
4651
4652    LOAD_8ROWS    coeffq+64*17, 64*2
4653    mova   [rsp+gprsize+16*33], m0                        ;in17
4654    mova   [rsp+gprsize+16*28], m1                        ;in19
4655    mova   [rsp+gprsize+16*29], m2                        ;in21
4656    mova   [rsp+gprsize+16*32], m3                        ;in23
4657    mova   [rsp+gprsize+16*31], m4                        ;in25
4658    mova   [rsp+gprsize+16*30], m5                        ;in27
4659    mova   [rsp+gprsize+16*27], m6                        ;in29
4660    mova   [rsp+gprsize+16*34], m7                        ;in31
4661
4662    call m(idct_8x32_internal_8bpc).main
4663    jmp .pass1_end
4664
4665.fast:
4666    mova                    m0, [coeffq+256*0]
4667    mova                    m1, [coeffq+256*1]
4668    mova                    m2, [coeffq+256*2]
4669    mova                    m3, [coeffq+256*3]
4670    pxor                    m4, m4
4671    REPX          {mova x, m4}, m5, m6, m7
4672    call  m(idct_8x8_internal_8bpc).main
4673
4674    SAVE_7ROWS    rsp+gprsize+16*3, 16
4675    mova                    m0, [coeffq+128*1]
4676    mova                    m1, [coeffq+128*3]
4677    mova                    m2, [coeffq+128*5]
4678    mova                    m3, [coeffq+128*7]
4679    pxor                    m4, m4
4680    REPX          {mova x, m4}, m5, m6, m7
4681    call m(idct_16x8_internal_8bpc).main
4682    mova                    m7, [rsp+gprsize+16*0]
4683    SAVE_8ROWS   rsp+gprsize+16*11, 16
4684
4685    call m(idct_8x32_internal_8bpc).main_fast
4686
4687.pass1_end:
4688    mova    [rsp+gprsize+16*0], m7
4689    mova                    m7, [o(pw_8192)]
4690    lea                   tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end1)]
4691    jmp   m(idct_8x8_internal_8bpc).pass1_end1
4692
4693.pass1_end1:
4694    SAVE_8ROWS     coeffq+64*0, 64
4695    LOAD_8ROWS   rsp+gprsize+16*11, 16
4696    mova    [rsp+gprsize+16*0], m7
4697    mova                    m7, [o(pw_8192)]
4698    lea                   tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end2)]
4699    jmp   m(idct_8x8_internal_8bpc).pass1_end1
4700
4701.pass1_end2:
4702    SAVE_8ROWS     coeffq+64*8, 64
4703    LOAD_8ROWS   rsp+gprsize+16*19, 16
4704    mova    [rsp+gprsize+16*0], m7
4705    mova                    m7, [o(pw_8192)]
4706    lea                   tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end3)]
4707    jmp   m(idct_8x8_internal_8bpc).pass1_end1
4708
4709.pass1_end3:
4710    SAVE_8ROWS    coeffq+64*16, 64
4711    LOAD_8ROWS   rsp+gprsize+16*27, 16
4712    mova    [rsp+gprsize+16*0], m7
4713    mova                    m7, [o(pw_8192)]
4714    lea                   tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end4)]
4715    jmp   m(idct_8x8_internal_8bpc).pass1_end1
4716
4717.pass1_end4:
4718    SAVE_8ROWS    coeffq+64*24, 64
4719
4720    add                 coeffq, 16
4721    dec                    r3d
4722    jg .pass1_loop
4723
4724
4725.pass2:
4726    mov                 coeffq, [rsp+gprsize*2+16*35]
4727    mov                    r3d, 4
4728    lea                   tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)]
4729
4730.pass2_loop:
4731    mov  [rsp+gprsize*3+16*35], r3d
4732    lea                     r3, [dstq+8]
4733    mov  [rsp+gprsize*2+16*35], r3
4734
4735    mova                    m0, [coeffq+16*4 ]
4736    mova                    m1, [coeffq+16*12]
4737    mova                    m2, [coeffq+16*20]
4738    mova                    m3, [coeffq+16*28]
4739    mova                    m4, [coeffq+16*5 ]
4740    mova                    m5, [coeffq+16*13]
4741    mova                    m6, [coeffq+16*21]
4742    mova                    m7, [coeffq+16*29]
4743    mova   [rsp+gprsize+16*19], m0                        ;in1
4744    mova   [rsp+gprsize+16*26], m1                        ;in3
4745    mova   [rsp+gprsize+16*23], m2                        ;in5
4746    mova   [rsp+gprsize+16*22], m3                        ;in7
4747    mova   [rsp+gprsize+16*21], m4                        ;in9
4748    mova   [rsp+gprsize+16*24], m5                        ;in11
4749    mova   [rsp+gprsize+16*25], m6                        ;in13
4750    mova   [rsp+gprsize+16*20], m7                        ;in15
4751
4752    mov                   eobd, [rsp+gprsize*1+16*35]
4753    test                  eobd, eobd
4754    jl .fast1
4755
4756.full1:
4757    mova                    m0, [coeffq+16*0 ]
4758    mova                    m1, [coeffq+16*16]
4759    mova                    m2, [coeffq+16*1 ]
4760    mova                    m3, [coeffq+16*17]
4761    mova                    m4, [coeffq+16*2 ]
4762    mova                    m5, [coeffq+16*18]
4763    mova                    m6, [coeffq+16*3 ]
4764    mova                    m7, [coeffq+16*19]
4765    call  m(idct_8x8_internal_8bpc).main
4766    SAVE_7ROWS    rsp+gprsize+16*3, 16
4767
4768    mova                    m0, [coeffq+16*8 ]
4769    mova                    m1, [coeffq+16*24]
4770    mova                    m2, [coeffq+16*9 ]
4771    mova                    m3, [coeffq+16*25]
4772    mova                    m4, [coeffq+16*10]
4773    mova                    m5, [coeffq+16*26]
4774    mova                    m6, [coeffq+16*11]
4775    mova                    m7, [coeffq+16*27]
4776    call m(idct_16x8_internal_8bpc).main
4777    mova                    m7, [rsp+gprsize+16*0]
4778    SAVE_8ROWS   rsp+gprsize+16*11, 16
4779
4780    mova                    m0, [coeffq+16*6 ]
4781    mova                    m1, [coeffq+16*14]
4782    mova                    m2, [coeffq+16*22]
4783    mova                    m3, [coeffq+16*30]
4784    mova                    m4, [coeffq+16*7 ]
4785    mova                    m5, [coeffq+16*15]
4786    mova                    m6, [coeffq+16*23]
4787    mova                    m7, [coeffq+16*31]
4788    mova   [rsp+gprsize+16*33], m0                        ;in17
4789    mova   [rsp+gprsize+16*28], m1                        ;in19
4790    mova   [rsp+gprsize+16*29], m2                        ;in21
4791    mova   [rsp+gprsize+16*32], m3                        ;in23
4792    mova   [rsp+gprsize+16*31], m4                        ;in25
4793    mova   [rsp+gprsize+16*30], m5                        ;in27
4794    mova   [rsp+gprsize+16*27], m6                        ;in29
4795    mova   [rsp+gprsize+16*34], m7                        ;in31
4796
4797    call m(idct_8x32_internal_8bpc).main
4798    jmp                   tx2q
4799
4800.fast1:
4801    mova                    m0, [coeffq+16*0 ]
4802    mova                    m1, [coeffq+16*16]
4803    mova                    m2, [coeffq+16*1 ]
4804    mova                    m3, [coeffq+16*17]
4805    pxor                    m4, m4
4806    REPX          {mova x, m4}, m5, m6, m7
4807    call  m(idct_8x8_internal_8bpc).main
4808    SAVE_7ROWS    rsp+gprsize+16*3, 16
4809
4810    mova                    m0, [coeffq+16*8 ]
4811    mova                    m1, [coeffq+16*24]
4812    mova                    m2, [coeffq+16*9 ]
4813    mova                    m3, [coeffq+16*25]
4814    pxor                    m4, m4
4815    REPX          {mova x, m4}, m5, m6, m7
4816    call m(idct_16x8_internal_8bpc).main
4817    mova                    m7, [rsp+gprsize+16*0]
4818    SAVE_8ROWS   rsp+gprsize+16*11, 16
4819
4820    call m(idct_8x32_internal_8bpc).main_fast
4821    jmp                   tx2q
4822
4823.pass2_end:
4824    lea                     r3, [o(m(idct_32x32_internal_8bpc).pass2_end1)]
4825    jmp  m(idct_8x32_internal_8bpc).end
4826
4827.pass2_end1:
4828    lea                   tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)]
4829    add                 coeffq, 16*32
4830    mov                   dstq, [rsp+gprsize*2+16*35]
4831    mov                    r3d, [rsp+gprsize*3+16*35]
4832    dec                    r3d
4833    jg .pass2_loop
4834
4835    ret
4836
4837
4838cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2
4839    %undef cmp
4840
4841    mov                    r4d, 2
4842    cmp                   eobd, 136
4843    mov                    r3d, 4
4844    cmovs                  r3d, r4d
4845
4846%if ARCH_X86_32
4847    LEA                     r5, $$
4848%endif
4849
4850    lea                     r4, [dstq+8]
4851    mov   [rsp+gprsize*0+16*3], r4
4852    mov   [rsp+gprsize*1+16*3], r3d
4853    mov   [rsp+gprsize*2+16*3], r3d
4854    mov   [rsp+gprsize*3+16*3], coeffq
4855
4856.loop:
4857    LOAD_8ROWS          coeffq, 64
4858    mova            [rsp+16*1], m6
4859    lea                   tx2q, [o(m(idct_32x16_internal_8bpc).end)]
4860    call  m(idct_8x8_internal_8bpc).pass1_end3
4861    pmulhrsw                m7, [o(pw_8192)]
4862    mova            [rsp+16*0], m7
4863    mova                    m7, [o(pw_8192)]
4864    REPX      {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
4865    mova            [rsp+16*1], m6
4866    mova            [rsp+16*2], m5
4867    call  m(idct_8x8_internal_8bpc).end3
4868    lea                   dstq, [dstq+strideq*2]
4869
4870    pxor                    m7, m7
4871    REPX   {mova [coeffq+64*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
4872
4873    add                 coeffq, 16
4874    dec                    r3d
4875    jg .loop
4876
4877    mov                    r4d, [rsp+gprsize*2+16*3]
4878    dec                    r4d
4879    jle .ret
4880
4881    mov                   dstq, [rsp+gprsize*0+16*3]
4882    mov                 coeffq, [rsp+gprsize*3+16*3]
4883    mov   [rsp+gprsize*2+16*3], r4
4884    lea                     r3, [dstq+8]
4885    add                 coeffq, 64*8
4886    mov   [rsp+gprsize*0+16*3], r3
4887    mov                    r3d, [rsp+gprsize*1+16*3]
4888    mov   [rsp+gprsize*3+16*3], coeffq
4889    jmp .loop
4890
4891.ret:
4892    RET
4893
4894
4895cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2
4896%if ARCH_X86_32
4897    LEA                     r5, $$
4898%endif
4899    test                  eobd, eobd
4900    jz .dconly
4901
4902    call m(idct_16x64_internal_8bpc)
4903    RET
4904
4905.dconly:
4906    movd                    m1, [o(pw_2896x8)]
4907    pmulhrsw                m0, m1, [coeffq]
4908    movd                    m2, [o(pw_8192)]
4909    mov               [coeffq], eobd
4910    mov                    r2d, 32
4911    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_16x64_8bpc).end)]
4912    jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
4913
4914.end:
4915    RET
4916
4917
4918cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
4919    %undef cmp
4920
4921    mov                    r4d, 2
4922    sub                   eobd, 151
4923    mov  [rsp+gprsize*1+16*67], eobd
4924    mov                    r3d, 4
4925    cmovs                  r3d, r4d
4926
4927%if ARCH_X86_32
4928    LEA                     r5, $$
4929%endif
4930
4931    mov  [rsp+gprsize*2+16*67], coeffq
4932
4933.pass1_loop:
4934    LOAD_8ROWS     coeffq+64*0, 64*2
4935    call  m(idct_8x8_internal_8bpc).main
4936    SAVE_7ROWS    rsp+gprsize+16*3, 16
4937    LOAD_8ROWS     coeffq+64*1, 64*2
4938    call m(idct_16x8_internal_8bpc).main
4939    mova                    m7, [o(pw_8192)]
4940    lea                   tx2q, [o(m(idct_16x64_internal_8bpc).pass1_end)]
4941    jmp   m(idct_8x8_internal_8bpc).pass1_end1
4942
4943.pass1_end:
4944    SAVE_8ROWS     coeffq+64*8, 64
4945    LOAD_8ROWS    rsp+gprsize+16*3, 16
4946    mova    [rsp+gprsize+16*0], m7
4947    mova                    m7, [o(pw_8192)]
4948    lea                   tx2q, [o(m(idct_16x64_internal_8bpc).pass1_end1)]
4949    jmp   m(idct_8x8_internal_8bpc).pass1_end1
4950
4951.pass1_end1:
4952    SAVE_8ROWS     coeffq+64*0, 64
4953
4954    add                 coeffq, 16
4955    dec                    r3d
4956    jg .pass1_loop
4957
4958    mov                 coeffq, [rsp+gprsize*2+16*67]
4959    mov                    r3d, 2
4960    lea                     r4, [dstq+8]
4961    mov  [rsp+gprsize*2+16*67], r4
4962    lea                     r4, [o(m(idct_16x64_internal_8bpc).end1)]
4963
4964.pass2_loop:
4965    mov  [rsp+gprsize*3+16*67], r3d
4966    mov                   eobd, [rsp+gprsize*1+16*67]
4967
4968    mova                    m0, [coeffq+16*4 ]            ;in1
4969    mova                    m1, [coeffq+16*12]            ;in3
4970    mova                    m2, [coeffq+16*20]            ;in5
4971    mova                    m3, [coeffq+16*28]            ;in7
4972    mova                    m4, [coeffq+16*5 ]            ;in9
4973    mova                    m5, [coeffq+16*13]            ;in11
4974    mova                    m6, [coeffq+16*21]            ;in13
4975    mova                    m7, [coeffq+16*29]            ;in15
4976    mova   [rsp+gprsize+16*35], m0                        ;in1
4977    mova   [rsp+gprsize+16*49], m1                        ;in3
4978    mova   [rsp+gprsize+16*43], m2                        ;in5
4979    mova   [rsp+gprsize+16*41], m3                        ;in7
4980    mova   [rsp+gprsize+16*39], m4                        ;in9
4981    mova   [rsp+gprsize+16*45], m5                        ;in11
4982    mova   [rsp+gprsize+16*47], m6                        ;in13
4983    mova   [rsp+gprsize+16*37], m7                        ;in15
4984
4985    pxor                    m4, m4
4986    mova                    m0, [coeffq+16*0]
4987    mova                    m1, [coeffq+16*1]
4988
4989    test                  eobd, eobd
4990    jl .fast
4991
4992.full:
4993    mova                    m2, [coeffq+16*2]
4994    mova                    m3, [coeffq+16*3]
4995
4996    REPX          {mova x, m4}, m5, m6, m7
4997    call  m(idct_8x8_internal_8bpc).main
4998    SAVE_7ROWS    rsp+gprsize+16*3, 16
4999
5000    pxor                    m4, m4
5001    mova                    m0, [coeffq+16*16]
5002    mova                    m1, [coeffq+16*17]
5003    mova                    m2, [coeffq+16*18]
5004    mova                    m3, [coeffq+16*19]
5005
5006    REPX          {mova x, m4}, m5, m6, m7
5007    call m(idct_16x8_internal_8bpc).main
5008    mova                    m7, [rsp+gprsize+16*0]
5009    SAVE_8ROWS   rsp+gprsize+16*11, 16
5010
5011    mova                    m0, [coeffq+16*8 ]
5012    mova                    m1, [coeffq+16*24]
5013    mova                    m2, [coeffq+16*9 ]
5014    mova                    m3, [coeffq+16*25]
5015    mova                    m4, [coeffq+16*10]
5016    mova                    m5, [coeffq+16*26]
5017    mova                    m6, [coeffq+16*11]
5018    mova                    m7, [coeffq+16*27]
5019    mova   [rsp+gprsize+16*19], m0
5020    mova   [rsp+gprsize+16*26], m1
5021    mova   [rsp+gprsize+16*23], m2
5022    mova   [rsp+gprsize+16*22], m3
5023    mova   [rsp+gprsize+16*21], m4
5024    mova   [rsp+gprsize+16*24], m5
5025    mova   [rsp+gprsize+16*25], m6
5026    mova   [rsp+gprsize+16*20], m7
5027
5028    call m(idct_8x32_internal_8bpc).main_fast
5029    SAVE_8ROWS    rsp+gprsize+16*3, 16
5030
5031    mova                    m0, [coeffq+16*6 ]            ;in17
5032    mova                    m1, [coeffq+16*14]            ;in19
5033    mova                    m2, [coeffq+16*22]            ;in21
5034    mova                    m3, [coeffq+16*30]            ;in23
5035    mova                    m4, [coeffq+16*7 ]            ;in25
5036    mova                    m5, [coeffq+16*15]            ;in27
5037    mova                    m6, [coeffq+16*23]            ;in29
5038    mova                    m7, [coeffq+16*31]            ;in31
5039    mova   [rsp+gprsize+16*63], m0                        ;in17
5040    mova   [rsp+gprsize+16*53], m1                        ;in19
5041    mova   [rsp+gprsize+16*55], m2                        ;in21
5042    mova   [rsp+gprsize+16*61], m3                        ;in23
5043    mova   [rsp+gprsize+16*59], m4                        ;in25
5044    mova   [rsp+gprsize+16*57], m5                        ;in27
5045    mova   [rsp+gprsize+16*51], m6                        ;in29
5046    mova   [rsp+gprsize+16*65], m7                        ;in31
5047
5048    call .main
5049    jmp  .end
5050
5051.fast:
5052    REPX          {mova x, m4}, m2, m3, m5, m6, m7
5053    call  m(idct_8x8_internal_8bpc).main
5054    SAVE_7ROWS    rsp+gprsize+16*3, 16
5055
5056    pxor                    m4, m4
5057    mova                    m0, [coeffq+16*16]
5058    mova                    m1, [coeffq+16*17]
5059
5060    REPX          {mova x, m4}, m2, m3, m5, m6, m7
5061    call m(idct_16x8_internal_8bpc).main
5062    mova                    m7, [rsp+gprsize+16*0]
5063    SAVE_8ROWS   rsp+gprsize+16*11, 16
5064
5065    mova                    m0, [coeffq+16*8 ]
5066    mova                    m1, [coeffq+16*24]
5067    mova                    m2, [coeffq+16*9 ]
5068    mova                    m3, [coeffq+16*25]
5069    mova   [rsp+gprsize+16*19], m0                        ;in1
5070    mova   [rsp+gprsize+16*26], m1                        ;in3
5071    mova   [rsp+gprsize+16*23], m2                        ;in5
5072    mova   [rsp+gprsize+16*22], m3                        ;in7
5073
5074    call m(idct_8x32_internal_8bpc).main_veryfast
5075    SAVE_8ROWS    rsp+gprsize+16*3, 16
5076
5077    call .main_fast
5078
5079.end:
5080    LOAD_8ROWS   rsp+gprsize+16*3, 16
5081    mova    [rsp+gprsize+16*0], m7
5082    mov                     r3, r4
5083    jmp  m(idct_8x32_internal_8bpc).end2
5084
5085.end1:
5086    LOAD_8ROWS   rsp+gprsize+16*35, 16
5087    lea                   dstq, [dstq+strideq*2]
5088    add                    rsp, 16*32
5089    lea                     r3, [o(m(idct_16x64_internal_8bpc).end2)]
5090    jmp  m(idct_8x32_internal_8bpc).end
5091
5092.end2:
5093    add                 coeffq, 16*32
5094    sub                    rsp, 16*32
5095
5096    mov                   dstq, [rsp+gprsize*2+16*67]
5097    mov                    r3d, [rsp+gprsize*3+16*67]
5098    lea                     r4, [dstq+8]
5099    mov  [rsp+gprsize*2+16*67], r4
5100    lea                     r4, [o(m(idct_16x64_internal_8bpc).end1)]
5101
5102    dec                    r3d
5103    jg .pass2_loop
5104    ret
5105
5106
5107ALIGN function_align
5108.main_fast:
5109    mova                    m0, [rsp+gprsize*2+16*35]     ;in1
5110    pmulhrsw                m3, m0, [o(pw_4095x8)]        ;t62,t63
5111    pmulhrsw                m0, [o(pw_101x8)]             ;t32,t33
5112    mova                    m7, [o(pd_2048)]
5113    mova [rsp+gprsize*2+16*35], m0                        ;t32
5114    mova [rsp+gprsize*2+16*66], m3                        ;t63
5115    ITX_MULSUB_2W            3, 0, 1, 2, 7,  401, 4076    ;t33a, t62a
5116    mova [rsp+gprsize*2+16*36], m3                        ;t33a
5117    mova [rsp+gprsize*2+16*65], m0                        ;t62a
5118
5119    mova                    m1, [rsp+gprsize*2+16*37]     ;in15
5120    pmulhrsw                m2, m1, [o(pw_3822x8)]        ;t60,t61
5121    pmulhrsw                m1, [o(pw_m1474x8)]           ;t34,t35
5122    mova [rsp+gprsize*2+16*38], m1                        ;t35
5123    mova [rsp+gprsize*2+16*63], m2                        ;t60
5124    ITX_MULSUB_2W            2, 1, 0, 3, 7, m4076, 401    ;t34a, t61a
5125    mova [rsp+gprsize*2+16*37], m2                        ;t34a
5126    mova [rsp+gprsize*2+16*64], m1                        ;t61a
5127
5128    mova                    m0, [rsp+gprsize*2+16*39]     ;in9
5129    pmulhrsw                m3, m0, [o(pw_3996x8)]        ;t58,t59
5130    pmulhrsw                m0, [o(pw_897x8)]             ;t36,t37
5131    mova [rsp+gprsize*2+16*39], m0                        ;t36
5132    mova [rsp+gprsize*2+16*62], m3                        ;t59
5133    ITX_MULSUB_2W            3, 0, 1, 2, 7, 3166, 2598    ;t37a, t58a
5134    mova [rsp+gprsize*2+16*40], m3                        ;t37a
5135    mova [rsp+gprsize*2+16*61], m0                        ;t58a
5136
5137    mova                    m1, [rsp+gprsize*2+16*41]     ;in7
5138    pmulhrsw                m2, m1, [o(pw_4036x8)]        ;t56,t57
5139    pmulhrsw                m1, [o(pw_m700x8)]            ;t38,t39
5140    mova [rsp+gprsize*2+16*42], m1                        ;t39
5141    mova [rsp+gprsize*2+16*59], m2                        ;t56
5142    ITX_MULSUB_2W            2, 1, 0, 3, 7, m2598, 3166   ;t38a, t57a
5143    mova [rsp+gprsize*2+16*41], m2                        ;t38a
5144    mova [rsp+gprsize*2+16*60], m1                        ;t57a
5145
5146    mova                    m0, [rsp+gprsize*2+16*43]     ;in5
5147    pmulhrsw                m3, m0, [o(pw_4065x8)]        ;t54,t55
5148    pmulhrsw                m0, [o(pw_501x8)]             ;t40,t41
5149    mova [rsp+gprsize*2+16*43], m0                        ;t40
5150    mova [rsp+gprsize*2+16*58], m3                        ;t55
5151    ITX_MULSUB_2W            3, 0, 1, 2, 7, 1931, 3612    ;t41a, t54a
5152    mova [rsp+gprsize*2+16*44], m3                        ;t41a
5153    mova [rsp+gprsize*2+16*57], m0                        ;t54a
5154
5155    mova                    m1, [rsp+gprsize*2+16*45]     ;in11
5156    pmulhrsw                m2, m1, [o(pw_3948x8)]        ;t52,t53
5157    pmulhrsw                m1, [o(pw_m1092x8)]           ;t42,t43
5158    mova [rsp+gprsize*2+16*46], m1                        ;t43
5159    mova [rsp+gprsize*2+16*55], m2                        ;t52
5160    ITX_MULSUB_2W            2, 1, 0, 3, 7, m3612, 1931   ;t42a, t53a
5161    mova [rsp+gprsize*2+16*45], m2                        ;t42a
5162    mova [rsp+gprsize*2+16*56], m1                        ;t53a
5163
5164    mova                    m0, [rsp+gprsize*2+16*47]     ;in13
5165    pmulhrsw                m3, m0, [o(pw_3889x8)]        ;t50,t51
5166    pmulhrsw                m0, [o(pw_1285x8)]            ;t44,t45
5167    mova                    m6, m0
5168    mova [rsp+gprsize*2+16*54], m3                        ;t51
5169    ITX_MULSUB_2W            3, 0, 1, 2, 7, 3920, 1189    ;t45a, t50a
5170    mova [rsp+gprsize*2+16*48], m3                        ;t45a
5171    mova [rsp+gprsize*2+16*53], m0                        ;t50a
5172
5173    mova                    m0, [rsp+gprsize*2+16*49]     ;in3
5174    pmulhrsw                m3, m0, [o(pw_4085x8)]        ;t48,t49
5175    pmulhrsw                m0, [o(pw_m301x8)]            ;t46,t47
5176    mova                    m4, m3
5177    mova                    m5, m0
5178
5179    jmp .main2
5180
5181ALIGN function_align
5182.main:
5183    mova                    m0, [rsp+gprsize*2+16*35]     ;in1
5184    mova                    m1, [rsp+gprsize*2+16*65]     ;in31
5185    pmulhrsw                m3, m0, [o(pw_4095x8)]        ;t63a
5186    pmulhrsw                m0, [o(pw_101x8)]             ;t32a
5187    pmulhrsw                m2, m1, [o(pw_2967x8)]        ;t62a
5188    pmulhrsw                m1, [o(pw_m2824x8)]           ;t33a
5189    mova                    m7, [o(pd_2048)]
5190    psubsw                  m4, m0, m1                    ;t33
5191    paddsw                  m0, m1                        ;t32
5192    psubsw                  m5, m3, m2                    ;t62
5193    paddsw                  m3, m2                        ;t63
5194    ITX_MULSUB_2W            5, 4, 1, 2, 7,  401, 4076    ;t33a, t62a
5195    mova [rsp+gprsize*2+16*35], m0                        ;t32
5196    mova [rsp+gprsize*2+16*36], m5                        ;t33a
5197    mova [rsp+gprsize*2+16*65], m4                        ;t62a
5198    mova [rsp+gprsize*2+16*66], m3                        ;t63
5199
5200    mova                    m0, [rsp+gprsize*2+16*63]     ;in17
5201    mova                    m1, [rsp+gprsize*2+16*37]     ;in15
5202    pmulhrsw                m3, m0, [o(pw_3745x8)]        ;t61a
5203    pmulhrsw                m0, [o(pw_1660x8)]            ;t34a
5204    pmulhrsw                m2, m1, [o(pw_3822x8)]        ;t60a
5205    pmulhrsw                m1, [o(pw_m1474x8)]           ;t35a
5206    psubsw                  m4, m1, m0                    ;t34
5207    paddsw                  m0, m1                        ;t35
5208    psubsw                  m5, m2, m3                    ;t61
5209    paddsw                  m3, m2                        ;t60
5210    ITX_MULSUB_2W            5, 4, 1, 2, 7, m4076, 401    ;t34a, t61a
5211    mova [rsp+gprsize*2+16*37], m5                        ;t34a
5212    mova [rsp+gprsize*2+16*38], m0                        ;t35
5213    mova [rsp+gprsize*2+16*63], m3                        ;t60
5214    mova [rsp+gprsize*2+16*64], m4                        ;t61a
5215
5216    mova                    m0, [rsp+gprsize*2+16*39]     ;in9
5217    mova                    m1, [rsp+gprsize*2+16*61]     ;in23
5218    pmulhrsw                m3, m0, [o(pw_3996x8)]        ;t59a
5219    pmulhrsw                m0, [o(pw_897x8)]             ;t36a
5220    pmulhrsw                m2, m1, [o(pw_3461x8)]        ;t58a
5221    pmulhrsw                m1, [o(pw_m2191x8)]           ;t37a
5222    psubsw                  m4, m0, m1                    ;t37
5223    paddsw                  m0, m1                        ;t36
5224    psubsw                  m5, m3, m2                    ;t58
5225    paddsw                  m3, m2                        ;t59
5226    ITX_MULSUB_2W            5, 4, 1, 2, 7, 3166, 2598    ;t37a, t58a
5227    mova [rsp+gprsize*2+16*39], m0                        ;t36
5228    mova [rsp+gprsize*2+16*40], m5                        ;t37a
5229    mova [rsp+gprsize*2+16*61], m4                        ;t58a
5230    mova [rsp+gprsize*2+16*62], m3                        ;t59
5231
5232    mova                    m0, [rsp+gprsize*2+16*59]     ;in25
5233    mova                    m1, [rsp+gprsize*2+16*41]     ;in7
5234    pmulhrsw                m3, m0, [o(pw_3349x8)]        ;t57a
5235    pmulhrsw                m0, [o(pw_2359x8)]            ;t38a
5236    pmulhrsw                m2, m1, [o(pw_4036x8)]        ;t56a
5237    pmulhrsw                m1, [o(pw_m700x8)]            ;t39a
5238    psubsw                  m4, m1, m0                    ;t38
5239    paddsw                  m0, m1                        ;t39
5240    psubsw                  m5, m2, m3                    ;t57
5241    paddsw                  m3, m2                        ;t56
5242    ITX_MULSUB_2W            5, 4, 1, 2, 7, m2598, 3166   ;t38a, t57a
5243    mova [rsp+gprsize*2+16*41], m5                        ;t38a
5244    mova [rsp+gprsize*2+16*42], m0                        ;t39
5245    mova [rsp+gprsize*2+16*59], m3                        ;t56
5246    mova [rsp+gprsize*2+16*60], m4                        ;t57a
5247
5248    mova                    m0, [rsp+gprsize*2+16*43]     ;in5
5249    mova                    m1, [rsp+gprsize*2+16*57]     ;in27
5250    pmulhrsw                m3, m0, [o(pw_4065x8)]        ;t55a
5251    pmulhrsw                m0, [o(pw_501x8)]             ;t40a
5252    pmulhrsw                m2, m1, [o(pw_3229x8)]        ;t54a
5253    pmulhrsw                m1, [o(pw_m2520x8)]           ;t41a
5254    psubsw                  m4, m0, m1                    ;t41
5255    paddsw                  m0, m1                        ;t40
5256    psubsw                  m5, m3, m2                    ;t54
5257    paddsw                  m3, m2                        ;t55
5258    ITX_MULSUB_2W            5, 4, 1, 2, 7, 1931, 3612    ;t41a, t54a
5259    mova [rsp+gprsize*2+16*43], m0                        ;t40
5260    mova [rsp+gprsize*2+16*44], m5                        ;t41a
5261    mova [rsp+gprsize*2+16*57], m4                        ;t54a
5262    mova [rsp+gprsize*2+16*58], m3                        ;t55
5263
5264    mova                    m0, [rsp+gprsize*2+16*55]     ;in21
5265    mova                    m1, [rsp+gprsize*2+16*45]     ;in11
5266    pmulhrsw                m3, m0, [o(pw_3564x8)]        ;t53a
5267    pmulhrsw                m0, [o(pw_2019x8)]            ;t42a
5268    pmulhrsw                m2, m1, [o(pw_3948x8)]        ;t52a
5269    pmulhrsw                m1, [o(pw_m1092x8)]           ;t43a
5270    psubsw                  m4, m1, m0                    ;t42
5271    paddsw                  m0, m1                        ;t43
5272    psubsw                  m5, m2, m3                    ;t53
5273    paddsw                  m3, m2                        ;t52
5274    ITX_MULSUB_2W            5, 4, 1, 2, 7, m3612, 1931   ;t42a, t53a
5275    mova [rsp+gprsize*2+16*45], m5                        ;t42a
5276    mova [rsp+gprsize*2+16*46], m0                        ;t43
5277    mova [rsp+gprsize*2+16*55], m3                        ;t52
5278    mova [rsp+gprsize*2+16*56], m4                        ;t53a
5279
5280    mova                    m0, [rsp+gprsize*2+16*47]     ;in13
5281    mova                    m1, [rsp+gprsize*2+16*53]     ;in19
5282    pmulhrsw                m3, m0, [o(pw_3889x8)]        ;t51a
5283    pmulhrsw                m0, [o(pw_1285x8)]            ;t44a
5284    pmulhrsw                m2, m1, [o(pw_3659x8)]        ;t50a
5285    pmulhrsw                m1, [o(pw_m1842x8)]           ;t45a
5286    psubsw                  m4, m0, m1                    ;t45
5287    paddsw                  m0, m1                        ;t44
5288    psubsw                  m5, m3, m2                    ;t50
5289    paddsw                  m3, m2                        ;t51
5290    ITX_MULSUB_2W            5, 4, 1, 2, 7, 3920, 1189    ;t45a, t50a
5291    mova                    m6, m0
5292    mova [rsp+gprsize*2+16*48], m5                        ;t45a
5293    mova [rsp+gprsize*2+16*53], m4                        ;t50a
5294    mova [rsp+gprsize*2+16*54], m3                        ;t51
5295
5296    mova                    m0, [rsp+gprsize*2+16*51]     ;in29
5297    mova                    m1, [rsp+gprsize*2+16*49]     ;in3
5298    pmulhrsw                m3, m0, [o(pw_3102x8)]        ;t49a
5299    pmulhrsw                m0, [o(pw_2675x8)]            ;t46a
5300    pmulhrsw                m2, m1, [o(pw_4085x8)]        ;t48a
5301    pmulhrsw                m1, [o(pw_m301x8)]            ;t47a
5302    psubsw                  m5, m1, m0                    ;t46
5303    paddsw                  m0, m1                        ;t47
5304    psubsw                  m4, m2, m3                    ;t49
5305    paddsw                  m3, m2                        ;t48
5306
5307ALIGN function_align
5308.main2:
5309    ITX_MULSUB_2W            4, 5, 1, 2, 7, m1189, 3920   ;t46a, t49a
5310    mova                    m1, [rsp+gprsize*2+16*54]     ;t51
5311    psubsw                  m2, m0, m6                    ;t44a
5312    paddsw                  m0, m6                        ;t47a
5313    psubsw                  m6, m3, m1                    ;t51a
5314    paddsw                  m3, m1                        ;t48a
5315    mova [rsp+gprsize*2+16*50], m0                        ;t47a
5316    mova [rsp+gprsize*2+16*51], m3                        ;t48a
5317    ITX_MULSUB_2W            6, 2, 0, 3, 7, m2276, 3406   ;t44, t51
5318    mova [rsp+gprsize*2+16*47], m6                        ;t44
5319    mova [rsp+gprsize*2+16*54], m2                        ;t51
5320
5321    mova                    m0, [rsp+gprsize*2+16*48]     ;t45a
5322    mova                    m3, [rsp+gprsize*2+16*53]     ;t50a
5323    psubsw                  m2, m4, m0                    ;t45
5324    paddsw                  m4, m0                        ;t46
5325    psubsw                  m6, m5, m3                    ;t50
5326    paddsw                  m5, m3                        ;t49
5327    ITX_MULSUB_2W            6, 2, 0, 3, 7, m2276, 3406   ;t45a, t50a
5328    mova [rsp+gprsize*2+16*48], m6                        ;t45a
5329    mova [rsp+gprsize*2+16*49], m4                        ;t46
5330    mova [rsp+gprsize*2+16*52], m5                        ;t49
5331    mova [rsp+gprsize*2+16*53], m2                        ;t50a
5332
5333    mova                    m0, [rsp+gprsize*2+16*43]     ;t40
5334    mova                    m2, [rsp+gprsize*2+16*46]     ;t43
5335    mova                    m3, [rsp+gprsize*2+16*55]     ;t52
5336    mova                    m1, [rsp+gprsize*2+16*58]     ;t55
5337    psubsw                  m4, m0, m2                    ;t43a
5338    paddsw                  m0, m2                        ;t40a
5339    psubsw                  m5, m1, m3                    ;t52a
5340    paddsw                  m1, m3                        ;t55a
5341    ITX_MULSUB_2W            5, 4, 2, 3, 7, 3406, 2276    ;t43, t52
5342    mova [rsp+gprsize*2+16*43], m0                        ;t40a
5343    mova [rsp+gprsize*2+16*46], m5                        ;t43
5344    mova [rsp+gprsize*2+16*55], m4                        ;t52
5345    mova [rsp+gprsize*2+16*58], m1                        ;t55a
5346
5347    mova                    m0, [rsp+gprsize*2+16*44]     ;t41a
5348    mova                    m2, [rsp+gprsize*2+16*45]     ;t42a
5349    mova                    m3, [rsp+gprsize*2+16*56]     ;t53a
5350    mova                    m1, [rsp+gprsize*2+16*57]     ;t54a
5351    psubsw                  m4, m0, m2                    ;t42
5352    paddsw                  m0, m2                        ;t41
5353    psubsw                  m5, m1, m3                    ;t53
5354    paddsw                  m1, m3                        ;t54
5355    ITX_MULSUB_2W            5, 4, 2, 3, 7, 3406, 2276    ;t42a, t53a
5356    mova [rsp+gprsize*2+16*44], m0                        ;t41
5357    mova [rsp+gprsize*2+16*45], m5                        ;t42a
5358    mova [rsp+gprsize*2+16*56], m4                        ;t53a
5359    mova [rsp+gprsize*2+16*57], m1                        ;t54
5360
5361    mova                    m0, [rsp+gprsize*2+16*41]     ;t38a
5362    mova                    m2, [rsp+gprsize*2+16*40]     ;t37a
5363    mova                    m3, [rsp+gprsize*2+16*61]     ;t58a
5364    mova                    m1, [rsp+gprsize*2+16*60]     ;t57a
5365    psubsw                  m4, m0, m2                    ;t37
5366    paddsw                  m0, m2                        ;t38
5367    psubsw                  m5, m1, m3                    ;t58
5368    paddsw                  m1, m3                        ;t57
5369    ITX_MULSUB_2W            5, 4, 2, 3, 7, m4017, 799    ;t37a, t58a
5370    mova [rsp+gprsize*2+16*41], m0                        ;t38
5371    mova [rsp+gprsize*2+16*40], m5                        ;t37a
5372    mova [rsp+gprsize*2+16*61], m4                        ;t58a
5373    mova [rsp+gprsize*2+16*60], m1                        ;t57
5374
5375    mova                    m0, [rsp+gprsize*2+16*42]     ;t39
5376    mova                    m2, [rsp+gprsize*2+16*39]     ;t36
5377    mova                    m3, [rsp+gprsize*2+16*62]     ;t59
5378    mova                    m1, [rsp+gprsize*2+16*59]     ;t56
5379    psubsw                  m4, m0, m2                    ;t36a
5380    paddsw                  m0, m2                        ;t39a
5381    psubsw                  m5, m1, m3                    ;t59a
5382    paddsw                  m1, m3                        ;t56a
5383    ITX_MULSUB_2W            5, 4, 2, 3, 7, m4017, 799    ;t36, t59
5384    mova [rsp+gprsize*2+16*42], m0                        ;t39a
5385    mova [rsp+gprsize*2+16*39], m5                        ;t36
5386    mova [rsp+gprsize*2+16*62], m4                        ;t59
5387    mova [rsp+gprsize*2+16*59], m1                        ;t56a
5388
5389    mova                    m0, [rsp+gprsize*2+16*35]     ;t32
5390    mova                    m2, [rsp+gprsize*2+16*38]     ;t35
5391    mova                    m3, [rsp+gprsize*2+16*63]     ;t60
5392    mova                    m1, [rsp+gprsize*2+16*66]     ;t63
5393    psubsw                  m4, m0, m2                    ;t35a
5394    paddsw                  m0, m2                        ;t32a
5395    psubsw                  m5, m1, m3                    ;t60a
5396    paddsw                  m1, m3                        ;t63a
5397    ITX_MULSUB_2W            5, 4, 2, 3, 7,  799, 4017    ;t35, t60
5398    mova [rsp+gprsize*2+16*35], m0                        ;t32a
5399    mova [rsp+gprsize*2+16*38], m5                        ;t35
5400    mova [rsp+gprsize*2+16*63], m4                        ;t60
5401    mova [rsp+gprsize*2+16*66], m1                        ;t63a
5402
5403    mova                    m0, [rsp+gprsize*2+16*36]     ;t33a
5404    mova                    m2, [rsp+gprsize*2+16*37]     ;t34a
5405    mova                    m3, [rsp+gprsize*2+16*64]     ;t61a
5406    mova                    m1, [rsp+gprsize*2+16*65]     ;t62a
5407    psubsw                  m4, m0, m2                    ;t34
5408    paddsw                  m0, m2                        ;t33
5409    psubsw                  m5, m1, m3                    ;t61
5410    paddsw                  m1, m3                        ;t62
5411    ITX_MULSUB_2W            5, 4, 2, 3, 7,  799, 4017    ;t34a, t61a
5412
5413    mova                    m2, [rsp+gprsize*2+16*41]     ;t38
5414    mova                    m3, [rsp+gprsize*2+16*60]     ;t57
5415    psubsw                  m6, m0, m2                    ;t38a
5416    paddsw                  m0, m2                        ;t33a
5417    psubsw                  m2, m1, m3                    ;t57a
5418    paddsw                  m1, m3                        ;t62a
5419    mova [rsp+gprsize*2+16*36], m0                        ;t33a
5420    mova [rsp+gprsize*2+16*65], m1                        ;t62a
5421    ITX_MULSUB_2W            2, 6, 0, 3, 7, 1567, 3784    ;t38, t57
5422    mova [rsp+gprsize*2+16*41], m2                        ;t38
5423    mova [rsp+gprsize*2+16*60], m6                        ;t57
5424
5425    mova                    m2, [rsp+gprsize*2+16*40]     ;t37
5426    mova                    m3, [rsp+gprsize*2+16*61]     ;t58
5427    psubsw                  m0, m5, m2                    ;t37
5428    paddsw                  m5, m2                        ;t34
5429    psubsw                  m1, m4, m3                    ;t58
5430    paddsw                  m4, m3                        ;t61
5431    ITX_MULSUB_2W            1, 0, 2, 3, 7, 1567, 3784    ;t37a, t58a
5432    mova [rsp+gprsize*2+16*37], m5                        ;t34
5433    mova [rsp+gprsize*2+16*64], m4                        ;t61
5434    mova [rsp+gprsize*2+16*40], m1                        ;t37a
5435    mova [rsp+gprsize*2+16*61], m0                        ;t58a
5436
5437    mova                    m0, [rsp+gprsize*2+16*38]     ;t35
5438    mova                    m2, [rsp+gprsize*2+16*39]     ;t36
5439    mova                    m3, [rsp+gprsize*2+16*62]     ;t59
5440    mova                    m1, [rsp+gprsize*2+16*63]     ;t60
5441    psubsw                  m4, m0, m2                    ;t36a
5442    paddsw                  m0, m2                        ;t35a
5443    psubsw                  m5, m1, m3                    ;t59a
5444    paddsw                  m1, m3                        ;t60a
5445    ITX_MULSUB_2W            5, 4, 2, 3, 7, 1567, 3784    ;t36, t59
5446    mova [rsp+gprsize*2+16*38], m0                        ;t35a
5447    mova [rsp+gprsize*2+16*39], m5                        ;t36
5448    mova [rsp+gprsize*2+16*62], m4                        ;t59
5449    mova [rsp+gprsize*2+16*63], m1                        ;t60a
5450
5451    mova                    m0, [rsp+gprsize*2+16*35]     ;t32a
5452    mova                    m2, [rsp+gprsize*2+16*42]     ;t39a
5453    mova                    m3, [rsp+gprsize*2+16*59]     ;t56a
5454    mova                    m1, [rsp+gprsize*2+16*66]     ;t63a
5455    psubsw                  m4, m0, m2                    ;t39
5456    paddsw                  m0, m2                        ;t32
5457    psubsw                  m5, m1, m3                    ;t56
5458    paddsw                  m1, m3                        ;t63
5459    ITX_MULSUB_2W            5, 4, 2, 3, 7, 1567, 3784    ;t39a, t56a
5460    mova [rsp+gprsize*2+16*35], m0                        ;t32
5461    mova [rsp+gprsize*2+16*42], m5                        ;t39a
5462    mova [rsp+gprsize*2+16*59], m4                        ;t56a
5463    mova [rsp+gprsize*2+16*66], m1                        ;t63
5464
5465    mova                    m0, [rsp+gprsize*2+16*50]     ;t47a
5466    mova                    m2, [rsp+gprsize*2+16*43]     ;t40a
5467    mova                    m3, [rsp+gprsize*2+16*58]     ;t55a
5468    mova                    m1, [rsp+gprsize*2+16*51]     ;t48a
5469    psubsw                  m4, m0, m2                    ;t40
5470    paddsw                  m0, m2                        ;t47
5471    psubsw                  m5, m1, m3                    ;t55
5472    paddsw                  m1, m3                        ;t48
5473    ITX_MULSUB_2W            5, 4, 2, 3, 7, m3784, 1567   ;t40a, t55a
5474    mova [rsp+gprsize*2+16*50], m0                        ;t47
5475    mova [rsp+gprsize*2+16*43], m5                        ;t40a
5476    mova [rsp+gprsize*2+16*58], m4                        ;t55a
5477    mova [rsp+gprsize*2+16*51], m1                        ;t48
5478
5479    mova                    m0, [rsp+gprsize*2+16*49]     ;t46
5480    mova                    m2, [rsp+gprsize*2+16*44]     ;t41
5481    mova                    m3, [rsp+gprsize*2+16*57]     ;t54
5482    mova                    m1, [rsp+gprsize*2+16*52]     ;t49
5483    psubsw                  m4, m0, m2                    ;t41a
5484    paddsw                  m0, m2                        ;t46a
5485    psubsw                  m5, m1, m3                    ;t54a
5486    paddsw                  m1, m3                        ;t49a
5487    ITX_MULSUB_2W            5, 4, 2, 3, 7, m3784, 1567   ;t41, t54
5488    mova [rsp+gprsize*2+16*49], m0                        ;t46a
5489    mova [rsp+gprsize*2+16*44], m5                        ;t41
5490    mova [rsp+gprsize*2+16*57], m4                        ;t54
5491    mova [rsp+gprsize*2+16*52], m1                        ;t49a
5492
5493    mova                    m0, [rsp+gprsize*2+16*48]     ;t45a
5494    mova                    m2, [rsp+gprsize*2+16*45]     ;t42a
5495    mova                    m3, [rsp+gprsize*2+16*56]     ;t53a
5496    mova                    m1, [rsp+gprsize*2+16*53]     ;t50a
5497    psubsw                  m4, m0, m2                    ;t42
5498    paddsw                  m0, m2                        ;t45
5499    psubsw                  m5, m1, m3                    ;t53
5500    paddsw                  m1, m3                        ;t50
5501    ITX_MULSUB_2W            5, 4, 2, 3, 7, m3784, 1567   ;t42a, t53a
5502    mova [rsp+gprsize*2+16*48], m0                        ;t45
5503    mova [rsp+gprsize*2+16*45], m5                        ;t42a
5504    mova [rsp+gprsize*2+16*56], m4                        ;t53a
5505    mova [rsp+gprsize*2+16*53], m1                        ;t50
5506
5507    mova                    m0, [rsp+gprsize*2+16*47]     ;t44
5508    mova                    m2, [rsp+gprsize*2+16*46]     ;t43
5509    mova                    m3, [rsp+gprsize*2+16*55]     ;t52
5510    mova                    m1, [rsp+gprsize*2+16*54]     ;t51
5511    psubsw                  m4, m0, m2                    ;t43a
5512    paddsw                  m0, m2                        ;t44a
5513    psubsw                  m5, m1, m3                    ;t52a
5514    paddsw                  m1, m3                        ;t51a
5515    ITX_MULSUB_2W            5, 4, 2, 3, 7, m3784, 1567   ;t43, t52
5516
5517    mova                    m2, [rsp+gprsize*2+16*38]     ;t35a
5518    mova                    m3, [rsp+gprsize*2+16*31]     ;tmp[28]
5519    psubsw                  m6, m2, m0                    ;t44
5520    paddsw                  m2, m0                        ;t35
5521    psubsw                  m0, m3, m2                    ;out35
5522    paddsw                  m2, m3                        ;out28
5523    mova                    m3, [rsp+gprsize*2+16*63]     ;t60a
5524    mova [rsp+gprsize*2+16*38], m0                        ;out35
5525    mova [rsp+gprsize*2+16*31], m2                        ;out28
5526    psubsw                  m0, m3, m1                    ;t51
5527    paddsw                  m3, m1                        ;t60
5528    ITX_MULSUB_2W            0, 6, 1, 2, 7, 2896, 2896    ;t44a, t51a
5529    mova                    m2, [rsp+gprsize*2+16*6 ]     ;tmp[3]
5530    psubsw                  m1, m2, m3                    ;out60
5531    paddsw                  m2, m3                        ;out3
5532    mova                    m3, [rsp+gprsize*2+16*22]     ;tmp[19]
5533    mova [rsp+gprsize*2+16*63], m1                        ;out60
5534    mova [rsp+gprsize*2+16*6 ], m2                        ;out3
5535    psubsw                  m1, m3, m0                    ;out44
5536    paddsw                  m3, m0                        ;out19
5537    mova                    m2, [rsp+gprsize*2+16*15]     ;tmp[12]
5538
5539    mova                    m0, [rsp+gprsize*2+16*39]     ;t36
5540    mova [rsp+gprsize*2+16*47], m1                        ;out44
5541    mova [rsp+gprsize*2+16*22], m3                        ;out19
5542    mova                    m1, [rsp+gprsize*2+16*62]     ;t59
5543    psubsw                  m3, m2, m6                    ;out51
5544    paddsw                  m2, m6                        ;out12
5545    mova [rsp+gprsize*2+16*54], m3                        ;out51
5546    mova [rsp+gprsize*2+16*15], m2                        ;out12
5547    psubsw                  m2, m0, m5                    ;t43a
5548    paddsw                  m0, m5                        ;t36a
5549    mova                    m5, [rsp+gprsize*2+16*30]     ;tmp[27]
5550    psubsw                  m3, m1, m4                    ;t52a
5551    paddsw                  m1, m4                        ;t59a
5552    ITX_MULSUB_2W            3, 2, 4, 6, 7, 2896, 2896    ;t43, t52
5553    mova                    m4, [rsp+gprsize*2+16*7 ]     ;tmp[4 ]
5554    psubsw                  m6, m5, m0                    ;out36
5555    paddsw                  m5, m0                        ;out27
5556    psubsw                  m0, m4, m1                    ;out59
5557    paddsw                  m4, m1                        ;out4
5558    mova [rsp+gprsize*2+16*39], m6                        ;out36
5559    mova [rsp+gprsize*2+16*30], m5                        ;out27
5560    mova [rsp+gprsize*2+16*62], m0                        ;out59
5561    mova [rsp+gprsize*2+16*7 ], m4                        ;out4
5562    mova                    m0, [rsp+gprsize*2+16*23]     ;tmp[20]
5563    mova                    m5, [rsp+gprsize*2+16*14]     ;tmp[11]
5564    psubsw                  m4, m0, m3                    ;out43
5565    paddsw                  m0, m3                        ;out20
5566    psubsw                  m6, m5, m2                    ;out52
5567    paddsw                  m5, m2                        ;out11
5568    mova [rsp+gprsize*2+16*46], m4                        ;out43
5569    mova [rsp+gprsize*2+16*23], m0                        ;out20
5570    mova [rsp+gprsize*2+16*55], m6                        ;out52
5571    mova [rsp+gprsize*2+16*14], m5                        ;out11
5572
5573    mova                    m0, [rsp+gprsize*2+16*40]     ;t37a
5574    mova                    m5, [rsp+gprsize*2+16*45]     ;t42a
5575    mova                    m3, [rsp+gprsize*2+16*56]     ;t53a
5576    mova                    m1, [rsp+gprsize*2+16*61]     ;t58a
5577    mova                    m2, [rsp+gprsize*2+16*29]     ;tmp[26]
5578    psubsw                  m4, m0, m5                    ;t42
5579    paddsw                  m0, m5                        ;t37
5580    psubsw                  m5, m1, m3                    ;t53
5581    paddsw                  m1, m3                        ;t58
5582    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t43, t52
5583    mova                    m3, [rsp+gprsize*2+16*8 ]     ;tmp[5 ]
5584    psubsw                  m6, m2, m0                    ;out37
5585    paddsw                  m2, m0                        ;out26
5586    psubsw                  m0, m3, m1                    ;out58
5587    paddsw                  m3, m1                        ;out5
5588    mova [rsp+gprsize*2+16*40], m6                        ;out37
5589    mova [rsp+gprsize*2+16*29], m2                        ;out26
5590    mova [rsp+gprsize*2+16*61], m0                        ;out58
5591    mova [rsp+gprsize*2+16*8 ], m3                        ;out5
5592    mova                    m0, [rsp+gprsize*2+16*24]     ;tmp[21]
5593    mova                    m1, [rsp+gprsize*2+16*13]     ;tmp[10]
5594    psubsw                  m2, m0, m5                    ;out42
5595    paddsw                  m0, m5                        ;out21
5596    psubsw                  m3, m1, m4                    ;out53
5597    paddsw                  m1, m4                        ;out10
5598    mova [rsp+gprsize*2+16*45], m2                        ;out42
5599    mova [rsp+gprsize*2+16*24], m0                        ;out21
5600    mova [rsp+gprsize*2+16*56], m3                        ;out53
5601    mova [rsp+gprsize*2+16*13], m1                        ;out10
5602
5603    mova                    m0, [rsp+gprsize*2+16*41]     ;t38
5604    mova                    m5, [rsp+gprsize*2+16*44]     ;t41
5605    mova                    m3, [rsp+gprsize*2+16*57]     ;t54
5606    mova                    m1, [rsp+gprsize*2+16*60]     ;t57
5607    mova                    m2, [rsp+gprsize*2+16*28]     ;tmp[25]
5608    psubsw                  m4, m0, m5                    ;t41a
5609    paddsw                  m0, m5                        ;t38a
5610    psubsw                  m5, m1, m3                    ;t54a
5611    paddsw                  m1, m3                        ;t57a
5612    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t41a, t54a
5613    mova                    m3, [rsp+gprsize*2+16*9 ]     ;tmp[6 ]
5614    psubsw                  m6, m2, m0                    ;out38
5615    paddsw                  m2, m0                        ;out25
5616    psubsw                  m0, m3, m1                    ;out57
5617    paddsw                  m3, m1                        ;out6
5618    mova [rsp+gprsize*2+16*41], m6                        ;out38
5619    mova [rsp+gprsize*2+16*28], m2                        ;out25
5620    mova [rsp+gprsize*2+16*60], m0                        ;out57
5621    mova [rsp+gprsize*2+16*9 ], m3                        ;out6
5622    mova                    m0, [rsp+gprsize*2+16*25]     ;tmp[22]
5623    mova                    m1, [rsp+gprsize*2+16*12]     ;tmp[9 ]
5624    psubsw                  m2, m0, m5                    ;out41
5625    paddsw                  m0, m5                        ;out22
5626    psubsw                  m3, m1, m4                    ;out54
5627    paddsw                  m1, m4                        ;out9
5628    mova [rsp+gprsize*2+16*44], m2                        ;out41
5629    mova [rsp+gprsize*2+16*25], m0                        ;out22
5630    mova [rsp+gprsize*2+16*57], m3                        ;out54
5631    mova [rsp+gprsize*2+16*12], m1                        ;out9
5632
5633    mova                    m0, [rsp+gprsize*2+16*42]     ;t39a
5634    mova                    m5, [rsp+gprsize*2+16*43]     ;t40a
5635    mova                    m3, [rsp+gprsize*2+16*58]     ;t55a
5636    mova                    m1, [rsp+gprsize*2+16*59]     ;t56a
5637    mova                    m2, [rsp+gprsize*2+16*27]     ;tmp[24]
5638    psubsw                  m4, m0, m5                    ;t40
5639    paddsw                  m0, m5                        ;t39
5640    psubsw                  m5, m1, m3                    ;t55
5641    paddsw                  m1, m3                        ;t56
5642    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t40a, t55a
5643    mova                    m3, [rsp+gprsize*2+16*10]     ;tmp[7 ]
5644    psubsw                  m6, m2, m0                    ;out39
5645    paddsw                  m2, m0                        ;out24
5646    psubsw                  m0, m3, m1                    ;out56
5647    paddsw                  m3, m1                        ;out7
5648    mova [rsp+gprsize*2+16*42], m6                        ;out39
5649    mova [rsp+gprsize*2+16*27], m2                        ;out24
5650    mova [rsp+gprsize*2+16*59], m0                        ;out56
5651    mova [rsp+gprsize*2+16*10], m3                        ;out7
5652    mova                    m0, [rsp+gprsize*2+16*26]     ;tmp[23]
5653    mova                    m1, [rsp+gprsize*2+16*11]     ;tmp[8 ]
5654    psubsw                  m2, m0, m5                    ;out40
5655    paddsw                  m0, m5                        ;out23
5656    psubsw                  m3, m1, m4                    ;out55
5657    paddsw                  m1, m4                        ;out8
5658    mova [rsp+gprsize*2+16*43], m2                        ;out40
5659    mova [rsp+gprsize*2+16*26], m0                        ;out23
5660    mova [rsp+gprsize*2+16*58], m3                        ;out55
5661    mova [rsp+gprsize*2+16*11], m1                        ;out8
5662
5663    mova                    m0, [rsp+gprsize*2+16*37]     ;t34
5664    mova                    m5, [rsp+gprsize*2+16*48]     ;t45
5665    mova                    m3, [rsp+gprsize*2+16*53]     ;t50
5666    mova                    m1, [rsp+gprsize*2+16*64]     ;t61
5667    mova                    m2, [rsp+gprsize*2+16*32]     ;tmp[29]
5668    psubsw                  m4, m0, m5                    ;t45a
5669    paddsw                  m0, m5                        ;t34a
5670    psubsw                  m5, m1, m3                    ;t50a
5671    paddsw                  m1, m3                        ;t61a
5672    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t45, t50
5673    mova                    m3, [rsp+gprsize*2+16*5 ]     ;tmp[2 ]
5674    psubsw                  m6, m2, m0                    ;out34
5675    paddsw                  m2, m0                        ;out29
5676    psubsw                  m0, m3, m1                    ;out61
5677    paddsw                  m3, m1                        ;out2
5678    mova [rsp+gprsize*2+16*37], m6                        ;out34
5679    mova [rsp+gprsize*2+16*32], m2                        ;out29
5680    mova [rsp+gprsize*2+16*64], m0                        ;out61
5681    mova [rsp+gprsize*2+16*5 ], m3                        ;out2
5682    mova                    m0, [rsp+gprsize*2+16*21]     ;tmp[18]
5683    mova                    m1, [rsp+gprsize*2+16*16]     ;tmp[13]
5684    psubsw                  m2, m0, m5                    ;out45
5685    paddsw                  m0, m5                        ;out18
5686    psubsw                  m3, m1, m4                    ;out50
5687    paddsw                  m1, m4                        ;out13
5688    mova [rsp+gprsize*2+16*48], m2                        ;out45
5689    mova [rsp+gprsize*2+16*21], m0                        ;out18
5690    mova [rsp+gprsize*2+16*53], m3                        ;out50
5691    mova [rsp+gprsize*2+16*16], m1                        ;out13
5692
5693    mova                    m0, [rsp+gprsize*2+16*36]     ;t33a
5694    mova                    m5, [rsp+gprsize*2+16*49]     ;t46a
5695    mova                    m3, [rsp+gprsize*2+16*52]     ;t49a
5696    mova                    m1, [rsp+gprsize*2+16*65]     ;t62a
5697    mova                    m2, [rsp+gprsize*2+16*33]     ;tmp[30]
5698    psubsw                  m4, m0, m5                    ;t46
5699    paddsw                  m0, m5                        ;t33
5700    psubsw                  m5, m1, m3                    ;t49
5701    paddsw                  m1, m3                        ;t62
5702    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t45, t50
5703    mova                    m3, [rsp+gprsize*2+16*4 ]     ;tmp[1 ]
5704    psubsw                  m6, m2, m0                    ;out33
5705    paddsw                  m2, m0                        ;out30
5706    psubsw                  m0, m3, m1                    ;out62
5707    paddsw                  m3, m1                        ;out1
5708    mova [rsp+gprsize*2+16*36], m6                        ;out33
5709    mova [rsp+gprsize*2+16*33], m2                        ;out30
5710    mova [rsp+gprsize*2+16*65], m0                        ;out62
5711    mova [rsp+gprsize*2+16*4 ], m3                        ;out1
5712    mova                    m0, [rsp+gprsize*2+16*20]     ;tmp[17]
5713    mova                    m1, [rsp+gprsize*2+16*17]     ;tmp[14]
5714    psubsw                  m2, m0, m5                    ;out46
5715    paddsw                  m0, m5                        ;out17
5716    psubsw                  m3, m1, m4                    ;out49
5717    paddsw                  m1, m4                        ;out14
5718    mova [rsp+gprsize*2+16*49], m2                        ;out46
5719    mova [rsp+gprsize*2+16*20], m0                        ;out17
5720    mova [rsp+gprsize*2+16*52], m3                        ;out49
5721    mova [rsp+gprsize*2+16*17], m1                        ;out14
5722
5723    mova                    m0, [rsp+gprsize*2+16*35]     ;t32
5724    mova                    m5, [rsp+gprsize*2+16*50]     ;t47
5725    mova                    m3, [rsp+gprsize*2+16*51]     ;t48
5726    mova                    m1, [rsp+gprsize*2+16*66]     ;t63
5727    mova                    m2, [rsp+gprsize*2+16*34]     ;tmp[31]
5728    psubsw                  m4, m0, m5                    ;t47a
5729    paddsw                  m0, m5                        ;t32a
5730    psubsw                  m5, m1, m3                    ;t48a
5731    paddsw                  m1, m3                        ;t63a
5732    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t47, t48
5733    mova                    m3, [rsp+gprsize*2+16*3 ]     ;tmp[0 ]
5734    psubsw                  m6, m2, m0                    ;out32
5735    paddsw                  m2, m0                        ;out31
5736    psubsw                  m0, m3, m1                    ;out63
5737    paddsw                  m3, m1                        ;out0
5738    mova [rsp+gprsize*2+16*35], m6                        ;out32
5739    mova [rsp+gprsize*2+16*34], m2                        ;out31
5740    mova [rsp+gprsize*2+16*66], m0                        ;out63
5741    mova [rsp+gprsize*2+16*3 ], m3                        ;out0
5742    mova                    m0, [rsp+gprsize*2+16*19]     ;tmp[16]
5743    mova                    m1, [rsp+gprsize*2+16*18]     ;tmp[15]
5744    psubsw                  m2, m0, m5                    ;out47
5745    paddsw                  m0, m5                        ;out16
5746    psubsw                  m3, m1, m4                    ;out48
5747    paddsw                  m1, m4                        ;out15
5748    mova [rsp+gprsize*2+16*50], m2                        ;out47
5749    mova [rsp+gprsize*2+16*19], m0                        ;out16
5750    mova [rsp+gprsize*2+16*51], m3                        ;out48
5751    mova [rsp+gprsize*2+16*18], m1                        ;out15
5752    ret
5753
5754
5755cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2
5756%if ARCH_X86_32
5757    LEA                     r5, $$
5758%endif
5759    test                  eobd, eobd
5760    jz .dconly
5761
5762    call m(idct_64x16_internal_8bpc)
5763    RET
5764
5765.dconly:
5766    movd                    m1, [o(pw_2896x8)]
5767    pmulhrsw                m0, m1, [coeffq]
5768    movd                    m2, [o(pw_8192)]
5769    mov               [coeffq], eobd
5770    mov                    r3d, 16
5771    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_64x16_8bpc).end)]
5772
5773.body:
5774    pmulhrsw                m0, m2
5775    movd                    m2, [o(pw_2048)]  ;intentionally rip-relative
5776    pmulhrsw                m0, m1
5777    pmulhrsw                m0, m2
5778    pshuflw                 m0, m0, q0000
5779    punpcklwd               m0, m0
5780    pxor                    m7, m7
5781
5782.loop:
5783    mova                    m1, [dstq+16*0]
5784    mova                    m3, [dstq+16*1]
5785    mova                    m5, [dstq+16*2]
5786    mova                    m6, [dstq+16*3]
5787    punpckhbw               m2, m1, m7
5788    punpcklbw               m1, m7
5789    punpckhbw               m4, m3, m7
5790    punpcklbw               m3, m7
5791    paddw                   m2, m0
5792    paddw                   m1, m0
5793    paddw                   m4, m0
5794    paddw                   m3, m0
5795    packuswb                m1, m2
5796    packuswb                m3, m4
5797    punpckhbw               m2, m5, m7
5798    punpcklbw               m5, m7
5799    punpckhbw               m4, m6, m7
5800    punpcklbw               m6, m7
5801    paddw                   m2, m0
5802    paddw                   m5, m0
5803    paddw                   m4, m0
5804    paddw                   m6, m0
5805    packuswb                m5, m2
5806    packuswb                m6, m4
5807    mova           [dstq+16*0], m1
5808    mova           [dstq+16*1], m3
5809    mova           [dstq+16*2], m5
5810    mova           [dstq+16*3], m6
5811    add                   dstq, strideq
5812    dec                    r3d
5813    jg .loop
5814    jmp                   tx2q
5815
5816.end:
5817    RET
5818
5819
5820%macro LOAD_4ROWS 2-3 0 ;src, stride, is_rect2
5821
5822%if %3
5823    mova                 m3, [o(pw_2896x8)]
5824    pmulhrsw             m0, m3, [%1+%2*0]
5825    pmulhrsw             m1, m3, [%1+%2*1]
5826    pmulhrsw             m2, m3, [%1+%2*2]
5827    pmulhrsw             m3, [%1+%2*3]
5828%else
5829    mova                 m0, [%1+%2*0]
5830    mova                 m1, [%1+%2*1]
5831    mova                 m2, [%1+%2*2]
5832    mova                 m3, [%1+%2*3]
5833%endif
5834%endmacro
5835
5836%macro LOAD_4ROWS_H 2 ;src, stride
5837    mova                 m4, [%1+%2*0]
5838    mova                 m5, [%1+%2*1]
5839    mova                 m6, [%1+%2*2]
5840    mova                 m7, [%1+%2*3]
5841%endmacro
5842
5843cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
5844    mov                    r3d, 2
5845    mov  [rsp+gprsize*2+16*67], dstq
5846    lea                   dstq, [rsp+gprsize+16*68]
5847
5848.pass1_loop:
5849    LOAD_4ROWS     coeffq+32*0, 32*8
5850    pxor                    m4, m4
5851    REPX          {mova x, m4}, m5, m6, m7
5852    call  m(idct_8x8_internal_8bpc).main
5853    SAVE_7ROWS    rsp+gprsize+16*3, 16
5854
5855    pxor                    m4, m4
5856    LOAD_4ROWS     coeffq+32*4, 32*8
5857
5858    REPX          {mova x, m4}, m5, m6, m7
5859    call m(idct_16x8_internal_8bpc).main
5860    mova                    m7, [rsp+gprsize+16*0]
5861    SAVE_8ROWS   rsp+gprsize+16*11, 16
5862
5863    LOAD_8ROWS     coeffq+32*2, 32*4
5864    mova   [rsp+gprsize+16*19], m0
5865    mova   [rsp+gprsize+16*26], m1
5866    mova   [rsp+gprsize+16*23], m2
5867    mova   [rsp+gprsize+16*22], m3
5868    mova   [rsp+gprsize+16*21], m4
5869    mova   [rsp+gprsize+16*24], m5
5870    mova   [rsp+gprsize+16*25], m6
5871    mova   [rsp+gprsize+16*20], m7
5872
5873    call m(idct_8x32_internal_8bpc).main_fast
5874    SAVE_8ROWS    rsp+gprsize+16*3, 16
5875
5876    LOAD_8ROWS     coeffq+32*1, 32*2
5877    mova   [rsp+gprsize+16*35], m0                        ;in1
5878    mova   [rsp+gprsize+16*49], m1                        ;in3
5879    mova   [rsp+gprsize+16*43], m2                        ;in5
5880    mova   [rsp+gprsize+16*41], m3                        ;in7
5881    mova   [rsp+gprsize+16*39], m4                        ;in9
5882    mova   [rsp+gprsize+16*45], m5                        ;in11
5883    mova   [rsp+gprsize+16*47], m6                        ;in13
5884    mova   [rsp+gprsize+16*37], m7                        ;in15
5885
5886    LOAD_8ROWS    coeffq+32*17, 32*2
5887    mova   [rsp+gprsize+16*63], m0                        ;in17
5888    mova   [rsp+gprsize+16*53], m1                        ;in19
5889    mova   [rsp+gprsize+16*55], m2                        ;in21
5890    mova   [rsp+gprsize+16*61], m3                        ;in23
5891    mova   [rsp+gprsize+16*59], m4                        ;in25
5892    mova   [rsp+gprsize+16*57], m5                        ;in27
5893    mova   [rsp+gprsize+16*51], m6                        ;in29
5894    mova   [rsp+gprsize+16*65], m7                        ;in31
5895
5896    call m(idct_16x64_internal_8bpc).main
5897
5898    LOAD_8ROWS    rsp+gprsize+16*3, 16
5899    mova    [rsp+gprsize+16*0], m7
5900    mova                    m7, [o(pw_8192)]
5901    lea                   tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end)]
5902    jmp   m(idct_8x8_internal_8bpc).pass1_end1
5903
5904.pass1_end:
5905    SAVE_8ROWS     coeffq+32*0, 32
5906    LOAD_8ROWS   rsp+gprsize+16*11, 16
5907    mova    [rsp+gprsize+16*0], m7
5908    mova                    m7, [o(pw_8192)]
5909    lea                   tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end1)]
5910    jmp   m(idct_8x8_internal_8bpc).pass1_end1
5911
5912.pass1_end1:
5913    SAVE_8ROWS     coeffq+32*8, 32
5914    LOAD_8ROWS   rsp+gprsize+16*19, 16
5915    mova    [rsp+gprsize+16*0], m7
5916    mova                    m7, [o(pw_8192)]
5917    lea                   tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end2)]
5918    jmp   m(idct_8x8_internal_8bpc).pass1_end1
5919
5920.pass1_end2:
5921    SAVE_8ROWS    coeffq+32*16, 32
5922    LOAD_8ROWS   rsp+gprsize+16*27, 16
5923    mova    [rsp+gprsize+16*0], m7
5924    mova                    m7, [o(pw_8192)]
5925    lea                   tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end3)]
5926    jmp   m(idct_8x8_internal_8bpc).pass1_end1
5927
5928.pass1_end3:
5929    SAVE_8ROWS    coeffq+32*24, 32
5930    LOAD_8ROWS   rsp+gprsize+16*35, 16
5931    mova    [rsp+gprsize+16*0], m7
5932    mova                    m7, [o(pw_8192)]
5933    lea                   tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end4)]
5934    jmp   m(idct_8x8_internal_8bpc).pass1_end1
5935
5936.pass1_end4:
5937    SAVE_8ROWS       dstq+32*0, 32
5938    LOAD_8ROWS   rsp+gprsize+16*43, 16
5939    mova    [rsp+gprsize+16*0], m7
5940    mova                    m7, [o(pw_8192)]
5941    lea                   tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end5)]
5942    jmp   m(idct_8x8_internal_8bpc).pass1_end1
5943
5944.pass1_end5:
5945    SAVE_8ROWS       dstq+32*8, 32
5946    LOAD_8ROWS   rsp+gprsize+16*51, 16
5947    mova    [rsp+gprsize+16*0], m7
5948    mova                    m7, [o(pw_8192)]
5949    lea                   tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end6)]
5950    jmp   m(idct_8x8_internal_8bpc).pass1_end1
5951
5952.pass1_end6:
5953    SAVE_8ROWS      dstq+32*16, 32
5954    LOAD_8ROWS   rsp+gprsize+16*59, 16
5955    mova    [rsp+gprsize+16*0], m7
5956    mova                    m7, [o(pw_8192)]
5957    lea                   tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end7)]
5958    jmp   m(idct_8x8_internal_8bpc).pass1_end1
5959
5960.pass1_end7:
5961    SAVE_8ROWS      dstq+32*24, 32
5962
5963    add                 coeffq, 16
5964    add                   dstq, 16
5965    dec                    r3d
5966    jg .pass1_loop
5967
5968.pass2:
5969    mov                   dstq, [rsp+gprsize*2+16*67]
5970    sub                 coeffq, 32
5971    mov                    r3d, 4
5972
5973.pass2_loop:
5974    mov  [rsp+gprsize*1+16*67], r3d
5975
5976    LOAD_4ROWS     coeffq+16*0, 32*2
5977    LOAD_4ROWS_H   coeffq+16*1, 32*2
5978    call  m(idct_8x8_internal_8bpc).main
5979    SAVE_7ROWS    rsp+gprsize+16*3, 16
5980    LOAD_4ROWS     coeffq+16*2, 32*2
5981    LOAD_4ROWS_H   coeffq+16*3, 32*2
5982    call m(idct_16x8_internal_8bpc).main
5983
5984    mov                    r3, dstq
5985    lea                  tx2q, [o(m(idct_64x16_internal_8bpc).end)]
5986    lea                  dstq, [dstq+strideq*8]
5987    jmp  m(idct_8x8_internal_8bpc).end
5988
5989.end:
5990    LOAD_8ROWS   rsp+gprsize+16*3, 16
5991    mova   [rsp+gprsize+16*0], m7
5992    lea                  tx2q, [o(m(idct_64x16_internal_8bpc).end1)]
5993    mov                  dstq, r3
5994    jmp  m(idct_8x8_internal_8bpc).end
5995
5996.end1:
5997    pxor                   m7, m7
5998    REPX  {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
5999
6000    add                 coeffq, 16*16
6001    mov                    r3d, [rsp+gprsize*1+16*67]
6002    mov                   dstq, [rsp+gprsize*2+16*67]
6003    add                   dstq, 8
6004    mov  [rsp+gprsize*2+16*67], dstq
6005    dec                    r3d
6006    jg .pass2_loop
6007
6008    mov                    r3d, 4
6009    lea                 coeffq, [rsp+gprsize+16*68]
6010.pass2_loop2:
6011    mov  [rsp+gprsize*1+16*67], r3d
6012
6013    LOAD_4ROWS     coeffq+16*0, 32*2
6014    LOAD_4ROWS_H   coeffq+16*1, 32*2
6015    call  m(idct_8x8_internal_8bpc).main
6016    SAVE_7ROWS    rsp+gprsize+16*3, 16
6017    LOAD_4ROWS     coeffq+16*2, 32*2
6018    LOAD_4ROWS_H   coeffq+16*3, 32*2
6019    call m(idct_16x8_internal_8bpc).main
6020
6021    mov                    r3, dstq
6022    lea                  tx2q, [o(m(idct_64x16_internal_8bpc).end2)]
6023    lea                  dstq, [dstq+strideq*8]
6024    jmp  m(idct_8x8_internal_8bpc).end
6025
6026.end2:
6027    LOAD_8ROWS   rsp+gprsize+16*3, 16
6028    mova   [rsp+gprsize+16*0], m7
6029    lea                  tx2q, [o(m(idct_64x16_internal_8bpc).end3)]
6030    mov                  dstq, r3
6031    jmp  m(idct_8x8_internal_8bpc).end
6032
6033.end3:
6034
6035    add                 coeffq, 16*16
6036    mov                    r3d, [rsp+gprsize*1+16*67]
6037    mov                   dstq, [rsp+gprsize*2+16*67]
6038    add                   dstq, 8
6039    mov  [rsp+gprsize*2+16*67], dstq
6040    dec                    r3d
6041    jg .pass2_loop2
6042    ret
6043
6044
6045cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2
6046%if ARCH_X86_32
6047    LEA                     r5, $$
6048%endif
6049    test                  eobd, eobd
6050    jz .dconly
6051
6052    call m(idct_32x64_internal_8bpc)
6053    RET
6054
6055.dconly:
6056    movd                    m1, [o(pw_2896x8)]
6057    pmulhrsw                m0, m1, [coeffq]
6058    movd                    m2, [o(pw_16384)]
6059    mov               [coeffq], eobd
6060    pmulhrsw                m0, m1
6061    mov                    r3d, 64
6062    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_32x64_8bpc).end)]
6063    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body
6064
6065.end:
6066    RET
6067
6068
6069cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
6070    %undef cmp
6071
6072    mov                    r4d, 2
6073    sub                   eobd, 136
6074    mov  [rsp+gprsize*1+16*67], eobd
6075    mov                    r3d, 4
6076    cmovs                  r3d, r4d
6077
6078%if ARCH_X86_32
6079    LEA                     r5, $$
6080%endif
6081
6082    mov  [rsp+gprsize*2+16*67], coeffq
6083
6084.pass1_loop:
6085    LOAD_8ROWS     coeffq+64*1, 64*2, 1
6086    mova   [rsp+gprsize+16*19], m0                        ;in1
6087    mova   [rsp+gprsize+16*26], m1                        ;in3
6088    mova   [rsp+gprsize+16*23], m2                        ;in5
6089    mova   [rsp+gprsize+16*22], m3                        ;in7
6090    mova   [rsp+gprsize+16*21], m4                        ;in9
6091    mova   [rsp+gprsize+16*24], m5                        ;in11
6092    mova   [rsp+gprsize+16*25], m6                        ;in13
6093    mova   [rsp+gprsize+16*20], m7                        ;in15
6094
6095    mov                   tx2d, [rsp+gprsize*1+16*67]
6096    test                  tx2d, tx2d
6097    jl .fast
6098
6099.full:
6100    LOAD_8ROWS     coeffq+64*0, 64*4, 1
6101    call  m(idct_8x8_internal_8bpc).main
6102    SAVE_7ROWS    rsp+gprsize+16*3, 16
6103    LOAD_8ROWS     coeffq+64*2, 64*4, 1
6104    call m(idct_16x8_internal_8bpc).main
6105    mova                    m7, [rsp+gprsize+16*0]
6106    SAVE_8ROWS   rsp+gprsize+16*11, 16
6107
6108    LOAD_8ROWS    coeffq+64*17, 64*2, 1
6109    mova   [rsp+gprsize+16*33], m0                        ;in17
6110    mova   [rsp+gprsize+16*28], m1                        ;in19
6111    mova   [rsp+gprsize+16*29], m2                        ;in21
6112    mova   [rsp+gprsize+16*32], m3                        ;in23
6113    mova   [rsp+gprsize+16*31], m4                        ;in25
6114    mova   [rsp+gprsize+16*30], m5                        ;in27
6115    mova   [rsp+gprsize+16*27], m6                        ;in29
6116    mova   [rsp+gprsize+16*34], m7                        ;in31
6117
6118    call m(idct_8x32_internal_8bpc).main
6119    jmp .pass1_end
6120
6121.fast:
6122    LOAD_4ROWS          coeffq, 256, 1
6123    pxor                    m4, m4
6124    REPX          {mova x, m4}, m5, m6, m7
6125    call  m(idct_8x8_internal_8bpc).main
6126
6127    SAVE_7ROWS    rsp+gprsize+16*3, 16
6128    LOAD_4ROWS    coeffq+128*1, 256, 1
6129    pxor                    m4, m4
6130    REPX          {mova x, m4}, m5, m6, m7
6131    call m(idct_16x8_internal_8bpc).main
6132    mova                    m7, [rsp+gprsize+16*0]
6133    SAVE_8ROWS   rsp+gprsize+16*11, 16
6134
6135    call m(idct_8x32_internal_8bpc).main_fast
6136
6137.pass1_end:
6138    mova    [rsp+gprsize+16*0], m7
6139    lea                   tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end1)]
6140    jmp   m(idct_8x8_internal_8bpc).pass1_end
6141
6142.pass1_end1:
6143    SAVE_8ROWS     coeffq+64*0, 64
6144    LOAD_8ROWS   rsp+gprsize+16*11, 16
6145    mova    [rsp+gprsize+16*0], m7
6146    lea                   tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end2)]
6147    jmp   m(idct_8x8_internal_8bpc).pass1_end
6148
6149.pass1_end2:
6150    SAVE_8ROWS     coeffq+64*8, 64
6151    LOAD_8ROWS   rsp+gprsize+16*19, 16
6152    mova    [rsp+gprsize+16*0], m7
6153    lea                   tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end3)]
6154    jmp   m(idct_8x8_internal_8bpc).pass1_end
6155
6156.pass1_end3:
6157    SAVE_8ROWS    coeffq+64*16, 64
6158    LOAD_8ROWS   rsp+gprsize+16*27, 16
6159    mova    [rsp+gprsize+16*0], m7
6160    lea                   tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end4)]
6161    jmp   m(idct_8x8_internal_8bpc).pass1_end
6162
6163.pass1_end4:
6164    SAVE_8ROWS    coeffq+64*24, 64
6165
6166    add                 coeffq, 16
6167    dec                    r3d
6168    jg .pass1_loop
6169
6170.pass2:
6171    mov                 coeffq, [rsp+gprsize*2+16*67]
6172    mov                    r3d, 4
6173    lea                     r4, [dstq+8]
6174    mov  [rsp+gprsize*2+16*67], r4
6175    lea                     r4, [o(m(idct_16x64_internal_8bpc).end1)]
6176    jmp m(idct_16x64_internal_8bpc).pass2_loop
6177
6178
6179cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2
6180%if ARCH_X86_32
6181    LEA                     r5, $$
6182%endif
6183    test                  eobd, eobd
6184    jz .dconly
6185
6186    call m(idct_64x32_internal_8bpc)
6187    RET
6188
6189.dconly:
6190    movd                    m1, [o(pw_2896x8)]
6191    pmulhrsw                m0, m1, [coeffq]
6192    movd                    m2, [o(pw_16384)]
6193    pmulhrsw                m0, m1
6194    mov               [coeffq], eobd
6195    mov                    r3d, 32
6196    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_64x32_8bpc).end)]
6197    jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body
6198
6199.end:
6200    RET
6201
6202cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
6203    %undef cmp
6204
6205    mov                    r4d, 2
6206    sub                   eobd, 136
6207    mov  [rsp+gprsize*1+16*67], eobd
6208    mov                    r3d, 4
6209    cmovs                  r3d, r4d
6210
6211%if ARCH_X86_32
6212    LEA                     r5, $$
6213%endif
6214
6215    mov  [rsp+gprsize*2+16*67], coeffq
6216    mov  [rsp+gprsize*3+16*67], dstq
6217    lea                   dstq, [rsp+gprsize+16*69]
6218    mov  [rsp+gprsize*4+16*67], dstq
6219
6220.pass1_loop:
6221    LOAD_4ROWS     coeffq+64*0, 64*8, 1
6222    pxor                    m4, m4
6223    REPX          {mova x, m4}, m5, m6, m7
6224    call  m(idct_8x8_internal_8bpc).main
6225    SAVE_7ROWS    rsp+gprsize+16*3, 16
6226
6227    pxor                    m4, m4
6228    LOAD_4ROWS     coeffq+64*4, 64*8, 1
6229
6230    REPX          {mova x, m4}, m5, m6, m7
6231    call m(idct_16x8_internal_8bpc).main
6232    mova                    m7, [rsp+gprsize+16*0]
6233    SAVE_8ROWS   rsp+gprsize+16*11, 16
6234
6235    LOAD_8ROWS     coeffq+64*2, 64*4, 1
6236    mova   [rsp+gprsize+16*19], m0
6237    mova   [rsp+gprsize+16*26], m1
6238    mova   [rsp+gprsize+16*23], m2
6239    mova   [rsp+gprsize+16*22], m3
6240    mova   [rsp+gprsize+16*21], m4
6241    mova   [rsp+gprsize+16*24], m5
6242    mova   [rsp+gprsize+16*25], m6
6243    mova   [rsp+gprsize+16*20], m7
6244
6245    call m(idct_8x32_internal_8bpc).main_fast
6246    SAVE_8ROWS    rsp+gprsize+16*3, 16
6247
6248    LOAD_8ROWS     coeffq+64*1, 64*2, 1
6249    mova   [rsp+gprsize+16*35], m0                        ;in1
6250    mova   [rsp+gprsize+16*49], m1                        ;in3
6251    mova   [rsp+gprsize+16*43], m2                        ;in5
6252    mova   [rsp+gprsize+16*41], m3                        ;in7
6253    mova   [rsp+gprsize+16*39], m4                        ;in9
6254    mova   [rsp+gprsize+16*45], m5                        ;in11
6255    mova   [rsp+gprsize+16*47], m6                        ;in13
6256    mova   [rsp+gprsize+16*37], m7                        ;in15
6257
6258    LOAD_8ROWS    coeffq+64*17, 64*2, 1
6259    mova   [rsp+gprsize+16*63], m0                        ;in17
6260    mova   [rsp+gprsize+16*53], m1                        ;in19
6261    mova   [rsp+gprsize+16*55], m2                        ;in21
6262    mova   [rsp+gprsize+16*61], m3                        ;in23
6263    mova   [rsp+gprsize+16*59], m4                        ;in25
6264    mova   [rsp+gprsize+16*57], m5                        ;in27
6265    mova   [rsp+gprsize+16*51], m6                        ;in29
6266    mova   [rsp+gprsize+16*65], m7                        ;in31
6267
6268    call m(idct_16x64_internal_8bpc).main
6269
6270    LOAD_8ROWS    rsp+gprsize+16*3, 16
6271    mova    [rsp+gprsize+16*0], m7
6272    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end)]
6273    jmp   m(idct_8x8_internal_8bpc).pass1_end
6274
6275.pass1_end:
6276    SAVE_8ROWS     coeffq+64*0, 64
6277    LOAD_8ROWS   rsp+gprsize+16*11, 16
6278    mova    [rsp+gprsize+16*0], m7
6279    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end1)]
6280    jmp   m(idct_8x8_internal_8bpc).pass1_end
6281
6282.pass1_end1:
6283    SAVE_8ROWS     coeffq+64*8, 64
6284    LOAD_8ROWS   rsp+gprsize+16*19, 16
6285    mova    [rsp+gprsize+16*0], m7
6286    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end2)]
6287    jmp   m(idct_8x8_internal_8bpc).pass1_end
6288
6289.pass1_end2:
6290    SAVE_8ROWS    coeffq+64*16, 64
6291    LOAD_8ROWS   rsp+gprsize+16*27, 16
6292    mova    [rsp+gprsize+16*0], m7
6293    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end3)]
6294    jmp   m(idct_8x8_internal_8bpc).pass1_end
6295
6296.pass1_end3:
6297    SAVE_8ROWS    coeffq+64*24, 64
6298    LOAD_8ROWS   rsp+gprsize+16*35, 16
6299    mova    [rsp+gprsize+16*0], m7
6300    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end4)]
6301    jmp   m(idct_8x8_internal_8bpc).pass1_end
6302
6303.pass1_end4:
6304    SAVE_8ROWS       dstq+64*0, 64
6305    LOAD_8ROWS   rsp+gprsize+16*43, 16
6306    mova    [rsp+gprsize+16*0], m7
6307    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end5)]
6308    jmp   m(idct_8x8_internal_8bpc).pass1_end
6309
6310.pass1_end5:
6311    SAVE_8ROWS       dstq+64*8, 64
6312    LOAD_8ROWS   rsp+gprsize+16*51, 16
6313    mova    [rsp+gprsize+16*0], m7
6314    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end6)]
6315    jmp   m(idct_8x8_internal_8bpc).pass1_end
6316
6317.pass1_end6:
6318    SAVE_8ROWS      dstq+64*16, 64
6319    LOAD_8ROWS   rsp+gprsize+16*59, 16
6320    mova    [rsp+gprsize+16*0], m7
6321    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end7)]
6322    jmp   m(idct_8x8_internal_8bpc).pass1_end
6323
6324.pass1_end7:
6325    SAVE_8ROWS      dstq+64*24, 64
6326
6327    add                 coeffq, 16
6328    add                   dstq, 16
6329    dec                    r3d
6330    jg .pass1_loop
6331
6332.pass2:
6333    mov                 coeffq, [rsp+gprsize*4+16*67]
6334    mov                   dstq, [rsp+gprsize*3+16*67]
6335    mov                   eobd, [rsp+gprsize*1+16*67]
6336    lea                   dstq, [dstq+32]
6337    mov  [rsp+gprsize*1+16*35], eobd
6338    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass2_end)]
6339    mov                    r3d, 4
6340    jmp m(idct_32x32_internal_8bpc).pass2_loop
6341
6342.pass2_end:
6343    mova    [rsp+gprsize+16*0], m7
6344    lea                     r3, [o(m(idct_64x32_internal_8bpc).pass2_end1)]
6345    jmp  m(idct_8x32_internal_8bpc).end2
6346
6347.pass2_end1:
6348    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass2_end)]
6349    add                 coeffq, 16*32
6350    mov                   dstq, [rsp+gprsize*2+16*35]
6351    mov                    r3d, [rsp+gprsize*3+16*35]
6352    dec                    r3d
6353    jg m(idct_32x32_internal_8bpc).pass2_loop
6354
6355.pass2_end2:
6356    mov                   dstq, [rsp+gprsize*3+16*67]
6357    mov                 coeffq, [rsp+gprsize*2+16*67]
6358    lea                   tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)]
6359    mov                    r3d, 4
6360    jmp m(idct_32x32_internal_8bpc).pass2_loop
6361
6362
6363cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2
6364%if ARCH_X86_32
6365    LEA                     r5, $$
6366%endif
6367    test                  eobd, eobd
6368    jz .dconly
6369
6370    call m(idct_64x64_internal_8bpc)
6371    RET
6372
6373.dconly:
6374    movd                    m1, [o(pw_2896x8)]
6375    pmulhrsw                m0, m1, [coeffq]
6376    movd                    m2, [o(pw_8192)]
6377    mov               [coeffq], eobd
6378    mov                    r3d, 64
6379    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_64x32_8bpc).end)]
6380    jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body
6381
6382cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
6383    %undef cmp
6384
6385    mov                    r5d, 4
6386    mov                    r4d, 2
6387    sub                   eobd, 136
6388    cmovns                 r4d, r5d
6389
6390%if ARCH_X86_32
6391    LEA                     r5, $$
6392%endif
6393
6394    mov  [rsp+gprsize*1+16*67], eobd
6395    mov                    r3d, r4d
6396    mov  [rsp+gprsize*4+16*67], coeffq
6397    mov  [rsp+gprsize*3+16*67], dstq
6398    lea                   dstq, [rsp+gprsize+16*69]
6399    mov  [rsp+gprsize*2+16*67], dstq
6400
6401.pass1_loop:
6402    LOAD_4ROWS     coeffq+64*0, 64*8
6403    pxor                    m4, m4
6404    REPX          {mova x, m4}, m5, m6, m7
6405    call  m(idct_8x8_internal_8bpc).main
6406    SAVE_7ROWS    rsp+gprsize+16*3, 16
6407
6408    pxor                    m4, m4
6409    LOAD_4ROWS     coeffq+64*4, 64*8
6410
6411    REPX          {mova x, m4}, m5, m6, m7
6412    call m(idct_16x8_internal_8bpc).main
6413    mova                    m7, [rsp+gprsize+16*0]
6414    SAVE_8ROWS   rsp+gprsize+16*11, 16
6415
6416    LOAD_8ROWS     coeffq+64*2, 64*4
6417    mova   [rsp+gprsize+16*19], m0
6418    mova   [rsp+gprsize+16*26], m1
6419    mova   [rsp+gprsize+16*23], m2
6420    mova   [rsp+gprsize+16*22], m3
6421    mova   [rsp+gprsize+16*21], m4
6422    mova   [rsp+gprsize+16*24], m5
6423    mova   [rsp+gprsize+16*25], m6
6424    mova   [rsp+gprsize+16*20], m7
6425
6426    call m(idct_8x32_internal_8bpc).main_fast
6427    SAVE_8ROWS    rsp+gprsize+16*3, 16
6428
6429    LOAD_8ROWS     coeffq+64*1, 64*2
6430    mova   [rsp+gprsize+16*35], m0                        ;in1
6431    mova   [rsp+gprsize+16*49], m1                        ;in3
6432    mova   [rsp+gprsize+16*43], m2                        ;in5
6433    mova   [rsp+gprsize+16*41], m3                        ;in7
6434    mova   [rsp+gprsize+16*39], m4                        ;in9
6435    mova   [rsp+gprsize+16*45], m5                        ;in11
6436    mova   [rsp+gprsize+16*47], m6                        ;in13
6437    mova   [rsp+gprsize+16*37], m7                        ;in15
6438
6439    LOAD_8ROWS    coeffq+64*17, 64*2
6440    mova   [rsp+gprsize+16*63], m0                        ;in17
6441    mova   [rsp+gprsize+16*53], m1                        ;in19
6442    mova   [rsp+gprsize+16*55], m2                        ;in21
6443    mova   [rsp+gprsize+16*61], m3                        ;in23
6444    mova   [rsp+gprsize+16*59], m4                        ;in25
6445    mova   [rsp+gprsize+16*57], m5                        ;in27
6446    mova   [rsp+gprsize+16*51], m6                        ;in29
6447    mova   [rsp+gprsize+16*65], m7                        ;in31
6448
6449    call m(idct_16x64_internal_8bpc).main
6450
6451    LOAD_8ROWS    rsp+gprsize+16*3, 16
6452    mova    [rsp+gprsize+16*0], m7
6453    mova                    m7, [o(pw_8192)]
6454    lea                   tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end)]
6455    jmp   m(idct_8x8_internal_8bpc).pass1_end1
6456
6457.pass1_end:
6458    SAVE_8ROWS     coeffq+64*0, 64
6459    LOAD_8ROWS   rsp+gprsize+16*11, 16
6460    mova    [rsp+gprsize+16*0], m7
6461    mova                    m7, [o(pw_8192)]
6462    lea                   tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end1)]
6463    jmp   m(idct_8x8_internal_8bpc).pass1_end1
6464
6465.pass1_end1:
6466    SAVE_8ROWS     coeffq+64*8, 64
6467    LOAD_8ROWS   rsp+gprsize+16*19, 16
6468    mova    [rsp+gprsize+16*0], m7
6469    mova                    m7, [o(pw_8192)]
6470    lea                   tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end2)]
6471    jmp   m(idct_8x8_internal_8bpc).pass1_end1
6472
6473.pass1_end2:
6474    SAVE_8ROWS    coeffq+64*16, 64
6475    LOAD_8ROWS   rsp+gprsize+16*27, 16
6476    mova    [rsp+gprsize+16*0], m7
6477    mova                    m7, [o(pw_8192)]
6478    lea                   tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end3)]
6479    jmp   m(idct_8x8_internal_8bpc).pass1_end1
6480
6481.pass1_end3:
6482    SAVE_8ROWS    coeffq+64*24, 64
6483    LOAD_8ROWS   rsp+gprsize+16*35, 16
6484    mova    [rsp+gprsize+16*0], m7
6485    mova                    m7, [o(pw_8192)]
6486    lea                   tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end4)]
6487    jmp   m(idct_8x8_internal_8bpc).pass1_end1
6488
6489.pass1_end4:
6490    SAVE_8ROWS       dstq+64*0, 64
6491    LOAD_8ROWS   rsp+gprsize+16*43, 16
6492    mova    [rsp+gprsize+16*0], m7
6493    mova                    m7, [o(pw_8192)]
6494    lea                   tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end5)]
6495    jmp   m(idct_8x8_internal_8bpc).pass1_end1
6496
6497.pass1_end5:
6498    SAVE_8ROWS       dstq+64*8, 64
6499    LOAD_8ROWS   rsp+gprsize+16*51, 16
6500    mova    [rsp+gprsize+16*0], m7
6501    mova                    m7, [o(pw_8192)]
6502    lea                   tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end6)]
6503    jmp   m(idct_8x8_internal_8bpc).pass1_end1
6504
6505.pass1_end6:
6506    SAVE_8ROWS      dstq+64*16, 64
6507    LOAD_8ROWS   rsp+gprsize+16*59, 16
6508    mova    [rsp+gprsize+16*0], m7
6509    mova                    m7, [o(pw_8192)]
6510    lea                   tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end7)]
6511    jmp   m(idct_8x8_internal_8bpc).pass1_end1
6512
6513.pass1_end7:
6514    SAVE_8ROWS      dstq+64*24, 64
6515
6516    add                 coeffq, 16
6517    add                   dstq, 16
6518    dec                    r3d
6519    jg .pass1_loop
6520
6521.pass2:
6522    mov                   dstq, [rsp+gprsize*3+16*67]
6523    mov                 coeffq, [rsp+gprsize*2+16*67]
6524    lea                   dstq, [dstq+32]
6525    mov                    r3d, 4
6526    lea                     r4, [dstq+8]
6527    mov  [rsp+gprsize*2+16*67], r4
6528    lea                     r4, [o(m(idct_64x64_internal_8bpc).pass2_end)]
6529    jmp m(idct_16x64_internal_8bpc).pass2_loop
6530
6531.pass2_end:
6532    LOAD_8ROWS   rsp+gprsize+16*35, 16
6533    lea                   dstq, [dstq+strideq*2]
6534    add                    rsp, 16*32
6535    mova    [rsp+gprsize+16*0], m7
6536    lea                     r3, [o(m(idct_64x64_internal_8bpc).pass2_end1)]
6537    jmp  m(idct_8x32_internal_8bpc).end2
6538
6539.pass2_end1:
6540    add                 coeffq, 16*32
6541    sub                    rsp, 16*32
6542
6543    mov                   dstq, [rsp+gprsize*2+16*67]
6544    mov                    r3d, [rsp+gprsize*3+16*67]
6545    lea                     r4, [dstq+8]
6546    mov  [rsp+gprsize*2+16*67], r4
6547    lea                     r4, [o(m(idct_64x64_internal_8bpc).pass2_end)]
6548
6549    dec                    r3d
6550    jg  m(idct_16x64_internal_8bpc).pass2_loop
6551
6552.pass2_end2:
6553    mov                 coeffq, [rsp+gprsize*4+16*67]
6554    mov                   dstq, [rsp+gprsize*2+16*67]
6555    mov                    r3d, 4
6556    sub                   dstq, 72
6557    lea                     r4, [dstq+8]
6558    mov  [rsp+gprsize*2+16*67], r4
6559    lea                     r4, [o(m(idct_16x64_internal_8bpc).end1)]
6560    jmp m(idct_16x64_internal_8bpc).pass2_loop
6561