1; Copyright © 2018-2021, VideoLAN and dav1d authors
2; Copyright © 2018, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29
30SECTION_RODATA 16
31
32deint_shuf:  db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
33
34deint_shuf1: db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
35deint_shuf2: db  8,  9,  0,  1, 10, 11,  2,  3, 12, 13,  4,  5, 14, 15,  6,  7
36
37%macro COEF_PAIR 2-3 0 ; !0 = m%1_m%2, 2 = no %2_%1
38pw_%1_m%2:  times 4 dw  %1, -%2
39%if %3 != 2
40pw_%2_%1:   times 4 dw  %2,  %1
41%endif
42%if %3
43pw_m%1_m%2: times 4 dw -%1, -%2
44%endif
45%endmacro
46
47;adst4
48pw_1321_3803:   times 4 dw  1321,  3803
49pw_2482_m1321:  times 4 dw  2482, -1321
50pw_3344_2482:   times 4 dw  3344,  2482
51pw_3344_m3803:  times 4 dw  3344, -3803
52pw_3344_m3344:  times 4 dw  3344, -3344
53pw_0_3344       times 4 dw     0,  3344
54pw_m6688_m3803: times 4 dw -6688, -3803
55
56COEF_PAIR 2896, 2896
57COEF_PAIR 1567, 3784
58COEF_PAIR  799, 4017
59COEF_PAIR 3406, 2276
60COEF_PAIR  401, 4076
61COEF_PAIR 1931, 3612
62COEF_PAIR 3166, 2598
63COEF_PAIR 3920, 1189
64COEF_PAIR 3784, 1567, 1
65COEF_PAIR  995, 3973
66COEF_PAIR 1751, 3703
67COEF_PAIR 3513, 2106
68COEF_PAIR 3857, 1380
69COEF_PAIR 4017,  799, 1
70COEF_PAIR  201, 4091
71COEF_PAIR 2440, 3290
72COEF_PAIR 3035, 2751
73COEF_PAIR 4052,  601
74COEF_PAIR 2276, 3406, 1
75COEF_PAIR 4076,  401, 2
76COEF_PAIR 2598, 3166, 2
77COEF_PAIR 3612, 1931, 2
78COEF_PAIR 1189, 3920, 2
79
80pd_2048:        times 4 dd  2048
81pw_2048:        times 8 dw  2048
82pw_m2048:       times 8 dw -2048
83pw_4096:        times 8 dw  4096
84pw_16384:       times 8 dw  16384
85pw_m16384:      times 8 dw  -16384
86pw_1697x16:     times 8 dw  1697*16
87pw_1697x8:      times 8 dw  1697*8
88pw_2896x8:      times 8 dw  2896*8
89pw_3344x8:      times 8 dw  3344*8
90pw_8192:        times 8 dw  8192
91pw_m8192:       times 8 dw -8192
92pw_5:           times 8 dw  5
93pw_201x8:       times 8 dw   201*8
94pw_4091x8:      times 8 dw  4091*8
95pw_m2751x8:     times 8 dw -2751*8
96pw_3035x8:      times 8 dw  3035*8
97pw_1751x8:      times 8 dw  1751*8
98pw_3703x8:      times 8 dw  3703*8
99pw_m1380x8:     times 8 dw -1380*8
100pw_3857x8:      times 8 dw  3857*8
101pw_995x8:       times 8 dw   995*8
102pw_3973x8:      times 8 dw  3973*8
103pw_m2106x8:     times 8 dw -2106*8
104pw_3513x8:      times 8 dw  3513*8
105pw_2440x8:      times 8 dw  2440*8
106pw_3290x8:      times 8 dw  3290*8
107pw_m601x8:      times 8 dw  -601*8
108pw_4052x8:      times 8 dw  4052*8
109
110pw_4095x8:      times 8 dw  4095*8
111pw_101x8:       times 8 dw   101*8
112pw_2967x8:      times 8 dw  2967*8
113pw_m2824x8:     times 8 dw -2824*8
114pw_3745x8:      times 8 dw  3745*8
115pw_1660x8:      times 8 dw  1660*8
116pw_3822x8:      times 8 dw  3822*8
117pw_m1474x8:     times 8 dw -1474*8
118pw_3996x8:      times 8 dw  3996*8
119pw_897x8:       times 8 dw   897*8
120pw_3461x8:      times 8 dw  3461*8
121pw_m2191x8:     times 8 dw -2191*8
122pw_3349x8:      times 8 dw  3349*8
123pw_2359x8:      times 8 dw  2359*8
124pw_4036x8:      times 8 dw  4036*8
125pw_m700x8:      times 8 dw  -700*8
126pw_4065x8:      times 8 dw  4065*8
127pw_501x8:       times 8 dw   501*8
128pw_3229x8:      times 8 dw  3229*8
129pw_m2520x8:     times 8 dw -2520*8
130pw_3564x8:      times 8 dw  3564*8
131pw_2019x8:      times 8 dw  2019*8
132pw_3948x8:      times 8 dw  3948*8
133pw_m1092x8:     times 8 dw -1092*8
134pw_3889x8:      times 8 dw  3889*8
135pw_1285x8:      times 8 dw  1285*8
136pw_3659x8:      times 8 dw  3659*8
137pw_m1842x8:     times 8 dw -1842*8
138pw_3102x8:      times 8 dw  3102*8
139pw_2675x8:      times 8 dw  2675*8
140pw_4085x8:      times 8 dw  4085*8
141pw_m301x8:      times 8 dw  -301*8
142
143SECTION .text
144
145%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
146
147%if ARCH_X86_64
148%define o(x) x
149%else
150%define o(x) r5-$$+x ; PIC
151%endif
152
153%macro WRITE_4X4 9  ;src[1-2], tmp[1-3], row[1-4]
154    lea                  r2, [dstq+strideq*2]
155%assign %%i 1
156%rotate 5
157%rep 4
158    %if %1 & 2
159        CAT_XDEFINE %%row_adr, %%i, r2   + strideq*(%1&1)
160    %else
161        CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
162    %endif
163    %assign %%i %%i + 1
164    %rotate 1
165%endrep
166
167    movd                 m%3, [%%row_adr1]        ;dst0
168    movd                 m%5, [%%row_adr2]        ;dst1
169    punpckldq            m%3, m%5                 ;high: dst1 :low: dst0
170    movd                 m%4, [%%row_adr3]        ;dst2
171    movd                 m%5, [%%row_adr4]        ;dst3
172    punpckldq            m%4, m%5                 ;high: dst3 :low: dst2
173
174    pxor                 m%5, m%5
175    punpcklbw            m%3, m%5                 ;extend byte to word
176    punpcklbw            m%4, m%5                 ;extend byte to word
177
178    paddw                m%3, m%1                 ;high: dst1 + out1 ;low: dst0 + out0
179    paddw                m%4, m%2                 ;high: dst3 + out3 ;low: dst2 + out2
180
181    packuswb             m%3, m%4                 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0
182
183    movd        [%%row_adr1], m%3                  ;store dst0 + out0
184    pshuflw              m%4, m%3, q1032
185    movd        [%%row_adr2], m%4                  ;store dst1 + out1
186    punpckhqdq           m%3, m%3
187    movd        [%%row_adr3], m%3                  ;store dst2 + out2
188    psrlq                m%3, 32
189    movd        [%%row_adr4], m%3                  ;store dst3 + out3
190%endmacro
191
192%macro ITX4_END 4-5 2048 ; row[1-4], rnd
193%if %5
194    mova                 m2, [o(pw_%5)]
195    pmulhrsw             m0, m2
196    pmulhrsw             m1, m2
197%endif
198
199    WRITE_4X4            0, 1, 2, 3, 4, %1, %2, %3, %4
200    ret
201%endmacro
202
203; flags: 1 = swap, 2: coef_regs, 4: no_pack
204%macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags
205%if %6 & 2
206    pmaddwd              m%2, m%4, m%1
207    pmaddwd              m%1, m%5
208%elif %6 & 1
209    pmaddwd              m%2, m%1, [o(pw_%5_%4)]
210    pmaddwd              m%1, [o(pw_%4_m%5)]
211%else
212    pmaddwd              m%2, m%1, [o(pw_%4_m%5)]
213    pmaddwd              m%1, [o(pw_%5_%4)]
214%endif
215    paddd                m%2, m%3
216    paddd                m%1, m%3
217    psrad                m%2, 12
218    psrad                m%1, 12
219%if %6 & 4 == 0
220    packssdw             m%1, m%2
221%endif
222%endmacro
223
224%macro IDCT4_1D_PACKED 0-1   ;pw_2896x8
225    mova                 m3, [o(pd_2048)]
226    punpckhwd            m2, m0, m1            ;unpacked in1 in3
227    punpcklwd            m0, m1                ;unpacked in0 in2
228    ITX_MUL2X_PACK        2, 1, 3, 1567, 3784
229    ITX_MUL2X_PACK        0, 1, 3, 2896, 2896
230    psubsw               m1, m0, m2            ;high: out2 ;low: out3
231    paddsw               m0, m2                ;high: out1 ;low: out0
232%endmacro
233
234%macro INV_TXFM_FN 4+ ; type1, type2, size, xmm/stack
235cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, %4, dst, stride, coeff, eob, tx2
236    %define %%p1 m(i%1_%3_internal_8bpc)
237%if ARCH_X86_32
238    LEA                    r5, $$
239%endif
240%if has_epilogue
241%ifidn %1_%2, dct_dct
242    test                 eobd, eobd
243    jz %%end
244%endif
245    lea                  tx2q, [o(m(i%2_%3_internal_8bpc).pass2)]
246    call %%p1
247    RET
248%%end:
249%else
250    lea                  tx2q, [o(m(i%2_%3_internal_8bpc).pass2)]
251%ifidn %1_%2, dct_dct
252    test                 eobd, eobd
253    jnz %%p1
254%else
255    times ((%%end - %%p1) >> 31) & 1 jmp %%p1
256ALIGN function_align
257%%end:
258%endif
259%endif
260%endmacro
261
262%macro INV_TXFM_4X4_FN 2 ; type1, type2
263    INV_TXFM_FN          %1, %2, 4x4, 6
264%ifidn %1_%2, dct_dct
265    pshuflw              m0, [coeffq], q0000
266    punpcklqdq           m0, m0
267    mova                 m1, [o(pw_2896x8)]
268    pmulhrsw             m0, m1
269    mov            [coeffq], eobd                ;0
270    pmulhrsw             m0, m1
271    mova                 m1, m0
272    TAIL_CALL m(iadst_4x4_internal_8bpc).end2
273%endif
274%endmacro
275
276INIT_XMM ssse3
277; itx16 relies on dct_dct being the first function. If you change the order, adjust `itx8_start` in itx16.
278
279INV_TXFM_4X4_FN dct, dct
280INV_TXFM_4X4_FN dct, adst
281INV_TXFM_4X4_FN dct, flipadst
282INV_TXFM_4X4_FN dct, identity
283
284cglobal idct_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
285    mova                 m0, [coeffq+16*0]      ;high: in1 ;low: in0
286    mova                 m1, [coeffq+16*1]      ;high: in3 ;low in2
287
288    IDCT4_1D_PACKED
289
290    mova                 m2, [o(deint_shuf)]
291    shufps               m3, m0, m1, q1331
292    shufps               m0, m1, q0220
293    pshufb               m0, m2                 ;high: in1 ;low: in0
294    pshufb               m1, m3, m2             ;high: in3 ;low :in2
295    jmp                tx2q
296
297.pass2:
298    IDCT4_1D_PACKED
299
300    pxor                 m2, m2
301    mova      [coeffq+16*0], m2
302    mova      [coeffq+16*1], m2                 ;memset(coeff, 0, sizeof(*coeff) * sh * sw);
303
304    ITX4_END     0, 1, 3, 2
305
306INV_TXFM_4X4_FN adst, dct
307INV_TXFM_4X4_FN adst, adst
308INV_TXFM_4X4_FN adst, flipadst
309INV_TXFM_4X4_FN adst, identity
310
311cglobal iadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
312    mova                 m0, [coeffq+16*0]
313    mova                 m1, [coeffq+16*1]
314    call .main
315    punpckhwd            m2, m0, m1
316    punpcklwd            m0, m1
317    punpckhwd            m1, m0, m2       ;high: in3 ;low :in2
318    punpcklwd            m0, m2           ;high: in1 ;low: in0
319    jmp                tx2q
320
321.pass2:
322    call .main
323
324.end:
325    pxor                 m2, m2
326    mova      [coeffq+16*0], m2
327    mova      [coeffq+16*1], m2
328
329.end2:
330    ITX4_END              0, 1, 2, 3
331
332ALIGN function_align
333cglobal_label .main
334    punpcklwd            m2, m0, m1                ;unpacked in0 in2
335    punpckhwd            m0, m1                    ;unpacked in1 in3
336    mova                 m3, m0
337    pmaddwd              m1, m2, [o(pw_3344_m3344)];3344 * in0 - 3344 * in2
338    pmaddwd              m0, [o(pw_0_3344)]        ;3344 * in3
339    paddd                m1, m0                    ;t2
340    pmaddwd              m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
341    pmaddwd              m2, [o(pw_2482_m1321)]    ;2482 * in0 - 1321 * in2
342    pmaddwd              m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
343    pmaddwd              m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3
344    paddd                m4, m0                    ;t0 + t3
345    pmaddwd              m3, [o(pw_m6688_m3803)]   ;-2 * 3344 * in1 - 3803 * in3
346    mova                 m0, [o(pd_2048)]
347    paddd                m1, m0                    ;t2 + 2048
348    paddd                m2, m0
349    paddd                m0, m4                    ;t0 + t3 + 2048
350    paddd                m5, m2                    ;t1 + t3 + 2048
351    paddd                m2, m4
352    paddd                m2, m3                    ;t0 + t1 - t3 + 2048
353    REPX      {psrad x, 12}, m1, m0, m5, m2
354    packssdw             m0, m5                    ;high: out1 ;low: out0
355    packssdw             m1, m2                    ;high: out3 ;low: out3
356    ret
357
358INV_TXFM_4X4_FN flipadst, dct
359INV_TXFM_4X4_FN flipadst, adst
360INV_TXFM_4X4_FN flipadst, flipadst
361INV_TXFM_4X4_FN flipadst, identity
362
363cglobal iflipadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
364    mova                 m0, [coeffq+16*0]
365    mova                 m1, [coeffq+16*1]
366    call m(iadst_4x4_internal_8bpc).main
367    punpcklwd            m2, m1, m0
368    punpckhwd            m1, m0
369    punpcklwd            m0, m1, m2            ;high: in3 ;low :in2
370    punpckhwd            m1, m2                ;high: in1 ;low: in0
371    jmp                tx2q
372
373.pass2:
374    call m(iadst_4x4_internal_8bpc).main
375
376.end:
377    pxor                 m2, m2
378    mova      [coeffq+16*0], m2
379    mova      [coeffq+16*1], m2
380
381.end2:
382    ITX4_END              3, 2, 1, 0
383
384INV_TXFM_4X4_FN identity, dct
385INV_TXFM_4X4_FN identity, adst
386INV_TXFM_4X4_FN identity, flipadst
387INV_TXFM_4X4_FN identity, identity
388
389cglobal iidentity_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
390    mova                 m0, [coeffq+16*0]
391    mova                 m1, [coeffq+16*1]
392    mova                 m3, [o(pw_1697x8)]
393    pmulhrsw             m2, m0, m3
394    pmulhrsw             m3, m1
395    paddsw               m0, m2
396    paddsw               m1, m3
397    punpckhwd            m2, m0, m1
398    punpcklwd            m0, m1
399    punpckhwd            m1, m0, m2            ;high: in3 ;low :in2
400    punpcklwd            m0, m2                ;high: in1 ;low: in0
401    jmp                tx2q
402
403.pass2:
404    mova                 m3, [o(pw_1697x8)]
405    pmulhrsw             m2, m3, m0
406    pmulhrsw             m3, m1
407    paddsw               m0, m2
408    paddsw               m1, m3
409    jmp m(iadst_4x4_internal_8bpc).end
410
411%macro IWHT4_1D_PACKED 0
412    punpckhqdq           m3, m0, m1            ;low: in1 high: in3
413    punpcklqdq           m0, m1                ;low: in0 high: in2
414    psubw                m2, m0, m3            ;low: in0 - in1 high: in2 - in3
415    paddw                m0, m3                ;low: in0 + in1 high: in2 + in3
416    punpckhqdq           m2, m2                ;t2 t2
417    punpcklqdq           m0, m0                ;t0 t0
418    psubw                m1, m0, m2
419    psraw                m1, 1                 ;t4 t4
420    psubw                m1, m3                ;low: t1/out2 high: t3/out1
421    psubw                m0, m1                ;high: out0
422    paddw                m2, m1                ;low: out3
423%endmacro
424
425INIT_XMM sse2
426cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, coeff
427    mova                 m0, [coeffq+16*0]
428    mova                 m1, [coeffq+16*1]
429    pxor                 m2, m2
430    mova      [coeffq+16*0], m2
431    mova      [coeffq+16*1], m2
432    psraw                m0, 2
433    psraw                m1, 2
434    IWHT4_1D_PACKED
435    punpckhwd            m0, m1
436    punpcklwd            m3, m1, m2
437    punpckhdq            m1, m0, m3
438    punpckldq            m0, m3
439    IWHT4_1D_PACKED
440    shufpd               m0, m2, 0x01
441    ITX4_END              0, 3, 2, 1, 0
442
443%macro IDCT8_1D_PACKED 0
444    mova                 m6, [o(pd_2048)]
445    punpckhwd            m4, m0, m3                 ;unpacked in1 in7
446    punpcklwd            m0, m2                     ;unpacked in0 in4
447    punpckhwd            m2, m1                     ;unpacked in5 in3
448    punpcklwd            m1, m3                     ;unpacked in2 in6
449    ITX_MUL2X_PACK        4, 3, 6,  799, 4017       ;low: t7a high: t4a
450    ITX_MUL2X_PACK        2, 3, 6, 3406, 2276       ;low: t6a high: t5a
451    ITX_MUL2X_PACK        1, 3, 6, 1567, 3784       ;low: t3  high: t2
452    psubsw               m3, m4, m2                 ;low: t6a high: t5a
453    paddsw               m4, m2                     ;low: t7  high: t4
454    pshufb               m3, [o(deint_shuf1)]
455    ITX_MUL2X_PACK        0, 2, 6, 2896, 2896       ;low: t0  high: t1
456    ITX_MUL2X_PACK        3, 2, 6, 2896, 2896       ;low: t6  high: t5
457    psubsw               m2, m0, m1                 ;low: tmp3 high: tmp2
458    paddsw               m0, m1                     ;low: tmp0 high: tmp1
459    punpcklqdq           m1, m4, m3                 ;low: t7   high: t6
460    punpckhqdq           m4, m3                     ;low: t4   high: t5
461    psubsw               m3, m0, m1                 ;low: out7 high: out6
462    paddsw               m0, m1                     ;low: out0 high: out1
463    paddsw               m1, m2, m4                 ;low: out3 high: out2
464    psubsw               m2, m4                     ;low: out4 high: out5
465%endmacro
466
467;dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
468;dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
469%macro ITX_MULSUB_2W 7-8 0 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2_in_tmp1
470    punpckhwd           m%4, m%1, m%2
471    punpcklwd           m%1, m%2
472%if %7 < 8
473    pmaddwd             m%2, m%7, m%1
474    pmaddwd             m%3, m%7, m%4
475%else
476    mova                m%2, [o(pw_%7_%6)]
477%if %8
478    pmaddwd             m%3, m%1, m%2
479    pmaddwd             m%2, m%4
480%else
481    pmaddwd             m%3, m%4, m%2
482    pmaddwd             m%2, m%1
483%endif
484%endif
485    paddd               m%3, m%5
486    paddd               m%2, m%5
487    psrad               m%3, 12
488    psrad               m%2, 12
489%if %8
490    packssdw            m%3, m%2
491%else
492    packssdw            m%2, m%3                 ;dst2
493%endif
494%if %7 < 8
495    pmaddwd             m%4, m%6
496    pmaddwd             m%1, m%6
497%elif %8
498    mova                m%2, [o(pw_%6_m%7)]
499    pmaddwd             m%4, m%2
500    pmaddwd             m%1, m%2
501%else
502    mova                m%3, [o(pw_%6_m%7)]
503    pmaddwd             m%4, m%3
504    pmaddwd             m%1, m%3
505%endif
506    paddd               m%4, m%5
507    paddd               m%1, m%5
508    psrad               m%4, 12
509    psrad               m%1, 12
510    packssdw            m%1, m%4                 ;dst1
511%endmacro
512
513%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
514    ITX_MULSUB_2W        %2, %4, %5, %6, %7, 1567, 3784, 1 ;t2, t3
515    ITX_MULSUB_2W        %1, %3, %4, %6, %7, 2896, 2896, 1 ;t1, t0
516    psubsw              m%3, m%1, m%2                      ;out2
517    paddsw              m%2, m%1                           ;out1
518    paddsw              m%1, m%5, m%4                      ;out0
519    psubsw              m%4, m%5                           ;out3
520%endmacro
521
522%macro WRITE_4X8 4 ;row[1-4]
523    WRITE_4X4             0, 1, 4, 5, 6, %1, %2, %3, %4
524    lea                dstq, [dstq+strideq*4]
525    WRITE_4X4             2, 3, 4, 5, 6, %1, %2, %3, %4
526%endmacro
527
528%macro INV_4X8 0
529    punpckhwd            m4, m2, m3
530    punpcklwd            m2, m3
531    punpckhwd            m3, m0, m1
532    punpcklwd            m0, m1
533    punpckhdq            m1, m0, m2                  ;low: in2 high: in3
534    punpckldq            m0, m2                      ;low: in0 high: in1
535    punpckldq            m2, m3, m4                  ;low: in4 high: in5
536    punpckhdq            m3, m4                      ;low: in6 high: in7
537%endmacro
538
539%macro INV_TXFM_4X8_FN 2 ; type1, type2
540    INV_TXFM_FN          %1, %2, 4x8, 8
541%ifidn %1_%2, dct_dct
542    pshuflw              m0, [coeffq], q0000
543    punpcklqdq           m0, m0
544    mova                 m1, [o(pw_2896x8)]
545    pmulhrsw             m0, m1
546    mov           [coeffq], eobd
547    pmulhrsw             m0, m1
548    pmulhrsw             m0, m1
549    pmulhrsw             m0, [o(pw_2048)]
550    mova                 m1, m0
551    mova                 m2, m0
552    mova                 m3, m0
553    TAIL_CALL m(iadst_4x8_internal_8bpc).end3
554%endif
555%endmacro
556
557INIT_XMM ssse3
558INV_TXFM_4X8_FN dct, dct
559INV_TXFM_4X8_FN dct, adst
560INV_TXFM_4X8_FN dct, flipadst
561INV_TXFM_4X8_FN dct, identity
562
563cglobal idct_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
564    mova                 m3, [o(pw_2896x8)]
565    pmulhrsw             m0, m3, [coeffq+16*0]
566    pmulhrsw             m1, m3, [coeffq+16*1]
567    pmulhrsw             m2, m3, [coeffq+16*2]
568    pmulhrsw             m3,     [coeffq+16*3]
569
570.pass1:
571    call m(idct_8x4_internal_8bpc).main
572    jmp m(iadst_4x8_internal_8bpc).pass1_end
573
574.pass2:
575    call .main
576    shufps               m1, m1, q1032
577    shufps               m3, m3, q1032
578    mova                 m4, [o(pw_2048)]
579    jmp m(iadst_4x8_internal_8bpc).end2
580
581ALIGN function_align
582cglobal_label .main
583    IDCT8_1D_PACKED
584    ret
585
586
587INV_TXFM_4X8_FN adst, dct
588INV_TXFM_4X8_FN adst, adst
589INV_TXFM_4X8_FN adst, flipadst
590INV_TXFM_4X8_FN adst, identity
591
592cglobal iadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
593    mova                 m3, [o(pw_2896x8)]
594    pmulhrsw             m0, m3, [coeffq+16*0]
595    pmulhrsw             m1, m3, [coeffq+16*1]
596    pmulhrsw             m2, m3, [coeffq+16*2]
597    pmulhrsw             m3,     [coeffq+16*3]
598
599.pass1:
600    call m(iadst_8x4_internal_8bpc).main
601
602.pass1_end:
603    INV_4X8
604    jmp                tx2q
605
606.pass2:
607    shufps               m0, m0, q1032
608    shufps               m1, m1, q1032
609    call .main
610    mova                 m4, [o(pw_2048)]
611    pxor                 m5, m5
612    psubw                m5, m4
613
614.end:
615    punpcklqdq           m4, m5
616
617.end2:
618    pmulhrsw             m0, m4
619    pmulhrsw             m1, m4
620    pmulhrsw             m2, m4
621    pmulhrsw             m3, m4
622    pxor                 m5, m5
623    mova      [coeffq+16*0], m5
624    mova      [coeffq+16*1], m5
625    mova      [coeffq+16*2], m5
626    mova      [coeffq+16*3], m5
627
628.end3:
629    WRITE_4X8             0, 1, 2, 3
630    RET
631
632ALIGN function_align
633cglobal_label .main
634    mova                 m6, [o(pd_2048)]
635    punpckhwd            m4, m3, m0                ;unpacked in7 in0
636    punpckhwd            m5, m2, m1                ;unpacked in5 in2
637    punpcklwd            m1, m2                    ;unpacked in3 in4
638    punpcklwd            m0, m3                    ;unpacked in1 in6
639    ITX_MUL2X_PACK        4, 2, 6,  401, 4076      ;low:  t0a   high:  t1a
640    ITX_MUL2X_PACK        5, 2, 6, 1931, 3612      ;low:  t2a   high:  t3a
641    ITX_MUL2X_PACK        1, 2, 6, 3166, 2598      ;low:  t4a   high:  t5a
642    ITX_MUL2X_PACK        0, 2, 6, 3920, 1189      ;low:  t6a   high:  t7a
643
644    psubsw               m3, m4, m1                ;low:  t4    high:  t5
645    paddsw               m4, m1                    ;low:  t0    high:  t1
646    psubsw               m2, m5, m0                ;low:  t6    high:  t7
647    paddsw               m5, m0                    ;low:  t2    high:  t3
648
649    shufps               m1, m3, m2, q1032
650    punpckhwd            m2, m1
651    punpcklwd            m3, m1
652    ITX_MUL2X_PACK        3, 0, 6, 1567, 3784, 1   ;low:  t5a   high:  t4a
653    ITX_MUL2X_PACK        2, 0, 6, 3784, 1567      ;low:  t7a   high:  t6a
654
655    psubsw               m1, m4, m5                ;low:  t2    high:  t3
656    paddsw               m4, m5                    ;low:  out0  high: -out7
657    psubsw               m5, m3, m2                ;low:  t7    high:  t6
658    paddsw               m3, m2                    ;low:  out6  high: -out1
659    shufps               m0, m4, m3, q3210         ;low:  out0  high: -out1
660    shufps               m3, m4, q3210             ;low:  out6  high: -out7
661
662    mova                 m2, [o(pw_2896_m2896)]
663    mova                 m7, [o(pw_2896_2896)]
664    shufps               m4, m1, m5, q1032         ;low:  t3    high:  t7
665    shufps               m1, m5, q3210             ;low:  t2    high:  t6
666    punpcklwd            m5, m1, m4
667    punpckhwd            m1, m4
668    pmaddwd              m4, m2, m1                ;-out5
669    pmaddwd              m2, m5                    ; out4
670    pmaddwd              m1, m7                    ; out2
671    pmaddwd              m5, m7                    ;-out3
672    REPX      {paddd x, m6}, m4, m2, m1, m5
673    REPX      {psrad x, 12}, m4, m2, m1, m5
674    packssdw             m1, m5                    ;low:  out2  high: -out3
675    packssdw             m2, m4                    ;low:  out4  high: -out5
676    ret
677
678INV_TXFM_4X8_FN flipadst, dct
679INV_TXFM_4X8_FN flipadst, adst
680INV_TXFM_4X8_FN flipadst, flipadst
681INV_TXFM_4X8_FN flipadst, identity
682
683cglobal iflipadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
684    mova                 m3, [o(pw_2896x8)]
685    pmulhrsw             m0, m3, [coeffq+16*0]
686    pmulhrsw             m1, m3, [coeffq+16*1]
687    pmulhrsw             m2, m3, [coeffq+16*2]
688    pmulhrsw             m3,     [coeffq+16*3]
689
690.pass1:
691    call m(iadst_8x4_internal_8bpc).main
692
693    punpcklwd            m4, m3, m2
694    punpckhwd            m3, m2
695    punpcklwd            m5, m1, m0
696    punpckhwd            m1, m0
697    punpckldq            m2, m3, m1                  ;low: in4 high: in5
698    punpckhdq            m3, m1                      ;low: in6 high: in7
699    punpckldq            m0, m4, m5                  ;low: in0 high: in1
700    punpckhdq            m1, m4, m5                  ;low: in2 high: in3
701    jmp                tx2q
702
703.pass2:
704    shufps               m0, m0, q1032
705    shufps               m1, m1, q1032
706    call m(iadst_4x8_internal_8bpc).main
707
708    mova                 m4, m0
709    mova                 m5, m1
710    pshufd               m0, m3, q1032
711    pshufd               m1, m2, q1032
712    pshufd               m2, m5, q1032
713    pshufd               m3, m4, q1032
714    mova                 m5, [o(pw_2048)]
715    pxor                 m4, m4
716    psubw                m4, m5
717    jmp m(iadst_4x8_internal_8bpc).end
718
719INV_TXFM_4X8_FN identity, dct
720INV_TXFM_4X8_FN identity, adst
721INV_TXFM_4X8_FN identity, flipadst
722INV_TXFM_4X8_FN identity, identity
723
724cglobal iidentity_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
725    mova                 m3, [o(pw_2896x8)]
726    pmulhrsw             m0, m3, [coeffq+16*0]
727    pmulhrsw             m1, m3, [coeffq+16*1]
728    pmulhrsw             m2, m3, [coeffq+16*2]
729    pmulhrsw             m3,     [coeffq+16*3]
730
731.pass1:
732    mova                 m7, [o(pw_1697x8)]
733    pmulhrsw             m4, m7, m0
734    pmulhrsw             m5, m7, m1
735    pmulhrsw             m6, m7, m2
736    pmulhrsw             m7, m3
737    paddsw               m0, m4
738    paddsw               m1, m5
739    paddsw               m2, m6
740    paddsw               m3, m7
741    jmp m(iadst_4x8_internal_8bpc).pass1_end
742
743.pass2:
744    mova                 m4, [o(pw_4096)]
745    jmp m(iadst_4x8_internal_8bpc).end2
746
747
748%macro WRITE_8X2 5       ;coefs[1-2], tmp[1-3]
749    movq                 m%3, [dstq        ]
750    movq                 m%4, [dstq+strideq]
751    pxor                 m%5, m%5
752    punpcklbw            m%3, m%5                 ;extend byte to word
753    punpcklbw            m%4, m%5                 ;extend byte to word
754%ifnum %1
755    paddw                m%3, m%1
756%else
757    paddw                m%3, %1
758%endif
759%ifnum %2
760    paddw                m%4, m%2
761%else
762    paddw                m%4, %2
763%endif
764    packuswb             m%3, m%4
765    movq      [dstq        ], m%3
766    punpckhqdq           m%3, m%3
767    movq      [dstq+strideq], m%3
768%endmacro
769
770%macro WRITE_8X4 7      ;coefs[1-4], tmp[1-3]
771    WRITE_8X2             %1, %2, %5, %6, %7
772    lea                dstq, [dstq+strideq*2]
773    WRITE_8X2             %3, %4, %5, %6, %7
774%endmacro
775
776%macro INV_TXFM_8X4_FN 2 ; type1, type2
777    INV_TXFM_FN          %1, %2, 8x4, 8
778%ifidn %1_%2, dct_dct
779    pshuflw              m0, [coeffq], q0000
780    punpcklqdq           m0, m0
781    mova                 m1, [o(pw_2896x8)]
782    pmulhrsw             m0, m1
783    pmulhrsw             m0, m1
784    mova                 m2, [o(pw_2048)]
785    pmulhrsw             m0, m1
786    pmulhrsw             m0, m2
787    mova                 m1, m0
788    mova                 m2, m0
789    mova                 m3, m0
790    TAIL_CALL m(iadst_8x4_internal_8bpc).end2
791%endif
792%endmacro
793
794INV_TXFM_8X4_FN dct, dct
795INV_TXFM_8X4_FN dct, adst
796INV_TXFM_8X4_FN dct, flipadst
797INV_TXFM_8X4_FN dct, identity
798
799cglobal idct_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
800    mova                 m3, [o(pw_2896x8)]
801    pmulhrsw             m0, m3, [coeffq+16*0]
802    pmulhrsw             m1, m3, [coeffq+16*1]
803    pmulhrsw             m2, m3, [coeffq+16*2]
804    pmulhrsw             m3,     [coeffq+16*3]
805
806    call m(idct_4x8_internal_8bpc).main
807
808    mova                 m4, [o(deint_shuf1)]
809    mova                 m5, [o(deint_shuf2)]
810    pshufb               m0, m4
811    pshufb               m1, m5
812    pshufb               m2, m4
813    pshufb               m3, m5
814    punpckhdq            m4, m0, m1
815    punpckldq            m0, m1
816    punpckhdq            m5, m2, m3
817    punpckldq            m2, m3
818    punpckhqdq           m1, m0, m2                      ;in1
819    punpcklqdq           m0, m2                          ;in0
820    punpckhqdq           m3, m4, m5                      ;in3
821    punpcklqdq           m2 ,m4, m5                      ;in2
822    jmp                tx2q
823
824.pass2:
825    call .main
826    jmp m(iadst_8x4_internal_8bpc).end
827
828ALIGN function_align
829cglobal_label .main
830    mova                 m6, [o(pd_2048)]
831    IDCT4_1D             0, 1, 2, 3, 4, 5, 6
832    ret
833
834INV_TXFM_8X4_FN adst, dct
835INV_TXFM_8X4_FN adst, adst
836INV_TXFM_8X4_FN adst, flipadst
837INV_TXFM_8X4_FN adst, identity
838
839cglobal iadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
840    mova                 m3, [o(pw_2896x8)]
841    pmulhrsw             m0, m3, [coeffq+16*0]
842    pmulhrsw             m1, m3, [coeffq+16*1]
843    pmulhrsw             m2, m3, [coeffq+16*2]
844    pmulhrsw             m3,     [coeffq+16*3]
845
846    shufps               m0, m0, q1032
847    shufps               m1, m1, q1032
848    call m(iadst_4x8_internal_8bpc).main
849
850    punpckhwd            m4, m0, m1
851    punpcklwd            m0, m1
852    punpckhwd            m1, m2, m3
853    punpcklwd            m2, m3
854    pxor                 m5, m5
855    psubsw               m3, m5, m1
856    psubsw               m5, m4
857    punpckhdq            m4, m5, m3
858    punpckldq            m5, m3
859    punpckhdq            m3, m0, m2
860    punpckldq            m0, m2
861    punpckhwd            m1, m0, m5      ;in1
862    punpcklwd            m0, m5          ;in0
863    punpcklwd            m2, m3, m4      ;in2
864    punpckhwd            m3, m4          ;in3
865    jmp              tx2q
866
867.pass2:
868    call .main
869
870.end:
871    mova                 m4, [o(pw_2048)]
872    pmulhrsw             m0, m4
873    pmulhrsw             m1, m4
874    pmulhrsw             m2, m4
875    pmulhrsw             m3, m4
876
877.end2:
878    pxor                 m6, m6
879    mova      [coeffq+16*0], m6
880    mova      [coeffq+16*1], m6
881    mova      [coeffq+16*2], m6
882    mova      [coeffq+16*3], m6
883.end3:
884    WRITE_8X4             0, 1, 2, 3, 4, 5, 6
885    RET
886
887ALIGN function_align
888cglobal_label .main
889    punpckhwd            m6, m0, m2                    ;unpacked in0 in2
890    punpcklwd            m0, m2                        ;unpacked in0 in2
891    punpckhwd            m7, m1, m3                    ;unpacked in1 in3
892    punpcklwd            m1, m3                        ;unpacked in1 in3
893
894    mova                 m2, [o(pw_3344_m3344)]
895    mova                 m4, [o(pw_0_3344)]
896    pmaddwd              m3, m2, m6                    ;3344 * in0 - 3344 * in2
897    pmaddwd              m5, m4, m7                    ;3344 * in3
898    pmaddwd              m2, m0
899    pmaddwd              m4, m1
900    paddd                m3, m5
901    paddd                m2, m4
902    mova                 m4, [o(pd_2048)]
903    paddd                m3, m4                        ;t2 + 2048
904    paddd                m2, m4
905    psrad                m3, 12
906    psrad                m2, 12
907    packssdw             m2, m3                        ;out2
908
909    pmaddwd              m4, m0, [o(pw_1321_3803)]     ;1321 * in0 + 3803 * in2
910    pmaddwd              m0, [o(pw_2482_m1321)]        ;2482 * in0 - 1321 * in2
911    pmaddwd              m3, m1, [o(pw_3344_2482)]     ;3344 * in1 + 2482 * in3
912    pmaddwd              m5, m1, [o(pw_3344_m3803)]    ;3344 * in1 - 3803 * in3
913    paddd                m3, m4                        ;t0 + t3
914
915    pmaddwd              m1, [o(pw_m6688_m3803)]       ;-2 * 3344 * in1 - 3803 * in3
916    mova                 m4, [o(pd_2048)]
917    paddd                m0, m4
918    paddd                m4, m3                        ;t0 + t3 + 2048
919    paddd                m5, m0                        ;t1 + t3 + 2048
920    paddd                m3, m0
921    paddd                m3, m1                        ;t0 + t1 - t3 + 2048
922
923    psrad                m4, 12                        ;out0
924    psrad                m5, 12                        ;out1
925    psrad                m3, 12                        ;out3
926    packssdw             m0, m4, m5                    ;low: out0  high: out1
927
928    pmaddwd              m4, m6, [o(pw_1321_3803)]     ;1321 * in0 + 3803 * in2
929    pmaddwd              m6, [o(pw_2482_m1321)]        ;2482 * in0 - 1321 * in2
930    pmaddwd              m1, m7, [o(pw_3344_2482)]     ;3344 * in1 + 2482 * in3
931    pmaddwd              m5, m7, [o(pw_3344_m3803)]    ;3344 * in1 - 3803 * in3
932    paddd                m1, m4                        ;t0 + t3
933    pmaddwd              m7, [o(pw_m6688_m3803)]       ;-2 * 3344 * in1 - 3803 * in3
934
935    mova                 m4, [o(pd_2048)]
936    paddd                m6, m4
937    paddd                m4, m1                        ;t0 + t3 + 2048
938    paddd                m5, m6                        ;t1 + t3 + 2048
939    paddd                m1, m6
940    paddd                m1, m7                        ;t0 + t1 - t3 + 2048
941
942    psrad                m4, 12                        ;out0
943    psrad                m5, 12                        ;out1
944    psrad                m1, 12                        ;out3
945    packssdw             m3, m1                        ;out3
946    packssdw             m4, m5                        ;low: out0  high: out1
947
948    punpckhqdq           m1, m0, m4                    ;out1
949    punpcklqdq           m0, m4                        ;out0
950    ret
951
952INV_TXFM_8X4_FN flipadst, dct
953INV_TXFM_8X4_FN flipadst, adst
954INV_TXFM_8X4_FN flipadst, flipadst
955INV_TXFM_8X4_FN flipadst, identity
956
957cglobal iflipadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
958    mova                 m3, [o(pw_2896x8)]
959    pmulhrsw             m0, m3, [coeffq+16*0]
960    pmulhrsw             m1, m3, [coeffq+16*1]
961    pmulhrsw             m2, m3, [coeffq+16*2]
962    pmulhrsw             m3,     [coeffq+16*3]
963
964    shufps               m0, m0, q1032
965    shufps               m1, m1, q1032
966    call m(iadst_4x8_internal_8bpc).main
967
968    punpckhwd            m5, m3, m2
969    punpcklwd            m3, m2
970    punpckhwd            m2, m1, m0
971    punpcklwd            m1, m0
972
973    pxor                 m0, m0
974    psubsw               m4, m0, m2
975    psubsw               m0, m5
976    punpckhdq            m2, m0, m4
977    punpckldq            m0, m4
978    punpckhdq            m4, m3, m1
979    punpckldq            m3, m1
980    punpckhwd            m1, m0, m3      ;in1
981    punpcklwd            m0, m3          ;in0
982    punpckhwd            m3, m2, m4      ;in3
983    punpcklwd            m2, m4          ;in2
984    jmp                  tx2q
985
986.pass2:
987    call m(iadst_8x4_internal_8bpc).main
988    mova                 m4, m0
989    mova                 m5, m1
990    mova                 m0, m3
991    mova                 m1, m2
992    mova                 m2, m5
993    mova                 m3, m4
994    jmp m(iadst_8x4_internal_8bpc).end
995
996INV_TXFM_8X4_FN identity, dct
997INV_TXFM_8X4_FN identity, adst
998INV_TXFM_8X4_FN identity, flipadst
999INV_TXFM_8X4_FN identity, identity
1000
1001cglobal iidentity_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
1002    mova                 m3, [o(pw_2896x8)]
1003    pmulhrsw             m0, m3, [coeffq+16*0]
1004    pmulhrsw             m1, m3, [coeffq+16*1]
1005    pmulhrsw             m2, m3, [coeffq+16*2]
1006    pmulhrsw             m3,     [coeffq+16*3]
1007    paddsw               m0, m0
1008    paddsw               m1, m1
1009    paddsw               m2, m2
1010    paddsw               m3, m3
1011
1012    punpckhwd            m4, m0, m1
1013    punpcklwd            m0, m1
1014    punpckhwd            m1, m2, m3
1015    punpcklwd            m2, m3
1016    punpckhdq            m5, m4, m1
1017    punpckldq            m4, m1
1018    punpckhdq            m3, m0, m2
1019    punpckldq            m0, m2
1020    punpckhwd            m1, m0, m4      ;in1
1021    punpcklwd            m0, m4          ;in0
1022    punpcklwd            m2, m3, m5      ;in2
1023    punpckhwd            m3, m5          ;in3
1024    jmp                tx2q
1025
1026.pass2:
1027    mova                 m7, [o(pw_1697x8)]
1028    pmulhrsw             m4, m7, m0
1029    pmulhrsw             m5, m7, m1
1030    pmulhrsw             m6, m7, m2
1031    pmulhrsw             m7, m3
1032    paddsw               m0, m4
1033    paddsw               m1, m5
1034    paddsw               m2, m6
1035    paddsw               m3, m7
1036    jmp m(iadst_8x4_internal_8bpc).end
1037
1038%macro INV_TXFM_8X8_FN 2 ; type1, type2
1039    INV_TXFM_FN          %1, %2, 8x8, 8, 16*4
1040%ifidn %1_%2, dct_dct
1041    pshuflw              m0, [coeffq], q0000
1042    punpcklwd            m0, m0
1043    mova                 m1, [o(pw_2896x8)]
1044    pmulhrsw             m0, m1
1045    mova                 m2, [o(pw_16384)]
1046    mov            [coeffq], eobd
1047    pmulhrsw             m0, m2
1048    psrlw                m2, 3
1049    pmulhrsw             m0, m1
1050    pmulhrsw             m0, m2
1051.end:
1052    mov                 r3d, 2
1053    lea                tx2q, [o(m(inv_txfm_add_dct_dct_8x8_8bpc).end3)]
1054.loop:
1055    WRITE_8X4             0, 0, 0, 0, 1, 2, 3
1056    lea                dstq, [dstq+strideq*2]
1057    dec                 r3d
1058    jg .loop
1059    jmp                tx2q
1060.end3:
1061    RET
1062%endif
1063%endmacro
1064
1065%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2
1066%if %3
1067    mova                 m7, [o(pw_2896x8)]
1068    pmulhrsw             m0, m7, [%1+%2*0]
1069    pmulhrsw             m1, m7, [%1+%2*1]
1070    pmulhrsw             m2, m7, [%1+%2*2]
1071    pmulhrsw             m3, m7, [%1+%2*3]
1072    pmulhrsw             m4, m7, [%1+%2*4]
1073    pmulhrsw             m5, m7, [%1+%2*5]
1074    pmulhrsw             m6, m7, [%1+%2*6]
1075    pmulhrsw             m7, [%1+%2*7]
1076%else
1077    mova                 m0, [%1+%2*0]
1078    mova                 m1, [%1+%2*1]
1079    mova                 m2, [%1+%2*2]
1080    mova                 m3, [%1+%2*3]
1081    mova                 m4, [%1+%2*4]
1082    mova                 m5, [%1+%2*5]
1083    mova                 m6, [%1+%2*6]
1084    mova                 m7, [%1+%2*7]
1085%endif
1086%endmacro
1087
1088%macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048
1089    ITX_MULSUB_2W         %1, %4, %5, %6, %7,  799, 4017    ;t4a, t7a
1090    ITX_MULSUB_2W         %3, %2, %5, %6, %7, 3406, 2276, 1 ;t5a, t6a
1091    psubsw               m%2, m%4, m%5                      ;t6a
1092    paddsw               m%4, m%5                           ;t7
1093    psubsw               m%5, m%1, m%3                      ;t5a
1094    paddsw               m%1, m%3                           ;t4
1095    ITX_MULSUB_2W         %2, %5, %3, %6, %7, 2896, 2896, 1 ;t5, t6
1096%endmacro
1097
1098INV_TXFM_8X8_FN dct, dct
1099INV_TXFM_8X8_FN dct, adst
1100INV_TXFM_8X8_FN dct, flipadst
1101INV_TXFM_8X8_FN dct, identity
1102
1103cglobal idct_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
1104    LOAD_8ROWS          coeffq, 16
1105
1106.pass1:
1107    call .main
1108
1109.pass1_end:
1110    mova                    m7, [o(pw_16384)]
1111
1112.pass1_end1:
1113    REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
1114    mova    [rsp+gprsize+16*1], m6
1115
1116.pass1_end2:
1117    REPX      {pmulhrsw x, m7}, m1, m3, m5
1118    pmulhrsw                m7, [rsp+gprsize+16*0]
1119
1120cglobal_label .pass1_end3
1121    punpcklwd               m6, m1, m5             ;10 50 11 51 12 52 13 53
1122    punpckhwd               m1, m5                 ;14 54 15 55 16 56 17 57
1123    punpckhwd               m5, m0, m4             ;04 44 05 45 06 46 07 47
1124    punpcklwd               m0, m4                 ;00 40 01 41 02 42 03 43
1125    punpckhwd               m4, m3, m7             ;34 74 35 75 36 76 37 77
1126    punpcklwd               m3, m7                 ;30 70 31 71 32 72 33 73
1127    punpckhwd               m7, m1, m4             ;16 36 56 76 17 37 57 77
1128    punpcklwd               m1, m4                 ;14 34 54 74 15 35 55 75
1129    punpckhwd               m4, m6, m3             ;12 32 52 72 13 33 53 73
1130    punpcklwd               m6, m3                 ;10 30 50 70 11 31 51 71
1131    mova    [rsp+gprsize+16*2], m6
1132    mova                    m6, [rsp+gprsize+16*1]
1133    punpckhwd               m3, m2, m6             ;24 64 25 65 26 66 27 67
1134    punpcklwd               m2, m6                 ;20 60 21 61 22 62 23 63
1135    punpckhwd               m6, m5, m3             ;06 26 46 66 07 27 47 67
1136    punpcklwd               m5, m3                 ;04 24 44 64 05 25 45 65
1137    punpckhwd               m3, m0, m2             ;02 22 42 62 03 23 43 63
1138    punpcklwd               m0, m2                 ;00 20 40 60 01 21 41 61
1139
1140    punpckhwd               m2, m6, m7             ;07 17 27 37 47 57 67 77
1141    punpcklwd               m6, m7                 ;06 16 26 36 46 56 66 76
1142    mova    [rsp+gprsize+16*0], m2
1143    punpcklwd               m2, m3, m4             ;02 12 22 32 42 52 62 72
1144    punpckhwd               m3, m4                 ;03 13 23 33 43 53 63 73
1145    punpcklwd               m4, m5, m1             ;04 14 24 34 44 54 64 74
1146    punpckhwd               m5, m1                 ;05 15 25 35 45 55 65 75
1147    mova                    m7, [rsp+gprsize+16*2]
1148    punpckhwd               m1, m0, m7             ;01 11 21 31 41 51 61 71
1149    punpcklwd               m0, m7                 ;00 10 20 30 40 50 60 70
1150    mova                    m7, [rsp+gprsize+16*0]
1151    jmp                   tx2q
1152
1153.pass2:
1154    lea                   tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
1155
1156.pass2_main:
1157    call .main
1158
1159.end:
1160    mova                    m7, [o(pw_2048)]
1161    REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
1162    mova    [rsp+gprsize+16*1], m6
1163
1164.end2:
1165    REPX      {pmulhrsw x, m7}, m1, m3, m5
1166    pmulhrsw                m7, [rsp+gprsize+16*0]
1167    mova    [rsp+gprsize+16*2], m5
1168    mova    [rsp+gprsize+16*0], m7
1169
1170.end3:
1171    WRITE_8X4                0, 1, 2, 3, 5, 6, 7
1172    lea                   dstq, [dstq+strideq*2]
1173    WRITE_8X4                4, [rsp+gprsize+16*2], [rsp+gprsize+16*1], [rsp+gprsize+16*0], 5, 6, 7
1174    jmp                   tx2q
1175
1176.end4:
1177    pxor                    m7, m7
1178    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
1179    ret
1180
1181ALIGN function_align
1182cglobal_label .main
1183    mova  [rsp+gprsize*2+16*0], m7
1184    mova  [rsp+gprsize*2+16*1], m3
1185    mova  [rsp+gprsize*2+16*2], m1
1186    mova                    m7, [o(pd_2048)]
1187    IDCT4_1D                 0, 2, 4, 6, 1, 3, 7
1188    mova                    m3, [rsp+gprsize*2+16*2]
1189    mova  [rsp+gprsize*2+16*2], m2
1190    mova                    m2, [rsp+gprsize*2+16*1]
1191    mova  [rsp+gprsize*2+16*1], m4
1192    mova                    m4, [rsp+gprsize*2+16*0]
1193    mova  [rsp+gprsize*2+16*0], m6
1194    IDCT8_1D_ODDHALF         3, 2, 5, 4, 1, 6, 7
1195    mova                    m6, [rsp+gprsize*2+16*0]
1196    psubsw                  m7, m0, m4                    ;out7
1197    paddsw                  m0, m4                        ;out0
1198    mova  [rsp+gprsize*2+16*0], m7
1199    mova                    m1, [rsp+gprsize*2+16*2]
1200    psubsw                  m4, m6, m3                    ;out4
1201    paddsw                  m3, m6                        ;out3
1202    mova                    m7, [rsp+gprsize*2+16*1]
1203    psubsw                  m6, m1, m5                    ;out6
1204    paddsw                  m1, m5                        ;out1
1205    psubsw                  m5, m7, m2                    ;out5
1206    paddsw                  m2, m7                        ;out2
1207    ret
1208
1209
1210INV_TXFM_8X8_FN adst, dct
1211INV_TXFM_8X8_FN adst, adst
1212INV_TXFM_8X8_FN adst, flipadst
1213INV_TXFM_8X8_FN adst, identity
1214
1215cglobal iadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
1216    LOAD_8ROWS          coeffq, 16
1217
1218.pass1:
1219    call .main
1220    call .main_pass1_end
1221
1222.pass1_end:
1223    mova                    m7, [o(pw_16384)]
1224
1225.pass1_end1:
1226    REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
1227    mova    [rsp+gprsize+16*1], m6
1228    pxor                    m6, m6
1229    psubw                   m6, m7
1230    mova                    m7, m6
1231    jmp m(idct_8x8_internal_8bpc).pass1_end2
1232
1233ALIGN function_align
1234.pass2:
1235    lea                   tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
1236
1237.pass2_main:
1238    call .main
1239    call .main_pass2_end
1240
1241.end:
1242    mova                    m7, [o(pw_2048)]
1243    REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
1244    mova    [rsp+gprsize+16*1], m6
1245    pxor                    m6, m6
1246    psubw                   m6, m7
1247    mova                    m7, m6
1248    jmp m(idct_8x8_internal_8bpc).end2
1249
1250ALIGN function_align
1251cglobal_label .main
1252    mova  [rsp+gprsize*2+16*0], m7
1253    mova  [rsp+gprsize*2+16*1], m3
1254    mova  [rsp+gprsize*2+16*2], m4
1255    mova                    m7, [o(pd_2048)]
1256    ITX_MULSUB_2W            5, 2, 3, 4, 7, 1931, 3612    ;t3a, t2a
1257    ITX_MULSUB_2W            1, 6, 3, 4, 7, 3920, 1189    ;t7a, t6a
1258    paddsw                  m3, m2, m6                    ;t2
1259    psubsw                  m2, m6                        ;t6
1260    paddsw                  m4, m5, m1                    ;t3
1261    psubsw                  m5, m1                        ;t7
1262    ITX_MULSUB_2W            5, 2, 1, 6, 7, 3784, 1567    ;t6a, t7a
1263
1264    mova                    m6, [rsp+gprsize*2+16*2]
1265    mova  [rsp+gprsize*2+16*2], m5
1266    mova                    m1, [rsp+gprsize*2+16*1]
1267    mova  [rsp+gprsize*2+16*1], m2
1268    mova                    m5, [rsp+gprsize*2+16*0]
1269    mova  [rsp+gprsize*2+16*0], m3
1270    ITX_MULSUB_2W            5, 0, 2, 3, 7,  401, 4076    ;t1a, t0a
1271    ITX_MULSUB_2W            1, 6, 2, 3, 7, 3166, 2598    ;t5a, t4a
1272    psubsw                  m2, m0, m6                    ;t4
1273    paddsw                  m0, m6                        ;t0
1274    paddsw                  m3, m5, m1                    ;t1
1275    psubsw                  m5, m1                        ;t5
1276    ITX_MULSUB_2W            2, 5, 1, 6, 7, 1567, 3784    ;t5a, t4a
1277
1278    mova                    m7, [rsp+gprsize*2+16*0]
1279    paddsw                  m1, m3, m4                    ;-out7
1280    psubsw                  m3, m4                        ;t3
1281    mova  [rsp+gprsize*2+16*0], m1
1282    psubsw                  m4, m0, m7                    ;t2
1283    paddsw                  m0, m7                        ;out0
1284    mova                    m6, [rsp+gprsize*2+16*2]
1285    mova                    m7, [rsp+gprsize*2+16*1]
1286    paddsw                  m1, m5, m6                    ;-out1
1287    psubsw                  m5, m6                        ;t6
1288    paddsw                  m6, m2, m7                    ;out6
1289    psubsw                  m2, m7                        ;t7
1290    ret
1291ALIGN function_align
1292.main_pass1_end:
1293    mova  [rsp+gprsize*2+16*1], m1
1294    mova  [rsp+gprsize*2+16*2], m6
1295    punpckhwd               m1, m4, m3
1296    punpcklwd               m4, m3
1297    punpckhwd               m7, m5, m2
1298    punpcklwd               m5, m2
1299    mova                    m2, [o(pw_2896_2896)]
1300    mova                    m6, [o(pd_2048)]
1301    pmaddwd                 m3, m2, m7
1302    pmaddwd                 m2, m5
1303    paddd                   m3, m6
1304    paddd                   m2, m6
1305    psrad                   m3, 12
1306    psrad                   m2, 12
1307    packssdw                m2, m3                        ;out2
1308    mova                    m3, [o(pw_2896_m2896)]
1309    pmaddwd                 m7, m3
1310    pmaddwd                 m5, m3
1311    paddd                   m7, m6
1312    paddd                   m5, m6
1313    psrad                   m7, 12
1314    psrad                   m5, 12
1315    packssdw                m5, m7                        ;-out5
1316    mova                    m3, [o(pw_2896_2896)]
1317    pmaddwd                 m7, m3, m1
1318    pmaddwd                 m3, m4
1319    paddd                   m7, m6
1320    paddd                   m3, m6
1321    psrad                   m7, 12
1322    psrad                   m3, 12
1323    packssdw                m3, m7                        ;-out3
1324    mova                    m7, [o(pw_2896_m2896)]
1325    pmaddwd                 m1, m7
1326    pmaddwd                 m4, m7
1327    paddd                   m1, m6
1328    paddd                   m4, m6
1329    psrad                   m1, 12
1330    psrad                   m4, 12
1331    packssdw                m4, m1                        ;-out5
1332    mova                    m1, [rsp+gprsize*2+16*1]
1333    mova                    m6, [rsp+gprsize*2+16*2]
1334    ret
1335ALIGN function_align
1336cglobal_label .main_pass2_end
1337    paddsw                  m7, m4, m3                    ;t2 + t3
1338    psubsw                  m4, m3                        ;t2 - t3
1339    paddsw                  m3, m5, m2                    ;t6 + t7
1340    psubsw                  m5, m2                        ;t6 - t7
1341    mova                    m2, [o(pw_2896x8)]
1342    pmulhrsw                m4, m2                        ;out4
1343    pmulhrsw                m5, m2                        ;-out5
1344    pmulhrsw                m7, m2                        ;-out3
1345    pmulhrsw                m2, m3                        ;out2
1346    mova                    m3, m7
1347    ret
1348
1349INV_TXFM_8X8_FN flipadst, dct
1350INV_TXFM_8X8_FN flipadst, adst
1351INV_TXFM_8X8_FN flipadst, flipadst
1352INV_TXFM_8X8_FN flipadst, identity
1353
1354cglobal iflipadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
1355    LOAD_8ROWS          coeffq, 16
1356
1357.pass1:
1358    call m(iadst_8x8_internal_8bpc).main
1359    call m(iadst_8x8_internal_8bpc).main_pass1_end
1360
1361.pass1_end:
1362    mova                    m7, [o(pw_m16384)]
1363
1364.pass1_end1:
1365    pmulhrsw                m1, m7
1366    mova    [rsp+gprsize+16*1], m1
1367    mova                    m1, m6
1368    mova                    m6, m2
1369    pmulhrsw                m2, m5, m7
1370    mova                    m5, m6
1371    mova                    m6, m4
1372    pmulhrsw                m4, m3, m7
1373    mova                    m3, m6
1374    mova                    m6, m0
1375    mova                    m0, m7
1376    pxor                    m7, m7
1377    psubw                   m7, m0
1378    pmulhrsw                m0, [rsp+gprsize+16*0]
1379    REPX      {pmulhrsw x, m7}, m1, m3, m5
1380    pmulhrsw                m7, m6
1381    jmp m(idct_8x8_internal_8bpc).pass1_end3
1382
1383ALIGN function_align
1384.pass2:
1385    lea                   tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
1386
1387.pass2_main:
1388    call m(iadst_8x8_internal_8bpc).main
1389    call m(iadst_8x8_internal_8bpc).main_pass2_end
1390
1391.end:
1392    mova                    m7, [o(pw_2048)]
1393    REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
1394    mova    [rsp+gprsize+16*2], m2
1395    mova                    m2, m0
1396    pxor                    m0, m0
1397    psubw                   m0, m7
1398    mova                    m7, m2
1399    pmulhrsw                m1, m0
1400    pmulhrsw                m2, m5, m0
1401    mova    [rsp+gprsize+16*1], m1
1402    mova                    m5, m4
1403    mova                    m1, m6
1404    pmulhrsw                m4, m3, m0
1405    pmulhrsw                m0, [rsp+gprsize+16*0]
1406    mova                    m3, m5
1407    mova    [rsp+gprsize+16*0], m7
1408    jmp m(idct_8x8_internal_8bpc).end3
1409
1410INV_TXFM_8X8_FN identity, dct
1411INV_TXFM_8X8_FN identity, adst
1412INV_TXFM_8X8_FN identity, flipadst
1413INV_TXFM_8X8_FN identity, identity
1414
1415cglobal iidentity_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
1416    LOAD_8ROWS          coeffq, 16
1417    mova    [rsp+gprsize+16*1], m6
1418    jmp   m(idct_8x8_internal_8bpc).pass1_end3
1419
1420ALIGN function_align
1421.pass2:
1422    lea                   tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
1423
1424.end:
1425    pmulhrsw                m7, [o(pw_4096)]
1426    mova    [rsp+gprsize+16*0], m7
1427    mova                    m7, [o(pw_4096)]
1428    REPX      {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
1429    mova    [rsp+gprsize+16*2], m5
1430    mova    [rsp+gprsize+16*1], m6
1431    jmp m(idct_8x8_internal_8bpc).end3
1432
1433
1434%macro INV_TXFM_4X16_FN 2 ; type1, type2
1435    INV_TXFM_FN          %1, %2, 4x16, 8
1436%ifidn %1_%2, dct_dct
1437    pshuflw               m0, [coeffq], q0000
1438    punpcklwd             m0, m0
1439    mova                  m1, [o(pw_2896x8)]
1440    pmulhrsw              m0, m1
1441    mov             [coeffq], eobd
1442    pmulhrsw              m0, [o(pw_16384)]
1443    pmulhrsw              m0, m1
1444    pmulhrsw              m0, [o(pw_2048)]
1445.end:
1446    WRITE_4X4             0, 0, 1, 2, 3, 0, 1, 2, 3
1447    lea                dstq, [dstq+strideq*4]
1448    WRITE_4X4             0, 0, 1, 2, 3, 0, 1, 2, 3
1449    lea                dstq, [dstq+strideq*4]
1450    WRITE_4X4             0, 0, 1, 2, 3, 0, 1, 2, 3
1451    lea                dstq, [dstq+strideq*4]
1452    WRITE_4X4             0, 0, 1, 2, 3, 0, 1, 2, 3
1453    RET
1454%endif
1455%endmacro
1456
1457INV_TXFM_4X16_FN dct, dct
1458INV_TXFM_4X16_FN dct, adst
1459INV_TXFM_4X16_FN dct, flipadst
1460INV_TXFM_4X16_FN dct, identity
1461
1462cglobal idct_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
1463    lea                  r3, [o(m(idct_4x8_internal_8bpc).pass1)]
1464
1465.pass1:
1466    mova                 m0, [coeffq+16*1]
1467    mova                 m1, [coeffq+16*3]
1468    mova                 m2, [coeffq+16*5]
1469    mova                 m3, [coeffq+16*7]
1470    push               tx2q
1471    lea                tx2q, [o(m(idct_4x16_internal_8bpc).pass1_2)]
1472    jmp                  r3
1473
1474.pass1_2:
1475    mova      [coeffq+16*1], m0
1476    mova      [coeffq+16*3], m1
1477    mova      [coeffq+16*5], m2
1478    mova      [coeffq+16*7], m3
1479    mova                 m0, [coeffq+16*0]
1480    mova                 m1, [coeffq+16*2]
1481    mova                 m2, [coeffq+16*4]
1482    mova                 m3, [coeffq+16*6]
1483    lea                tx2q, [o(m(idct_4x16_internal_8bpc).pass1_end)]
1484    jmp                  r3
1485
1486.pass1_end:
1487    pop                tx2q
1488
1489    mova                 m4, [coeffq+16*1]
1490    mova                 m5, [coeffq+16*3]
1491    mova                 m6, [coeffq+16*5]
1492    mova                 m7, [o(pw_16384)]
1493    REPX   {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
1494
1495    pmulhrsw             m7, [coeffq+16*7]
1496    mova       [coeffq+16*7], m7
1497    jmp                tx2q
1498
1499.pass2:
1500    call m(idct_16x4_internal_8bpc).main
1501
1502.end:
1503    mova                  m7, [o(pw_2048)]
1504    REPX    {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
1505    pmulhrsw              m7, [coeffq+16*7]
1506    mova       [coeffq+16*4], m4
1507
1508.end1:
1509    mova       [coeffq+16*5], m5
1510    mova       [coeffq+16*6], m6
1511    mov                   r3, coeffq
1512    WRITE_4X8              0, 1, 3, 2
1513
1514    mova                  m0, [r3+16*4]
1515    mova                  m1, [r3+16*5]
1516    mova                  m2, [r3+16*6]
1517    mova                  m3, m7
1518    lea                 dstq, [dstq+strideq*4]
1519    WRITE_4X8              0, 1, 3, 2
1520
1521.end2:
1522    pxor                  m7, m7
1523    REPX     {mova [r3+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
1524    ret
1525
1526INV_TXFM_4X16_FN adst, dct
1527INV_TXFM_4X16_FN adst, adst
1528INV_TXFM_4X16_FN adst, flipadst
1529INV_TXFM_4X16_FN adst, identity
1530
1531cglobal iadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
1532    lea                   r3, [o(m(iadst_4x8_internal_8bpc).pass1)]
1533    jmp   m(idct_4x16_internal_8bpc).pass1
1534
1535.pass2:
1536    call m(iadst_16x4_internal_8bpc).main
1537    call m(iadst_16x4_internal_8bpc).main_pass2_end
1538
1539    punpcklqdq            m6, m5, m4                ;low: -out5  high: -out7
1540    punpckhqdq            m4, m5                    ;low:  out8  high:  out10
1541    punpcklqdq            m5, m7, m2                ;low:  out4  high:  out6
1542    punpckhqdq            m2, m7                    ;low: -out9  high: -out11
1543    mova       [coeffq+16*4], m2
1544    mova       [coeffq+16*5], m6
1545    mova                  m2, [coeffq+16*6]
1546    mova                  m6, [coeffq+16*7]
1547    punpckhqdq            m1, m6, m0                ;low: -out13 high: -out15
1548    punpcklqdq            m0, m6                    ;low:  out0  high:  out2
1549    punpckhqdq            m6, m3, m2                ;low:  out12 high:  out14
1550    punpcklqdq            m2, m3                    ;low: -out1  high: -out3
1551
1552    mova                  m7, [o(pw_2048)]
1553
1554.end1:
1555    REPX    {pmulhrsw x, m7}, m0, m5, m4, m6
1556    pxor                  m3, m3
1557    psubw                 m3, m7
1558    mova                  m7, [coeffq+16*4]
1559    REPX    {pmulhrsw x, m3}, m2, m7, m1
1560    pmulhrsw              m3, [coeffq+16*5]
1561    mova       [coeffq+16*7], m5
1562
1563    punpckhqdq            m5, m4, m7                ;low:  out10 high:  out11
1564    punpcklqdq            m4, m7                    ;low:  out8  high:  out9
1565    punpckhqdq            m7, m6, m1                ;low:  out14 high:  out15
1566    punpcklqdq            m6, m1                    ;low:  out12 high:  out13
1567    punpckhqdq            m1, m0, m2                ;low:  out2  high:  out3
1568    punpcklqdq            m0, m2                    ;low:  out0  high:  out1
1569    mova       [coeffq+16*4], m4
1570    mova                  m4, [coeffq+16*7]
1571    punpcklqdq            m2, m4, m3                ;low:  out4  high:  out5
1572    punpckhqdq            m4, m3                    ;low:  out6  high:  out7
1573    mova                  m3, m4
1574
1575.end2:
1576    mova       [coeffq+16*5], m5
1577    mova       [coeffq+16*6], m6
1578    mov                   r3, coeffq
1579    WRITE_4X8              0, 1, 2, 3
1580
1581    mova                  m0, [r3+16*4]
1582    mova                  m1, [r3+16*5]
1583    mova                  m2, [r3+16*6]
1584    mova                  m3, m7
1585    lea                 dstq, [dstq+strideq*4]
1586    WRITE_4X8              0, 1, 2, 3
1587
1588.end3:
1589    pxor                  m7, m7
1590    REPX     {mova [r3+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
1591    ret
1592
1593
1594INV_TXFM_4X16_FN flipadst, dct
1595INV_TXFM_4X16_FN flipadst, adst
1596INV_TXFM_4X16_FN flipadst, flipadst
1597INV_TXFM_4X16_FN flipadst, identity
1598
1599cglobal iflipadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
1600    lea                   r3, [o(m(iflipadst_4x8_internal_8bpc).pass1)]
1601    jmp   m(idct_4x16_internal_8bpc).pass1
1602
1603.pass2:
1604    call m(iadst_16x4_internal_8bpc).main
1605    call m(iadst_16x4_internal_8bpc).main_pass2_end
1606
1607    punpckhqdq            m6, m5, m4                ;low:  out5  high:  out7
1608    punpcklqdq            m4, m5                    ;low: -out8  high: -out10
1609    punpckhqdq            m5, m7, m2                ;low: -out4  high: -out6
1610    punpcklqdq            m2, m7                    ;low:  out9  high:  out11
1611    mova       [coeffq+16*4], m2
1612    mova       [coeffq+16*5], m6
1613    mova                  m2, [coeffq+16*6]
1614    mova                  m6, [coeffq+16*7]
1615    punpcklqdq            m1, m6, m0                ;low:  out13 high:  out15
1616    punpckhqdq            m0, m6                    ;low: -out0  high: -out2
1617    punpcklqdq            m6, m3, m2                ;low: -out12 high: -out14
1618    punpckhqdq            m2, m3                    ;low:  out1  high:  out3
1619
1620    mova                  m7, [o(pw_m2048)]
1621    jmp   m(iadst_4x16_internal_8bpc).end1
1622
1623
1624INV_TXFM_4X16_FN identity, dct
1625INV_TXFM_4X16_FN identity, adst
1626INV_TXFM_4X16_FN identity, flipadst
1627INV_TXFM_4X16_FN identity, identity
1628
1629%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394]
1630    pmulhrsw            m%2, m%3, m%1
1631%if %0 == 4 ; if downshifting by 1
1632    pmulhrsw            m%2, m%4
1633%else
1634    paddsw              m%1, m%1
1635%endif
1636    paddsw              m%1, m%2
1637%endmacro
1638
1639cglobal iidentity_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
1640    mova                  m0, [coeffq+16*1]
1641    mova                  m6, [o(pw_1697x8)]
1642    mova                  m1, [coeffq+16*3]
1643    mova                  m2, [coeffq+16*5]
1644    mova                  m3, [coeffq+16*7]
1645    pcmpeqw               m7, m7
1646    mov                   r3, tx2q
1647    lea                 tx2q, [o(.pass1_2)]
1648.pass1:
1649    pmulhrsw              m4, m6, m0
1650    pmulhrsw              m5, m6, m1
1651    pavgw                 m4, m0
1652    pcmpeqw               m0, m7
1653    pavgw                 m5, m1
1654    pcmpeqw               m1, m7
1655    pandn                 m0, m4
1656    pmulhrsw              m4, m6, m2
1657    pandn                 m1, m5
1658    pmulhrsw              m5, m6, m3
1659    pavgw                 m4, m2
1660    pcmpeqw               m2, m7
1661    pavgw                 m5, m3
1662    pcmpeqw               m3, m7
1663    pandn                 m2, m4
1664    pandn                 m3, m5
1665    jmp m(iadst_4x8_internal_8bpc).pass1_end
1666.pass1_2:
1667    mova       [coeffq+16*1], m0
1668    mova       [coeffq+16*3], m1
1669    mova       [coeffq+16*5], m2
1670    mova       [coeffq+16*7], m3
1671    mova                  m0, [coeffq+16*0]
1672    mova                  m1, [coeffq+16*2]
1673    mova                  m2, [coeffq+16*4]
1674    mova                  m3, [coeffq+16*6]
1675    lea                 tx2q, [o(.pass1_end)]
1676    jmp .pass1
1677.pass1_end:
1678    mova                  m4, [coeffq+16*1]
1679    mova                  m5, [coeffq+16*3]
1680    mova                  m6, [coeffq+16*5]
1681    jmp                   r3
1682.pass2:
1683    mova                  m7, [o(pw_1697x16)]
1684    mova       [coeffq+16*6], m6
1685    REPX    {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5
1686    mova                  m6, [coeffq+16*7]
1687    IDTX16                 6, 7, 7
1688    mova       [coeffq+16*7], m6
1689    mova                  m6, [coeffq+16*6]
1690    pmulhrsw              m7, m6, [o(pw_1697x16)]
1691    paddsw                m6, m6
1692    paddsw                m6, m7
1693    mova                  m7, [o(pw_2048)]
1694    REPX    {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
1695    pmulhrsw              m7, [coeffq+16*7]
1696    mova       [coeffq+16*4], m4
1697    jmp m(iadst_4x16_internal_8bpc).end2
1698
1699
1700%macro INV_TXFM_16X4_FN 2 ; type1, type2
1701    INV_TXFM_FN          %1, %2, 16x4, 8
1702%ifidn %1_%2, dct_dct
1703    movd                 m1, [o(pw_2896x8)]
1704    pmulhrsw             m0, m1, [coeffq]
1705    movd                 m2, [o(pw_16384)]
1706    mov            [coeffq], eobd
1707    mov                 r2d, 2
1708    lea                tx2q, [o(m(inv_txfm_add_dct_dct_16x4_8bpc).end)]
1709.dconly:
1710    pmulhrsw             m0, m2
1711    movd                 m2, [o(pw_2048)]              ;intentionally rip-relative
1712    pmulhrsw             m0, m1
1713    pmulhrsw             m0, m2
1714    pshuflw              m0, m0, q0000
1715    punpcklwd            m0, m0
1716    pxor                 m5, m5
1717.dconly_loop:
1718    mova                 m1, [dstq]
1719    mova                 m3, [dstq+strideq]
1720    punpckhbw            m2, m1, m5
1721    punpcklbw            m1, m5
1722    punpckhbw            m4, m3, m5
1723    punpcklbw            m3, m5
1724    paddw                m2, m0
1725    paddw                m1, m0
1726    paddw                m4, m0
1727    paddw                m3, m0
1728    packuswb             m1, m2
1729    packuswb             m3, m4
1730    mova             [dstq], m1
1731    mova     [dstq+strideq], m3
1732    lea                dstq, [dstq+strideq*2]
1733    dec                 r2d
1734    jg .dconly_loop
1735    jmp                tx2q
1736.end:
1737    RET
1738%endif
1739%endmacro
1740
1741%macro LOAD_7ROWS 2 ;src, stride
1742    mova                 m0, [%1+%2*0]
1743    mova                 m1, [%1+%2*1]
1744    mova                 m2, [%1+%2*2]
1745    mova                 m3, [%1+%2*3]
1746    mova                 m4, [%1+%2*4]
1747    mova                 m5, [%1+%2*5]
1748    mova                 m6, [%1+%2*6]
1749%endmacro
1750
1751%macro SAVE_7ROWS 2 ;src, stride
1752    mova          [%1+%2*0], m0
1753    mova          [%1+%2*1], m1
1754    mova          [%1+%2*2], m2
1755    mova          [%1+%2*3], m3
1756    mova          [%1+%2*4], m4
1757    mova          [%1+%2*5], m5
1758    mova          [%1+%2*6], m6
1759%endmacro
1760
1761%macro IDCT16_1D_PACKED_ODDHALF 7  ;src[1-4], tmp[1-3]
1762    punpckhwd            m%5, m%4, m%1                ;packed in13 in3
1763    punpcklwd            m%1, m%4                     ;packed in1  in15
1764    punpcklwd            m%4, m%3, m%2                ;packed in9  in7
1765    punpckhwd            m%2, m%3                     ;packed in5  in11
1766    mova                 m%7, [o(pd_2048)]
1767    ITX_MUL2X_PACK        %1, %6, %7,  401, 4076, 1    ;low: t8a   high: t15a
1768    ITX_MUL2X_PACK        %4, %6, %7, 3166, 2598, 1    ;low: t9a   high: t14a
1769    ITX_MUL2X_PACK        %2, %6, %7, 1931, 3612, 1    ;low: t10a  high: t13a
1770    ITX_MUL2X_PACK        %5, %6, %7, 3920, 1189, 1    ;low: t11a  high: t12a
1771    psubsw               m%6, m%1, m%4                 ;low: t9    high: t14
1772    paddsw               m%1, m%4                      ;low: t8    high: t15
1773    psubsw               m%4, m%5, m%2                 ;low: t10   high: t13
1774    paddsw               m%5, m%2                      ;low: t11   high: t12
1775    mova                 m%2, [o(deint_shuf2)]
1776    pshufb               m%6, m%2
1777    pshufb               m%4, m%2
1778    ITX_MUL2X_PACK        %6, %3, %7, 1567, 3784, 1    ;low: t9a   high: t14a
1779    ITX_MUL2X_PACK        %4, %3, %7, m3784, 1567, 1   ;low: t10a  high: t13a
1780    psubsw               m%3, m%1, m%5                 ;low: t11a  high: t12a
1781    paddsw               m%1, m%5                      ;low: t8a   high: t15a
1782    psubsw               m%5, m%6, m%4                 ;low: t10   high: t13
1783    paddsw               m%6, m%4                      ;low: t9    high: t14
1784    pshufb               m%3, m%2
1785    pshufb               m%5, m%2
1786    ITX_MUL2X_PACK        %3, %2, %7, 2896, 2896, 4    ;t12,  t11
1787    ITX_MUL2X_PACK        %5, %4, %7, 2896, 2896, 4    ;t13a, t10a
1788    packssdw             m%2, m%4                      ;low: t11   high: t10a
1789    packssdw             m%3, m%5                      ;low: t12   high: t13a
1790    punpckhqdq           m%4, m%1, m%6                 ;low: t15a  high: t14
1791    punpcklqdq           m%1, m%6                      ;low: t8a   high: t9
1792%endmacro
1793
1794INV_TXFM_16X4_FN dct, dct
1795INV_TXFM_16X4_FN dct, adst
1796INV_TXFM_16X4_FN dct, flipadst
1797INV_TXFM_16X4_FN dct, identity
1798
1799cglobal idct_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
1800    LOAD_7ROWS        coeffq, 16
1801    call .main
1802
1803.pass1_end:
1804    punpckhwd             m7, m0, m2                 ;packed out1,  out5
1805    punpcklwd             m0, m2                     ;packed out0,  out4
1806    punpcklwd             m2, m1, m3                 ;packed out3,  out7
1807    punpckhwd             m1, m3                     ;packed out2,  out6
1808    mova       [coeffq+16*6], m7
1809    mova                  m7, [coeffq+16*7]
1810    punpckhwd             m3, m4, m6                 ;packed out9,  out13
1811    punpcklwd             m4, m6                     ;packed out8,  out12
1812    punpcklwd             m6, m5, m7                 ;packed out11, out15
1813    punpckhwd             m5, m7                     ;packed out10, out14
1814
1815.pass1_end2:
1816    mova                  m7, [o(pw_16384)]
1817    REPX    {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
1818    pmulhrsw              m7, [coeffq+16*6]
1819    mova       [coeffq+16*6], m7
1820
1821.pass1_end3:
1822    punpckhwd             m7, m3, m6                 ;packed 9, 11, 13, 15 high
1823    punpcklwd             m3, m6                     ;packed 9, 10, 13, 15 low
1824    punpckhwd             m6, m4, m5                 ;packed 8, 10, 12, 14 high
1825    punpcklwd             m4, m5                     ;packed 8, 10, 12, 14 low
1826    punpckhwd             m5, m4, m3                 ;8, 9, 10, 11, 12, 13, 14, 15(1)
1827    punpcklwd             m4, m3                     ;8, 9, 10, 11, 12, 13, 14, 15(0)
1828    punpckhwd             m3, m6, m7                 ;8, 9, 10, 11, 12, 13, 14, 15(3)
1829    punpcklwd             m6, m7                     ;8, 9, 10, 11, 12, 13, 14, 15(2)
1830    mova       [coeffq+16*7], m3
1831    mova                  m3, [coeffq+16*6]
1832    punpckhwd             m7, m3, m2                 ;packed 1, 3, 5, 7 high
1833    punpcklwd             m3, m2                     ;packed 1, 3, 5, 7 low
1834    punpckhwd             m2, m0, m1                 ;packed 0, 2, 4, 6 high
1835    punpcklwd             m0, m1                     ;packed 0, 2, 4, 6 low
1836    punpckhwd             m1, m0, m3                 ;0, 1, 2, 3, 4, 5, 6, 7(1)
1837    punpcklwd             m0, m3                     ;0, 1, 2, 3, 4, 5, 6, 7(0)
1838    punpckhwd             m3, m2, m7                 ;0, 1, 2, 3, 4, 5, 6, 7(3)
1839    punpcklwd             m2, m7                     ;0, 1, 2, 3, 4, 5, 6, 7(2)
1840    jmp                 tx2q
1841
1842.pass2:
1843    lea                 tx2q, [o(m(idct_8x4_internal_8bpc).pass2)]
1844
1845.pass2_end:
1846    mova       [coeffq+16*4], m4
1847    mova       [coeffq+16*5], m5
1848    mova       [coeffq+16*6], m6
1849    lea                   r3, [dstq+8]
1850    call                tx2q
1851
1852    add               coeffq, 16*4
1853    mova                  m0, [coeffq+16*0]
1854    mova                  m1, [coeffq+16*1]
1855    mova                  m2, [coeffq+16*2]
1856    mova                  m3, [coeffq+16*3]
1857    mov                 dstq, r3
1858    jmp                 tx2q
1859
1860ALIGN function_align
1861cglobal_label .main
1862    punpckhqdq            m7, m0, m1                 ;low:in1  high:in3
1863    punpcklqdq            m0, m1
1864    punpcklqdq            m1, m2, m3
1865    punpckhqdq            m3, m2                     ;low:in7  high:in5
1866    mova       [coeffq+16*4], m7
1867    mova       [coeffq+16*5], m3
1868    mova                  m7, [coeffq+16*7]
1869    punpcklqdq            m2, m4, m5
1870    punpckhqdq            m4, m5                     ;low:in9  high:in11
1871    punpcklqdq            m3, m6, m7
1872    punpckhqdq            m7, m6                     ;low:in15 high:in13
1873    mova       [coeffq+16*6], m4
1874    IDCT8_1D_PACKED
1875    mova                  m6, [coeffq+16*4]
1876    mova                  m4, [coeffq+16*5]
1877    mova                  m5, [coeffq+16*6]
1878    mova       [coeffq+16*4], m1
1879    mova       [coeffq+16*5], m2
1880    mova       [coeffq+16*6], m3
1881
1882    IDCT16_1D_PACKED_ODDHALF 6, 4, 5, 7, 1, 2, 3
1883
1884    mova                  m1, [coeffq+16*4]
1885    psubsw                m3, m0, m7                 ;low:out15 high:out14
1886    paddsw                m0, m7                     ;low:out0  high:out1
1887    psubsw                m7, m1, m5                 ;low:out12 high:out13
1888    paddsw                m1, m5                     ;low:out3  high:out2
1889    mova       [coeffq+16*7], m3
1890    mova                  m2, [coeffq+16*5]
1891    mova                  m3, [coeffq+16*6]
1892    psubsw                m5, m2, m4                 ;low:out11 high:out10
1893    paddsw                m2, m4                     ;low:out4  high:out5
1894    psubsw                m4, m3, m6                 ;low:out8  high:out9
1895    paddsw                m3, m6                     ;low:out7  high:out6
1896    mova                  m6, m7
1897    ret
1898
1899INV_TXFM_16X4_FN adst, dct
1900INV_TXFM_16X4_FN adst, adst
1901INV_TXFM_16X4_FN adst, flipadst
1902INV_TXFM_16X4_FN adst, identity
1903
1904cglobal iadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
1905    LOAD_7ROWS        coeffq, 16
1906    call .main
1907    call .main_pass1_end
1908
1909    punpckhwd             m6, m7, m0                 ;packed -out11, -out15
1910    punpcklwd             m0, m7                     ;packed   out0,   out4
1911    punpcklwd             m7, m3, m4                 ;packed  -out3,  -out7
1912    punpckhwd             m4, m3                     ;packed   out8,  out12
1913    mova                  m1, [coeffq+16*6]
1914    punpcklwd             m3, m1, m5                 ;packed  -out1,  -out5
1915    punpckhwd             m5, m1                     ;packed  out10,  out14
1916    mova                  m1, [coeffq+16*7]
1917    mova       [coeffq+16*6], m3
1918    mova       [coeffq+16*7], m7
1919    punpckhwd             m3, m2, m1                 ;packed  -out9,  -out13
1920    punpcklwd             m1, m2                     ;packed   out2,   out6
1921
1922    mova                  m7, [o(pw_16384)]
1923
1924.pass1_end:
1925    REPX    {pmulhrsw x, m7}, m0, m1, m4, m5
1926    pxor                  m2, m2
1927    psubw                 m2, m7
1928    mova                  m7, [coeffq+16*6]
1929    REPX    {pmulhrsw x, m2}, m7, m3, m6
1930    pmulhrsw              m2, [coeffq+16*7]
1931    mova       [coeffq+16*6], m7
1932    jmp   m(idct_16x4_internal_8bpc).pass1_end3
1933
1934.pass2:
1935    lea                 tx2q, [o(m(iadst_8x4_internal_8bpc).pass2)]
1936    jmp   m(idct_16x4_internal_8bpc).pass2_end
1937
1938ALIGN function_align
1939cglobal_label .main
1940    mova       [coeffq+16*6], m0
1941    pshufd                m0, m1, q1032
1942    pshufd                m2, m2, q1032
1943    punpckhwd             m1, m6, m0                 ;packed in13,  in2
1944    punpcklwd             m0, m6                     ;packed  in3, in12
1945    punpckhwd             m7, m5, m2                 ;packed in11,  in4
1946    punpcklwd             m2, m5                     ;packed  in5, in10
1947    mova                  m6, [o(pd_2048)]
1948    ITX_MUL2X_PACK         1, 5, 6,  995, 3973       ;low:t2   high:t3
1949    ITX_MUL2X_PACK         7, 5, 6, 1751, 3703       ;low:t4   high:t5
1950    ITX_MUL2X_PACK         2, 5, 6, 3513, 2106       ;low:t10  high:t11
1951    ITX_MUL2X_PACK         0, 5, 6, 3857, 1380       ;low:t12  high:t13
1952    psubsw                m5, m1, m2                 ;low:t10a high:t11a
1953    paddsw                m1, m2                     ;low:t2a  high:t3a
1954    psubsw                m2, m7, m0                 ;low:t12a high:t13a
1955    paddsw                m7, m0                     ;low:t4a  high:t5a
1956    punpcklqdq            m0, m5
1957    punpckhwd             m0, m5                     ;packed t10a, t11a
1958    punpcklqdq            m5, m2
1959    punpckhwd             m2, m5                     ;packed t13a, t12a
1960    ITX_MUL2X_PACK         0, 5, 6, 3406, 2276       ;low:t10  high:t11
1961    ITX_MUL2X_PACK         2, 5, 6, 4017,  799, 1    ;low:t12  high:t13
1962    mova       [coeffq+16*4], m1
1963    mova       [coeffq+16*5], m7
1964    mova                  m1, [coeffq+16*6]
1965    mova                  m7, [coeffq+16*7]
1966    pshufd                m1, m1, q1032
1967    pshufd                m3, m3, q1032
1968    punpckhwd             m5, m7, m1                 ;packed in15,  in0
1969    punpcklwd             m1, m7                     ;packed  in1, in14
1970    punpckhwd             m7, m4, m3                 ;packed  in9,  in6
1971    punpcklwd             m3, m4                     ;packed  in7,  in8
1972    ITX_MUL2X_PACK         5, 4, 6,  201, 4091       ;low:t0    high:t1
1973    ITX_MUL2X_PACK         7, 4, 6, 2440, 3290       ;low:t6    high:t7
1974    ITX_MUL2X_PACK         3, 4, 6, 3035, 2751       ;low:t8    high:t9
1975    ITX_MUL2X_PACK         1, 4, 6, 4052,  601       ;low:t14   high:t15
1976    psubsw                m4, m5, m3                 ;low:t8a   high:t9a
1977    paddsw                m5, m3                     ;low:t0a   high:t1a
1978    psubsw                m3, m7, m1                 ;low:t14a  high:t15a
1979    paddsw                m7, m1                     ;low:t6a   high:t7a
1980    punpcklqdq            m1, m4
1981    punpckhwd             m1, m4                     ;packed  t8a,  t9a
1982    punpcklqdq            m4, m3
1983    punpckhwd             m3, m4                     ;packed t15a, t14a
1984    ITX_MUL2X_PACK         1, 4, 6,  799, 4017       ;low:t8    high:t9
1985    ITX_MUL2X_PACK         3, 4, 6, 2276, 3406, 1    ;low:t14   high:t15
1986    paddsw                m4, m1, m2                 ;low:t12a  high:t13a
1987    psubsw                m1, m2                     ;low:t8a   high:t9a
1988    psubsw                m2, m0, m3                 ;low:t14a  high:t15a
1989    paddsw                m0, m3                     ;low:t10a  high:t11a
1990    punpcklqdq            m3, m1
1991    punpckhwd             m3, m1                     ;packed t12a, t13a
1992    punpcklqdq            m1, m2
1993    punpckhwd             m2, m1                     ;packed t15a, t14a
1994    ITX_MUL2X_PACK         3, 1, 6, 1567, 3784       ;low:t12   high:t13
1995    ITX_MUL2X_PACK         2, 1, 6, 3784, 1567, 1    ;low:t14   high:t15
1996    psubsw                m1, m3, m2                 ;low:t14a  high:t15a
1997    paddsw                m3, m2                     ;low:out2  high:-out13
1998    psubsw                m2, m4, m0                 ;low:t10   high:t11
1999    paddsw                m0, m4                     ;low:-out1 high:out14
2000    mova       [coeffq+16*6], m0
2001    mova       [coeffq+16*7], m3
2002    mova                  m0, [coeffq+16*4]
2003    mova                  m3, [coeffq+16*5]
2004    psubsw                m4, m5, m3                 ;low:t4    high:t5
2005    paddsw                m5, m3                     ;low:t0    high:t1
2006    psubsw                m3, m0, m7                 ;low:t6    high:t7
2007    paddsw                m0, m7                     ;low:t2    high:t3
2008    punpcklqdq            m7, m4
2009    punpckhwd             m7, m4                     ;packed t4, t5
2010    punpcklqdq            m4, m3
2011    punpckhwd             m3, m4                     ;packed t7, t6
2012    ITX_MUL2X_PACK         7, 4, 6, 1567, 3784       ;low:t4a   high:t5a
2013    ITX_MUL2X_PACK         3, 4, 6, 3784, 1567, 1    ;low:t6a   high:t7a
2014    psubsw                m4, m5, m0                 ;low:t2a   high:t3a
2015    paddsw                m0, m5                     ;low:out0  high:-out15
2016    psubsw                m5, m7, m3                 ;low:t6    high:t7
2017    paddsw                m3, m7                     ;low:-out3 high:out12
2018    ret
2019ALIGN function_align
2020.main_pass1_end:
2021    mova                  m7, [o(deint_shuf1)]
2022    mova       [coeffq+16*4], m0
2023    mova       [coeffq+16*5], m3
2024    mova                  m0, [o(pw_2896_m2896)]
2025    mova                  m3, [o(pw_2896_2896)]
2026    pshufb                m1, m7                     ;t14a t15a
2027    pshufb                m2, m7                     ;t10  t11
2028    pshufb                m4, m7                     ;t2a  t3a
2029    pshufb                m5, m7                     ;t6   t7
2030    pmaddwd               m7, m0, m2
2031    pmaddwd               m2, m3
2032    paddd                 m7, m6
2033    paddd                 m2, m6
2034    psrad                 m7, 12
2035    psrad                 m2, 12
2036    packssdw              m2, m7                     ;low:out6  high:-out9
2037    pmaddwd               m7, m0, m4
2038    pmaddwd               m4, m3
2039    paddd                 m7, m6
2040    paddd                 m4, m6
2041    psrad                 m7, 12
2042    psrad                 m4, 12
2043    packssdw              m4, m7                     ;low:-out7 high:out8
2044    pmaddwd               m7, m3, m5
2045    pmaddwd               m5, m0
2046    paddd                 m7, m6
2047    paddd                 m5, m6
2048    psrad                 m7, 12
2049    psrad                 m5, 12
2050    packssdw              m7, m5                     ;low:out4  high:-out11
2051    pmaddwd               m5, m3, m1
2052    pmaddwd               m1, m0
2053    paddd                 m5, m6
2054    paddd                 m1, m6
2055    psrad                 m5, 12
2056    psrad                 m1, 12
2057    packssdw              m5, m1                     ;low:-out5 high:out10
2058    mova                  m0, [coeffq+16*4]
2059    mova                  m3, [coeffq+16*5]
2060    ret
2061ALIGN function_align
2062cglobal_label .main_pass2_end
2063    mova                  m7, [o(pw_2896x8)]
2064    punpckhqdq            m6, m2, m1                 ;low:t11   high:t15a
2065    punpcklqdq            m2, m1                     ;low:t10   high:t14a
2066    psubsw                m1, m2, m6
2067    paddsw                m2, m6
2068    punpckhqdq            m6, m4, m5                 ;low:t3a   high:t7
2069    punpcklqdq            m4, m5                     ;low:t2a   high:t6
2070    psubsw                m5, m4, m6
2071    paddsw                m4, m6
2072    pmulhrsw              m1, m7                     ;low:-out9 high:out10
2073    pmulhrsw              m2, m7                     ;low:out6  high:-out5
2074    pmulhrsw              m5, m7                     ;low:out8  high:-out11
2075    pmulhrsw              m4, m7                     ;low:-out7 high:out4
2076    punpckhqdq            m7, m4, m5                 ;low:out4  high:-out11
2077    punpcklqdq            m4, m5                     ;low:-out7 high:out8
2078    punpckhqdq            m5, m2, m1                 ;low:-out5 high:out10
2079    punpcklqdq            m2, m1                     ;low:out6  high:-out9
2080    ret
2081
2082
2083INV_TXFM_16X4_FN flipadst, dct
2084INV_TXFM_16X4_FN flipadst, adst
2085INV_TXFM_16X4_FN flipadst, flipadst
2086INV_TXFM_16X4_FN flipadst, identity
2087
2088cglobal iflipadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
2089    LOAD_7ROWS        coeffq, 16
2090    call m(iadst_16x4_internal_8bpc).main
2091    call m(iadst_16x4_internal_8bpc).main_pass1_end
2092
2093    punpcklwd             m6, m7, m0                 ;packed  out11,  out15
2094    punpckhwd             m0, m7                     ;packed  -out0,  -out4
2095    punpckhwd             m7, m3, m4                 ;packed   out3,   out7
2096    punpcklwd             m4, m3                     ;packed  -out8, -out12
2097    mova                  m1, [coeffq+16*6]
2098    punpckhwd             m3, m1, m5                 ;packed   out1,   out5
2099    punpcklwd             m5, m1                     ;packed -out10, -out14
2100    mova                  m1, [coeffq+16*7]
2101    mova       [coeffq+16*6], m3
2102    mova       [coeffq+16*7], m7
2103    punpcklwd             m3, m2, m1                 ;packed   out9,  out13
2104    punpckhwd             m1, m2                     ;packed  -out2,  -out6
2105
2106    mova                  m7, [o(pw_m16384)]
2107    jmp   m(iadst_16x4_internal_8bpc).pass1_end
2108
2109.pass2:
2110    lea                 tx2q, [o(m(iflipadst_8x4_internal_8bpc).pass2)]
2111    jmp   m(idct_16x4_internal_8bpc).pass2_end
2112
2113
2114INV_TXFM_16X4_FN identity, dct
2115INV_TXFM_16X4_FN identity, adst
2116INV_TXFM_16X4_FN identity, flipadst
2117INV_TXFM_16X4_FN identity, identity
2118
2119cglobal iidentity_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
2120    mova                  m1, [coeffq+16*6]
2121    mova                  m0, [coeffq+16*5]
2122    mova                  m2, [coeffq+16*7]
2123    mova                  m6, [o(pw_1697x16)]
2124    mova                  m7, [o(pw_16384)]
2125    pmulhrsw              m4, m6, m1
2126    pmulhrsw              m3, m6, m0
2127    pmulhrsw              m5, m6, m2
2128    pmulhrsw              m4, m7
2129    pmulhrsw              m3, m7
2130    pmulhrsw              m5, m7
2131    paddsw                m1, m4
2132    paddsw                m0, m3
2133    paddsw                m5, m2
2134    mova                  m2, [coeffq+16*2]
2135    mova                  m3, [coeffq+16*3]
2136    mova                  m4, [coeffq+16*4]
2137    mova       [coeffq+16*6], m1
2138    mova       [coeffq+16*5], m0
2139    mova       [coeffq+16*7], m5
2140    pmulhrsw              m0, m6, m2
2141    pmulhrsw              m1, m6, m3
2142    pmulhrsw              m5, m6, m4
2143    pmulhrsw              m0, m7
2144    pmulhrsw              m1, m7
2145    pmulhrsw              m5, m7
2146    paddsw                m2, m0
2147    paddsw                m3, m1
2148    paddsw                m4, m5
2149    mova                  m0, [coeffq+16*0]
2150    mova                  m1, [coeffq+16*1]
2151    pmulhrsw              m5, m6, m0
2152    pmulhrsw              m6, m1
2153    pmulhrsw              m5, m7
2154    pmulhrsw              m6, m7
2155    paddsw                m0, m5
2156    paddsw                m1, m6
2157    mova                  m6, [coeffq+16*6]
2158    mova                  m5, [coeffq+16*5]
2159    punpckhwd             m7, m0, m2                 ;packed out1,  out5
2160    punpcklwd             m0, m2                     ;packed out0,  out4
2161    punpckhwd             m2, m1, m3                 ;packed out3,  out7
2162    punpcklwd             m1, m3                     ;packed out2,  out6
2163    mova       [coeffq+16*6], m7
2164    mova                  m7, [coeffq+16*7]
2165    punpckhwd             m3, m4, m6                 ;packed out9,  out13
2166    punpcklwd             m4, m6                     ;packed out8,  out12
2167    punpckhwd             m6, m5, m7                 ;packed out11, out15
2168    punpcklwd             m5, m7                     ;packed out10, out14
2169    jmp   m(idct_16x4_internal_8bpc).pass1_end3
2170
2171.pass2:
2172    lea                 tx2q, [o(m(iidentity_8x4_internal_8bpc).pass2)]
2173    jmp   m(idct_16x4_internal_8bpc).pass2_end
2174
2175
2176%macro SAVE_8ROWS 2  ;src, stride
2177    mova                 [%1+%2*0], m0
2178    mova                 [%1+%2*1], m1
2179    mova                 [%1+%2*2], m2
2180    mova                 [%1+%2*3], m3
2181    mova                 [%1+%2*4], m4
2182    mova                 [%1+%2*5], m5
2183    mova                 [%1+%2*6], m6
2184    mova                 [%1+%2*7], m7
2185%endmacro
2186
2187%macro INV_TXFM_8X16_FN 2 ; type1, type2
2188    INV_TXFM_FN          %1, %2, 8x16, 8, 16*16
2189%ifidn %1_%2, dct_dct
2190    pshuflw              m0, [coeffq], q0000
2191    punpcklwd            m0, m0
2192    mova                 m1, [o(pw_2896x8)]
2193    pmulhrsw             m0, m1
2194    mova                 m2, [o(pw_16384)]
2195    mov            [coeffq], eobd
2196    pmulhrsw             m0, m1
2197    pmulhrsw             m0, m2
2198    psrlw                m2, 3              ; pw_2048
2199    pmulhrsw             m0, m1
2200    pmulhrsw             m0, m2
2201    mov                 r3d, 4
2202    lea                tx2q, [o(m(inv_txfm_add_dct_dct_8x16_8bpc).end)]
2203    jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop
2204.end:
2205    RET
2206%endif
2207%endmacro
2208
2209INV_TXFM_8X16_FN dct, dct
2210INV_TXFM_8X16_FN dct, adst
2211INV_TXFM_8X16_FN dct, flipadst
2212INV_TXFM_8X16_FN dct, identity
2213
2214cglobal idct_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
2215    lea                    r3, [o(m(idct_8x8_internal_8bpc).pass1)]
2216
2217.pass1:
2218    LOAD_8ROWS    coeffq+16*1, 32, 1
2219    mov   [rsp+gprsize+16*11], tx2q
2220    lea                  tx2q, [o(m(idct_8x16_internal_8bpc).pass1_end)]
2221    jmp                    r3
2222
2223.pass1_end:
2224    SAVE_8ROWS    coeffq+16*1, 32
2225    LOAD_8ROWS    coeffq+16*0, 32, 1
2226    mov                  tx2q, [rsp+gprsize+16*11]
2227    jmp                    r3
2228
2229.pass2:
2230    lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end)]
2231
2232.pass2_pre:
2233    mova       [coeffq+16*2 ], m1
2234    mova       [coeffq+16*6 ], m3
2235    mova       [coeffq+16*10], m5
2236    mova       [coeffq+16*14], m7
2237    mova                   m1, m2
2238    mova                   m2, m4
2239    mova                   m3, m6
2240    mova                   m4, [coeffq+16*1 ]
2241    mova                   m5, [coeffq+16*5 ]
2242    mova                   m6, [coeffq+16*9 ]
2243    mova                   m7, [coeffq+16*13]
2244
2245.pass2_main:
2246    call m(idct_8x8_internal_8bpc).main
2247
2248    SAVE_7ROWS   rsp+gprsize+16*3, 16
2249    mova                   m0, [coeffq+16*2 ]
2250    mova                   m1, [coeffq+16*6 ]
2251    mova                   m2, [coeffq+16*10]
2252    mova                   m3, [coeffq+16*14]
2253    mova                   m4, [coeffq+16*3 ]
2254    mova                   m5, [coeffq+16*7 ]
2255    mova                   m6, [coeffq+16*11]
2256    mova                   m7, [coeffq+16*15]
2257    call m(idct_16x8_internal_8bpc).main
2258
2259    mov                    r3, dstq
2260    lea                  dstq, [dstq+strideq*8]
2261    jmp  m(idct_8x8_internal_8bpc).end
2262
2263.end:
2264    LOAD_8ROWS   rsp+gprsize+16*3, 16
2265    mova   [rsp+gprsize+16*0], m7
2266    lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
2267    mov                  dstq, r3
2268    jmp  m(idct_8x8_internal_8bpc).end
2269
2270.end1:
2271    pxor                   m7, m7
2272    REPX  {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
2273    ret
2274
2275INV_TXFM_8X16_FN adst, dct
2276INV_TXFM_8X16_FN adst, adst
2277INV_TXFM_8X16_FN adst, flipadst
2278INV_TXFM_8X16_FN adst, identity
2279
2280cglobal iadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
2281    lea                    r3, [o(m(iadst_8x8_internal_8bpc).pass1)]
2282    jmp  m(idct_8x16_internal_8bpc).pass1
2283
2284.pass2:
2285    lea                  tx2q, [o(m(iadst_8x16_internal_8bpc).end)]
2286
2287.pass2_pre:
2288    mova    [rsp+gprsize+16*7], m0
2289    mova    [rsp+gprsize+16*8], m1
2290    mova    [rsp+gprsize+16*5], m6
2291    mova    [rsp+gprsize+16*6], m7
2292    mova                    m0, m2
2293    mova                    m1, m3
2294    mova                    m2, m4
2295    mova                    m3, m5
2296
2297.pass2_main:
2298    mova                    m4, [coeffq+16*1 ]
2299    mova                    m5, [coeffq+16*3 ]
2300    mova                    m6, [coeffq+16*13]
2301    mova                    m7, [coeffq+16*15]
2302    mova    [rsp+gprsize+16*3], m4
2303    mova    [rsp+gprsize+16*4], m5
2304    mova    [rsp+gprsize+16*9], m6
2305    mova    [rsp+gprsize+32*5], m7
2306    mova                    m4, [coeffq+16*5 ]
2307    mova                    m5, [coeffq+16*7 ]
2308    mova                    m6, [coeffq+16*9 ]
2309    mova                    m7, [coeffq+16*11]
2310
2311    call m(iadst_16x8_internal_8bpc).main
2312    call m(iadst_16x8_internal_8bpc).main_pass2_end
2313
2314    mov                    r3, dstq
2315    lea                  dstq, [dstq+strideq*8]
2316    jmp m(iadst_8x8_internal_8bpc).end
2317
2318.end:
2319    LOAD_8ROWS   rsp+gprsize+16*3, 16
2320    mova   [rsp+gprsize+16*0], m7
2321    lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
2322    mov                  dstq, r3
2323    jmp  m(iadst_8x8_internal_8bpc).end
2324
2325
2326INV_TXFM_8X16_FN flipadst, dct
2327INV_TXFM_8X16_FN flipadst, adst
2328INV_TXFM_8X16_FN flipadst, flipadst
2329INV_TXFM_8X16_FN flipadst, identity
2330
2331cglobal iflipadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
2332    lea                    r3, [o(m(iflipadst_8x8_internal_8bpc).pass1)]
2333    jmp  m(idct_8x16_internal_8bpc).pass1
2334
2335.pass2:
2336    lea                   tx2q, [o(m(iflipadst_8x16_internal_8bpc).end)]
2337    lea                     r3, [dstq+strideq*8]
2338
2339.pass2_pre:
2340    mova    [rsp+gprsize+16*7], m0
2341    mova    [rsp+gprsize+16*8], m1
2342    mova    [rsp+gprsize+16*5], m6
2343    mova    [rsp+gprsize+16*6], m7
2344    mova                    m0, m2
2345    mova                    m1, m3
2346    mova                    m2, m4
2347    mova                    m3, m5
2348
2349.pass2_main:
2350    mova                    m4, [coeffq+16*1 ]
2351    mova                    m5, [coeffq+16*3 ]
2352    mova                    m6, [coeffq+16*13]
2353    mova                    m7, [coeffq+16*15]
2354    mova    [rsp+gprsize+16*3], m4
2355    mova    [rsp+gprsize+16*4], m5
2356    mova    [rsp+gprsize+16*9], m6
2357    mova    [rsp+gprsize+32*5], m7
2358    mova                    m4, [coeffq+16*5 ]
2359    mova                    m5, [coeffq+16*7 ]
2360    mova                    m6, [coeffq+16*9 ]
2361    mova                    m7, [coeffq+16*11]
2362
2363    call m(iadst_16x8_internal_8bpc).main
2364    call m(iadst_16x8_internal_8bpc).main_pass2_end
2365    jmp  m(iflipadst_8x8_internal_8bpc).end
2366
2367.end:
2368    LOAD_8ROWS    rsp+gprsize+16*3, 16
2369    mova    [rsp+gprsize+16*0], m7
2370    lea                   tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
2371    mov                   dstq, r3
2372    jmp  m(iflipadst_8x8_internal_8bpc).end
2373
2374
2375INV_TXFM_8X16_FN identity, dct
2376INV_TXFM_8X16_FN identity, adst
2377INV_TXFM_8X16_FN identity, flipadst
2378INV_TXFM_8X16_FN identity, identity
2379
2380cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
2381    LOAD_8ROWS    coeffq+16*1, 32, 1
2382    mov                    r3, tx2q
2383    lea                  tx2q, [o(.pass1_end)]
2384    mova   [rsp+gprsize+16*1], m6
2385    jmp  m(idct_8x8_internal_8bpc).pass1_end3
2386
2387.pass1_end:
2388    SAVE_8ROWS    coeffq+16*1, 32
2389    LOAD_8ROWS    coeffq+16*0, 32, 1
2390    mov                  tx2q, r3
2391    mova   [rsp+gprsize+16*1], m6
2392    jmp  m(idct_8x8_internal_8bpc).pass1_end3
2393
2394.pass2:
2395    lea                  tx2q, [o(.end1)]
2396
2397.end:
2398    mova   [rsp+gprsize+16*0], m7
2399    mova   [rsp+gprsize+16*1], m6
2400    mova                   m7, [o(pw_1697x16)]
2401    REPX     {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5
2402    mova                   m6, [rsp+gprsize+16*1]
2403    mova   [rsp+gprsize+16*2], m5
2404    IDTX16                  6, 5, 7
2405    mova                   m5, [rsp+gprsize+16*0]
2406    IDTX16                  5, 7, 7
2407    mova                   m7, [o(pw_2048)]
2408    REPX     {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
2409    pmulhrsw               m7, [rsp+gprsize+16*2]
2410    mova   [rsp+gprsize+16*0], m5
2411    mova   [rsp+gprsize+16*1], m6
2412    mova   [rsp+gprsize+16*2], m7
2413    jmp  m(idct_8x8_internal_8bpc).end3
2414
2415.end1:
2416    LOAD_8ROWS    coeffq+16*1, 32
2417    lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
2418    lea                  dstq, [dstq+strideq*2]
2419    jmp .end
2420
2421
2422%macro INV_TXFM_16X8_FN 2 ; type1, type2
2423    INV_TXFM_FN          %1, %2, 16x8, 8, 16*16
2424%ifidn %1_%2, dct_dct
2425    movd                 m1, [o(pw_2896x8)]
2426    pmulhrsw             m0, m1, [coeffq]
2427    movd                 m2, [o(pw_16384)]
2428    mov            [coeffq], eobd
2429    pmulhrsw             m0, m1
2430    mov                 r2d, 4
2431    lea                tx2q, [o(m(inv_txfm_add_dct_dct_16x8_8bpc).end)]
2432    jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
2433.end:
2434    RET
2435%endif
2436%endmacro
2437
2438INV_TXFM_16X8_FN dct, dct
2439INV_TXFM_16X8_FN dct, adst
2440INV_TXFM_16X8_FN dct, flipadst
2441INV_TXFM_16X8_FN dct, identity
2442
2443cglobal idct_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
2444    LOAD_8ROWS    coeffq+16*0, 32, 1
2445    call m(idct_8x8_internal_8bpc).main
2446    SAVE_7ROWS   rsp+gprsize+16*3, 16
2447
2448    LOAD_8ROWS    coeffq+16*1, 32, 1
2449    call  .main
2450    mov                    r3, tx2q
2451    lea                  tx2q, [o(.pass1_end)]
2452    jmp  m(idct_8x8_internal_8bpc).pass1_end
2453
2454.pass1_end:
2455    SAVE_8ROWS    coeffq+16*1, 32
2456    LOAD_8ROWS   rsp+gprsize+16*3, 16
2457    mova   [rsp+gprsize+16*0], m7
2458    mov                  tx2q, r3
2459    jmp  m(idct_8x8_internal_8bpc).pass1_end
2460
2461.pass2:
2462    lea                  tx2q, [o(.end)]
2463    lea                    r3, [dstq+8]
2464    jmp  m(idct_8x8_internal_8bpc).pass2_main
2465
2466.end:
2467    LOAD_8ROWS    coeffq+16*1, 32
2468    lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
2469    mov                  dstq, r3
2470    jmp  m(idct_8x8_internal_8bpc).pass2_main
2471
2472
2473ALIGN function_align
2474cglobal_label .main
2475    mova [rsp+gprsize*2+16*1], m2
2476    mova [rsp+gprsize*2+16*2], m6
2477    mova [rsp+gprsize*2+32*5], m5
2478
2479    mova                   m6, [o(pd_2048)]
2480    ITX_MULSUB_2W           0, 7, 2, 5, 6,  401, 4076   ;t8a, t15a
2481    ITX_MULSUB_2W           4, 3, 2, 5, 6, 3166, 2598   ;t9a, t14a
2482    psubsw                 m2, m0, m4                   ;t9
2483    paddsw                 m0, m4                       ;t8
2484    psubsw                 m4, m7, m3                   ;t14
2485    paddsw                 m7, m3                       ;t15
2486    ITX_MULSUB_2W           4, 2, 3, 5, 6, 1567, 3784   ;t9a, t14a
2487    mova                   m3, [rsp+gprsize*2+16*1]
2488    mova                   m5, [rsp+gprsize*2+32*5]
2489    mova [rsp+gprsize*2+16*1], m2
2490    mova [rsp+gprsize*2+32*5], m4
2491    mova                   m2, [rsp+gprsize*2+16*2]
2492    mova [rsp+gprsize*2+16*2], m7
2493    ITX_MULSUB_2W           3, 5, 7, 4, 6, 1931, 3612   ;t10a, t13a
2494    ITX_MULSUB_2W           2, 1, 7, 4, 6, 3920, 1189   ;t11a, t12a
2495    psubsw                 m4, m2, m3                   ;t10
2496    paddsw                 m2, m3                       ;t11
2497    psubsw                 m3, m1, m5                   ;t13
2498    paddsw                 m1, m5                       ;t12
2499    ITX_MULSUB_2W           3, 4, 7, 5, 6, m3784, 1567  ;t10a, t13a
2500    mova                   m7, [rsp+gprsize*2+32*5]
2501    psubsw                 m6, m0, m2                   ;t11a
2502    paddsw                 m0, m2                       ;t8a
2503    paddsw                 m2, m7, m3                   ;t9
2504    psubsw                 m7, m3                       ;t10
2505    mova                   m5, [rsp+gprsize*2+16*0]
2506    psubsw                 m3, m5, m0                   ;out8
2507    paddsw                 m0, m5                       ;out7
2508    mova [rsp+gprsize*2+32*5], m0
2509    mova                   m5, [rsp+gprsize*2+16*9]
2510    psubsw                 m0, m5, m2                   ;out9
2511    paddsw                 m2, m5                       ;out6
2512    mova [rsp+gprsize*2+16*0], m0
2513    mova [rsp+gprsize*2+16*9], m2
2514    mova                   m0, [rsp+gprsize*2+16*1]
2515    mova                   m2, [rsp+gprsize*2+16*2]
2516    mova [rsp+gprsize*2+16*1], m3
2517    psubsw                 m5, m0, m4                   ;t13
2518    paddsw                 m0, m4                       ;t14
2519    mova                   m3, [o(pd_2048)]
2520    psubsw                 m4, m2, m1                   ;t12a
2521    paddsw                 m1, m2                       ;t15a
2522    mova [rsp+gprsize*2+16*2], m1
2523    ITX_MULSUB_2W           5, 7, 1, 2, 3, 2896, 2896   ;t10a, t13a
2524    ITX_MULSUB_2W           4, 6, 1, 2, 3, 2896, 2896   ;t11,  t12
2525    mova                   m3, [rsp+gprsize*2+16*8]
2526    psubsw                 m2, m3, m5                   ;out10
2527    paddsw                 m3, m5                       ;out5
2528    mova                   m5, [rsp+gprsize*2+16*7]
2529    mova [rsp+gprsize*2+16*8], m3
2530    psubsw                 m3, m5, m4                   ;out11
2531    paddsw                 m5, m4                       ;out4
2532    mova                   m4, [rsp+gprsize*2+16*6]
2533    mova [rsp+gprsize*2+16*7], m5
2534    paddsw                 m5, m4, m6                   ;out3
2535    psubsw                 m4, m6                       ;out12
2536    mova                   m6, [rsp+gprsize*2+16*5]
2537    mova [rsp+gprsize*2+16*6], m5
2538    psubsw                 m5, m6, m7                   ;out13
2539    paddsw                 m6, m7                       ;out2
2540    mova                   m7, [rsp+gprsize*2+16*4]
2541    mova [rsp+gprsize*2+16*5], m6
2542    psubsw                 m6, m7, m0                   ;out14
2543    paddsw                 m7, m0                       ;out1
2544    mova                   m1, [rsp+gprsize*2+16*2]
2545    mova                   m0, [rsp+gprsize*2+16*3]
2546    mova [rsp+gprsize*2+16*4], m7
2547    psubsw                 m7, m0, m1                   ;out15
2548    paddsw                 m0, m1                       ;out0
2549    mova [rsp+gprsize*2+16*3], m0
2550    mova                   m1, [rsp+gprsize*2+16*0]
2551    mova                   m0, [rsp+gprsize*2+16*1]
2552    mova [rsp+gprsize*2+16*0], m7
2553    ret
2554
2555INV_TXFM_16X8_FN adst, dct
2556INV_TXFM_16X8_FN adst, adst
2557INV_TXFM_16X8_FN adst, flipadst
2558INV_TXFM_16X8_FN adst, identity
2559
2560cglobal iadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
2561    mova                    m7, [o(pw_2896x8)]
2562    pmulhrsw                m0, m7, [coeffq+16*0 ]
2563    pmulhrsw                m1, m7, [coeffq+16*1 ]
2564    pmulhrsw                m2, m7, [coeffq+16*14]
2565    pmulhrsw                m3, m7, [coeffq+16*15]
2566    mova    [rsp+gprsize+16*7], m0
2567    mova    [rsp+gprsize+16*8], m1
2568    mova    [rsp+gprsize+16*9], m2
2569    mova    [rsp+gprsize+32*5], m3
2570    pmulhrsw                m0, m7, [coeffq+16*6 ]
2571    pmulhrsw                m1, m7, [coeffq+16*7 ]
2572    pmulhrsw                m2, m7, [coeffq+16*8 ]
2573    pmulhrsw                m3, m7, [coeffq+16*9 ]
2574    mova    [rsp+gprsize+16*3], m2
2575    mova    [rsp+gprsize+16*4], m3
2576    mova    [rsp+gprsize+16*5], m0
2577    mova    [rsp+gprsize+16*6], m1
2578    pmulhrsw                m0, m7, [coeffq+16*2 ]
2579    pmulhrsw                m1, m7, [coeffq+16*3 ]
2580    pmulhrsw                m2, m7, [coeffq+16*4 ]
2581    pmulhrsw                m3, m7, [coeffq+16*5 ]
2582    pmulhrsw                m4, m7, [coeffq+16*10]
2583    pmulhrsw                m5, m7, [coeffq+16*11]
2584    pmulhrsw                m6, m7, [coeffq+16*12]
2585    pmulhrsw                m7,     [coeffq+16*13]
2586
2587    call .main
2588    call .main_pass1_end
2589    mov                    r3, tx2q
2590    lea                  tx2q, [o(.pass1_end)]
2591    jmp m(iadst_8x8_internal_8bpc).pass1_end
2592
2593.pass1_end:
2594    SAVE_8ROWS    coeffq+16*1, 32
2595    LOAD_8ROWS   rsp+gprsize+16*3, 16
2596    mova   [rsp+gprsize+16*0], m7
2597    mov                  tx2q, r3
2598    jmp m(iadst_8x8_internal_8bpc).pass1_end
2599
2600.pass2:
2601    lea                  tx2q, [o(.end)]
2602    lea                    r3, [dstq+8]
2603    jmp m(iadst_8x8_internal_8bpc).pass2_main
2604
2605.end:
2606    LOAD_8ROWS    coeffq+16*1, 32
2607    lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
2608    mov                  dstq, r3
2609    jmp m(iadst_8x8_internal_8bpc).pass2_main
2610
2611ALIGN function_align
2612cglobal_label .main
2613    mova  [rsp+gprsize*2+16*0], m1
2614    mova  [rsp+gprsize*2+16*1], m2
2615    mova  [rsp+gprsize*2+16*2], m6
2616
2617    mova                    m6, [o(pd_2048)]
2618    ITX_MULSUB_2W            7, 0, 1, 2, 6,  995, 3973   ;t3,  t2
2619    ITX_MULSUB_2W            3, 4, 1, 2, 6, 3513, 2106   ;t11, t10
2620    psubsw                  m1, m0, m4                   ;t10a
2621    paddsw                  m0, m4                       ;t2a
2622    psubsw                  m4, m7, m3                   ;t11a
2623    paddsw                  m3, m7                       ;t3a
2624    ITX_MULSUB_2W            1, 4, 7, 2, 6, 3406, 2276   ;t11, t10
2625    mova                    m2, [rsp+gprsize*2+16*0]     ;in3
2626    mova                    m7, [rsp+gprsize*2+16*1]     ;in4
2627    mova  [rsp+gprsize*2+16*0], m1                       ;t11
2628    mova  [rsp+gprsize*2+16*1], m4                       ;t10
2629    mova                    m1, [rsp+gprsize*2+16*2]     ;in12
2630    mova  [rsp+gprsize*2+16*2], m0                       ;t2a
2631    ITX_MULSUB_2W            5, 7, 0, 4, 6, 1751, 3703   ;t5,  t4
2632    ITX_MULSUB_2W            2, 1, 0, 4, 6, 3857, 1380   ;t13, t12
2633    psubsw                  m0, m7, m1                   ;t12a
2634    paddsw                  m1, m7                       ;t4a
2635    psubsw                  m4, m5, m2                   ;t13a
2636    paddsw                  m5, m2                       ;t5a
2637    ITX_MULSUB_2W            4, 0, 7, 2, 6, 4017,  799   ;t12, t13
2638    mova                    m2, [rsp+gprsize*2+16*8]     ;in1
2639    mova                    m7, [rsp+gprsize*2+16*9]     ;in14
2640    mova  [rsp+gprsize*2+16*8], m4                       ;t12
2641    mova  [rsp+gprsize*2+16*9], m0                       ;t13
2642    mova                    m4, [rsp+gprsize*2+16*4]     ;in9
2643    mova                    m0, [rsp+gprsize*2+16*5]     ;in6
2644    mova  [rsp+gprsize*2+16*4], m1                       ;t4a
2645    mova  [rsp+gprsize*2+16*5], m5                       ;t5a
2646    ITX_MULSUB_2W            2, 7, 1, 5, 6, 4052,  601   ;t15, t14
2647    ITX_MULSUB_2W            4, 0, 1, 5, 6, 2440, 3290   ;t7,  t6
2648    psubsw                  m1, m0, m7                   ;t14a
2649    paddsw                  m0, m7                       ;t6a
2650    psubsw                  m5, m4, m2                   ;t15a
2651    paddsw                  m4, m2                       ;t7a
2652    ITX_MULSUB_2W            5, 1, 7, 2, 6, 2276, 3406   ;t14, t15
2653    mova                    m2, [rsp+gprsize*2+16*2]     ;t2a
2654    mova  [rsp+gprsize*2+16*2], m5                       ;t14
2655    psubsw                  m7, m2, m0                   ;t6
2656    paddsw                  m2, m0                       ;t2
2657    psubsw                  m0, m3, m4                   ;t7
2658    paddsw                  m3, m4                       ;t3
2659    ITX_MULSUB_2W            0, 7, 4, 5, 6, 3784, 1567   ;t6a, t7a
2660    mova                    m4, [rsp+gprsize*2+16*7]     ;in0
2661    mova                    m5, [rsp+gprsize*2+32*5]     ;in15
2662    mova  [rsp+gprsize*2+16*7], m3                       ;t3
2663    mova  [rsp+gprsize*2+32*5], m1                       ;t15
2664    mova                    m1, [rsp+gprsize*2+16*6]     ;in7
2665    mova                    m3, [rsp+gprsize*2+16*3]     ;in8
2666    mova  [rsp+gprsize*2+16*6], m7                       ;t7a
2667    mova  [rsp+gprsize*2+16*3], m0                       ;t6a
2668    ITX_MULSUB_2W            5, 4, 0, 7, 6,  201, 4091   ;t1,  t0
2669    ITX_MULSUB_2W            1, 3, 0, 7, 6, 3035, 2751   ;t9,  t8
2670    psubsw                  m0, m4, m3                   ;t8a
2671    paddsw                  m4, m3                       ;t0a
2672    psubsw                  m3, m5, m1                   ;t9a
2673    paddsw                  m5, m1                       ;t1a
2674    ITX_MULSUB_2W            0, 3, 1, 7, 6,  799, 4017   ;t9,  t8
2675    mova                    m1, [rsp+gprsize*2+16*4]     ;t4a
2676    mova                    m7, [rsp+gprsize*2+16*5]     ;t5a
2677    mova  [rsp+gprsize*2+16*4], m3                       ;t8
2678    mova  [rsp+gprsize*2+16*5], m0                       ;t9
2679    psubsw                  m0, m4, m1                   ;t4
2680    paddsw                  m4, m1                       ;t0
2681    psubsw                  m3, m5, m7                   ;t5
2682    paddsw                  m5, m7                       ;t1
2683    ITX_MULSUB_2W            0, 3, 1, 7, 6, 1567, 3784   ;t5a, t4a
2684    mova                    m7, [rsp+gprsize*2+16*3]     ;t6a
2685    psubsw                  m1, m4, m2                   ;t2a
2686    paddsw                  m4, m2                       ;out0
2687    mova  [rsp+gprsize*2+16*3], m4                       ;out0
2688    mova                    m4, [rsp+gprsize*2+16*6]     ;t7a
2689    psubsw                  m2, m3, m7                   ;t6
2690    paddsw                  m3, m7                       ;-out3
2691    mova  [rsp+gprsize*2+16*6], m3                       ;-out3
2692    psubsw                  m3, m0, m4                   ;t7
2693    paddsw                  m0, m4                       ;out12
2694    mova [rsp+gprsize*2+16*12], m3
2695    mova                    m3, [rsp+gprsize*2+16*7]     ;t3
2696    mova [rsp+gprsize*2+16* 7], m2                       ;out4
2697    psubsw                  m2, m5, m3                   ;t3a
2698    paddsw                  m5, m3                       ;-out15
2699    mova [rsp+gprsize*2+16*11], m2
2700    mova                    m2, [rsp+gprsize*2+32*5]     ;t15
2701    mova [rsp+gprsize*2+16*10], m1                       ;-out7
2702    mova                    m1, [rsp+gprsize*2+16*0]     ;t11
2703    mova [rsp+gprsize*2+16*0 ], m5                       ;-out15
2704    mova                    m3, [rsp+gprsize*2+16*1]     ;t10
2705    mova [rsp+gprsize*2+16*1 ], m4                       ;-out11
2706    mova                    m4, [rsp+gprsize*2+16*2]     ;t14
2707    mova [rsp+gprsize*2+16*2 ], m0                       ;out12
2708    psubsw                  m0, m3, m4                   ;t14a
2709    paddsw                  m3, m4                       ;t10a
2710    psubsw                  m5, m1, m2                   ;t15a
2711    paddsw                  m1, m2                       ;t11a
2712    ITX_MULSUB_2W            5, 0, 2, 4, 6, 3784, 1567   ;t14, t15
2713    mova                    m2, [rsp+gprsize*2+16*4]     ;t8
2714    mova                    m4, [rsp+gprsize*2+16*5]     ;t9
2715    mova  [rsp+gprsize*2+16*4], m3                       ;t10a
2716    mova  [rsp+gprsize*2+16*5], m1                       ;t11a
2717    mova                    m3, [rsp+gprsize*2+16*8]     ;t12
2718    mova                    m1, [rsp+gprsize*2+16*9]     ;t13
2719    mova  [rsp+gprsize*2+16*8], m5                       ;t14
2720    mova  [rsp+gprsize*2+16*9], m0                       ;t15
2721    psubsw                  m5, m2, m3                   ;t12a
2722    paddsw                  m2, m3                       ;t8a
2723    psubsw                  m0, m4, m1                   ;t13a
2724    paddsw                  m4, m1                       ;t9a
2725    ITX_MULSUB_2W            5, 0, 1, 3, 6, 1567, 3784   ;t13, t12
2726    mova                    m6, [rsp+gprsize*2+16*4]     ;t10a
2727    mova                    m1, [rsp+gprsize*2+16*5]     ;t11a
2728    psubsw                  m3, m2, m6                   ;t10
2729    paddsw                  m2, m6                       ;-out1
2730    paddsw                  m6, m4, m1                   ;out14
2731    psubsw                  m4, m1                       ;t11
2732    mova [rsp+gprsize*2+16*14], m4
2733    mova [rsp+gprsize*2+16* 4], m2                       ;-out1
2734    mova                    m4, [rsp+gprsize*2+16*8]     ;t14
2735    mova                    m2, [rsp+gprsize*2+16*9]     ;t15
2736    mova [rsp+gprsize*2+16* 9], m3                       ;out6
2737    psubsw                  m3, m0, m4                   ;t14a
2738    paddsw                  m0, m4                       ;out2
2739    psubsw                  m4, m5, m2                   ;t15a
2740    paddsw                  m5, m2                       ;-out13
2741    mova [rsp+gprsize*2+16* 5], m0                       ;out2
2742    ret
2743ALIGN function_align
2744.main_pass1_end:
2745    mova                    m0, [rsp+gprsize*2+16*14]
2746    mova [rsp+gprsize*2+16*14], m5
2747    mova [rsp+gprsize*2+16*15], m6
2748    mova                    m5, [o(pw_2896_2896)]
2749    mova                    m6, [o(pw_2896_m2896)]
2750    mova                    m7, [o(pd_2048)]
2751    punpcklwd               m2, m3, m4
2752    punpckhwd               m3, m4
2753    pmaddwd                 m4, m5, m2
2754    pmaddwd                 m2, m6
2755    pmaddwd                 m1, m5, m3
2756    pmaddwd                 m3, m6
2757    REPX         {paddd x, m7}, m4, m2, m1, m3
2758    REPX         {psrad x, 12}, m4, m1, m2, m3
2759    packssdw                m4, m1                       ;-out5
2760    packssdw                m2, m3                       ;out10
2761    mova [rsp+gprsize*2+16* 8], m4
2762    mova                    m3, [rsp+gprsize*2+16* 9]
2763    punpcklwd               m1, m3, m0
2764    punpckhwd               m3, m0
2765    pmaddwd                 m0, m5, m1
2766    pmaddwd                 m1, m6
2767    pmaddwd                 m4, m5, m3
2768    pmaddwd                 m3, m6
2769    REPX         {paddd x, m7}, m0, m1, m4, m3
2770    REPX         {psrad x, 12}, m0, m4, m1, m3
2771    packssdw                m0, m4                       ;out6
2772    packssdw                m1, m3                       ;-out9
2773    mova [rsp+gprsize*2+16* 9], m0
2774    mova                    m0, [rsp+gprsize*2+16* 7]
2775    mova                    m4, [rsp+gprsize*2+16*12]
2776    punpcklwd               m3, m0, m4
2777    punpckhwd               m0, m4
2778    pmaddwd                 m4, m5, m3
2779    pmaddwd                 m3, m6
2780    pmaddwd                 m5, m0
2781    pmaddwd                 m0, m6
2782    REPX         {paddd x, m7}, m4, m3, m5, m0
2783    REPX         {psrad x, 12}, m4, m5, m3, m0
2784    packssdw                m4, m5                       ;out4
2785    packssdw                m3, m0                       ;-out11
2786    mova [rsp+gprsize*2+16* 7], m4
2787    mova                    m4, [rsp+gprsize*2+16*10]
2788    mova                    m5, [rsp+gprsize*2+16*11]
2789    punpcklwd               m0, m4, m5
2790    punpckhwd               m4, m5
2791    pmaddwd                 m5, m0, [o(pw_2896_2896)]
2792    pmaddwd                 m0, m6
2793    pmaddwd                 m6, m4
2794    pmaddwd                 m4, [o(pw_2896_2896)]
2795    REPX         {paddd x, m7}, m5, m0, m6, m4
2796    REPX         {psrad x, 12}, m0, m6, m5, m4
2797    packssdw                m0, m6                       ;out8
2798    packssdw                m5, m4                       ;-out7
2799    mova [rsp+gprsize*2+16*10], m5
2800    mova                    m4, [rsp+gprsize*2+16* 2]    ;out12
2801    mova                    m5, [rsp+gprsize*2+16*14]    ;-out13
2802    mova                    m6, [rsp+gprsize*2+16*15]    ;out14
2803    ret
2804ALIGN function_align
2805cglobal_label .main_pass2_end
2806    mova                    m7, [o(pw_2896x8)]
2807    mova                    m1, [rsp+gprsize*2+16* 9]
2808    mova                    m2, [rsp+gprsize*2+16*14]
2809    paddsw                  m0, m1, m2
2810    psubsw                  m1, m2
2811    pmulhrsw                m0, m7                       ;out6
2812    pmulhrsw                m1, m7                       ;-out9
2813    mova [rsp+gprsize*2+16* 9], m0
2814    psubsw                  m2, m3, m4
2815    paddsw                  m3, m4
2816    pmulhrsw                m2, m7                       ;out10
2817    pmulhrsw                m3, m7                       ;-out5
2818    mova [rsp+gprsize*2+16* 8], m3
2819    mova                    m3, [rsp+gprsize*2+16* 7]
2820    mova                    m4, [rsp+gprsize*2+16*12]
2821    paddsw                  m0, m3, m4
2822    psubsw                  m3, m4
2823    pmulhrsw                m0, m7                       ;out4
2824    pmulhrsw                m3, m7                       ;-out11
2825    mova [rsp+gprsize*2+16* 7], m0
2826    mova                    m0, [rsp+gprsize*2+16*10]
2827    paddsw                  m4, m0, [rsp+gprsize*2+16*11]
2828    psubsw                  m0, [rsp+gprsize*2+16*11]
2829    pmulhrsw                m4, m7                       ;-out7
2830    pmulhrsw                m0, m7                       ;out8
2831    mova [rsp+gprsize*2+16*10], m4
2832    mova                    m4, [rsp+gprsize*2+16*2 ]    ;out12
2833    ret
2834
2835INV_TXFM_16X8_FN flipadst, dct
2836INV_TXFM_16X8_FN flipadst, adst
2837INV_TXFM_16X8_FN flipadst, flipadst
2838INV_TXFM_16X8_FN flipadst, identity
2839
2840cglobal iflipadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
2841    mova                    m7, [o(pw_2896x8)]
2842    pmulhrsw                m0, m7, [coeffq+16*0 ]
2843    pmulhrsw                m1, m7, [coeffq+16*1 ]
2844    pmulhrsw                m2, m7, [coeffq+16*14]
2845    pmulhrsw                m3, m7, [coeffq+16*15]
2846    mova    [rsp+gprsize+16*7], m0
2847    mova    [rsp+gprsize+16*8], m1
2848    mova    [rsp+gprsize+16*9], m2
2849    mova    [rsp+gprsize+32*5], m3
2850    pmulhrsw                m0, m7, [coeffq+16*6 ]
2851    pmulhrsw                m1, m7, [coeffq+16*7 ]
2852    pmulhrsw                m2, m7, [coeffq+16*8 ]
2853    pmulhrsw                m3, m7, [coeffq+16*9 ]
2854    mova    [rsp+gprsize+16*3], m2
2855    mova    [rsp+gprsize+16*4], m3
2856    mova    [rsp+gprsize+16*5], m0
2857    mova    [rsp+gprsize+16*6], m1
2858    pmulhrsw                m0, m7, [coeffq+16*2 ]
2859    pmulhrsw                m1, m7, [coeffq+16*3 ]
2860    pmulhrsw                m2, m7, [coeffq+16*4 ]
2861    pmulhrsw                m3, m7, [coeffq+16*5 ]
2862    pmulhrsw                m4, m7, [coeffq+16*10]
2863    pmulhrsw                m5, m7, [coeffq+16*11]
2864    pmulhrsw                m6, m7, [coeffq+16*12]
2865    pmulhrsw                m7,     [coeffq+16*13]
2866
2867    call m(iadst_16x8_internal_8bpc).main
2868    call m(iadst_16x8_internal_8bpc).main_pass1_end
2869
2870    mova                    m7, [rsp+gprsize+16*0]
2871    SAVE_8ROWS     coeffq+16*0, 32
2872    LOAD_8ROWS    rsp+gprsize+16*3, 16
2873    mova    [rsp+gprsize+16*0], m7
2874    mov                     r3, tx2q
2875    lea                   tx2q, [o(.pass1_end)]
2876    jmp m(iflipadst_8x8_internal_8bpc).pass1_end
2877
2878.pass1_end:
2879    SAVE_8ROWS     coeffq+16*1, 32
2880    LOAD_8ROWS     coeffq+16*0, 32
2881    mova    [rsp+gprsize+16*0], m7
2882    mov                   tx2q, r3
2883    jmp m(iflipadst_8x8_internal_8bpc).pass1_end
2884
2885.pass2:
2886    lea                   tx2q, [o(.end)]
2887    lea                     r3, [dstq+8]
2888    jmp m(iflipadst_8x8_internal_8bpc).pass2_main
2889
2890.end:
2891    LOAD_8ROWS     coeffq+16*1, 32
2892    lea                   tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
2893    mov                   dstq, r3
2894    jmp m(iflipadst_8x8_internal_8bpc).pass2_main
2895
2896
2897INV_TXFM_16X8_FN identity, dct
2898INV_TXFM_16X8_FN identity, adst
2899INV_TXFM_16X8_FN identity, flipadst
2900INV_TXFM_16X8_FN identity, identity
2901
2902cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
2903    add                coeffq, 16*16
2904    mova                   m4, [coeffq-16*7]
2905    mova                   m5, [coeffq-16*5]
2906    mova                   m6, [coeffq-16*3]
2907    mova                   m7, [coeffq-16*1]
2908    mov                    r3, tx2q
2909    lea                  tx2q, [o(.pass1_end)]
2910
2911.pass1:
2912    mova                   m0, [o(pw_2896x8)]
2913    mova                   m2, [o(pw_1697x16)]
2914    mova                   m3, [o(pw_16384)]
2915    sub                coeffq, 8*16
2916    REPX     {pmulhrsw x, m0}, m4, m5, m6, m7
2917    pmulhrsw               m1, m2, m4
2918    pmulhrsw               m1, m3
2919    paddsw                 m1, m4 ; 1
2920    pmulhrsw               m4, m2, m5
2921    pmulhrsw               m4, m3
2922    paddsw                 m4, m5 ; 3
2923    pmulhrsw               m5, m2, m6
2924    pmulhrsw               m5, m3
2925    paddsw                 m5, m6 ; 5
2926    pmulhrsw               m6, m2, m7
2927    pmulhrsw               m6, m3
2928    paddsw                 m7, m6 ; 7
2929    pmulhrsw               m6, m0, [coeffq+16*6]
2930    mova   [rsp+gprsize+16*0], m4
2931    pmulhrsw               m4, m2, m6
2932    pmulhrsw               m4, m3
2933    paddsw                 m6, m4 ; 6
2934    pmulhrsw               m4, m0, [coeffq+16*4]
2935    mova   [rsp+gprsize+16*1], m6
2936    pmulhrsw               m6, m2, m4
2937    pmulhrsw               m6, m3
2938    paddsw                 m4, m6 ; 4
2939    pmulhrsw               m6, m0, [coeffq+16*2]
2940    pmulhrsw               m0,     [coeffq+16*0]
2941    pmulhrsw               m2, m6
2942    pmulhrsw               m2, m3
2943    paddsw                 m2, m6 ; 2
2944    pmulhrsw               m6, m0, [o(pw_1697x16)]
2945    pmulhrsw               m6, m3
2946    mova                   m3, [rsp+gprsize+16*0]
2947    paddsw                 m0, m6
2948    jmp   m(idct_8x8_internal_8bpc).pass1_end3
2949
2950.pass1_end:
2951    mova        [coeffq+16*1], m4
2952    mova        [coeffq+16*3], m5
2953    mova        [coeffq+16*5], m6
2954    mova        [coeffq+16*7], m7
2955    mova                   m4, [coeffq-16*7]
2956    mova                   m5, [coeffq-16*5]
2957    mova                   m6, [coeffq-16*3]
2958    mova                   m7, [coeffq-16*1]
2959    mova        [coeffq-16*7], m0
2960    mova        [coeffq-16*5], m1
2961    mova        [coeffq-16*3], m2
2962    mova        [coeffq-16*1], m3
2963    mov                  tx2q, r3
2964    jmp .pass1
2965
2966.pass2:
2967    lea                  tx2q, [o(.end)]
2968    lea                    r3, [dstq+8]
2969    jmp  m(iidentity_8x8_internal_8bpc).end
2970
2971.end:
2972    LOAD_8ROWS    coeffq+16*1, 32
2973    lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
2974    mov                  dstq, r3
2975    jmp  m(iidentity_8x8_internal_8bpc).end
2976
2977
2978%macro INV_TXFM_16X16_FN 2 ; type1, type2
2979    INV_TXFM_FN          %1, %2, 16x16, 8, 16*16
2980%ifidn %1_%2, dct_dct
2981    movd                   m1, [o(pw_2896x8)]
2982    pmulhrsw               m0, m1, [coeffq]
2983    movd                   m2, [o(pw_8192)]
2984    mov              [coeffq], eobd
2985    mov                   r2d, 8
2986    lea                  tx2q, [o(m(inv_txfm_add_dct_dct_16x16_8bpc).end)]
2987    jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
2988.end:
2989    RET
2990%endif
2991%endmacro
2992
2993INV_TXFM_16X16_FN dct, dct
2994INV_TXFM_16X16_FN dct, adst
2995INV_TXFM_16X16_FN dct, flipadst
2996INV_TXFM_16X16_FN dct, identity
2997
2998cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
2999    LOAD_8ROWS     coeffq+16*1, 64
3000    call  m(idct_8x8_internal_8bpc).main
3001    SAVE_7ROWS    rsp+gprsize+16*3, 16
3002    LOAD_8ROWS     coeffq+16*3, 64
3003    call m(idct_16x8_internal_8bpc).main
3004    mov                     r3, tx2q
3005    lea                   tx2q, [o(.pass1_end)]
3006    mova                    m7, [o(pw_8192)]
3007    jmp   m(idct_8x8_internal_8bpc).pass1_end1
3008
3009.pass1_end:
3010    SAVE_8ROWS    coeffq+16*17, 32
3011    LOAD_8ROWS    rsp+gprsize+16*3, 16
3012    mova    [rsp+gprsize+16*0], m7
3013    lea                   tx2q, [o(.pass1_end1)]
3014    mova                    m7, [o(pw_8192)]
3015    jmp   m(idct_8x8_internal_8bpc).pass1_end1
3016
3017.pass1_end1:
3018    SAVE_8ROWS     coeffq+16*1, 32
3019    LOAD_8ROWS     coeffq+16*0, 64
3020    call  m(idct_8x8_internal_8bpc).main
3021    SAVE_7ROWS    rsp+gprsize+16*3, 16
3022    LOAD_8ROWS     coeffq+16*2, 64
3023    call m(idct_16x8_internal_8bpc).main
3024    lea                   tx2q, [o(.pass1_end2)]
3025    mova                    m7, [o(pw_8192)]
3026    jmp   m(idct_8x8_internal_8bpc).pass1_end1
3027
3028.pass1_end2:
3029    SAVE_8ROWS    coeffq+16*16, 32
3030    LOAD_8ROWS    rsp+gprsize+16*3, 16
3031    mova    [rsp+gprsize+16*0], m7
3032    mov                   tx2q, r3
3033    mova                    m7, [o(pw_8192)]
3034    jmp   m(idct_8x8_internal_8bpc).pass1_end1
3035
3036.pass2:
3037    lea                   tx2q, [o(.end)]
3038    jmp  m(idct_8x16_internal_8bpc).pass2_pre
3039
3040.end:
3041    LOAD_8ROWS    rsp+gprsize+16*3, 16
3042    mova    [rsp+gprsize+16*0], m7
3043    lea                   tx2q, [o(.end1)]
3044    mov                   dstq, r3
3045    lea                     r3, [dstq+8]
3046    jmp   m(idct_8x8_internal_8bpc).end
3047
3048.end1:
3049    pxor                    m7, m7
3050    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
3051
3052    add                 coeffq, 32*8
3053    mov                   dstq, r3
3054
3055    mova                    m0, [coeffq+16*0 ]
3056    mova                    m1, [coeffq+16*4 ]
3057    mova                    m2, [coeffq+16*8 ]
3058    mova                    m3, [coeffq+16*12]
3059    mova                    m4, [coeffq+16*1 ]
3060    mova                    m5, [coeffq+16*5 ]
3061    mova                    m6, [coeffq+16*9 ]
3062    mova                    m7, [coeffq+16*13]
3063    lea                   tx2q, [o(m(idct_8x16_internal_8bpc).end)]
3064    jmp  m(idct_8x16_internal_8bpc).pass2_main
3065
3066
3067%macro ITX_16X16_ADST_LOAD_ODD_COEFS 0
3068    mova                    m0, [coeffq+16*1 ]
3069    mova                    m1, [coeffq+16*3 ]
3070    mova                    m2, [coeffq+16*29]
3071    mova                    m3, [coeffq+16*31]
3072    mova    [rsp+gprsize+16*7], m0
3073    mova    [rsp+gprsize+16*8], m1
3074    mova    [rsp+gprsize+16*9], m2
3075    mova    [rsp+gprsize+32*5], m3
3076    mova                    m0, [coeffq+16*13]
3077    mova                    m1, [coeffq+16*15]
3078    mova                    m2, [coeffq+16*17]
3079    mova                    m3, [coeffq+16*19]
3080    mova    [rsp+gprsize+16*3], m2
3081    mova    [rsp+gprsize+16*4], m3
3082    mova    [rsp+gprsize+16*5], m0
3083    mova    [rsp+gprsize+16*6], m1
3084    mova                    m0, [coeffq+16*5 ]
3085    mova                    m1, [coeffq+16*7 ]
3086    mova                    m2, [coeffq+16*9 ]
3087    mova                    m3, [coeffq+16*11]
3088    mova                    m4, [coeffq+16*21]
3089    mova                    m5, [coeffq+16*23]
3090    mova                    m6, [coeffq+16*25]
3091    mova                    m7, [coeffq+16*27]
3092%endmacro
3093
3094%macro ITX_16X16_ADST_LOAD_EVEN_COEFS 0
3095    mova                    m0, [coeffq+16*0 ]
3096    mova                    m1, [coeffq+16*2 ]
3097    mova                    m2, [coeffq+16*28]
3098    mova                    m3, [coeffq+16*30]
3099    mova    [rsp+gprsize+16*7], m0
3100    mova    [rsp+gprsize+16*8], m1
3101    mova    [rsp+gprsize+16*9], m2
3102    mova    [rsp+gprsize+32*5], m3
3103    mova                    m0, [coeffq+16*12]
3104    mova                    m1, [coeffq+16*14]
3105    mova                    m2, [coeffq+16*16]
3106    mova                    m3, [coeffq+16*18]
3107    mova    [rsp+gprsize+16*3], m2
3108    mova    [rsp+gprsize+16*4], m3
3109    mova    [rsp+gprsize+16*5], m0
3110    mova    [rsp+gprsize+16*6], m1
3111    mova                    m0, [coeffq+16*4 ]
3112    mova                    m1, [coeffq+16*6 ]
3113    mova                    m2, [coeffq+16*8 ]
3114    mova                    m3, [coeffq+16*10]
3115    mova                    m4, [coeffq+16*20]
3116    mova                    m5, [coeffq+16*22]
3117    mova                    m6, [coeffq+16*24]
3118    mova                    m7, [coeffq+16*26]
3119%endmacro
3120
3121INV_TXFM_16X16_FN adst, dct
3122INV_TXFM_16X16_FN adst, adst
3123INV_TXFM_16X16_FN adst, flipadst
3124
3125cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
3126    ITX_16X16_ADST_LOAD_ODD_COEFS
3127    call m(iadst_16x8_internal_8bpc).main
3128    call m(iadst_16x8_internal_8bpc).main_pass1_end
3129
3130    mov                     r3, tx2q
3131    lea                   tx2q, [o(.pass1_end)]
3132    mova                    m7, [o(pw_8192)]
3133    jmp  m(iadst_8x8_internal_8bpc).pass1_end1
3134
3135.pass1_end:
3136    SAVE_8ROWS    coeffq+16*17, 32
3137    LOAD_8ROWS    rsp+gprsize+16*3, 16
3138    mova    [rsp+gprsize+16*0], m7
3139    lea                   tx2q, [o(.pass1_end1)]
3140    mova                    m7, [o(pw_8192)]
3141    jmp  m(iadst_8x8_internal_8bpc).pass1_end1
3142
3143.pass1_end1:
3144    SAVE_8ROWS     coeffq+16*1, 32
3145    ITX_16X16_ADST_LOAD_EVEN_COEFS
3146    call m(iadst_16x8_internal_8bpc).main
3147    call m(iadst_16x8_internal_8bpc).main_pass1_end
3148
3149    lea                   tx2q, [o(.pass1_end2)]
3150    mova                    m7, [o(pw_8192)]
3151    jmp  m(iadst_8x8_internal_8bpc).pass1_end1
3152
3153.pass1_end2:
3154    SAVE_8ROWS    coeffq+16*16, 32
3155    LOAD_8ROWS    rsp+gprsize+16*3, 16
3156    mova    [rsp+gprsize+16*0], m7
3157    mov                   tx2q, r3
3158    mova                    m7, [o(pw_8192)]
3159    jmp  m(iadst_8x8_internal_8bpc).pass1_end1
3160
3161.pass2:
3162    lea                   tx2q, [o(.end)]
3163    jmp m(iadst_8x16_internal_8bpc).pass2_pre
3164
3165.end:
3166    LOAD_8ROWS    rsp+gprsize+16*3, 16
3167    mova    [rsp+gprsize+16*0], m7
3168    lea                   tx2q, [o(.end1)]
3169    mov                   dstq, r3
3170    lea                     r3, [dstq+8]
3171    jmp  m(iadst_8x8_internal_8bpc).end
3172
3173.end1:
3174    pxor                    m7, m7
3175    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
3176
3177    add                 coeffq, 32*8
3178    mov                   dstq, r3
3179
3180    mova                    m4, [coeffq+16*0 ]
3181    mova                    m5, [coeffq+16*2 ]
3182    mova                    m0, [coeffq+16*4 ]
3183    mova                    m1, [coeffq+16*6 ]
3184    mova                    m2, [coeffq+16*8 ]
3185    mova                    m3, [coeffq+16*10]
3186    mova                    m6, [coeffq+16*12]
3187    mova                    m7, [coeffq+16*14]
3188    mova    [rsp+gprsize+16*7], m4
3189    mova    [rsp+gprsize+16*8], m5
3190    mova    [rsp+gprsize+16*5], m6
3191    mova    [rsp+gprsize+16*6], m7
3192    lea                   tx2q, [o(m(iadst_8x16_internal_8bpc).end)]
3193    jmp m(iadst_8x16_internal_8bpc).pass2_main
3194
3195
3196INV_TXFM_16X16_FN flipadst, dct
3197INV_TXFM_16X16_FN flipadst, adst
3198INV_TXFM_16X16_FN flipadst, flipadst
3199
3200cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
3201    ITX_16X16_ADST_LOAD_ODD_COEFS
3202    call m(iadst_16x8_internal_8bpc).main
3203    call m(iadst_16x8_internal_8bpc).main_pass1_end
3204
3205    mov                     r3, tx2q
3206    lea                   tx2q, [o(.pass1_end)]
3207    mova                    m7, [o(pw_m8192)]
3208    jmp  m(iflipadst_8x8_internal_8bpc).pass1_end1
3209
3210.pass1_end:
3211    SAVE_8ROWS     coeffq+16*1, 32
3212    LOAD_8ROWS    rsp+gprsize+16*3, 16
3213    mova    [rsp+gprsize+16*0], m7
3214    lea                   tx2q, [o(.pass1_end1)]
3215    mova                    m7, [o(pw_m8192)]
3216    jmp  m(iflipadst_8x8_internal_8bpc).pass1_end1
3217
3218.pass1_end1:
3219    SAVE_8ROWS    coeffq+16*17, 32
3220    ITX_16X16_ADST_LOAD_EVEN_COEFS
3221    call m(iadst_16x8_internal_8bpc).main
3222    call m(iadst_16x8_internal_8bpc).main_pass1_end
3223
3224    mova                    m7, [rsp+gprsize+16*0]
3225    SAVE_8ROWS     coeffq+16*0, 32
3226    LOAD_8ROWS    rsp+gprsize+16*3, 16
3227    mova    [rsp+gprsize+16*0], m7
3228    lea                   tx2q, [o(.pass1_end2)]
3229    mova                    m7, [o(pw_m8192)]
3230    jmp  m(iflipadst_8x8_internal_8bpc).pass1_end1
3231
3232.pass1_end2:
3233    SAVE_8ROWS    coeffq+16*16, 32
3234    LOAD_8ROWS    coeffq+16* 0, 32
3235    mova    [rsp+gprsize+16*0], m7
3236    mov                   tx2q, r3
3237    mova                    m7, [o(pw_m8192)]
3238    jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
3239
3240.pass2:
3241    lea                   tx2q, [o(.end)]
3242    lea                     r3, [dstq+8]
3243    jmp m(iflipadst_8x16_internal_8bpc).pass2_pre
3244
3245.end:
3246    LOAD_8ROWS    rsp+gprsize+16*3, 16
3247    mova    [rsp+gprsize+16*0], m7
3248    lea                   tx2q, [o(.end1)]
3249    lea                   dstq, [dstq+strideq*2]
3250    jmp  m(iflipadst_8x8_internal_8bpc).end
3251
3252.end1:
3253    pxor                    m7, m7
3254    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
3255
3256    add                 coeffq, 32*8
3257
3258    mova                    m4, [coeffq+16*0 ]
3259    mova                    m5, [coeffq+16*2 ]
3260    mova                    m0, [coeffq+16*4 ]
3261    mova                    m1, [coeffq+16*6 ]
3262    mova                    m2, [coeffq+16*8 ]
3263    mova                    m3, [coeffq+16*10]
3264    mova                    m6, [coeffq+16*12]
3265    mova                    m7, [coeffq+16*14]
3266    mova    [rsp+gprsize+16*7], m4
3267    mova    [rsp+gprsize+16*8], m5
3268    mova    [rsp+gprsize+16*5], m6
3269    mova    [rsp+gprsize+16*6], m7
3270
3271    lea                   tx2q, [o(.end2)]
3272    mov                   dstq, r3
3273    jmp m(iflipadst_8x16_internal_8bpc).pass2_main
3274
3275.end2:
3276    LOAD_8ROWS    rsp+gprsize+16*3, 16
3277    mova    [rsp+gprsize+16*0], m7
3278    lea                   tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
3279    lea                   dstq, [dstq+strideq*2]
3280    jmp  m(iflipadst_8x8_internal_8bpc).end
3281
3282
3283%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16
3284    pmulhrsw            m%2, m%3, m%1
3285    psraw               m%2, 1
3286    pavgw               m%1, m%2
3287%endmacro
3288
3289INV_TXFM_16X16_FN identity, dct
3290INV_TXFM_16X16_FN identity, identity
3291
3292cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
3293    add                 coeffq, 16*17
3294    mov                     r3, tx2q
3295    lea                   tx2q, [o(.pass1_end)]
3296
3297.pass1:
3298    mova                    m6, [o(pw_1697x16)]
3299    mova                    m7, [coeffq+32*6]
3300    mova                    m0, [coeffq+32*0]
3301    mova                    m1, [coeffq+32*1]
3302    mova                    m2, [coeffq+32*2]
3303    mova                    m3, [coeffq+32*3]
3304    mova                    m4, [coeffq+32*4]
3305    REPX     {IDTX16B x, 5, 6}, 7, 0, 1, 2, 3, 4
3306    mova                    m5, [coeffq+32*5]
3307    mova    [rsp+gprsize+16*1], m7
3308    IDTX16B                  5, 7, 6
3309    mova                    m7, [coeffq+32*7]
3310    IDTX16B                  7, 6, 6
3311    jmp   m(idct_8x8_internal_8bpc).pass1_end3
3312
3313.pass1_end:
3314    SAVE_8ROWS          coeffq, 32
3315    sub                 coeffq, 16
3316    lea                   tx2q, [o(.pass1_end1)]
3317    jmp .pass1
3318
3319.pass1_end1:
3320    SAVE_8ROWS          coeffq, 32
3321    sub                 coeffq, 15*16
3322    lea                   tx2q, [o(.pass1_end2)]
3323    jmp .pass1
3324
3325.pass1_end2:
3326    SAVE_8ROWS          coeffq, 32
3327    sub                 coeffq, 16
3328    mov                   tx2q, r3
3329    jmp .pass1
3330
3331.pass2:
3332    lea                     r3, [dstq+8]
3333    lea                   tx2q, [o(.end1)]
3334
3335.end:
3336    mova    [rsp+gprsize+16*0], m7
3337    mova    [rsp+gprsize+16*1], m4
3338    mova                    m7, [o(pw_1697x16)]
3339    REPX      {IDTX16 x, 4, 7}, 5, 6, 0, 1, 2, 3
3340    mova                    m4, [o(pw_2048)]
3341    pmulhrsw                m5, m4
3342    pmulhrsw                m6, m4
3343    mova    [rsp+gprsize+16*2], m5
3344    mova                    m5, [rsp+gprsize+16*1]
3345    mova    [rsp+gprsize+16*1], m6
3346    IDTX16                   5, 6, 7
3347    mova                    m6, [rsp+gprsize+16*0]
3348    IDTX16                   6, 7, 7
3349    REPX      {pmulhrsw x, m4}, m0, m1, m2, m3, m6
3350    pmulhrsw                m4, m5
3351    mova    [rsp+gprsize+16*0], m6
3352    jmp   m(idct_8x8_internal_8bpc).end3
3353
3354.end1:
3355    LOAD_8ROWS     coeffq+16*1, 32
3356    lea                   tx2q, [o(.end2)]
3357    lea                   dstq, [dstq+strideq*2]
3358    jmp .end
3359
3360.end2:
3361    pxor                    m7, m7
3362    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
3363
3364    add                 coeffq, 32*8
3365    LOAD_8ROWS          coeffq, 32
3366    lea                   tx2q, [o(.end3)]
3367    mov                   dstq, r3
3368    jmp .end
3369
3370.end3:
3371    LOAD_8ROWS     coeffq+16*1, 32
3372    lea                   tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
3373    lea                   dstq, [dstq+strideq*2]
3374    jmp .end
3375
3376
3377cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
3378%if ARCH_X86_32
3379    LEA                     r5, $$
3380%endif
3381    test                  eobd, eobd
3382    jz .dconly
3383    call  m(idct_8x32_internal_8bpc)
3384    RET
3385
3386.dconly:
3387    movd                 m1, [o(pw_2896x8)]
3388    pmulhrsw             m0, m1, [coeffq]
3389    movd                 m2, [o(pw_8192)]
3390    mov            [coeffq], eobd
3391    pmulhrsw             m0, m2
3392    psrlw                m2, 2            ;pw_2048
3393    pmulhrsw             m0, m1
3394    pmulhrsw             m0, m2
3395    pshuflw              m0, m0, q0000
3396    punpcklwd            m0, m0
3397    mov                 r3d, 8
3398    lea                tx2q, [o(.end)]
3399    jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop
3400
3401.end:
3402    RET
3403
3404
3405
3406cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
3407    cmp                   eobd, 106
3408    jle .fast
3409
3410    LOAD_8ROWS     coeffq+16*3, 64
3411    call  m(idct_8x8_internal_8bpc).main
3412    mova                    m7, [o(pw_8192)]
3413    lea                   tx2q, [o(.pass1)]
3414    jmp   m(idct_8x8_internal_8bpc).pass1_end1
3415
3416.pass1:
3417    mova   [rsp+gprsize+16*9 ], m0                        ;in24
3418    mova   [rsp+gprsize+16*10], m4                        ;in28
3419    mova   [rsp+gprsize+16*17], m2                        ;in26
3420    mova   [rsp+gprsize+16*18], m6                        ;in30
3421    mova   [rsp+gprsize+16*31], m1                        ;in25
3422    mova   [rsp+gprsize+16*30], m3                        ;in27
3423    mova   [rsp+gprsize+16*27], m5                        ;in29
3424    mova   [rsp+gprsize+16*34], m7                        ;in31
3425    LOAD_8ROWS     coeffq+16*2, 64
3426    call  m(idct_8x8_internal_8bpc).main
3427    mova                    m7, [o(pw_8192)]
3428    lea                   tx2q, [o(.pass1_1)]
3429    jmp   m(idct_8x8_internal_8bpc).pass1_end1
3430
3431.pass1_1:
3432    mova   [rsp+gprsize+16*7 ], m0                        ;in16
3433    mova   [rsp+gprsize+16*8 ], m4                        ;in20
3434    mova   [rsp+gprsize+16*15], m2                        ;in18
3435    mova   [rsp+gprsize+16*16], m6                        ;in22
3436    mova   [rsp+gprsize+16*33], m1                        ;in17
3437    mova   [rsp+gprsize+16*28], m3                        ;in19
3438    mova   [rsp+gprsize+16*29], m5                        ;in21
3439    mova   [rsp+gprsize+16*32], m7                        ;in23
3440
3441.fast:
3442    LOAD_8ROWS     coeffq+16*1, 64
3443    call  m(idct_8x8_internal_8bpc).main
3444    mova                    m7, [o(pw_8192)]
3445    lea                   tx2q, [o(.pass1_end)]
3446    jmp   m(idct_8x8_internal_8bpc).pass1_end1
3447
3448.pass1_end:
3449    mova   [rsp+gprsize+16*5 ], m0                        ;in8
3450    mova   [rsp+gprsize+16*6 ], m4                        ;in12
3451    mova   [rsp+gprsize+16*13], m2                        ;in10
3452    mova   [rsp+gprsize+16*14], m6                        ;in14
3453    mova   [rsp+gprsize+16*21], m1                        ;in9
3454    mova   [rsp+gprsize+16*24], m3                        ;in11
3455    mova   [rsp+gprsize+16*25], m5                        ;in13
3456    mova   [rsp+gprsize+16*20], m7                        ;in15
3457    LOAD_8ROWS     coeffq+16*0, 64
3458    call  m(idct_8x8_internal_8bpc).main
3459    mova                    m7, [o(pw_8192)]
3460    lea                   tx2q, [o(.pass1_end1)]
3461    jmp   m(idct_8x8_internal_8bpc).pass1_end1
3462
3463.pass1_end1:
3464    mova   [rsp+gprsize+16*11], m2                        ;in2
3465    mova   [rsp+gprsize+16*12], m6                        ;in6
3466    mova   [rsp+gprsize+16*19], m1                        ;in1
3467    mova   [rsp+gprsize+16*26], m3                        ;in3
3468    mova   [rsp+gprsize+16*23], m5                        ;in5
3469    mova   [rsp+gprsize+16*22], m7                        ;in7
3470    mova                    m1, m4                        ;in4
3471    mova                    m2, [rsp+gprsize+16*5 ]       ;in8
3472    mova                    m3, [rsp+gprsize+16*6 ]       ;in12
3473
3474    cmp                   eobd, 106
3475    jg .full
3476
3477    pxor                    m4, m4
3478    REPX          {mova x, m4}, m5, m6, m7
3479    call  m(idct_8x8_internal_8bpc).main
3480    SAVE_7ROWS   rsp+gprsize+16*3 , 16
3481    mova                    m0, [rsp+gprsize+16*11]
3482    mova                    m1, [rsp+gprsize+16*12]
3483    mova                    m2, [rsp+gprsize+16*13]
3484    mova                    m3, [rsp+gprsize+16*14]
3485    pxor                    m4, m4
3486    REPX          {mova x, m4}, m5, m6, m7
3487    call m(idct_16x8_internal_8bpc).main
3488    mova                    m7, [rsp+gprsize+16*0]
3489    SAVE_8ROWS   rsp+gprsize+16*11, 16
3490
3491    call .main_fast
3492    jmp  .pass2
3493
3494.full:
3495    mova                    m4, [rsp+gprsize+16*7 ]       ;in16
3496    mova                    m5, [rsp+gprsize+16*8 ]       ;in20
3497    mova                    m6, [rsp+gprsize+16*9 ]       ;in24
3498    mova                    m7, [rsp+gprsize+16*10]       ;in28
3499    call  m(idct_8x8_internal_8bpc).main
3500    SAVE_7ROWS   rsp+gprsize+16*3 , 16
3501    LOAD_8ROWS   rsp+gprsize+16*11, 16
3502    call m(idct_16x8_internal_8bpc).main
3503    mova                    m7, [rsp+gprsize+16*0]
3504    SAVE_8ROWS   rsp+gprsize+16*11, 16
3505    call .main
3506
3507.pass2:
3508    lea                     r3, [o(.end6)]
3509
3510.end:
3511    mova   [rsp+gprsize+16*0 ], m7
3512    lea                   tx2q, [o(.end2)]
3513
3514.end1:
3515    pxor                    m7, m7
3516    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  \
3517                                     8,  9,  10, 11, 12, 13, 14, 15, \
3518                                     16, 17, 18, 19, 20, 21, 22, 23, \
3519                                     24, 25, 26, 27, 28, 29, 30, 31
3520
3521    jmp                   tx2q
3522
3523.end2:
3524    lea                   tx2q, [o(.end3)]
3525    jmp   m(idct_8x8_internal_8bpc).end
3526
3527.end3:
3528    LOAD_8ROWS   rsp+gprsize+16*11, 16
3529    mova   [rsp+gprsize+16*0 ], m7
3530    lea                   dstq, [dstq+strideq*2]
3531    lea                   tx2q, [o(.end4)]
3532    jmp   m(idct_8x8_internal_8bpc).end
3533
3534.end4:
3535    LOAD_8ROWS   rsp+gprsize+16*19, 16
3536    mova   [rsp+gprsize+16*0 ], m7
3537    lea                   dstq, [dstq+strideq*2]
3538    lea                   tx2q, [o(.end5)]
3539    jmp   m(idct_8x8_internal_8bpc).end
3540
3541.end5:
3542    LOAD_8ROWS   rsp+gprsize+16*27, 16
3543    mova   [rsp+gprsize+16*0 ], m7
3544    lea                   dstq, [dstq+strideq*2]
3545    mov                   tx2q, r3
3546    jmp   m(idct_8x8_internal_8bpc).end
3547
3548.end6:
3549    ret
3550
3551ALIGN function_align
3552cglobal_label .main_veryfast
3553    mova                    m0, [rsp+gprsize*2+16*19]     ;in1
3554    pmulhrsw                m3, m0, [o(pw_4091x8)]        ;t30,t31
3555    pmulhrsw                m0, [o(pw_201x8)]             ;t16,t17
3556    mova                    m7, [o(pd_2048)]
3557    mova [rsp+gprsize*2+16*19], m0                        ;t16
3558    mova [rsp+gprsize*2+16*34], m3                        ;t31
3559    ITX_MULSUB_2W            3, 0, 1, 2, 7,  799, 4017    ;t17a, t30a
3560    mova [rsp+gprsize*2+16*20], m3                        ;t17a
3561    mova [rsp+gprsize*2+16*33], m0                        ;t30a
3562    mova                    m1, [rsp+gprsize*2+16*22]     ;in7
3563    pmulhrsw                m2, m1, [o(pw_3857x8)]        ;t28,t29
3564    pmulhrsw                m1, [o(pw_m1380x8)]           ;t18,t19
3565    mova [rsp+gprsize*2+16*22], m1                        ;t19
3566    mova [rsp+gprsize*2+16*31], m2                        ;t28
3567    ITX_MULSUB_2W            2, 1, 0, 3, 7, m4017, 799    ;t18a, t29a
3568    mova [rsp+gprsize*2+16*21], m2                        ;t18a
3569    mova [rsp+gprsize*2+16*32], m1                        ;t29a
3570    mova                    m0, [rsp+gprsize*2+16*23]     ;in5
3571    pmulhrsw                m3, m0, [o(pw_3973x8)]        ;t26, t27
3572    pmulhrsw                m0, [o(pw_995x8)]             ;t20, t21
3573    mova [rsp+gprsize*2+16*23], m0                        ;t20
3574    mova [rsp+gprsize*2+16*30], m3                        ;t27
3575    ITX_MULSUB_2W            3, 0, 1, 2, 7, 3406, 2276    ;t21a, t26a
3576    mova [rsp+gprsize*2+16*24], m3                        ;t21a
3577    mova [rsp+gprsize*2+16*29], m0                        ;t26a
3578    mova                    m2, [rsp+gprsize*2+16*26]     ;in3
3579    pxor                    m0, m0
3580    mova                    m3, m0
3581    pmulhrsw                m1, m2, [o(pw_4052x8)]
3582    pmulhrsw                m2, [o(pw_m601x8)]
3583    jmp .main2
3584
3585ALIGN function_align
3586cglobal_label .main_fast ;bottom half is zero
3587    mova                    m0, [rsp+gprsize*2+16*19]     ;in1
3588    mova                    m1, [rsp+gprsize*2+16*20]     ;in15
3589    pmulhrsw                m3, m0, [o(pw_4091x8)]        ;t31a
3590    pmulhrsw                m0, [o(pw_201x8)]             ;t16a
3591    pmulhrsw                m2, m1, [o(pw_3035x8)]        ;t30a
3592    pmulhrsw                m1, [o(pw_m2751x8)]           ;t17a
3593    mova                    m7, [o(pd_2048)]
3594    psubsw                  m4, m0, m1                    ;t17
3595    paddsw                  m0, m1                        ;t16
3596    psubsw                  m5, m3, m2                    ;t30
3597    paddsw                  m3, m2                        ;t31
3598    ITX_MULSUB_2W            5, 4, 1, 2, 7,  799, 4017    ;t17a, t30a
3599    mova [rsp+gprsize*2+16*19], m0                        ;t16
3600    mova [rsp+gprsize*2+16*20], m5                        ;t17a
3601    mova [rsp+gprsize*2+16*33], m4                        ;t30a
3602    mova [rsp+gprsize*2+16*34], m3                        ;t31
3603    mova                    m0, [rsp+gprsize*2+16*21]     ;in9
3604    mova                    m1, [rsp+gprsize*2+16*22]     ;in7
3605    pmulhrsw                m3, m0, [o(pw_3703x8)]
3606    pmulhrsw                m0, [o(pw_1751x8)]
3607    pmulhrsw                m2, m1, [o(pw_3857x8)]
3608    pmulhrsw                m1, [o(pw_m1380x8)]
3609    psubsw                  m4, m1, m0                    ;t18
3610    paddsw                  m0, m1                        ;t19
3611    psubsw                  m5, m2, m3                    ;t29
3612    paddsw                  m3, m2                        ;t28
3613    ITX_MULSUB_2W            5, 4, 1, 2, 7, m4017, 799    ;t18a, t29a
3614    mova [rsp+gprsize*2+16*21], m5                        ;t18a
3615    mova [rsp+gprsize*2+16*22], m0                        ;t19
3616    mova [rsp+gprsize*2+16*31], m3                        ;t28
3617    mova [rsp+gprsize*2+16*32], m4                        ;t29a
3618    mova                    m0, [rsp+gprsize*2+16*23]     ;in5
3619    mova                    m1, [rsp+gprsize*2+16*24]     ;in11
3620    pmulhrsw                m3, m0, [o(pw_3973x8)]
3621    pmulhrsw                m0, [o(pw_995x8)]
3622    pmulhrsw                m2, m1, [o(pw_3513x8)]
3623    pmulhrsw                m1, [o(pw_m2106x8)]
3624    psubsw                  m4, m0, m1                    ;t21
3625    paddsw                  m0, m1                        ;t20
3626    psubsw                  m5, m3, m2                    ;t26
3627    paddsw                  m3, m2                        ;t27
3628    ITX_MULSUB_2W            5, 4, 1, 2, 7, 3406, 2276    ;t21a, t26a
3629    mova [rsp+gprsize*2+16*23], m0                        ;t20
3630    mova [rsp+gprsize*2+16*24], m5                        ;t21a
3631    mova [rsp+gprsize*2+16*29], m4                        ;t26a
3632    mova [rsp+gprsize*2+16*30], m3                        ;t27
3633    mova                    m0, [rsp+gprsize*2+16*25]     ;in13
3634    mova                    m2, [rsp+gprsize*2+16*26]     ;in3
3635    pmulhrsw                m3, m0, [o(pw_3290x8)]
3636    pmulhrsw                m0, [o(pw_2440x8)]
3637    pmulhrsw                m1, m2, [o(pw_4052x8)]
3638    pmulhrsw                m2, [o(pw_m601x8)]
3639    jmp .main2
3640
3641ALIGN function_align
3642cglobal_label .main
3643    mova                    m7, [o(pd_2048)]
3644    mova                    m0, [rsp+gprsize*2+16*19]     ;in1
3645    mova                    m1, [rsp+gprsize*2+16*20]     ;in15
3646    mova                    m2, [rsp+gprsize*2+16*33]     ;in17
3647    mova                    m3, [rsp+gprsize*2+16*34]     ;in31
3648    ITX_MULSUB_2W            0, 3, 4, 5, 7,  201, 4091    ;t16a, t31a
3649    ITX_MULSUB_2W            2, 1, 4, 5, 7, 3035, 2751    ;t17a, t30a
3650    psubsw                  m4, m0, m2                    ;t17
3651    paddsw                  m0, m2                        ;t16
3652    psubsw                  m5, m3, m1                    ;t30
3653    paddsw                  m3, m1                        ;t31
3654    ITX_MULSUB_2W            5, 4, 1, 2, 7,  799, 4017    ;t17a, t30a
3655    mova [rsp+gprsize*2+16*19], m0                        ;t16
3656    mova [rsp+gprsize*2+16*20], m5                        ;t17a
3657    mova [rsp+gprsize*2+16*33], m4                        ;t30a
3658    mova [rsp+gprsize*2+16*34], m3                        ;t31
3659    mova                    m0, [rsp+gprsize*2+16*21]     ;in9
3660    mova                    m1, [rsp+gprsize*2+16*22]     ;in7
3661    mova                    m2, [rsp+gprsize*2+16*31]     ;in25
3662    mova                    m3, [rsp+gprsize*2+16*32]     ;in23
3663    ITX_MULSUB_2W            0, 3, 4, 5, 7, 1751, 3703    ;t18a, t29a
3664    ITX_MULSUB_2W            2, 1, 4, 5, 7, 3857, 1380    ;t19a, t28a
3665    psubsw                  m4, m2, m0                    ;t18
3666    paddsw                  m0, m2                        ;t19
3667    psubsw                  m5, m1, m3                    ;t29
3668    paddsw                  m3, m1                        ;t28
3669    ITX_MULSUB_2W            5, 4, 1, 2, 7, m4017, 799    ;t18a, t29a
3670    mova [rsp+gprsize*2+16*21], m5                        ;t18a
3671    mova [rsp+gprsize*2+16*22], m0                        ;t19
3672    mova [rsp+gprsize*2+16*31], m3                        ;t28
3673    mova [rsp+gprsize*2+16*32], m4                        ;t29a
3674    mova                    m0, [rsp+gprsize*2+16*23]     ;in5
3675    mova                    m1, [rsp+gprsize*2+16*24]     ;in11
3676    mova                    m2, [rsp+gprsize*2+16*29]     ;in21
3677    mova                    m3, [rsp+gprsize*2+16*30]     ;in27
3678    ITX_MULSUB_2W            0, 3, 4, 5, 7,  995, 3973    ;t20a, t27a
3679    ITX_MULSUB_2W            2, 1, 4, 5, 7, 3513, 2106    ;t21a, t26a
3680    psubsw                  m4, m0, m2                    ;t21
3681    paddsw                  m0, m2                        ;t20
3682    psubsw                  m5, m3, m1                    ;t26
3683    paddsw                  m3, m1                        ;t27
3684    ITX_MULSUB_2W            5, 4, 1, 2, 7, 3406, 2276    ;t21a, t26a
3685    mova [rsp+gprsize*2+16*23], m0                        ;t20
3686    mova [rsp+gprsize*2+16*24], m5                        ;t21a
3687    mova [rsp+gprsize*2+16*29], m4                        ;t26a
3688    mova [rsp+gprsize*2+16*30], m3                        ;t27
3689    mova                    m0, [rsp+gprsize*2+16*25]     ;in13
3690    mova                    m1, [rsp+gprsize*2+16*26]     ;in3
3691    mova                    m2, [rsp+gprsize*2+16*27]     ;in29
3692    mova                    m3, [rsp+gprsize*2+16*28]     ;in19
3693    ITX_MULSUB_2W            0, 3, 4, 5, 7, 2440, 3290    ;t22a, t25a
3694    ITX_MULSUB_2W            2, 1, 4, 5, 7, 4052,  601    ;t23a, t24a
3695
3696.main2:
3697    psubsw                  m4, m2, m0                    ;t22
3698    paddsw                  m0, m2                        ;t23
3699    psubsw                  m5, m1, m3                    ;t25
3700    paddsw                  m3, m1                        ;t24
3701    ITX_MULSUB_2W            5, 4, 1, 2, 7, m2276, 3406   ;t22a, t25a
3702    mova                    m2, [rsp+gprsize*2+16*24]     ;t21a
3703    psubsw                  m1, m5, m2                    ;t21
3704    paddsw                  m5, m2                        ;t22
3705    mova [rsp+gprsize*2+16*25], m5                        ;t22
3706    mova                    m2, [rsp+gprsize*2+16*29]     ;t26a
3707    psubsw                  m5, m4, m2                    ;t26
3708    paddsw                  m4, m2                        ;t25
3709    mova [rsp+gprsize*2+16*28], m4                        ;t25
3710    ITX_MULSUB_2W            5, 1, 2, 4, 7, m3784, 1567   ;t21a, t26a
3711    mova [rsp+gprsize*2+16*24], m5                        ;t21a
3712    mova [rsp+gprsize*2+16*29], m1                        ;t26a
3713
3714    mova                    m1, [rsp+gprsize*2+16*23]     ;t20
3715    mova                    m5, [rsp+gprsize*2+16*30]     ;t27
3716    psubsw                  m2, m0, m1                    ;t20a
3717    paddsw                  m0, m1                        ;t23a
3718    psubsw                  m6, m3, m5                    ;t27a
3719    paddsw                  m3, m5                        ;t24a
3720    ITX_MULSUB_2W            6, 2, 1, 5, 7, m3784, 1567   ;t20, t27
3721    mova [rsp+gprsize*2+16*26], m0                        ;t23a
3722    mova [rsp+gprsize*2+16*27], m3                        ;t24a
3723    mova [rsp+gprsize*2+16*30], m2                        ;t27
3724
3725    mova                    m0, [rsp+gprsize*2+16*20]     ;t17a
3726    mova                    m1, [rsp+gprsize*2+16*21]     ;t18a
3727    mova                    m2, [rsp+gprsize*2+16*32]     ;t29a
3728    mova                    m3, [rsp+gprsize*2+16*33]     ;t30a
3729    psubsw                  m4, m0, m1                    ;t18
3730    paddsw                  m0, m1                        ;t17
3731    psubsw                  m5, m3, m2                    ;t29
3732    paddsw                  m3, m2                        ;t30
3733    ITX_MULSUB_2W            5, 4, 1, 2, 7, 1567, 3784    ;t18a, t29a
3734    mova [rsp+gprsize*2+16*20], m0                        ;t17
3735    mova [rsp+gprsize*2+16*21], m5                        ;t18a
3736    mova [rsp+gprsize*2+16*32], m4                        ;t29a
3737    mova [rsp+gprsize*2+16*33], m3                        ;t30
3738    mova                    m0, [rsp+gprsize*2+16*19]     ;t16
3739    mova                    m1, [rsp+gprsize*2+16*22]     ;t19
3740    mova                    m2, [rsp+gprsize*2+16*31]     ;t28
3741    mova                    m3, [rsp+gprsize*2+16*34]     ;t31
3742    psubsw                  m4, m0, m1                    ;t19a
3743    paddsw                  m0, m1                        ;t16a
3744    psubsw                  m5, m3, m2                    ;t28a
3745    paddsw                  m3, m2                        ;t31a
3746    ITX_MULSUB_2W            5, 4, 1, 2, 7, 1567, 3784    ;t19, t28
3747    mova                    m2, [rsp+gprsize*2+16*15]     ;tmp12
3748    psubsw                  m1, m5, m6                    ;t20a
3749    paddsw                  m5, m6                        ;t19a
3750    psubsw                  m6, m2, m5                    ;out19
3751    paddsw                  m2, m5                        ;out12
3752    mova                    m5, [rsp+gprsize*2+16*30]     ;t27
3753    mova [rsp+gprsize*2+16*22], m6                        ;out19
3754    mova [rsp+gprsize*2+16*15], m2                        ;out12
3755    psubsw                  m6, m4, m5                    ;t27a
3756    paddsw                  m4, m5                        ;t28a
3757    ITX_MULSUB_2W            6, 1, 2, 5, 7, 2896, 2896    ;t20, t27
3758    mova                    m2, [rsp+gprsize*2+16*6 ]     ;tmp3
3759    psubsw                  m5, m2, m4                    ;out28
3760    paddsw                  m2, m4                        ;out3
3761    mova                    m4, [rsp+gprsize*2+16*14]     ;tmp11
3762    mova [rsp+gprsize*2+16*31], m5                        ;out28
3763    mova [rsp+gprsize*2+16*6 ], m2                        ;out3
3764    psubsw                  m5, m4, m6                    ;out20
3765    paddsw                  m4, m6                        ;out11
3766    mova                    m2, [rsp+gprsize*2+16*7 ]     ;tmp4
3767    mova [rsp+gprsize*2+16*23], m5                        ;out20
3768    mova [rsp+gprsize*2+16*14], m4                        ;out11
3769    psubsw                  m5, m2, m1                    ;out27
3770    paddsw                  m2, m1                        ;out4
3771    mova                    m1, [rsp+gprsize*2+16*26]     ;t23a
3772    mova                    m4, [rsp+gprsize*2+16*27]     ;t24a
3773    mova [rsp+gprsize*2+16*30], m5                        ;out27
3774    mova [rsp+gprsize*2+16*7 ], m2                        ;out4
3775    psubsw                  m5, m0, m1                    ;t23
3776    paddsw                  m0, m1                        ;t16
3777    psubsw                  m2, m3, m4                    ;t24
3778    paddsw                  m3, m4                        ;t31
3779    ITX_MULSUB_2W            2, 5, 4, 6, 7, 2896, 2896    ;t23a, t24a
3780    mova                    m6, [rsp+gprsize*2+16*18]     ;tmp15
3781    psubsw                  m4, m6, m0                    ;out16
3782    paddsw                  m6, m0                        ;out15
3783    mova                    m0, [rsp+gprsize*2+16*3 ]     ;tmp0
3784    mova                    m1, [rsp+gprsize*2+16*11]     ;tmp8
3785    mova [rsp+gprsize*2+16*18], m6                        ;out15
3786    mova [rsp+gprsize*2+16*19], m4                        ;out16
3787    psubsw                  m6, m0, m3                    ;out31
3788    paddsw                  m0, m3                        ;out0
3789    psubsw                  m4, m1, m2                    ;out23
3790    paddsw                  m1, m2                        ;out8
3791    mova                    m3, [rsp+gprsize*2+16*10]     ;tmp7
3792    mova [rsp+gprsize*2+16*34], m6                        ;out31
3793    mova [rsp+gprsize*2+16*11], m1                        ;out8
3794    mova [rsp+gprsize*2+16*26], m4                        ;out23
3795    paddsw                  m6, m3, m5                    ;out7
3796    psubsw                  m3, m5                        ;out24
3797    mova                    m1, [rsp+gprsize*2+16*20]     ;t17
3798    mova                    m5, [rsp+gprsize*2+16*25]     ;t22
3799    mova                    m2, [rsp+gprsize*2+16*17]     ;tmp14
3800    mova [rsp+gprsize*2+16*27], m3                        ;out24
3801    psubsw                  m4, m1, m5                    ;t22a
3802    paddsw                  m1, m5                        ;t17a
3803    psubsw                  m3, m2, m1                    ;out17
3804    paddsw                  m2, m1                        ;out14
3805    mova                    m5, [rsp+gprsize*2+16*28]     ;t25
3806    mova                    m1, [rsp+gprsize*2+16*33]     ;t30
3807    mova [rsp+gprsize*2+16*17], m2                        ;out14
3808    mova [rsp+gprsize*2+16*20], m3                        ;out17
3809    psubsw                  m2, m1, m5                    ;t25a
3810    paddsw                  m1, m5                        ;t30a
3811    ITX_MULSUB_2W            2, 4, 3, 5, 7, 2896, 2896    ;t22, t25
3812    mova                    m5, [rsp+gprsize*2+16*4 ]     ;tmp1
3813    psubsw                  m3, m5, m1                    ;out30
3814    paddsw                  m5, m1                        ;out1
3815    mova                    m1, [rsp+gprsize*2+16*12]     ;tmp9
3816    mova [rsp+gprsize*2+16*33], m3                        ;out30
3817    mova [rsp+gprsize*2+16*4 ], m5                        ;out1
3818    psubsw                  m3, m1, m2                    ;out22
3819    paddsw                  m1, m2                        ;out9
3820    mova                    m5, [rsp+gprsize*2+16*9 ]     ;tmp6
3821    mova [rsp+gprsize*2+16*25], m3                        ;out22
3822    mova [rsp+gprsize*2+16*12], m1                        ;out9
3823    psubsw                  m3, m5, m4                    ;out25
3824    paddsw                  m5, m4                        ;out6
3825    mova                    m4, [rsp+gprsize*2+16*21]     ;t18a
3826    mova                    m1, [rsp+gprsize*2+16*24]     ;t21a
3827    mova                    m2, [rsp+gprsize*2+16*16]     ;tmp13
3828    mova [rsp+gprsize*2+16*28], m3                        ;out25
3829    mova [rsp+gprsize*2+16*9 ], m5                        ;out6
3830    paddsw                  m3, m4, m1                    ;t18
3831    psubsw                  m4, m1                        ;t21
3832    psubsw                  m5, m2, m3                    ;out18
3833    paddsw                  m2, m3                        ;out13
3834    mova                    m1, [rsp+gprsize*2+16*29]     ;t26a
3835    mova                    m3, [rsp+gprsize*2+16*32]     ;t29a
3836    mova [rsp+gprsize*2+16*21], m5                        ;out18
3837    mova [rsp+gprsize*2+16*16], m2                        ;out13
3838    psubsw                  m5, m3, m1                    ;t26
3839    paddsw                  m3, m1                        ;t29
3840    ITX_MULSUB_2W            5, 4, 1, 2, 7, 2896, 2896    ;t21a, t26a
3841    mova                    m2, [rsp+gprsize*2+16*5 ]     ;tmp2
3842    psubsw                  m1, m2, m3                    ;out29
3843    paddsw                  m2, m3                        ;out2
3844    mova                    m3, [rsp+gprsize*2+16*13]     ;tmp10
3845    mova [rsp+gprsize*2+16*32], m1                        ;out29
3846    psubsw                  m7, m3, m5                    ;out21
3847    paddsw                  m3, m5                        ;out10
3848    mova                    m5, [rsp+gprsize*2+16*8 ]     ;tmp5
3849    mova [rsp+gprsize*2+16*24], m7                        ;out21
3850    mova [rsp+gprsize*2+16*13], m3                        ;out10
3851    psubsw                  m1, m5, m4                    ;out26
3852    paddsw                  m5, m4                        ;out5
3853    mova                    m7, m6                        ;out7
3854    mova                    m3, [rsp+gprsize*2+16*6 ]     ;out3
3855    mova                    m4, [rsp+gprsize*2+16*7 ]     ;out4
3856    mova [rsp+gprsize*2+16*29], m1                        ;out26
3857    mova                    m6, [rsp+gprsize*2+16*9 ]     ;out6
3858    mova                    m1, [rsp+gprsize*2+16*4 ]     ;out1
3859    ret
3860
3861
3862cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
3863%if ARCH_X86_32
3864    LEA                     r5, $$
3865%endif
3866    test                  eobd, eobd
3867    jz .dconly
3868    call  m(idct_32x8_internal_8bpc)
3869    RET
3870
3871.dconly:
3872    movd                    m1, [o(pw_2896x8)]
3873    pmulhrsw                m0, m1, [coeffq]
3874    movd                    m2, [o(pw_8192)]
3875    mov               [coeffq], eobd
3876    mov                    r3d, 8
3877    lea                   tx2q, [o(.end)]
3878
3879.body:
3880    pmulhrsw                m0, m2
3881    movd                    m2, [o(pw_2048)]  ;intentionally rip-relative
3882    pmulhrsw                m0, m1
3883    pmulhrsw                m0, m2
3884    pshuflw                 m0, m0, q0000
3885    punpcklwd               m0, m0
3886    pxor                    m5, m5
3887
3888.loop:
3889    mova                    m1, [dstq+16*0]
3890    mova                    m3, [dstq+16*1]
3891    punpckhbw               m2, m1, m5
3892    punpcklbw               m1, m5
3893    punpckhbw               m4, m3, m5
3894    punpcklbw               m3, m5
3895    paddw                   m2, m0
3896    paddw                   m1, m0
3897    paddw                   m4, m0
3898    paddw                   m3, m0
3899    packuswb                m1, m2
3900    packuswb                m3, m4
3901    mova           [dstq+16*0], m1
3902    mova           [dstq+16*1], m3
3903    add                   dstq, strideq
3904    dec                    r3d
3905    jg .loop
3906    jmp                   tx2q
3907
3908.end:
3909    RET
3910
3911
3912cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
3913    LOAD_8ROWS     coeffq+16*0, 64
3914    call  m(idct_8x8_internal_8bpc).main
3915    SAVE_7ROWS    rsp+gprsize+16*3, 16
3916
3917    LOAD_8ROWS     coeffq+16*2, 64
3918    call m(idct_16x8_internal_8bpc).main
3919    mova                    m7, [rsp+gprsize+16*0]
3920    SAVE_8ROWS   rsp+gprsize+16*11, 16
3921
3922    LOAD_8ROWS     coeffq+16*1, 32
3923    mova   [rsp+gprsize+16*19], m0                        ;in1
3924    mova   [rsp+gprsize+16*26], m1                        ;in3
3925    mova   [rsp+gprsize+16*23], m2                        ;in5
3926    mova   [rsp+gprsize+16*22], m3                        ;in7
3927    mova   [rsp+gprsize+16*21], m4                        ;in9
3928    mova   [rsp+gprsize+16*24], m5                        ;in11
3929    mova   [rsp+gprsize+16*25], m6                        ;in13
3930    mova   [rsp+gprsize+16*20], m7                        ;in15
3931
3932    cmp                   eobd, 106
3933    jg  .full
3934    call m(idct_8x32_internal_8bpc).main_fast
3935    jmp .pass2
3936
3937.full:
3938    LOAD_8ROWS    coeffq+16*17, 32
3939    mova   [rsp+gprsize+16*33], m0                        ;in17
3940    mova   [rsp+gprsize+16*28], m1                        ;in19
3941    mova   [rsp+gprsize+16*29], m2                        ;in21
3942    mova   [rsp+gprsize+16*32], m3                        ;in23
3943    mova   [rsp+gprsize+16*31], m4                        ;in25
3944    mova   [rsp+gprsize+16*30], m5                        ;in27
3945    mova   [rsp+gprsize+16*27], m6                        ;in29
3946    mova   [rsp+gprsize+16*34], m7                        ;in31
3947    call m(idct_8x32_internal_8bpc).main
3948
3949.pass2:
3950    mova   [rsp+gprsize+16*0 ], m7
3951    lea                   tx2q, [o(.end)]
3952    jmp  m(idct_8x32_internal_8bpc).end1
3953
3954.end:
3955    mova                    m7, [o(pw_8192)]
3956    lea                   tx2q, [o(.end1)]
3957    jmp   m(idct_8x8_internal_8bpc).pass1_end1
3958
3959.end1:
3960    lea                     r3, [dstq+8]
3961    lea                   tx2q, [o(.end2)]
3962    jmp   m(idct_8x8_internal_8bpc).pass2_main
3963
3964.end2:
3965    LOAD_8ROWS   rsp+gprsize+16*11, 16
3966    mova   [rsp+gprsize+16*0 ], m7
3967    mova                    m7, [o(pw_8192)]
3968    lea                   tx2q, [o(.end3)]
3969    jmp   m(idct_8x8_internal_8bpc).pass1_end1
3970
3971.end3:
3972    mov                   dstq, r3
3973    add                     r3, 8
3974    lea                   tx2q, [o(.end4)]
3975    jmp   m(idct_8x8_internal_8bpc).pass2_main
3976
3977.end4:
3978    LOAD_8ROWS   rsp+gprsize+16*19, 16
3979    mova   [rsp+gprsize+16*0 ], m7
3980    mova                    m7, [o(pw_8192)]
3981    lea                   tx2q, [o(.end5)]
3982    jmp   m(idct_8x8_internal_8bpc).pass1_end1
3983
3984.end5:
3985    mov                   dstq, r3
3986    add                     r3, 8
3987    lea                   tx2q, [o(.end6)]
3988    jmp   m(idct_8x8_internal_8bpc).pass2_main
3989
3990.end6:
3991    LOAD_8ROWS   rsp+gprsize+16*27, 16
3992    mova   [rsp+gprsize+16*0 ], m7
3993    mova                    m7, [o(pw_8192)]
3994    lea                   tx2q, [o(.end7)]
3995    jmp   m(idct_8x8_internal_8bpc).pass1_end1
3996
3997.end7:
3998    mov                   dstq, r3
3999    lea                   tx2q, [o(.end8)]
4000    jmp   m(idct_8x8_internal_8bpc).pass2_main
4001
4002.end8:
4003    ret
4004
4005
4006cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
4007    mov                    r5d, 4
4008    mov                   tx2d, 2
4009    cmp                   eobd, 107
4010    cmovns                tx2d, r5d
4011    mov                    r3d, tx2d
4012%if ARCH_X86_32
4013    LEA                     r5, $$
4014%endif
4015    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end8)]
4016.loop:
4017    LOAD_8ROWS     coeffq+16*0, 64
4018    paddsw                  m6, [o(pw_5)]
4019    mova            [rsp+16*1], m6
4020    mova                    m6, [o(pw_5)]
4021    REPX        {paddsw x, m6}, m0, m1, m2, m3, m4, m5, m7
4022    call  m(idct_8x8_internal_8bpc).pass1_end3
4023    REPX        {psraw  x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
4024    mova            [rsp+16*2], m5
4025    mova            [rsp+16*1], m6
4026    mova            [rsp+16*0], m7
4027    call  m(idct_8x8_internal_8bpc).end3
4028    lea                   dstq, [dstq+strideq*2]
4029    pxor                    m7, m7
4030    REPX   {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
4031    add                 coeffq, 16
4032    dec                    r3d
4033    jg .loop
4034    RET
4035
4036cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
4037    mov                    r5d, 4
4038    mov                   tx2d, 2
4039    cmp                   eobd, 107
4040    cmovns                tx2d, r5d
4041    mov                    r3d, tx2d
4042%if ARCH_X86_32
4043    LEA                     r5, $$
4044%endif
4045
4046.loop:
4047    LOAD_8ROWS     coeffq+16*0, 16
4048    pmulhrsw                m6, [o(pw_4096)]
4049    mova            [rsp+16*1], m6
4050    mova                    m6, [o(pw_4096)]
4051    REPX      {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7
4052    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end8)]
4053    call  m(idct_8x8_internal_8bpc).pass1_end3
4054
4055    mov             [rsp+16*3], dstq
4056    mova            [rsp+16*2], m5
4057    mova            [rsp+16*1], m6
4058    mova            [rsp+16*0], m7
4059    lea                   tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
4060    call  m(idct_8x8_internal_8bpc).end3
4061
4062    add                 coeffq, 16*8
4063    mov                   dstq, [rsp+16*3]
4064    lea                   dstq, [dstq+8]
4065    dec                    r3d
4066    jg .loop
4067    jnc .loop
4068    RET
4069
4070
4071cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
4072%if ARCH_X86_32
4073    LEA                     r5, $$
4074%endif
4075    test                  eobd, eobd
4076    jz .dconly
4077    call  m(idct_16x32_internal_8bpc)
4078.end:
4079    RET
4080
4081.dconly:
4082    movd                    m1, [o(pw_2896x8)]
4083    pmulhrsw                m0, m1, [coeffq]
4084    movd                    m2, [o(pw_16384)]
4085    mov               [coeffq], eobd
4086    pmulhrsw                m0, m1
4087    mov                    r2d, 16
4088    lea                   tx2q, [o(.end)]
4089    jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
4090
4091
4092cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
4093    LOAD_8ROWS     coeffq+16*1, 128, 1
4094    call  m(idct_8x8_internal_8bpc).main
4095    SAVE_7ROWS    rsp+gprsize+16*3, 16
4096    LOAD_8ROWS     coeffq+16*5, 128, 1
4097    call m(idct_16x8_internal_8bpc).main
4098    lea                   tx2q, [o(.pass1_end)]
4099    jmp   m(idct_8x8_internal_8bpc).pass1_end
4100
4101.pass1_end:
4102    SAVE_8ROWS    coeffq+16*33, 64               ;in8~in15
4103    LOAD_8ROWS    rsp+gprsize+16*3, 16
4104    mova    [rsp+gprsize+16*0], m7
4105    lea                   tx2q, [o(.pass1_end1)]
4106    jmp   m(idct_8x8_internal_8bpc).pass1_end
4107
4108.pass1_end1:
4109    mova        [coeffq+16*1 ], m0                        ;in8
4110    mova        [coeffq+16*5 ], m4                        ;in12
4111    mova   [rsp+gprsize+16*13], m2                        ;in10
4112    mova   [rsp+gprsize+16*14], m6                        ;in14
4113    mova   [rsp+gprsize+16*21], m1                        ;in9
4114    mova   [rsp+gprsize+16*24], m3                        ;in11
4115    mova   [rsp+gprsize+16*25], m5                        ;in13
4116    mova   [rsp+gprsize+16*20], m7                        ;in15
4117    LOAD_8ROWS     coeffq+16*0, 128, 1
4118    call  m(idct_8x8_internal_8bpc).main
4119    SAVE_7ROWS    rsp+gprsize+16*3, 16
4120    LOAD_8ROWS     coeffq+16*4, 128, 1
4121    call m(idct_16x8_internal_8bpc).main
4122    lea                   tx2q, [o(.pass1_end2)]
4123    jmp   m(idct_8x8_internal_8bpc).pass1_end
4124
4125.pass1_end2:
4126    SAVE_8ROWS    coeffq+16*32, 64               ;in0~in7
4127    LOAD_8ROWS    rsp+gprsize+16*3, 16
4128    mova    [rsp+gprsize+16*0], m7
4129    lea                   tx2q, [o(.pass1_end3)]
4130    jmp   m(idct_8x8_internal_8bpc).pass1_end
4131
4132.pass1_end3:
4133    mova   [rsp+gprsize+16*11], m2                        ;in2
4134    mova   [rsp+gprsize+16*12], m6                        ;in6
4135    mova   [rsp+gprsize+16*19], m1                        ;in1
4136    mova   [rsp+gprsize+16*26], m3                        ;in3
4137    mova   [rsp+gprsize+16*23], m5                        ;in5
4138    mova   [rsp+gprsize+16*22], m7                        ;in7
4139
4140    cmp                   eobd, 150
4141    jg .full
4142
4143    mova                    m1, m4                        ;in4
4144    mova                    m2, [coeffq+16*1 ]            ;in8
4145    mova                    m3, [coeffq+16*5 ]            ;in12
4146    pxor                    m4, m4
4147    REPX          {mova x, m4}, m5, m6, m7
4148    call  m(idct_8x8_internal_8bpc).main
4149    SAVE_7ROWS    rsp+gprsize+16*3, 16
4150    mova                    m0, [rsp+gprsize+16*11]       ;in2
4151    mova                    m1, [rsp+gprsize+16*12]       ;in6
4152    mova                    m2, [rsp+gprsize+16*13]       ;in10
4153    mova                    m3, [rsp+gprsize+16*14]       ;in14
4154    pxor                    m4, m4
4155    REPX          {mova x, m4}, m5, m6, m7
4156    call m(idct_16x8_internal_8bpc).main
4157    mova                    m7, [rsp+gprsize+16*0]
4158    SAVE_8ROWS   rsp+gprsize+16*11, 16
4159
4160    call m(idct_8x32_internal_8bpc).main_fast
4161    jmp  .pass2
4162
4163.full:
4164    mova        [coeffq+16*0 ], m0                        ;in0
4165    mova        [coeffq+16*4 ], m4                        ;in4
4166
4167    LOAD_8ROWS     coeffq+16*2, 128, 1
4168    call  m(idct_8x8_internal_8bpc).main
4169    SAVE_7ROWS    rsp+gprsize+16*3, 16
4170    LOAD_8ROWS     coeffq+16*6, 128, 1
4171    call m(idct_16x8_internal_8bpc).main
4172    lea                   tx2q, [o(.pass1_end4)]
4173    jmp   m(idct_8x8_internal_8bpc).pass1_end
4174
4175.pass1_end4:
4176    SAVE_8ROWS    coeffq+16*34, 64               ;in16~in23
4177    LOAD_8ROWS    rsp+gprsize+16*3, 16
4178    mova    [rsp+gprsize+16*0], m7
4179    lea                   tx2q, [o(.pass1_end5)]
4180    jmp   m(idct_8x8_internal_8bpc).pass1_end
4181
4182.pass1_end5:
4183    mova        [coeffq+16*2 ], m0                        ;in16
4184    mova        [coeffq+16*6 ], m4                        ;in20
4185    mova   [rsp+gprsize+16*15], m2                        ;in18
4186    mova   [rsp+gprsize+16*16], m6                        ;in22
4187    mova   [rsp+gprsize+16*33], m1                        ;in17
4188    mova   [rsp+gprsize+16*28], m3                        ;in19
4189    mova   [rsp+gprsize+16*29], m5                        ;in21
4190    mova   [rsp+gprsize+16*32], m7                        ;in23
4191
4192    LOAD_8ROWS     coeffq+16*3, 128, 1
4193    call  m(idct_8x8_internal_8bpc).main
4194    SAVE_7ROWS    rsp+gprsize+16*3, 16
4195    LOAD_8ROWS     coeffq+16*7, 128, 1
4196    call m(idct_16x8_internal_8bpc).main
4197    lea                   tx2q, [o(.pass1_end6)]
4198    jmp   m(idct_8x8_internal_8bpc).pass1_end
4199
4200.pass1_end6:
4201    SAVE_8ROWS    coeffq+16*35, 64                        ;in24~in31
4202    LOAD_8ROWS    rsp+gprsize+16*3, 16
4203    mova    [rsp+gprsize+16*0], m7
4204    lea                   tx2q, [o(.pass1_end7)]
4205    jmp   m(idct_8x8_internal_8bpc).pass1_end
4206
4207.pass1_end7:
4208    mova   [rsp+gprsize+16*17], m2                        ;in26
4209    mova   [rsp+gprsize+16*18], m6                        ;in30
4210    mova   [rsp+gprsize+16*31], m1                        ;in25
4211    mova   [rsp+gprsize+16*30], m3                        ;in27
4212    mova   [rsp+gprsize+16*27], m5                        ;in29
4213    mova   [rsp+gprsize+16*34], m7                        ;in31
4214
4215    mova                    m6, m0                        ;in24
4216    mova                    m7, m4                        ;in28
4217    mova                    m0, [coeffq+16*0 ]            ;in0
4218    mova                    m1, [coeffq+16*4 ]            ;in4
4219    mova                    m2, [coeffq+16*1 ]            ;in8
4220    mova                    m3, [coeffq+16*5 ]            ;in12
4221    mova                    m4, [coeffq+16*2 ]            ;in16
4222    mova                    m5, [coeffq+16*6 ]            ;in20
4223    call  m(idct_8x8_internal_8bpc).main
4224    SAVE_7ROWS   rsp+gprsize+16*3 , 16
4225    LOAD_8ROWS   rsp+gprsize+16*11, 16
4226    call m(idct_16x8_internal_8bpc).main
4227    mova                    m7, [rsp+gprsize+16*0]
4228    SAVE_8ROWS   rsp+gprsize+16*11, 16
4229
4230    call m(idct_8x32_internal_8bpc).main
4231
4232.pass2:
4233    mov  [rsp+gprsize*1+16*35], eobd
4234    lea                     r3, [dstq+8]
4235    mov  [rsp+gprsize*2+16*35], r3
4236    lea                     r3, [o(.end)]
4237    jmp  m(idct_8x32_internal_8bpc).end
4238
4239.end:
4240    mov                   dstq, [rsp+gprsize*2+16*35]
4241    mov                   eobd, [rsp+gprsize*1+16*35]
4242    add                 coeffq, 16*32
4243
4244    mova                    m0, [coeffq+16*4 ]            ;in1
4245    mova                    m1, [coeffq+16*12]            ;in3
4246    mova                    m2, [coeffq+16*20]            ;in5
4247    mova                    m3, [coeffq+16*28]            ;in7
4248    mova                    m4, [coeffq+16*5 ]            ;in9
4249    mova                    m5, [coeffq+16*13]            ;in11
4250    mova                    m6, [coeffq+16*21]            ;in13
4251    mova                    m7, [coeffq+16*29]            ;in15
4252
4253    mova   [rsp+gprsize+16*19], m0                        ;in1
4254    mova   [rsp+gprsize+16*26], m1                        ;in3
4255    mova   [rsp+gprsize+16*23], m2                        ;in5
4256    mova   [rsp+gprsize+16*22], m3                        ;in7
4257    mova   [rsp+gprsize+16*21], m4                        ;in9
4258    mova   [rsp+gprsize+16*24], m5                        ;in11
4259    mova   [rsp+gprsize+16*25], m6                        ;in13
4260    mova   [rsp+gprsize+16*20], m7                        ;in15
4261
4262    mova                    m0, [coeffq+16*0 ]            ;in0
4263    mova                    m1, [coeffq+16*16]            ;in4
4264    mova                    m2, [coeffq+16*1 ]            ;in8
4265    mova                    m3, [coeffq+16*17]            ;in12
4266
4267    cmp                   eobd, 150
4268    jg .full1
4269
4270    pxor                    m4, m4
4271    REPX          {mova x, m4}, m5, m6, m7
4272    call  m(idct_8x8_internal_8bpc).main
4273    SAVE_7ROWS    rsp+gprsize+16*3, 16
4274
4275    mova                    m0, [coeffq+16*8 ]            ;in2
4276    mova                    m1, [coeffq+16*24]            ;in6
4277    mova                    m2, [coeffq+16*9 ]            ;in10
4278    mova                    m3, [coeffq+16*25]            ;in14
4279    pxor                    m4, m4
4280    REPX          {mova x, m4}, m5, m6, m7
4281    call m(idct_16x8_internal_8bpc).main
4282    mova                    m7, [rsp+gprsize+16*0]
4283    SAVE_8ROWS   rsp+gprsize+16*11, 16
4284
4285    call m(idct_8x32_internal_8bpc).main_fast
4286    jmp m(idct_8x32_internal_8bpc).pass2
4287
4288.full1:
4289    mova                    m4, [coeffq+16*2 ]            ;in16
4290    mova                    m5, [coeffq+16*18]            ;in20
4291    mova                    m6, [coeffq+16*3 ]            ;in24
4292    mova                    m7, [coeffq+16*19]            ;in26
4293    call  m(idct_8x8_internal_8bpc).main
4294    SAVE_7ROWS    rsp+gprsize+16*3, 16
4295
4296    mova                    m0, [coeffq+16*8 ]            ;in2
4297    mova                    m1, [coeffq+16*24]            ;in6
4298    mova                    m2, [coeffq+16*9 ]            ;in10
4299    mova                    m3, [coeffq+16*25]            ;in14
4300    mova                    m4, [coeffq+16*10]            ;in18
4301    mova                    m5, [coeffq+16*26]            ;in22
4302    mova                    m6, [coeffq+16*11]            ;in26
4303    mova                    m7, [coeffq+16*27]            ;in30
4304    call m(idct_16x8_internal_8bpc).main
4305    mova                    m7, [rsp+gprsize+16*0]
4306    SAVE_8ROWS   rsp+gprsize+16*11, 16
4307
4308    mova                    m0, [coeffq+16*6 ]            ;in17
4309    mova                    m1, [coeffq+16*14]            ;in19
4310    mova                    m2, [coeffq+16*22]            ;in21
4311    mova                    m3, [coeffq+16*30]            ;in23
4312    mova                    m4, [coeffq+16*7 ]            ;in25
4313    mova                    m5, [coeffq+16*15]            ;in27
4314    mova                    m6, [coeffq+16*23]            ;in29
4315    mova                    m7, [coeffq+16*31]            ;in31
4316
4317    mova   [rsp+gprsize+16*33], m0                        ;in17
4318    mova   [rsp+gprsize+16*28], m1                        ;in19
4319    mova   [rsp+gprsize+16*29], m2                        ;in21
4320    mova   [rsp+gprsize+16*32], m3                        ;in23
4321    mova   [rsp+gprsize+16*31], m4                        ;in25
4322    mova   [rsp+gprsize+16*30], m5                        ;in27
4323    mova   [rsp+gprsize+16*27], m6                        ;in29
4324    mova   [rsp+gprsize+16*34], m7                        ;in31
4325
4326    call m(idct_8x32_internal_8bpc).main
4327    jmp m(idct_8x32_internal_8bpc).pass2
4328
4329
4330cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
4331%if ARCH_X86_32
4332    LEA                     r5, $$
4333%endif
4334    test                  eobd, eobd
4335    jz .dconly
4336
4337    call m(idct_32x16_internal_8bpc)
4338    call m(idct_8x16_internal_8bpc).pass2
4339
4340    add                 coeffq, 16*16
4341    lea                   dstq, [r3+8]
4342    LOAD_8ROWS       rsp+16*11, 16
4343    mova            [rsp+16*0], m7
4344    lea                   tx2q, [o(m(idct_32x16_internal_8bpc).end)]
4345    call  m(idct_8x8_internal_8bpc).pass1_end
4346    call m(idct_8x16_internal_8bpc).pass2
4347
4348    add                 coeffq, 16*16
4349    lea                   dstq, [r3+8]
4350    LOAD_8ROWS       rsp+16*19, 16
4351    mova            [rsp+16*0], m7
4352    lea                   tx2q, [o(m(idct_32x16_internal_8bpc).end)]
4353    call  m(idct_8x8_internal_8bpc).pass1_end
4354    call m(idct_8x16_internal_8bpc).pass2
4355
4356    add                 coeffq, 16*16
4357    lea                   dstq, [r3+8]
4358    LOAD_8ROWS       rsp+16*27, 16
4359    mova            [rsp+16*0], m7
4360    lea                   tx2q, [o(m(idct_32x16_internal_8bpc).end)]
4361    call  m(idct_8x8_internal_8bpc).pass1_end
4362    call m(idct_8x16_internal_8bpc).pass2
4363    RET
4364
4365.dconly:
4366    movd                    m1, [o(pw_2896x8)]
4367    pmulhrsw                m0, m1, [coeffq]
4368    movd                    m2, [o(pw_16384)]
4369    mov               [coeffq], eobd
4370    pmulhrsw                m0, m1
4371    mov                    r3d, 16
4372    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)]
4373    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body
4374
4375
4376cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
4377    add                 coeffq, 16
4378    lea                     r3, [o(.pass1_end1)]
4379.pass1:
4380    LOAD_8ROWS     coeffq+16*0, 128, 1
4381    call  m(idct_8x8_internal_8bpc).main
4382    SAVE_7ROWS    rsp+gprsize+16*3, 16
4383
4384    LOAD_8ROWS     coeffq+16*4, 128, 1
4385    call m(idct_16x8_internal_8bpc).main
4386    mova                    m7, [rsp+gprsize+16*0]
4387    SAVE_8ROWS   rsp+gprsize+16*11, 16
4388
4389    LOAD_8ROWS     coeffq+16*2, 64, 1
4390    mova   [rsp+gprsize+16*19], m0                        ;in1
4391    mova   [rsp+gprsize+16*26], m1                        ;in3
4392    mova   [rsp+gprsize+16*23], m2                        ;in5
4393    mova   [rsp+gprsize+16*22], m3                        ;in7
4394    mova   [rsp+gprsize+16*21], m4                        ;in9
4395    mova   [rsp+gprsize+16*24], m5                        ;in11
4396    mova   [rsp+gprsize+16*25], m6                        ;in13
4397    mova   [rsp+gprsize+16*20], m7                        ;in15
4398
4399    LOAD_8ROWS    coeffq+16*34, 64, 1
4400    mova   [rsp+gprsize+16*33], m0                        ;in17
4401    mova   [rsp+gprsize+16*28], m1                        ;in19
4402    mova   [rsp+gprsize+16*29], m2                        ;in21
4403    mova   [rsp+gprsize+16*32], m3                        ;in23
4404    mova   [rsp+gprsize+16*31], m4                        ;in25
4405    mova   [rsp+gprsize+16*30], m5                        ;in27
4406    mova   [rsp+gprsize+16*27], m6                        ;in29
4407    mova   [rsp+gprsize+16*34], m7                        ;in31
4408    call m(idct_8x32_internal_8bpc).main
4409
4410.pass1_end:
4411    mova   [rsp+gprsize+16*0 ], m7
4412    mov                   tx2q, r3
4413    jmp   m(idct_8x8_internal_8bpc).pass1_end
4414
4415.pass1_end1:
4416    SAVE_8ROWS     coeffq+16*0, 32
4417    LOAD_8ROWS   rsp+gprsize+16*11, 16
4418    mova   [rsp+gprsize+16*0 ], m7
4419    lea                   tx2q, [o(.pass1_end2)]
4420    jmp   m(idct_8x8_internal_8bpc).pass1_end
4421
4422.pass1_end2:
4423    SAVE_8ROWS    coeffq+16*16, 32
4424    LOAD_8ROWS   rsp+gprsize+16*19, 16
4425    mova   [rsp+gprsize+16*0 ], m7
4426    lea                   tx2q, [o(.pass1_end3)]
4427    jmp   m(idct_8x8_internal_8bpc).pass1_end
4428
4429.pass1_end3:
4430    SAVE_8ROWS    coeffq+16*32, 32
4431    LOAD_8ROWS   rsp+gprsize+16*27, 16
4432    mova   [rsp+gprsize+16*0 ], m7
4433    lea                   tx2q, [o(.pass1_end4)]
4434    jmp   m(idct_8x8_internal_8bpc).pass1_end
4435
4436.pass1_end4:
4437    SAVE_8ROWS    coeffq+16*48, 32
4438
4439    sub                 coeffq, 16
4440    lea                     r3, [o(.end)]
4441    jmp .pass1
4442
4443.end:
4444    ret
4445
4446
4447cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
4448    mov                    r4d, eobd
4449    cmp                   eobd, 43                ;if (eob > 43)
4450    sbb                    r3d, r3d               ;  iteration_count++
4451    cmp                    r4d, 150               ;if (eob > 150)
4452    sbb                    r3d, 0                 ;  iteration_count++
4453    cmp                    r4d, 278               ;if (eob > 278)
4454    sbb                    r3d, -4                ;  iteration_count++
4455
4456%if ARCH_X86_32
4457    LEA                     r5, $$
4458%endif
4459    lea                     r4, [dstq+8]
4460    mov             [rsp+16*3], r4
4461    mov     [rsp+gprsize+16*3], r3d
4462    mov   [rsp+gprsize*2+16*3], coeffq
4463
4464.loop:
4465    LOAD_8ROWS          coeffq, 64, 1
4466    mova            [rsp+16*1], m6
4467    pxor                    m6, m6
4468    REPX   {mova [coeffq+64*x], m6}, 0,  1,  2,  3,  4,  5,  6,  7
4469    lea                   tx2q, [o(m(idct_32x16_internal_8bpc).end)]
4470    call  m(idct_8x8_internal_8bpc).pass1_end3
4471    mova            [rsp+16*0], m2
4472    mova            [rsp+16*1], m3
4473    mova            [rsp+16*2], m4
4474    mova                    m3, [o(pw_1697x16)]
4475    mova                    m4, [o(pw_16384)]
4476    REPX   {IDTX16 x, 2, 3, 4}, 5, 6, 7, 0, 1
4477    mova                    m2, [o(pw_8192)]
4478    REPX      {pmulhrsw x, m2}, m5, m6, m7, m0, m1
4479    mova                    m2, [rsp+16*0]
4480    mova            [rsp+16*0], m7
4481    IDTX16                   2, 7, 3, 4
4482    mova                    m7, [rsp+16*2]
4483    mova            [rsp+16*2], m5
4484    IDTX16                   7, 5, 3, 4
4485    mova                    m5, [rsp+16*1]
4486    mova            [rsp+16*1], m6
4487    pmulhrsw                m3, m5
4488    pmulhrsw                m3, m4
4489    psrlw                   m4, 1 ; pw_8192
4490    paddsw                  m3, m5
4491    pmulhrsw                m2, m4
4492    pmulhrsw                m3, m4
4493    pmulhrsw                m4, m7
4494    call  m(idct_8x8_internal_8bpc).end3
4495    lea                   dstq, [dstq+strideq*2]
4496    add                 coeffq, 16
4497    dec                    r3d
4498    jg .loop
4499    mov                 coeffq, [rsp+gprsize*2+16*3]
4500    add                 coeffq, 64*8
4501    mov                    r3d, [rsp+gprsize+16*3]
4502    xor                   dstq, dstq
4503    mov     [rsp+gprsize+16*3], dstq
4504    mov                   dstq, [rsp+16*3]
4505    test                   r3d, r3d
4506    jnz .loop
4507    RET
4508
4509
4510cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
4511    mov                    r4d, 12                ;0100b
4512    mov                    r5d, 136               ;1000 1000b
4513    cmp                   eobd, 44                ;if (eob > 43)
4514    cmovns                 r4d, r5d               ;  iteration_count+2
4515    cmp                   eobd, 151               ;if (eob > 150)
4516    mov                    r3d, 34952             ;1000 1000 1000 1000b
4517    cmovs                  r3d, r4d               ;  iteration_count += 4
4518
4519%if ARCH_X86_32
4520    LEA                     r5, $$
4521%endif
4522    lea                     r4, [dstq+8]
4523    mov             [rsp+16*3], r4
4524
4525.loop:
4526    LOAD_8ROWS          coeffq, 32, 1
4527    REPX         {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7
4528    mova            [rsp+16*1], m6
4529    lea                   tx2q, [o(m(idct_32x16_internal_8bpc).end)]
4530    call  m(idct_8x8_internal_8bpc).pass1_end3
4531    mova            [rsp+16*1], m5
4532    mova            [rsp+16*2], m6
4533    mova                    m6, [o(pw_1697x16)]
4534    REPX      {IDTX16 x, 5, 6}, 7, 0, 1, 2, 3, 4
4535    pmulhrsw                m7, [o(pw_2048)]
4536    mova                    m5, [rsp+16*1]
4537    mova            [rsp+16*0], m7
4538    IDTX16                   5, 7, 6
4539    mova                    m7, [rsp+16*2]
4540    IDTX16                   7, 6, 6
4541    mova                    m6, [o(pw_2048)]
4542    REPX      {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7
4543    mova            [rsp+16*2], m5
4544    mova            [rsp+16*1], m7
4545    call  m(idct_8x8_internal_8bpc).end3
4546    lea                   dstq, [dstq+strideq*2]
4547    pxor                    m7, m7
4548    REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
4549
4550.loop_end:
4551    add                 coeffq, 16
4552    shr                    r3d, 2
4553    jz .ret
4554    test                   r3d, 2
4555    jnz .loop
4556    mov                    r4d, r3d
4557    and                    r4d, 1
4558    lea                 coeffq, [coeffq+r4*8+32*7]
4559    mov                   dstq, [rsp+16*3]
4560    lea                     r4, [dstq+8]
4561    mov             [rsp+16*3], r4
4562    jmp .loop
4563
4564.ret:
4565    RET
4566
4567
4568cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
4569%if ARCH_X86_32
4570    LEA                     r5, $$
4571%endif
4572    test                  eobd, eobd
4573    jz .dconly
4574
4575    call m(idct_32x32_internal_8bpc)
4576    RET
4577
4578.dconly:
4579    movd                    m1, [o(pw_2896x8)]
4580    pmulhrsw                m0, m1, [coeffq]
4581    movd                    m2, [o(pw_8192)]
4582    mov               [coeffq], eobd
4583    mov                    r3d, 32
4584    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)]
4585    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body
4586
4587
4588cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
4589    mov                    r4d, 2
4590    sub                   eobd, 136
4591    mov  [rsp+gprsize*1+16*35], eobd
4592    mov                    r3d, 4
4593    cmovs                  r3d, r4d
4594
4595%if ARCH_X86_32
4596    LEA                     r5, $$
4597%endif
4598
4599    mov  [rsp+gprsize*2+16*35], coeffq
4600
4601.pass1_loop:
4602    LOAD_8ROWS     coeffq+64*1, 64*2
4603    mova   [rsp+gprsize+16*19], m0                        ;in1
4604    mova   [rsp+gprsize+16*26], m1                        ;in3
4605    mova   [rsp+gprsize+16*23], m2                        ;in5
4606    mova   [rsp+gprsize+16*22], m3                        ;in7
4607    mova   [rsp+gprsize+16*21], m4                        ;in9
4608    mova   [rsp+gprsize+16*24], m5                        ;in11
4609    mova   [rsp+gprsize+16*25], m6                        ;in13
4610    mova   [rsp+gprsize+16*20], m7                        ;in15
4611
4612    mov                   tx2d, [rsp+gprsize*1+16*35]
4613    test                  tx2d, tx2d
4614    jl .fast
4615
4616.full:
4617    LOAD_8ROWS     coeffq+64*0, 64*4
4618    call  m(idct_8x8_internal_8bpc).main
4619    SAVE_7ROWS    rsp+gprsize+16*3, 16
4620    LOAD_8ROWS     coeffq+64*2, 64*4
4621    call m(idct_16x8_internal_8bpc).main
4622    mova                    m7, [rsp+gprsize+16*0]
4623    SAVE_8ROWS   rsp+gprsize+16*11, 16
4624
4625    LOAD_8ROWS    coeffq+64*17, 64*2
4626    mova   [rsp+gprsize+16*33], m0                        ;in17
4627    mova   [rsp+gprsize+16*28], m1                        ;in19
4628    mova   [rsp+gprsize+16*29], m2                        ;in21
4629    mova   [rsp+gprsize+16*32], m3                        ;in23
4630    mova   [rsp+gprsize+16*31], m4                        ;in25
4631    mova   [rsp+gprsize+16*30], m5                        ;in27
4632    mova   [rsp+gprsize+16*27], m6                        ;in29
4633    mova   [rsp+gprsize+16*34], m7                        ;in31
4634
4635    call m(idct_8x32_internal_8bpc).main
4636    jmp .pass1_end
4637
4638.fast:
4639    mova                    m0, [coeffq+256*0]
4640    mova                    m1, [coeffq+256*1]
4641    mova                    m2, [coeffq+256*2]
4642    mova                    m3, [coeffq+256*3]
4643    pxor                    m4, m4
4644    REPX          {mova x, m4}, m5, m6, m7
4645    call  m(idct_8x8_internal_8bpc).main
4646
4647    SAVE_7ROWS    rsp+gprsize+16*3, 16
4648    mova                    m0, [coeffq+128*1]
4649    mova                    m1, [coeffq+128*3]
4650    mova                    m2, [coeffq+128*5]
4651    mova                    m3, [coeffq+128*7]
4652    pxor                    m4, m4
4653    REPX          {mova x, m4}, m5, m6, m7
4654    call m(idct_16x8_internal_8bpc).main
4655    mova                    m7, [rsp+gprsize+16*0]
4656    SAVE_8ROWS   rsp+gprsize+16*11, 16
4657
4658    call m(idct_8x32_internal_8bpc).main_fast
4659
4660.pass1_end:
4661    mova    [rsp+gprsize+16*0], m7
4662    mova                    m7, [o(pw_8192)]
4663    lea                   tx2q, [o(.pass1_end1)]
4664    jmp   m(idct_8x8_internal_8bpc).pass1_end1
4665
4666.pass1_end1:
4667    SAVE_8ROWS     coeffq+64*0, 64
4668    LOAD_8ROWS   rsp+gprsize+16*11, 16
4669    mova    [rsp+gprsize+16*0], m7
4670    mova                    m7, [o(pw_8192)]
4671    lea                   tx2q, [o(.pass1_end2)]
4672    jmp   m(idct_8x8_internal_8bpc).pass1_end1
4673
4674.pass1_end2:
4675    SAVE_8ROWS     coeffq+64*8, 64
4676    LOAD_8ROWS   rsp+gprsize+16*19, 16
4677    mova    [rsp+gprsize+16*0], m7
4678    mova                    m7, [o(pw_8192)]
4679    lea                   tx2q, [o(.pass1_end3)]
4680    jmp   m(idct_8x8_internal_8bpc).pass1_end1
4681
4682.pass1_end3:
4683    SAVE_8ROWS    coeffq+64*16, 64
4684    LOAD_8ROWS   rsp+gprsize+16*27, 16
4685    mova    [rsp+gprsize+16*0], m7
4686    mova                    m7, [o(pw_8192)]
4687    lea                   tx2q, [o(.pass1_end4)]
4688    jmp   m(idct_8x8_internal_8bpc).pass1_end1
4689
4690.pass1_end4:
4691    SAVE_8ROWS    coeffq+64*24, 64
4692
4693    add                 coeffq, 16
4694    dec                    r3d
4695    jg .pass1_loop
4696
4697
4698.pass2:
4699    mov                 coeffq, [rsp+gprsize*2+16*35]
4700    mov                    r3d, 4
4701    lea                   tx2q, [o(.pass2_end)]
4702
4703.pass2_loop:
4704    mov  [rsp+gprsize*3+16*35], r3d
4705    lea                     r3, [dstq+8]
4706    mov  [rsp+gprsize*2+16*35], r3
4707
4708    mova                    m0, [coeffq+16*4 ]
4709    mova                    m1, [coeffq+16*12]
4710    mova                    m2, [coeffq+16*20]
4711    mova                    m3, [coeffq+16*28]
4712    mova                    m4, [coeffq+16*5 ]
4713    mova                    m5, [coeffq+16*13]
4714    mova                    m6, [coeffq+16*21]
4715    mova                    m7, [coeffq+16*29]
4716    mova   [rsp+gprsize+16*19], m0                        ;in1
4717    mova   [rsp+gprsize+16*26], m1                        ;in3
4718    mova   [rsp+gprsize+16*23], m2                        ;in5
4719    mova   [rsp+gprsize+16*22], m3                        ;in7
4720    mova   [rsp+gprsize+16*21], m4                        ;in9
4721    mova   [rsp+gprsize+16*24], m5                        ;in11
4722    mova   [rsp+gprsize+16*25], m6                        ;in13
4723    mova   [rsp+gprsize+16*20], m7                        ;in15
4724
4725    mov                   eobd, [rsp+gprsize*1+16*35]
4726    test                  eobd, eobd
4727    jl .fast1
4728
4729.full1:
4730    mova                    m0, [coeffq+16*0 ]
4731    mova                    m1, [coeffq+16*16]
4732    mova                    m2, [coeffq+16*1 ]
4733    mova                    m3, [coeffq+16*17]
4734    mova                    m4, [coeffq+16*2 ]
4735    mova                    m5, [coeffq+16*18]
4736    mova                    m6, [coeffq+16*3 ]
4737    mova                    m7, [coeffq+16*19]
4738    call  m(idct_8x8_internal_8bpc).main
4739    SAVE_7ROWS    rsp+gprsize+16*3, 16
4740
4741    mova                    m0, [coeffq+16*8 ]
4742    mova                    m1, [coeffq+16*24]
4743    mova                    m2, [coeffq+16*9 ]
4744    mova                    m3, [coeffq+16*25]
4745    mova                    m4, [coeffq+16*10]
4746    mova                    m5, [coeffq+16*26]
4747    mova                    m6, [coeffq+16*11]
4748    mova                    m7, [coeffq+16*27]
4749    call m(idct_16x8_internal_8bpc).main
4750    mova                    m7, [rsp+gprsize+16*0]
4751    SAVE_8ROWS   rsp+gprsize+16*11, 16
4752
4753    mova                    m0, [coeffq+16*6 ]
4754    mova                    m1, [coeffq+16*14]
4755    mova                    m2, [coeffq+16*22]
4756    mova                    m3, [coeffq+16*30]
4757    mova                    m4, [coeffq+16*7 ]
4758    mova                    m5, [coeffq+16*15]
4759    mova                    m6, [coeffq+16*23]
4760    mova                    m7, [coeffq+16*31]
4761    mova   [rsp+gprsize+16*33], m0                        ;in17
4762    mova   [rsp+gprsize+16*28], m1                        ;in19
4763    mova   [rsp+gprsize+16*29], m2                        ;in21
4764    mova   [rsp+gprsize+16*32], m3                        ;in23
4765    mova   [rsp+gprsize+16*31], m4                        ;in25
4766    mova   [rsp+gprsize+16*30], m5                        ;in27
4767    mova   [rsp+gprsize+16*27], m6                        ;in29
4768    mova   [rsp+gprsize+16*34], m7                        ;in31
4769
4770    call m(idct_8x32_internal_8bpc).main
4771    jmp                   tx2q
4772
4773.fast1:
4774    mova                    m0, [coeffq+16*0 ]
4775    mova                    m1, [coeffq+16*16]
4776    mova                    m2, [coeffq+16*1 ]
4777    mova                    m3, [coeffq+16*17]
4778    pxor                    m4, m4
4779    REPX          {mova x, m4}, m5, m6, m7
4780    call  m(idct_8x8_internal_8bpc).main
4781    SAVE_7ROWS    rsp+gprsize+16*3, 16
4782
4783    mova                    m0, [coeffq+16*8 ]
4784    mova                    m1, [coeffq+16*24]
4785    mova                    m2, [coeffq+16*9 ]
4786    mova                    m3, [coeffq+16*25]
4787    pxor                    m4, m4
4788    REPX          {mova x, m4}, m5, m6, m7
4789    call m(idct_16x8_internal_8bpc).main
4790    mova                    m7, [rsp+gprsize+16*0]
4791    SAVE_8ROWS   rsp+gprsize+16*11, 16
4792
4793    call m(idct_8x32_internal_8bpc).main_fast
4794    jmp                   tx2q
4795
4796.pass2_end:
4797    lea                     r3, [o(.pass2_end1)]
4798    jmp  m(idct_8x32_internal_8bpc).end
4799
4800.pass2_end1:
4801    lea                   tx2q, [o(.pass2_end)]
4802    add                 coeffq, 16*32
4803    mov                   dstq, [rsp+gprsize*2+16*35]
4804    mov                    r3d, [rsp+gprsize*3+16*35]
4805    dec                    r3d
4806    jg .pass2_loop
4807
4808    ret
4809
4810
4811cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2
4812    mov                    r4d, 2
4813    cmp                   eobd, 136
4814    mov                    r3d, 4
4815    cmovs                  r3d, r4d
4816
4817%if ARCH_X86_32
4818    LEA                     r5, $$
4819%endif
4820
4821    lea                     r4, [dstq+8]
4822    mov   [rsp+gprsize*0+16*3], r4
4823    mov   [rsp+gprsize*1+16*3], r3d
4824    mov   [rsp+gprsize*2+16*3], r3d
4825    mov   [rsp+gprsize*3+16*3], coeffq
4826
4827.loop:
4828    LOAD_8ROWS          coeffq, 64
4829    mova            [rsp+16*1], m6
4830    lea                   tx2q, [o(m(idct_32x16_internal_8bpc).end)]
4831    call  m(idct_8x8_internal_8bpc).pass1_end3
4832    pmulhrsw                m7, [o(pw_8192)]
4833    mova            [rsp+16*0], m7
4834    mova                    m7, [o(pw_8192)]
4835    REPX      {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
4836    mova            [rsp+16*1], m6
4837    mova            [rsp+16*2], m5
4838    call  m(idct_8x8_internal_8bpc).end3
4839    lea                   dstq, [dstq+strideq*2]
4840
4841    pxor                    m7, m7
4842    REPX   {mova [coeffq+64*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
4843
4844    add                 coeffq, 16
4845    dec                    r3d
4846    jg .loop
4847
4848    mov                    r4d, [rsp+gprsize*2+16*3]
4849    dec                    r4d
4850    jle .ret
4851
4852    mov                   dstq, [rsp+gprsize*0+16*3]
4853    mov                 coeffq, [rsp+gprsize*3+16*3]
4854    mov   [rsp+gprsize*2+16*3], r4
4855    lea                     r3, [dstq+8]
4856    add                 coeffq, 64*8
4857    mov   [rsp+gprsize*0+16*3], r3
4858    mov                    r3d, [rsp+gprsize*1+16*3]
4859    mov   [rsp+gprsize*3+16*3], coeffq
4860    jmp .loop
4861
4862.ret:
4863    RET
4864
4865
4866cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2
4867%if ARCH_X86_32
4868    LEA                     r5, $$
4869%endif
4870    test                  eobd, eobd
4871    jz .dconly
4872    call m(idct_16x64_internal_8bpc)
4873.end:
4874    RET
4875
4876.dconly:
4877    movd                    m1, [o(pw_2896x8)]
4878    pmulhrsw                m0, m1, [coeffq]
4879    movd                    m2, [o(pw_8192)]
4880    mov               [coeffq], eobd
4881    mov                    r2d, 32
4882    lea                   tx2q, [o(.end)]
4883    jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
4884
4885
4886cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
4887    mov                    r4d, 2
4888    sub                   eobd, 151
4889    mov  [rsp+gprsize*1+16*67], eobd
4890    mov                    r3d, 4
4891    cmovs                  r3d, r4d
4892
4893%if ARCH_X86_32
4894    LEA                     r5, $$
4895%endif
4896
4897    mov  [rsp+gprsize*2+16*67], coeffq
4898
4899.pass1_loop:
4900    LOAD_8ROWS     coeffq+64*0, 64*2
4901    call  m(idct_8x8_internal_8bpc).main
4902    SAVE_7ROWS    rsp+gprsize+16*3, 16
4903    LOAD_8ROWS     coeffq+64*1, 64*2
4904    call m(idct_16x8_internal_8bpc).main
4905    mova                    m7, [o(pw_8192)]
4906    lea                   tx2q, [o(.pass1_end)]
4907    jmp   m(idct_8x8_internal_8bpc).pass1_end1
4908
4909.pass1_end:
4910    SAVE_8ROWS     coeffq+64*8, 64
4911    LOAD_8ROWS    rsp+gprsize+16*3, 16
4912    mova    [rsp+gprsize+16*0], m7
4913    mova                    m7, [o(pw_8192)]
4914    lea                   tx2q, [o(.pass1_end1)]
4915    jmp   m(idct_8x8_internal_8bpc).pass1_end1
4916
4917.pass1_end1:
4918    SAVE_8ROWS     coeffq+64*0, 64
4919
4920    add                 coeffq, 16
4921    dec                    r3d
4922    jg .pass1_loop
4923
4924    mov                 coeffq, [rsp+gprsize*2+16*67]
4925    mov                    r3d, 2
4926    lea                     r4, [dstq+8]
4927    mov  [rsp+gprsize*2+16*67], r4
4928    lea                     r4, [o(.end1)]
4929
4930.pass2_loop:
4931    mov  [rsp+gprsize*3+16*67], r3d
4932    mov                   eobd, [rsp+gprsize*1+16*67]
4933
4934    mova                    m0, [coeffq+16*4 ]            ;in1
4935    mova                    m1, [coeffq+16*12]            ;in3
4936    mova                    m2, [coeffq+16*20]            ;in5
4937    mova                    m3, [coeffq+16*28]            ;in7
4938    mova                    m4, [coeffq+16*5 ]            ;in9
4939    mova                    m5, [coeffq+16*13]            ;in11
4940    mova                    m6, [coeffq+16*21]            ;in13
4941    mova                    m7, [coeffq+16*29]            ;in15
4942    mova   [rsp+gprsize+16*35], m0                        ;in1
4943    mova   [rsp+gprsize+16*49], m1                        ;in3
4944    mova   [rsp+gprsize+16*43], m2                        ;in5
4945    mova   [rsp+gprsize+16*41], m3                        ;in7
4946    mova   [rsp+gprsize+16*39], m4                        ;in9
4947    mova   [rsp+gprsize+16*45], m5                        ;in11
4948    mova   [rsp+gprsize+16*47], m6                        ;in13
4949    mova   [rsp+gprsize+16*37], m7                        ;in15
4950
4951    pxor                    m4, m4
4952    mova                    m0, [coeffq+16*0]
4953    mova                    m1, [coeffq+16*1]
4954
4955    test                  eobd, eobd
4956    jl .fast
4957
4958.full:
4959    mova                    m2, [coeffq+16*2]
4960    mova                    m3, [coeffq+16*3]
4961
4962    REPX          {mova x, m4}, m5, m6, m7
4963    call  m(idct_8x8_internal_8bpc).main
4964    SAVE_7ROWS    rsp+gprsize+16*3, 16
4965
4966    pxor                    m4, m4
4967    mova                    m0, [coeffq+16*16]
4968    mova                    m1, [coeffq+16*17]
4969    mova                    m2, [coeffq+16*18]
4970    mova                    m3, [coeffq+16*19]
4971
4972    REPX          {mova x, m4}, m5, m6, m7
4973    call m(idct_16x8_internal_8bpc).main
4974    mova                    m7, [rsp+gprsize+16*0]
4975    SAVE_8ROWS   rsp+gprsize+16*11, 16
4976
4977    mova                    m0, [coeffq+16*8 ]
4978    mova                    m1, [coeffq+16*24]
4979    mova                    m2, [coeffq+16*9 ]
4980    mova                    m3, [coeffq+16*25]
4981    mova                    m4, [coeffq+16*10]
4982    mova                    m5, [coeffq+16*26]
4983    mova                    m6, [coeffq+16*11]
4984    mova                    m7, [coeffq+16*27]
4985    mova   [rsp+gprsize+16*19], m0
4986    mova   [rsp+gprsize+16*26], m1
4987    mova   [rsp+gprsize+16*23], m2
4988    mova   [rsp+gprsize+16*22], m3
4989    mova   [rsp+gprsize+16*21], m4
4990    mova   [rsp+gprsize+16*24], m5
4991    mova   [rsp+gprsize+16*25], m6
4992    mova   [rsp+gprsize+16*20], m7
4993
4994    call m(idct_8x32_internal_8bpc).main_fast
4995    SAVE_8ROWS    rsp+gprsize+16*3, 16
4996
4997    mova                    m0, [coeffq+16*6 ]            ;in17
4998    mova                    m1, [coeffq+16*14]            ;in19
4999    mova                    m2, [coeffq+16*22]            ;in21
5000    mova                    m3, [coeffq+16*30]            ;in23
5001    mova                    m4, [coeffq+16*7 ]            ;in25
5002    mova                    m5, [coeffq+16*15]            ;in27
5003    mova                    m6, [coeffq+16*23]            ;in29
5004    mova                    m7, [coeffq+16*31]            ;in31
5005    mova   [rsp+gprsize+16*63], m0                        ;in17
5006    mova   [rsp+gprsize+16*53], m1                        ;in19
5007    mova   [rsp+gprsize+16*55], m2                        ;in21
5008    mova   [rsp+gprsize+16*61], m3                        ;in23
5009    mova   [rsp+gprsize+16*59], m4                        ;in25
5010    mova   [rsp+gprsize+16*57], m5                        ;in27
5011    mova   [rsp+gprsize+16*51], m6                        ;in29
5012    mova   [rsp+gprsize+16*65], m7                        ;in31
5013
5014    call .main
5015    jmp  .end
5016
5017.fast:
5018    REPX          {mova x, m4}, m2, m3, m5, m6, m7
5019    call  m(idct_8x8_internal_8bpc).main
5020    SAVE_7ROWS    rsp+gprsize+16*3, 16
5021
5022    pxor                    m4, m4
5023    mova                    m0, [coeffq+16*16]
5024    mova                    m1, [coeffq+16*17]
5025
5026    REPX          {mova x, m4}, m2, m3, m5, m6, m7
5027    call m(idct_16x8_internal_8bpc).main
5028    mova                    m7, [rsp+gprsize+16*0]
5029    SAVE_8ROWS   rsp+gprsize+16*11, 16
5030
5031    mova                    m0, [coeffq+16*8 ]
5032    mova                    m1, [coeffq+16*24]
5033    mova                    m2, [coeffq+16*9 ]
5034    mova                    m3, [coeffq+16*25]
5035    mova   [rsp+gprsize+16*19], m0                        ;in1
5036    mova   [rsp+gprsize+16*26], m1                        ;in3
5037    mova   [rsp+gprsize+16*23], m2                        ;in5
5038    mova   [rsp+gprsize+16*22], m3                        ;in7
5039
5040    call m(idct_8x32_internal_8bpc).main_veryfast
5041    SAVE_8ROWS    rsp+gprsize+16*3, 16
5042
5043    call .main_fast
5044
5045.end:
5046    LOAD_8ROWS   rsp+gprsize+16*3, 16
5047    mova    [rsp+gprsize+16*0], m7
5048    mov                     r3, r4
5049    jmp  m(idct_8x32_internal_8bpc).end2
5050
5051.end1:
5052    LOAD_8ROWS   rsp+gprsize+16*35, 16
5053    lea                   dstq, [dstq+strideq*2]
5054    lea                     r3, [rsp+16*32+gprsize]
5055    call .write
5056    mov                   dstq, [rsp+gprsize*2+16*67]
5057    mov                    r3d, [rsp+gprsize*3+16*67]
5058    lea                     r4, [dstq+8]
5059    mov  [rsp+gprsize*2+16*67], r4
5060    lea                     r4, [o(.end1)]
5061
5062    dec                    r3d
5063    jg .pass2_loop
5064    ret
5065.write:
5066    mova             [r3+16*0], m7
5067    mov                     r4, -16*32
5068    pxor                    m7, m7
5069    sub                 coeffq, r4
5070.zero_loop:
5071    mova      [coeffq+r4+16*0], m7
5072    mova      [coeffq+r4+16*1], m7
5073    add                     r4, 16*2
5074    jl .zero_loop
5075    call .write_main2
5076    LOAD_8ROWS        r3+16*11, 16
5077    call .write_main
5078    LOAD_8ROWS        r3+16*19, 16
5079    call .write_main
5080    LOAD_8ROWS        r3+16*27, 16
5081.write_main:
5082    mova             [r3+16*0], m7
5083.write_main2:
5084    mova                    m7, [o(pw_2048)]
5085    REPX      {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
5086    pmulhrsw                m7, [r3+16*0]
5087    mova             [r3+16*2], m5
5088    mova             [r3+16*1], m6
5089    mova             [r3+16*0], m7
5090    WRITE_8X4                0, 1, 2, 3, 5, 6, 7
5091    lea                   dstq, [dstq+strideq*2]
5092    WRITE_8X4                4, [r3+16*2], [r3+16*1], [r3+16*0], 5, 6, 7
5093    lea                   dstq, [dstq+strideq*2]
5094    ret
5095
5096
5097ALIGN function_align
5098cglobal_label .main_fast
5099    mova                    m0, [rsp+gprsize*2+16*35]     ;in1
5100    pmulhrsw                m3, m0, [o(pw_4095x8)]        ;t62,t63
5101    pmulhrsw                m0, [o(pw_101x8)]             ;t32,t33
5102    mova                    m7, [o(pd_2048)]
5103    mova [rsp+gprsize*2+16*35], m0                        ;t32
5104    mova [rsp+gprsize*2+16*66], m3                        ;t63
5105    ITX_MULSUB_2W            3, 0, 1, 2, 7,  401, 4076    ;t33a, t62a
5106    mova [rsp+gprsize*2+16*36], m3                        ;t33a
5107    mova [rsp+gprsize*2+16*65], m0                        ;t62a
5108
5109    mova                    m1, [rsp+gprsize*2+16*37]     ;in15
5110    pmulhrsw                m2, m1, [o(pw_3822x8)]        ;t60,t61
5111    pmulhrsw                m1, [o(pw_m1474x8)]           ;t34,t35
5112    mova [rsp+gprsize*2+16*38], m1                        ;t35
5113    mova [rsp+gprsize*2+16*63], m2                        ;t60
5114    ITX_MULSUB_2W            2, 1, 0, 3, 7, m4076, 401    ;t34a, t61a
5115    mova [rsp+gprsize*2+16*37], m2                        ;t34a
5116    mova [rsp+gprsize*2+16*64], m1                        ;t61a
5117
5118    mova                    m0, [rsp+gprsize*2+16*39]     ;in9
5119    pmulhrsw                m3, m0, [o(pw_3996x8)]        ;t58,t59
5120    pmulhrsw                m0, [o(pw_897x8)]             ;t36,t37
5121    mova [rsp+gprsize*2+16*39], m0                        ;t36
5122    mova [rsp+gprsize*2+16*62], m3                        ;t59
5123    ITX_MULSUB_2W            3, 0, 1, 2, 7, 3166, 2598    ;t37a, t58a
5124    mova [rsp+gprsize*2+16*40], m3                        ;t37a
5125    mova [rsp+gprsize*2+16*61], m0                        ;t58a
5126
5127    mova                    m1, [rsp+gprsize*2+16*41]     ;in7
5128    pmulhrsw                m2, m1, [o(pw_4036x8)]        ;t56,t57
5129    pmulhrsw                m1, [o(pw_m700x8)]            ;t38,t39
5130    mova [rsp+gprsize*2+16*42], m1                        ;t39
5131    mova [rsp+gprsize*2+16*59], m2                        ;t56
5132    ITX_MULSUB_2W            2, 1, 0, 3, 7, m2598, 3166   ;t38a, t57a
5133    mova [rsp+gprsize*2+16*41], m2                        ;t38a
5134    mova [rsp+gprsize*2+16*60], m1                        ;t57a
5135
5136    mova                    m0, [rsp+gprsize*2+16*43]     ;in5
5137    pmulhrsw                m3, m0, [o(pw_4065x8)]        ;t54,t55
5138    pmulhrsw                m0, [o(pw_501x8)]             ;t40,t41
5139    mova [rsp+gprsize*2+16*43], m0                        ;t40
5140    mova [rsp+gprsize*2+16*58], m3                        ;t55
5141    ITX_MULSUB_2W            3, 0, 1, 2, 7, 1931, 3612    ;t41a, t54a
5142    mova [rsp+gprsize*2+16*44], m3                        ;t41a
5143    mova [rsp+gprsize*2+16*57], m0                        ;t54a
5144
5145    mova                    m1, [rsp+gprsize*2+16*45]     ;in11
5146    pmulhrsw                m2, m1, [o(pw_3948x8)]        ;t52,t53
5147    pmulhrsw                m1, [o(pw_m1092x8)]           ;t42,t43
5148    mova [rsp+gprsize*2+16*46], m1                        ;t43
5149    mova [rsp+gprsize*2+16*55], m2                        ;t52
5150    ITX_MULSUB_2W            2, 1, 0, 3, 7, m3612, 1931   ;t42a, t53a
5151    mova [rsp+gprsize*2+16*45], m2                        ;t42a
5152    mova [rsp+gprsize*2+16*56], m1                        ;t53a
5153
5154    mova                    m0, [rsp+gprsize*2+16*47]     ;in13
5155    pmulhrsw                m3, m0, [o(pw_3889x8)]        ;t50,t51
5156    pmulhrsw                m0, [o(pw_1285x8)]            ;t44,t45
5157    mova                    m6, m0
5158    mova [rsp+gprsize*2+16*54], m3                        ;t51
5159    ITX_MULSUB_2W            3, 0, 1, 2, 7, 3920, 1189    ;t45a, t50a
5160    mova [rsp+gprsize*2+16*48], m3                        ;t45a
5161    mova [rsp+gprsize*2+16*53], m0                        ;t50a
5162
5163    mova                    m0, [rsp+gprsize*2+16*49]     ;in3
5164    pmulhrsw                m3, m0, [o(pw_4085x8)]        ;t48,t49
5165    pmulhrsw                m0, [o(pw_m301x8)]            ;t46,t47
5166    mova                    m4, m3
5167    mova                    m5, m0
5168
5169    jmp .main2
5170
5171ALIGN function_align
5172cglobal_label .main
5173    mova                    m0, [rsp+gprsize*2+16*35]     ;in1
5174    mova                    m1, [rsp+gprsize*2+16*65]     ;in31
5175    pmulhrsw                m3, m0, [o(pw_4095x8)]        ;t63a
5176    pmulhrsw                m0, [o(pw_101x8)]             ;t32a
5177    pmulhrsw                m2, m1, [o(pw_2967x8)]        ;t62a
5178    pmulhrsw                m1, [o(pw_m2824x8)]           ;t33a
5179    mova                    m7, [o(pd_2048)]
5180    psubsw                  m4, m0, m1                    ;t33
5181    paddsw                  m0, m1                        ;t32
5182    psubsw                  m5, m3, m2                    ;t62
5183    paddsw                  m3, m2                        ;t63
5184    ITX_MULSUB_2W            5, 4, 1, 2, 7,  401, 4076    ;t33a, t62a
5185    mova [rsp+gprsize*2+16*35], m0                        ;t32
5186    mova [rsp+gprsize*2+16*36], m5                        ;t33a
5187    mova [rsp+gprsize*2+16*65], m4                        ;t62a
5188    mova [rsp+gprsize*2+16*66], m3                        ;t63
5189
5190    mova                    m0, [rsp+gprsize*2+16*63]     ;in17
5191    mova                    m1, [rsp+gprsize*2+16*37]     ;in15
5192    pmulhrsw                m3, m0, [o(pw_3745x8)]        ;t61a
5193    pmulhrsw                m0, [o(pw_1660x8)]            ;t34a
5194    pmulhrsw                m2, m1, [o(pw_3822x8)]        ;t60a
5195    pmulhrsw                m1, [o(pw_m1474x8)]           ;t35a
5196    psubsw                  m4, m1, m0                    ;t34
5197    paddsw                  m0, m1                        ;t35
5198    psubsw                  m5, m2, m3                    ;t61
5199    paddsw                  m3, m2                        ;t60
5200    ITX_MULSUB_2W            5, 4, 1, 2, 7, m4076, 401    ;t34a, t61a
5201    mova [rsp+gprsize*2+16*37], m5                        ;t34a
5202    mova [rsp+gprsize*2+16*38], m0                        ;t35
5203    mova [rsp+gprsize*2+16*63], m3                        ;t60
5204    mova [rsp+gprsize*2+16*64], m4                        ;t61a
5205
5206    mova                    m0, [rsp+gprsize*2+16*39]     ;in9
5207    mova                    m1, [rsp+gprsize*2+16*61]     ;in23
5208    pmulhrsw                m3, m0, [o(pw_3996x8)]        ;t59a
5209    pmulhrsw                m0, [o(pw_897x8)]             ;t36a
5210    pmulhrsw                m2, m1, [o(pw_3461x8)]        ;t58a
5211    pmulhrsw                m1, [o(pw_m2191x8)]           ;t37a
5212    psubsw                  m4, m0, m1                    ;t37
5213    paddsw                  m0, m1                        ;t36
5214    psubsw                  m5, m3, m2                    ;t58
5215    paddsw                  m3, m2                        ;t59
5216    ITX_MULSUB_2W            5, 4, 1, 2, 7, 3166, 2598    ;t37a, t58a
5217    mova [rsp+gprsize*2+16*39], m0                        ;t36
5218    mova [rsp+gprsize*2+16*40], m5                        ;t37a
5219    mova [rsp+gprsize*2+16*61], m4                        ;t58a
5220    mova [rsp+gprsize*2+16*62], m3                        ;t59
5221
5222    mova                    m0, [rsp+gprsize*2+16*59]     ;in25
5223    mova                    m1, [rsp+gprsize*2+16*41]     ;in7
5224    pmulhrsw                m3, m0, [o(pw_3349x8)]        ;t57a
5225    pmulhrsw                m0, [o(pw_2359x8)]            ;t38a
5226    pmulhrsw                m2, m1, [o(pw_4036x8)]        ;t56a
5227    pmulhrsw                m1, [o(pw_m700x8)]            ;t39a
5228    psubsw                  m4, m1, m0                    ;t38
5229    paddsw                  m0, m1                        ;t39
5230    psubsw                  m5, m2, m3                    ;t57
5231    paddsw                  m3, m2                        ;t56
5232    ITX_MULSUB_2W            5, 4, 1, 2, 7, m2598, 3166   ;t38a, t57a
5233    mova [rsp+gprsize*2+16*41], m5                        ;t38a
5234    mova [rsp+gprsize*2+16*42], m0                        ;t39
5235    mova [rsp+gprsize*2+16*59], m3                        ;t56
5236    mova [rsp+gprsize*2+16*60], m4                        ;t57a
5237
5238    mova                    m0, [rsp+gprsize*2+16*43]     ;in5
5239    mova                    m1, [rsp+gprsize*2+16*57]     ;in27
5240    pmulhrsw                m3, m0, [o(pw_4065x8)]        ;t55a
5241    pmulhrsw                m0, [o(pw_501x8)]             ;t40a
5242    pmulhrsw                m2, m1, [o(pw_3229x8)]        ;t54a
5243    pmulhrsw                m1, [o(pw_m2520x8)]           ;t41a
5244    psubsw                  m4, m0, m1                    ;t41
5245    paddsw                  m0, m1                        ;t40
5246    psubsw                  m5, m3, m2                    ;t54
5247    paddsw                  m3, m2                        ;t55
5248    ITX_MULSUB_2W            5, 4, 1, 2, 7, 1931, 3612    ;t41a, t54a
5249    mova [rsp+gprsize*2+16*43], m0                        ;t40
5250    mova [rsp+gprsize*2+16*44], m5                        ;t41a
5251    mova [rsp+gprsize*2+16*57], m4                        ;t54a
5252    mova [rsp+gprsize*2+16*58], m3                        ;t55
5253
5254    mova                    m0, [rsp+gprsize*2+16*55]     ;in21
5255    mova                    m1, [rsp+gprsize*2+16*45]     ;in11
5256    pmulhrsw                m3, m0, [o(pw_3564x8)]        ;t53a
5257    pmulhrsw                m0, [o(pw_2019x8)]            ;t42a
5258    pmulhrsw                m2, m1, [o(pw_3948x8)]        ;t52a
5259    pmulhrsw                m1, [o(pw_m1092x8)]           ;t43a
5260    psubsw                  m4, m1, m0                    ;t42
5261    paddsw                  m0, m1                        ;t43
5262    psubsw                  m5, m2, m3                    ;t53
5263    paddsw                  m3, m2                        ;t52
5264    ITX_MULSUB_2W            5, 4, 1, 2, 7, m3612, 1931   ;t42a, t53a
5265    mova [rsp+gprsize*2+16*45], m5                        ;t42a
5266    mova [rsp+gprsize*2+16*46], m0                        ;t43
5267    mova [rsp+gprsize*2+16*55], m3                        ;t52
5268    mova [rsp+gprsize*2+16*56], m4                        ;t53a
5269
5270    mova                    m0, [rsp+gprsize*2+16*47]     ;in13
5271    mova                    m1, [rsp+gprsize*2+16*53]     ;in19
5272    pmulhrsw                m3, m0, [o(pw_3889x8)]        ;t51a
5273    pmulhrsw                m0, [o(pw_1285x8)]            ;t44a
5274    pmulhrsw                m2, m1, [o(pw_3659x8)]        ;t50a
5275    pmulhrsw                m1, [o(pw_m1842x8)]           ;t45a
5276    psubsw                  m4, m0, m1                    ;t45
5277    paddsw                  m0, m1                        ;t44
5278    psubsw                  m5, m3, m2                    ;t50
5279    paddsw                  m3, m2                        ;t51
5280    ITX_MULSUB_2W            5, 4, 1, 2, 7, 3920, 1189    ;t45a, t50a
5281    mova                    m6, m0
5282    mova [rsp+gprsize*2+16*48], m5                        ;t45a
5283    mova [rsp+gprsize*2+16*53], m4                        ;t50a
5284    mova [rsp+gprsize*2+16*54], m3                        ;t51
5285
5286    mova                    m0, [rsp+gprsize*2+16*51]     ;in29
5287    mova                    m1, [rsp+gprsize*2+16*49]     ;in3
5288    pmulhrsw                m3, m0, [o(pw_3102x8)]        ;t49a
5289    pmulhrsw                m0, [o(pw_2675x8)]            ;t46a
5290    pmulhrsw                m2, m1, [o(pw_4085x8)]        ;t48a
5291    pmulhrsw                m1, [o(pw_m301x8)]            ;t47a
5292    psubsw                  m5, m1, m0                    ;t46
5293    paddsw                  m0, m1                        ;t47
5294    psubsw                  m4, m2, m3                    ;t49
5295    paddsw                  m3, m2                        ;t48
5296
5297ALIGN function_align
5298.main2:
5299    ITX_MULSUB_2W            4, 5, 1, 2, 7, m1189, 3920   ;t46a, t49a
5300    mova                    m1, [rsp+gprsize*2+16*54]     ;t51
5301    psubsw                  m2, m0, m6                    ;t44a
5302    paddsw                  m0, m6                        ;t47a
5303    psubsw                  m6, m3, m1                    ;t51a
5304    paddsw                  m3, m1                        ;t48a
5305    mova [rsp+gprsize*2+16*50], m0                        ;t47a
5306    mova [rsp+gprsize*2+16*51], m3                        ;t48a
5307    ITX_MULSUB_2W            6, 2, 0, 3, 7, m2276, 3406   ;t44, t51
5308    mova [rsp+gprsize*2+16*47], m6                        ;t44
5309    mova [rsp+gprsize*2+16*54], m2                        ;t51
5310
5311    mova                    m0, [rsp+gprsize*2+16*48]     ;t45a
5312    mova                    m3, [rsp+gprsize*2+16*53]     ;t50a
5313    psubsw                  m2, m4, m0                    ;t45
5314    paddsw                  m4, m0                        ;t46
5315    psubsw                  m6, m5, m3                    ;t50
5316    paddsw                  m5, m3                        ;t49
5317    ITX_MULSUB_2W            6, 2, 0, 3, 7, m2276, 3406   ;t45a, t50a
5318    mova [rsp+gprsize*2+16*48], m6                        ;t45a
5319    mova [rsp+gprsize*2+16*49], m4                        ;t46
5320    mova [rsp+gprsize*2+16*52], m5                        ;t49
5321    mova [rsp+gprsize*2+16*53], m2                        ;t50a
5322
5323    mova                    m0, [rsp+gprsize*2+16*43]     ;t40
5324    mova                    m2, [rsp+gprsize*2+16*46]     ;t43
5325    mova                    m3, [rsp+gprsize*2+16*55]     ;t52
5326    mova                    m1, [rsp+gprsize*2+16*58]     ;t55
5327    psubsw                  m4, m0, m2                    ;t43a
5328    paddsw                  m0, m2                        ;t40a
5329    psubsw                  m5, m1, m3                    ;t52a
5330    paddsw                  m1, m3                        ;t55a
5331    ITX_MULSUB_2W            5, 4, 2, 3, 7, 3406, 2276    ;t43, t52
5332    mova [rsp+gprsize*2+16*43], m0                        ;t40a
5333    mova [rsp+gprsize*2+16*46], m5                        ;t43
5334    mova [rsp+gprsize*2+16*55], m4                        ;t52
5335    mova [rsp+gprsize*2+16*58], m1                        ;t55a
5336
5337    mova                    m0, [rsp+gprsize*2+16*44]     ;t41a
5338    mova                    m2, [rsp+gprsize*2+16*45]     ;t42a
5339    mova                    m3, [rsp+gprsize*2+16*56]     ;t53a
5340    mova                    m1, [rsp+gprsize*2+16*57]     ;t54a
5341    psubsw                  m4, m0, m2                    ;t42
5342    paddsw                  m0, m2                        ;t41
5343    psubsw                  m5, m1, m3                    ;t53
5344    paddsw                  m1, m3                        ;t54
5345    ITX_MULSUB_2W            5, 4, 2, 3, 7, 3406, 2276    ;t42a, t53a
5346    mova [rsp+gprsize*2+16*44], m0                        ;t41
5347    mova [rsp+gprsize*2+16*45], m5                        ;t42a
5348    mova [rsp+gprsize*2+16*56], m4                        ;t53a
5349    mova [rsp+gprsize*2+16*57], m1                        ;t54
5350
5351    mova                    m0, [rsp+gprsize*2+16*41]     ;t38a
5352    mova                    m2, [rsp+gprsize*2+16*40]     ;t37a
5353    mova                    m3, [rsp+gprsize*2+16*61]     ;t58a
5354    mova                    m1, [rsp+gprsize*2+16*60]     ;t57a
5355    psubsw                  m4, m0, m2                    ;t37
5356    paddsw                  m0, m2                        ;t38
5357    psubsw                  m5, m1, m3                    ;t58
5358    paddsw                  m1, m3                        ;t57
5359    ITX_MULSUB_2W            5, 4, 2, 3, 7, m4017, 799    ;t37a, t58a
5360    mova [rsp+gprsize*2+16*41], m0                        ;t38
5361    mova [rsp+gprsize*2+16*40], m5                        ;t37a
5362    mova [rsp+gprsize*2+16*61], m4                        ;t58a
5363    mova [rsp+gprsize*2+16*60], m1                        ;t57
5364
5365    mova                    m0, [rsp+gprsize*2+16*42]     ;t39
5366    mova                    m2, [rsp+gprsize*2+16*39]     ;t36
5367    mova                    m3, [rsp+gprsize*2+16*62]     ;t59
5368    mova                    m1, [rsp+gprsize*2+16*59]     ;t56
5369    psubsw                  m4, m0, m2                    ;t36a
5370    paddsw                  m0, m2                        ;t39a
5371    psubsw                  m5, m1, m3                    ;t59a
5372    paddsw                  m1, m3                        ;t56a
5373    ITX_MULSUB_2W            5, 4, 2, 3, 7, m4017, 799    ;t36, t59
5374    mova [rsp+gprsize*2+16*42], m0                        ;t39a
5375    mova [rsp+gprsize*2+16*39], m5                        ;t36
5376    mova [rsp+gprsize*2+16*62], m4                        ;t59
5377    mova [rsp+gprsize*2+16*59], m1                        ;t56a
5378
5379    mova                    m0, [rsp+gprsize*2+16*35]     ;t32
5380    mova                    m2, [rsp+gprsize*2+16*38]     ;t35
5381    mova                    m3, [rsp+gprsize*2+16*63]     ;t60
5382    mova                    m1, [rsp+gprsize*2+16*66]     ;t63
5383    psubsw                  m4, m0, m2                    ;t35a
5384    paddsw                  m0, m2                        ;t32a
5385    psubsw                  m5, m1, m3                    ;t60a
5386    paddsw                  m1, m3                        ;t63a
5387    ITX_MULSUB_2W            5, 4, 2, 3, 7,  799, 4017    ;t35, t60
5388    mova [rsp+gprsize*2+16*35], m0                        ;t32a
5389    mova [rsp+gprsize*2+16*38], m5                        ;t35
5390    mova [rsp+gprsize*2+16*63], m4                        ;t60
5391    mova [rsp+gprsize*2+16*66], m1                        ;t63a
5392
5393    mova                    m0, [rsp+gprsize*2+16*36]     ;t33a
5394    mova                    m2, [rsp+gprsize*2+16*37]     ;t34a
5395    mova                    m3, [rsp+gprsize*2+16*64]     ;t61a
5396    mova                    m1, [rsp+gprsize*2+16*65]     ;t62a
5397    psubsw                  m4, m0, m2                    ;t34
5398    paddsw                  m0, m2                        ;t33
5399    psubsw                  m5, m1, m3                    ;t61
5400    paddsw                  m1, m3                        ;t62
5401    ITX_MULSUB_2W            5, 4, 2, 3, 7,  799, 4017    ;t34a, t61a
5402
5403    mova                    m2, [rsp+gprsize*2+16*41]     ;t38
5404    mova                    m3, [rsp+gprsize*2+16*60]     ;t57
5405    psubsw                  m6, m0, m2                    ;t38a
5406    paddsw                  m0, m2                        ;t33a
5407    psubsw                  m2, m1, m3                    ;t57a
5408    paddsw                  m1, m3                        ;t62a
5409    mova [rsp+gprsize*2+16*36], m0                        ;t33a
5410    mova [rsp+gprsize*2+16*65], m1                        ;t62a
5411    ITX_MULSUB_2W            2, 6, 0, 3, 7, 1567, 3784    ;t38, t57
5412    mova [rsp+gprsize*2+16*41], m2                        ;t38
5413    mova [rsp+gprsize*2+16*60], m6                        ;t57
5414
5415    mova                    m2, [rsp+gprsize*2+16*40]     ;t37
5416    mova                    m3, [rsp+gprsize*2+16*61]     ;t58
5417    psubsw                  m0, m5, m2                    ;t37
5418    paddsw                  m5, m2                        ;t34
5419    psubsw                  m1, m4, m3                    ;t58
5420    paddsw                  m4, m3                        ;t61
5421    ITX_MULSUB_2W            1, 0, 2, 3, 7, 1567, 3784    ;t37a, t58a
5422    mova [rsp+gprsize*2+16*37], m5                        ;t34
5423    mova [rsp+gprsize*2+16*64], m4                        ;t61
5424    mova [rsp+gprsize*2+16*40], m1                        ;t37a
5425    mova [rsp+gprsize*2+16*61], m0                        ;t58a
5426
5427    mova                    m0, [rsp+gprsize*2+16*38]     ;t35
5428    mova                    m2, [rsp+gprsize*2+16*39]     ;t36
5429    mova                    m3, [rsp+gprsize*2+16*62]     ;t59
5430    mova                    m1, [rsp+gprsize*2+16*63]     ;t60
5431    psubsw                  m4, m0, m2                    ;t36a
5432    paddsw                  m0, m2                        ;t35a
5433    psubsw                  m5, m1, m3                    ;t59a
5434    paddsw                  m1, m3                        ;t60a
5435    ITX_MULSUB_2W            5, 4, 2, 3, 7, 1567, 3784    ;t36, t59
5436    mova [rsp+gprsize*2+16*38], m0                        ;t35a
5437    mova [rsp+gprsize*2+16*39], m5                        ;t36
5438    mova [rsp+gprsize*2+16*62], m4                        ;t59
5439    mova [rsp+gprsize*2+16*63], m1                        ;t60a
5440
5441    mova                    m0, [rsp+gprsize*2+16*35]     ;t32a
5442    mova                    m2, [rsp+gprsize*2+16*42]     ;t39a
5443    mova                    m3, [rsp+gprsize*2+16*59]     ;t56a
5444    mova                    m1, [rsp+gprsize*2+16*66]     ;t63a
5445    psubsw                  m4, m0, m2                    ;t39
5446    paddsw                  m0, m2                        ;t32
5447    psubsw                  m5, m1, m3                    ;t56
5448    paddsw                  m1, m3                        ;t63
5449    ITX_MULSUB_2W            5, 4, 2, 3, 7, 1567, 3784    ;t39a, t56a
5450    mova [rsp+gprsize*2+16*35], m0                        ;t32
5451    mova [rsp+gprsize*2+16*42], m5                        ;t39a
5452    mova [rsp+gprsize*2+16*59], m4                        ;t56a
5453    mova [rsp+gprsize*2+16*66], m1                        ;t63
5454
5455    mova                    m0, [rsp+gprsize*2+16*50]     ;t47a
5456    mova                    m2, [rsp+gprsize*2+16*43]     ;t40a
5457    mova                    m3, [rsp+gprsize*2+16*58]     ;t55a
5458    mova                    m1, [rsp+gprsize*2+16*51]     ;t48a
5459    psubsw                  m4, m0, m2                    ;t40
5460    paddsw                  m0, m2                        ;t47
5461    psubsw                  m5, m1, m3                    ;t55
5462    paddsw                  m1, m3                        ;t48
5463    ITX_MULSUB_2W            5, 4, 2, 3, 7, m3784, 1567   ;t40a, t55a
5464    mova [rsp+gprsize*2+16*50], m0                        ;t47
5465    mova [rsp+gprsize*2+16*43], m5                        ;t40a
5466    mova [rsp+gprsize*2+16*58], m4                        ;t55a
5467    mova [rsp+gprsize*2+16*51], m1                        ;t48
5468
5469    mova                    m0, [rsp+gprsize*2+16*49]     ;t46
5470    mova                    m2, [rsp+gprsize*2+16*44]     ;t41
5471    mova                    m3, [rsp+gprsize*2+16*57]     ;t54
5472    mova                    m1, [rsp+gprsize*2+16*52]     ;t49
5473    psubsw                  m4, m0, m2                    ;t41a
5474    paddsw                  m0, m2                        ;t46a
5475    psubsw                  m5, m1, m3                    ;t54a
5476    paddsw                  m1, m3                        ;t49a
5477    ITX_MULSUB_2W            5, 4, 2, 3, 7, m3784, 1567   ;t41, t54
5478    mova [rsp+gprsize*2+16*49], m0                        ;t46a
5479    mova [rsp+gprsize*2+16*44], m5                        ;t41
5480    mova [rsp+gprsize*2+16*57], m4                        ;t54
5481    mova [rsp+gprsize*2+16*52], m1                        ;t49a
5482
5483    mova                    m0, [rsp+gprsize*2+16*48]     ;t45a
5484    mova                    m2, [rsp+gprsize*2+16*45]     ;t42a
5485    mova                    m3, [rsp+gprsize*2+16*56]     ;t53a
5486    mova                    m1, [rsp+gprsize*2+16*53]     ;t50a
5487    psubsw                  m4, m0, m2                    ;t42
5488    paddsw                  m0, m2                        ;t45
5489    psubsw                  m5, m1, m3                    ;t53
5490    paddsw                  m1, m3                        ;t50
5491    ITX_MULSUB_2W            5, 4, 2, 3, 7, m3784, 1567   ;t42a, t53a
5492    mova [rsp+gprsize*2+16*48], m0                        ;t45
5493    mova [rsp+gprsize*2+16*45], m5                        ;t42a
5494    mova [rsp+gprsize*2+16*56], m4                        ;t53a
5495    mova [rsp+gprsize*2+16*53], m1                        ;t50
5496
5497    mova                    m0, [rsp+gprsize*2+16*47]     ;t44
5498    mova                    m2, [rsp+gprsize*2+16*46]     ;t43
5499    mova                    m3, [rsp+gprsize*2+16*55]     ;t52
5500    mova                    m1, [rsp+gprsize*2+16*54]     ;t51
5501    psubsw                  m4, m0, m2                    ;t43a
5502    paddsw                  m0, m2                        ;t44a
5503    psubsw                  m5, m1, m3                    ;t52a
5504    paddsw                  m1, m3                        ;t51a
5505    ITX_MULSUB_2W            5, 4, 2, 3, 7, m3784, 1567   ;t43, t52
5506
5507    mova                    m2, [rsp+gprsize*2+16*38]     ;t35a
5508    mova                    m3, [rsp+gprsize*2+16*31]     ;tmp[28]
5509    psubsw                  m6, m2, m0                    ;t44
5510    paddsw                  m2, m0                        ;t35
5511    psubsw                  m0, m3, m2                    ;out35
5512    paddsw                  m2, m3                        ;out28
5513    mova                    m3, [rsp+gprsize*2+16*63]     ;t60a
5514    mova [rsp+gprsize*2+16*38], m0                        ;out35
5515    mova [rsp+gprsize*2+16*31], m2                        ;out28
5516    psubsw                  m0, m3, m1                    ;t51
5517    paddsw                  m3, m1                        ;t60
5518    ITX_MULSUB_2W            0, 6, 1, 2, 7, 2896, 2896    ;t44a, t51a
5519    mova                    m2, [rsp+gprsize*2+16*6 ]     ;tmp[3]
5520    psubsw                  m1, m2, m3                    ;out60
5521    paddsw                  m2, m3                        ;out3
5522    mova                    m3, [rsp+gprsize*2+16*22]     ;tmp[19]
5523    mova [rsp+gprsize*2+16*63], m1                        ;out60
5524    mova [rsp+gprsize*2+16*6 ], m2                        ;out3
5525    psubsw                  m1, m3, m0                    ;out44
5526    paddsw                  m3, m0                        ;out19
5527    mova                    m2, [rsp+gprsize*2+16*15]     ;tmp[12]
5528
5529    mova                    m0, [rsp+gprsize*2+16*39]     ;t36
5530    mova [rsp+gprsize*2+16*47], m1                        ;out44
5531    mova [rsp+gprsize*2+16*22], m3                        ;out19
5532    mova                    m1, [rsp+gprsize*2+16*62]     ;t59
5533    psubsw                  m3, m2, m6                    ;out51
5534    paddsw                  m2, m6                        ;out12
5535    mova [rsp+gprsize*2+16*54], m3                        ;out51
5536    mova [rsp+gprsize*2+16*15], m2                        ;out12
5537    psubsw                  m2, m0, m5                    ;t43a
5538    paddsw                  m0, m5                        ;t36a
5539    mova                    m5, [rsp+gprsize*2+16*30]     ;tmp[27]
5540    psubsw                  m3, m1, m4                    ;t52a
5541    paddsw                  m1, m4                        ;t59a
5542    ITX_MULSUB_2W            3, 2, 4, 6, 7, 2896, 2896    ;t43, t52
5543    mova                    m4, [rsp+gprsize*2+16*7 ]     ;tmp[4 ]
5544    psubsw                  m6, m5, m0                    ;out36
5545    paddsw                  m5, m0                        ;out27
5546    psubsw                  m0, m4, m1                    ;out59
5547    paddsw                  m4, m1                        ;out4
5548    mova [rsp+gprsize*2+16*39], m6                        ;out36
5549    mova [rsp+gprsize*2+16*30], m5                        ;out27
5550    mova [rsp+gprsize*2+16*62], m0                        ;out59
5551    mova [rsp+gprsize*2+16*7 ], m4                        ;out4
5552    mova                    m0, [rsp+gprsize*2+16*23]     ;tmp[20]
5553    mova                    m5, [rsp+gprsize*2+16*14]     ;tmp[11]
5554    psubsw                  m4, m0, m3                    ;out43
5555    paddsw                  m0, m3                        ;out20
5556    psubsw                  m6, m5, m2                    ;out52
5557    paddsw                  m5, m2                        ;out11
5558    mova [rsp+gprsize*2+16*46], m4                        ;out43
5559    mova [rsp+gprsize*2+16*23], m0                        ;out20
5560    mova [rsp+gprsize*2+16*55], m6                        ;out52
5561    mova [rsp+gprsize*2+16*14], m5                        ;out11
5562
5563    mova                    m0, [rsp+gprsize*2+16*40]     ;t37a
5564    mova                    m5, [rsp+gprsize*2+16*45]     ;t42a
5565    mova                    m3, [rsp+gprsize*2+16*56]     ;t53a
5566    mova                    m1, [rsp+gprsize*2+16*61]     ;t58a
5567    mova                    m2, [rsp+gprsize*2+16*29]     ;tmp[26]
5568    psubsw                  m4, m0, m5                    ;t42
5569    paddsw                  m0, m5                        ;t37
5570    psubsw                  m5, m1, m3                    ;t53
5571    paddsw                  m1, m3                        ;t58
5572    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t43, t52
5573    mova                    m3, [rsp+gprsize*2+16*8 ]     ;tmp[5 ]
5574    psubsw                  m6, m2, m0                    ;out37
5575    paddsw                  m2, m0                        ;out26
5576    psubsw                  m0, m3, m1                    ;out58
5577    paddsw                  m3, m1                        ;out5
5578    mova [rsp+gprsize*2+16*40], m6                        ;out37
5579    mova [rsp+gprsize*2+16*29], m2                        ;out26
5580    mova [rsp+gprsize*2+16*61], m0                        ;out58
5581    mova [rsp+gprsize*2+16*8 ], m3                        ;out5
5582    mova                    m0, [rsp+gprsize*2+16*24]     ;tmp[21]
5583    mova                    m1, [rsp+gprsize*2+16*13]     ;tmp[10]
5584    psubsw                  m2, m0, m5                    ;out42
5585    paddsw                  m0, m5                        ;out21
5586    psubsw                  m3, m1, m4                    ;out53
5587    paddsw                  m1, m4                        ;out10
5588    mova [rsp+gprsize*2+16*45], m2                        ;out42
5589    mova [rsp+gprsize*2+16*24], m0                        ;out21
5590    mova [rsp+gprsize*2+16*56], m3                        ;out53
5591    mova [rsp+gprsize*2+16*13], m1                        ;out10
5592
5593    mova                    m0, [rsp+gprsize*2+16*41]     ;t38
5594    mova                    m5, [rsp+gprsize*2+16*44]     ;t41
5595    mova                    m3, [rsp+gprsize*2+16*57]     ;t54
5596    mova                    m1, [rsp+gprsize*2+16*60]     ;t57
5597    mova                    m2, [rsp+gprsize*2+16*28]     ;tmp[25]
5598    psubsw                  m4, m0, m5                    ;t41a
5599    paddsw                  m0, m5                        ;t38a
5600    psubsw                  m5, m1, m3                    ;t54a
5601    paddsw                  m1, m3                        ;t57a
5602    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t41a, t54a
5603    mova                    m3, [rsp+gprsize*2+16*9 ]     ;tmp[6 ]
5604    psubsw                  m6, m2, m0                    ;out38
5605    paddsw                  m2, m0                        ;out25
5606    psubsw                  m0, m3, m1                    ;out57
5607    paddsw                  m3, m1                        ;out6
5608    mova [rsp+gprsize*2+16*41], m6                        ;out38
5609    mova [rsp+gprsize*2+16*28], m2                        ;out25
5610    mova [rsp+gprsize*2+16*60], m0                        ;out57
5611    mova [rsp+gprsize*2+16*9 ], m3                        ;out6
5612    mova                    m0, [rsp+gprsize*2+16*25]     ;tmp[22]
5613    mova                    m1, [rsp+gprsize*2+16*12]     ;tmp[9 ]
5614    psubsw                  m2, m0, m5                    ;out41
5615    paddsw                  m0, m5                        ;out22
5616    psubsw                  m3, m1, m4                    ;out54
5617    paddsw                  m1, m4                        ;out9
5618    mova [rsp+gprsize*2+16*44], m2                        ;out41
5619    mova [rsp+gprsize*2+16*25], m0                        ;out22
5620    mova [rsp+gprsize*2+16*57], m3                        ;out54
5621    mova [rsp+gprsize*2+16*12], m1                        ;out9
5622
5623    mova                    m0, [rsp+gprsize*2+16*42]     ;t39a
5624    mova                    m5, [rsp+gprsize*2+16*43]     ;t40a
5625    mova                    m3, [rsp+gprsize*2+16*58]     ;t55a
5626    mova                    m1, [rsp+gprsize*2+16*59]     ;t56a
5627    mova                    m2, [rsp+gprsize*2+16*27]     ;tmp[24]
5628    psubsw                  m4, m0, m5                    ;t40
5629    paddsw                  m0, m5                        ;t39
5630    psubsw                  m5, m1, m3                    ;t55
5631    paddsw                  m1, m3                        ;t56
5632    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t40a, t55a
5633    mova                    m3, [rsp+gprsize*2+16*10]     ;tmp[7 ]
5634    psubsw                  m6, m2, m0                    ;out39
5635    paddsw                  m2, m0                        ;out24
5636    psubsw                  m0, m3, m1                    ;out56
5637    paddsw                  m3, m1                        ;out7
5638    mova [rsp+gprsize*2+16*42], m6                        ;out39
5639    mova [rsp+gprsize*2+16*27], m2                        ;out24
5640    mova [rsp+gprsize*2+16*59], m0                        ;out56
5641    mova [rsp+gprsize*2+16*10], m3                        ;out7
5642    mova                    m0, [rsp+gprsize*2+16*26]     ;tmp[23]
5643    mova                    m1, [rsp+gprsize*2+16*11]     ;tmp[8 ]
5644    psubsw                  m2, m0, m5                    ;out40
5645    paddsw                  m0, m5                        ;out23
5646    psubsw                  m3, m1, m4                    ;out55
5647    paddsw                  m1, m4                        ;out8
5648    mova [rsp+gprsize*2+16*43], m2                        ;out40
5649    mova [rsp+gprsize*2+16*26], m0                        ;out23
5650    mova [rsp+gprsize*2+16*58], m3                        ;out55
5651    mova [rsp+gprsize*2+16*11], m1                        ;out8
5652
5653    mova                    m0, [rsp+gprsize*2+16*37]     ;t34
5654    mova                    m5, [rsp+gprsize*2+16*48]     ;t45
5655    mova                    m3, [rsp+gprsize*2+16*53]     ;t50
5656    mova                    m1, [rsp+gprsize*2+16*64]     ;t61
5657    mova                    m2, [rsp+gprsize*2+16*32]     ;tmp[29]
5658    psubsw                  m4, m0, m5                    ;t45a
5659    paddsw                  m0, m5                        ;t34a
5660    psubsw                  m5, m1, m3                    ;t50a
5661    paddsw                  m1, m3                        ;t61a
5662    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t45, t50
5663    mova                    m3, [rsp+gprsize*2+16*5 ]     ;tmp[2 ]
5664    psubsw                  m6, m2, m0                    ;out34
5665    paddsw                  m2, m0                        ;out29
5666    psubsw                  m0, m3, m1                    ;out61
5667    paddsw                  m3, m1                        ;out2
5668    mova [rsp+gprsize*2+16*37], m6                        ;out34
5669    mova [rsp+gprsize*2+16*32], m2                        ;out29
5670    mova [rsp+gprsize*2+16*64], m0                        ;out61
5671    mova [rsp+gprsize*2+16*5 ], m3                        ;out2
5672    mova                    m0, [rsp+gprsize*2+16*21]     ;tmp[18]
5673    mova                    m1, [rsp+gprsize*2+16*16]     ;tmp[13]
5674    psubsw                  m2, m0, m5                    ;out45
5675    paddsw                  m0, m5                        ;out18
5676    psubsw                  m3, m1, m4                    ;out50
5677    paddsw                  m1, m4                        ;out13
5678    mova [rsp+gprsize*2+16*48], m2                        ;out45
5679    mova [rsp+gprsize*2+16*21], m0                        ;out18
5680    mova [rsp+gprsize*2+16*53], m3                        ;out50
5681    mova [rsp+gprsize*2+16*16], m1                        ;out13
5682
5683    mova                    m0, [rsp+gprsize*2+16*36]     ;t33a
5684    mova                    m5, [rsp+gprsize*2+16*49]     ;t46a
5685    mova                    m3, [rsp+gprsize*2+16*52]     ;t49a
5686    mova                    m1, [rsp+gprsize*2+16*65]     ;t62a
5687    mova                    m2, [rsp+gprsize*2+16*33]     ;tmp[30]
5688    psubsw                  m4, m0, m5                    ;t46
5689    paddsw                  m0, m5                        ;t33
5690    psubsw                  m5, m1, m3                    ;t49
5691    paddsw                  m1, m3                        ;t62
5692    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t45, t50
5693    mova                    m3, [rsp+gprsize*2+16*4 ]     ;tmp[1 ]
5694    psubsw                  m6, m2, m0                    ;out33
5695    paddsw                  m2, m0                        ;out30
5696    psubsw                  m0, m3, m1                    ;out62
5697    paddsw                  m3, m1                        ;out1
5698    mova [rsp+gprsize*2+16*36], m6                        ;out33
5699    mova [rsp+gprsize*2+16*33], m2                        ;out30
5700    mova [rsp+gprsize*2+16*65], m0                        ;out62
5701    mova [rsp+gprsize*2+16*4 ], m3                        ;out1
5702    mova                    m0, [rsp+gprsize*2+16*20]     ;tmp[17]
5703    mova                    m1, [rsp+gprsize*2+16*17]     ;tmp[14]
5704    psubsw                  m2, m0, m5                    ;out46
5705    paddsw                  m0, m5                        ;out17
5706    psubsw                  m3, m1, m4                    ;out49
5707    paddsw                  m1, m4                        ;out14
5708    mova [rsp+gprsize*2+16*49], m2                        ;out46
5709    mova [rsp+gprsize*2+16*20], m0                        ;out17
5710    mova [rsp+gprsize*2+16*52], m3                        ;out49
5711    mova [rsp+gprsize*2+16*17], m1                        ;out14
5712
5713    mova                    m0, [rsp+gprsize*2+16*35]     ;t32
5714    mova                    m5, [rsp+gprsize*2+16*50]     ;t47
5715    mova                    m3, [rsp+gprsize*2+16*51]     ;t48
5716    mova                    m1, [rsp+gprsize*2+16*66]     ;t63
5717    mova                    m2, [rsp+gprsize*2+16*34]     ;tmp[31]
5718    psubsw                  m4, m0, m5                    ;t47a
5719    paddsw                  m0, m5                        ;t32a
5720    psubsw                  m5, m1, m3                    ;t48a
5721    paddsw                  m1, m3                        ;t63a
5722    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t47, t48
5723    mova                    m3, [rsp+gprsize*2+16*3 ]     ;tmp[0 ]
5724    psubsw                  m6, m2, m0                    ;out32
5725    paddsw                  m2, m0                        ;out31
5726    psubsw                  m0, m3, m1                    ;out63
5727    paddsw                  m3, m1                        ;out0
5728    mova [rsp+gprsize*2+16*35], m6                        ;out32
5729    mova [rsp+gprsize*2+16*34], m2                        ;out31
5730    mova [rsp+gprsize*2+16*66], m0                        ;out63
5731    mova [rsp+gprsize*2+16*3 ], m3                        ;out0
5732    mova                    m0, [rsp+gprsize*2+16*19]     ;tmp[16]
5733    mova                    m1, [rsp+gprsize*2+16*18]     ;tmp[15]
5734    psubsw                  m2, m0, m5                    ;out47
5735    paddsw                  m0, m5                        ;out16
5736    psubsw                  m3, m1, m4                    ;out48
5737    paddsw                  m1, m4                        ;out15
5738    mova [rsp+gprsize*2+16*50], m2                        ;out47
5739    mova [rsp+gprsize*2+16*19], m0                        ;out16
5740    mova [rsp+gprsize*2+16*51], m3                        ;out48
5741    mova [rsp+gprsize*2+16*18], m1                        ;out15
5742    ret
5743
5744
5745cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2
5746%if ARCH_X86_32
5747    LEA                     r5, $$
5748%endif
5749    test                  eobd, eobd
5750    jz .dconly
5751
5752    call m(idct_64x16_internal_8bpc)
5753    RET
5754
5755.dconly:
5756    movd                    m1, [o(pw_2896x8)]
5757    pmulhrsw                m0, m1, [coeffq]
5758    movd                    m2, [o(pw_8192)]
5759    mov               [coeffq], eobd
5760    mov                    r3d, 16
5761    lea                   tx2q, [o(.end)]
5762
5763.body:
5764    pmulhrsw                m0, m2
5765    movd                    m2, [o(pw_2048)]  ;intentionally rip-relative
5766    pmulhrsw                m0, m1
5767    pmulhrsw                m0, m2
5768    pshuflw                 m0, m0, q0000
5769    punpcklwd               m0, m0
5770    pxor                    m7, m7
5771
5772.loop:
5773    mova                    m1, [dstq+16*0]
5774    mova                    m3, [dstq+16*1]
5775    mova                    m5, [dstq+16*2]
5776    mova                    m6, [dstq+16*3]
5777    punpckhbw               m2, m1, m7
5778    punpcklbw               m1, m7
5779    punpckhbw               m4, m3, m7
5780    punpcklbw               m3, m7
5781    paddw                   m2, m0
5782    paddw                   m1, m0
5783    paddw                   m4, m0
5784    paddw                   m3, m0
5785    packuswb                m1, m2
5786    packuswb                m3, m4
5787    punpckhbw               m2, m5, m7
5788    punpcklbw               m5, m7
5789    punpckhbw               m4, m6, m7
5790    punpcklbw               m6, m7
5791    paddw                   m2, m0
5792    paddw                   m5, m0
5793    paddw                   m4, m0
5794    paddw                   m6, m0
5795    packuswb                m5, m2
5796    packuswb                m6, m4
5797    mova           [dstq+16*0], m1
5798    mova           [dstq+16*1], m3
5799    mova           [dstq+16*2], m5
5800    mova           [dstq+16*3], m6
5801    add                   dstq, strideq
5802    dec                    r3d
5803    jg .loop
5804    jmp                   tx2q
5805
5806.end:
5807    RET
5808
5809
5810%macro LOAD_4ROWS 2-3 0 ;src, stride, is_rect2
5811
5812%if %3
5813    mova                 m3, [o(pw_2896x8)]
5814    pmulhrsw             m0, m3, [%1+%2*0]
5815    pmulhrsw             m1, m3, [%1+%2*1]
5816    pmulhrsw             m2, m3, [%1+%2*2]
5817    pmulhrsw             m3, [%1+%2*3]
5818%else
5819    mova                 m0, [%1+%2*0]
5820    mova                 m1, [%1+%2*1]
5821    mova                 m2, [%1+%2*2]
5822    mova                 m3, [%1+%2*3]
5823%endif
5824%endmacro
5825
5826%macro LOAD_4ROWS_H 2 ;src, stride
5827    mova                 m4, [%1+%2*0]
5828    mova                 m5, [%1+%2*1]
5829    mova                 m6, [%1+%2*2]
5830    mova                 m7, [%1+%2*3]
5831%endmacro
5832
5833cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
5834    mov                    r3d, 2
5835    mov  [rsp+gprsize*2+16*67], dstq
5836    lea                   dstq, [rsp+gprsize+16*68]
5837
5838.pass1_loop:
5839    LOAD_4ROWS     coeffq+32*0, 32*8
5840    pxor                    m4, m4
5841    REPX          {mova x, m4}, m5, m6, m7
5842    call  m(idct_8x8_internal_8bpc).main
5843    SAVE_7ROWS    rsp+gprsize+16*3, 16
5844
5845    pxor                    m4, m4
5846    LOAD_4ROWS     coeffq+32*4, 32*8
5847
5848    REPX          {mova x, m4}, m5, m6, m7
5849    call m(idct_16x8_internal_8bpc).main
5850    mova                    m7, [rsp+gprsize+16*0]
5851    SAVE_8ROWS   rsp+gprsize+16*11, 16
5852
5853    LOAD_8ROWS     coeffq+32*2, 32*4
5854    mova   [rsp+gprsize+16*19], m0
5855    mova   [rsp+gprsize+16*26], m1
5856    mova   [rsp+gprsize+16*23], m2
5857    mova   [rsp+gprsize+16*22], m3
5858    mova   [rsp+gprsize+16*21], m4
5859    mova   [rsp+gprsize+16*24], m5
5860    mova   [rsp+gprsize+16*25], m6
5861    mova   [rsp+gprsize+16*20], m7
5862
5863    call m(idct_8x32_internal_8bpc).main_fast
5864    SAVE_8ROWS    rsp+gprsize+16*3, 16
5865
5866    LOAD_8ROWS     coeffq+32*1, 32*2
5867    mova   [rsp+gprsize+16*35], m0                        ;in1
5868    mova   [rsp+gprsize+16*49], m1                        ;in3
5869    mova   [rsp+gprsize+16*43], m2                        ;in5
5870    mova   [rsp+gprsize+16*41], m3                        ;in7
5871    mova   [rsp+gprsize+16*39], m4                        ;in9
5872    mova   [rsp+gprsize+16*45], m5                        ;in11
5873    mova   [rsp+gprsize+16*47], m6                        ;in13
5874    mova   [rsp+gprsize+16*37], m7                        ;in15
5875
5876    LOAD_8ROWS    coeffq+32*17, 32*2
5877    mova   [rsp+gprsize+16*63], m0                        ;in17
5878    mova   [rsp+gprsize+16*53], m1                        ;in19
5879    mova   [rsp+gprsize+16*55], m2                        ;in21
5880    mova   [rsp+gprsize+16*61], m3                        ;in23
5881    mova   [rsp+gprsize+16*59], m4                        ;in25
5882    mova   [rsp+gprsize+16*57], m5                        ;in27
5883    mova   [rsp+gprsize+16*51], m6                        ;in29
5884    mova   [rsp+gprsize+16*65], m7                        ;in31
5885
5886    call m(idct_16x64_internal_8bpc).main
5887
5888    LOAD_8ROWS    rsp+gprsize+16*3, 16
5889    mova    [rsp+gprsize+16*0], m7
5890    mova                    m7, [o(pw_8192)]
5891    lea                   tx2q, [o(.pass1_end)]
5892    jmp   m(idct_8x8_internal_8bpc).pass1_end1
5893
5894.pass1_end:
5895    SAVE_8ROWS     coeffq+32*0, 32
5896    LOAD_8ROWS   rsp+gprsize+16*11, 16
5897    mova    [rsp+gprsize+16*0], m7
5898    mova                    m7, [o(pw_8192)]
5899    lea                   tx2q, [o(.pass1_end1)]
5900    jmp   m(idct_8x8_internal_8bpc).pass1_end1
5901
5902.pass1_end1:
5903    SAVE_8ROWS     coeffq+32*8, 32
5904    LOAD_8ROWS   rsp+gprsize+16*19, 16
5905    mova    [rsp+gprsize+16*0], m7
5906    mova                    m7, [o(pw_8192)]
5907    lea                   tx2q, [o(.pass1_end2)]
5908    jmp   m(idct_8x8_internal_8bpc).pass1_end1
5909
5910.pass1_end2:
5911    SAVE_8ROWS    coeffq+32*16, 32
5912    LOAD_8ROWS   rsp+gprsize+16*27, 16
5913    mova    [rsp+gprsize+16*0], m7
5914    mova                    m7, [o(pw_8192)]
5915    lea                   tx2q, [o(.pass1_end3)]
5916    jmp   m(idct_8x8_internal_8bpc).pass1_end1
5917
5918.pass1_end3:
5919    SAVE_8ROWS    coeffq+32*24, 32
5920    LOAD_8ROWS   rsp+gprsize+16*35, 16
5921    mova    [rsp+gprsize+16*0], m7
5922    mova                    m7, [o(pw_8192)]
5923    lea                   tx2q, [o(.pass1_end4)]
5924    jmp   m(idct_8x8_internal_8bpc).pass1_end1
5925
5926.pass1_end4:
5927    SAVE_8ROWS       dstq+32*0, 32
5928    LOAD_8ROWS   rsp+gprsize+16*43, 16
5929    mova    [rsp+gprsize+16*0], m7
5930    mova                    m7, [o(pw_8192)]
5931    lea                   tx2q, [o(.pass1_end5)]
5932    jmp   m(idct_8x8_internal_8bpc).pass1_end1
5933
5934.pass1_end5:
5935    SAVE_8ROWS       dstq+32*8, 32
5936    LOAD_8ROWS   rsp+gprsize+16*51, 16
5937    mova    [rsp+gprsize+16*0], m7
5938    mova                    m7, [o(pw_8192)]
5939    lea                   tx2q, [o(.pass1_end6)]
5940    jmp   m(idct_8x8_internal_8bpc).pass1_end1
5941
5942.pass1_end6:
5943    SAVE_8ROWS      dstq+32*16, 32
5944    LOAD_8ROWS   rsp+gprsize+16*59, 16
5945    mova    [rsp+gprsize+16*0], m7
5946    mova                    m7, [o(pw_8192)]
5947    lea                   tx2q, [o(.pass1_end7)]
5948    jmp   m(idct_8x8_internal_8bpc).pass1_end1
5949
5950.pass1_end7:
5951    SAVE_8ROWS      dstq+32*24, 32
5952
5953    add                 coeffq, 16
5954    add                   dstq, 16
5955    dec                    r3d
5956    jg .pass1_loop
5957
5958.pass2:
5959    mov                   dstq, [rsp+gprsize*2+16*67]
5960    sub                 coeffq, 32
5961    mov                    r3d, 4
5962
5963.pass2_loop:
5964    mov  [rsp+gprsize*1+16*67], r3d
5965
5966    LOAD_4ROWS     coeffq+16*0, 32*2
5967    LOAD_4ROWS_H   coeffq+16*1, 32*2
5968    call  m(idct_8x8_internal_8bpc).main
5969    SAVE_7ROWS    rsp+gprsize+16*3, 16
5970    LOAD_4ROWS     coeffq+16*2, 32*2
5971    LOAD_4ROWS_H   coeffq+16*3, 32*2
5972    call m(idct_16x8_internal_8bpc).main
5973
5974    mov                    r3, dstq
5975    lea                  tx2q, [o(.end)]
5976    lea                  dstq, [dstq+strideq*8]
5977    jmp  m(idct_8x8_internal_8bpc).end
5978
5979.end:
5980    LOAD_8ROWS   rsp+gprsize+16*3, 16
5981    mova   [rsp+gprsize+16*0], m7
5982    lea                  tx2q, [o(.end1)]
5983    mov                  dstq, r3
5984    jmp  m(idct_8x8_internal_8bpc).end
5985
5986.end1:
5987    pxor                   m7, m7
5988    REPX  {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
5989
5990    add                 coeffq, 16*16
5991    mov                    r3d, [rsp+gprsize*1+16*67]
5992    mov                   dstq, [rsp+gprsize*2+16*67]
5993    add                   dstq, 8
5994    mov  [rsp+gprsize*2+16*67], dstq
5995    dec                    r3d
5996    jg .pass2_loop
5997
5998    mov                    r3d, 4
5999    lea                 coeffq, [rsp+gprsize+16*68]
6000.pass2_loop2:
6001    mov  [rsp+gprsize*1+16*67], r3d
6002
6003    LOAD_4ROWS     coeffq+16*0, 32*2
6004    LOAD_4ROWS_H   coeffq+16*1, 32*2
6005    call  m(idct_8x8_internal_8bpc).main
6006    SAVE_7ROWS    rsp+gprsize+16*3, 16
6007    LOAD_4ROWS     coeffq+16*2, 32*2
6008    LOAD_4ROWS_H   coeffq+16*3, 32*2
6009    call m(idct_16x8_internal_8bpc).main
6010
6011    mov                    r3, dstq
6012    lea                  tx2q, [o(.end2)]
6013    lea                  dstq, [dstq+strideq*8]
6014    jmp  m(idct_8x8_internal_8bpc).end
6015
6016.end2:
6017    LOAD_8ROWS   rsp+gprsize+16*3, 16
6018    mova   [rsp+gprsize+16*0], m7
6019    lea                  tx2q, [o(.end3)]
6020    mov                  dstq, r3
6021    jmp  m(idct_8x8_internal_8bpc).end
6022
6023.end3:
6024
6025    add                 coeffq, 16*16
6026    mov                    r3d, [rsp+gprsize*1+16*67]
6027    mov                   dstq, [rsp+gprsize*2+16*67]
6028    add                   dstq, 8
6029    mov  [rsp+gprsize*2+16*67], dstq
6030    dec                    r3d
6031    jg .pass2_loop2
6032    ret
6033
6034
6035cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2
6036%if ARCH_X86_32
6037    LEA                     r5, $$
6038%endif
6039    test                  eobd, eobd
6040    jz .dconly
6041    call m(idct_32x64_internal_8bpc)
6042.end:
6043    RET
6044
6045.dconly:
6046    movd                    m1, [o(pw_2896x8)]
6047    pmulhrsw                m0, m1, [coeffq]
6048    movd                    m2, [o(pw_16384)]
6049    mov               [coeffq], eobd
6050    pmulhrsw                m0, m1
6051    mov                    r3d, 64
6052    lea                   tx2q, [o(.end)]
6053    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body
6054
6055
6056cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
6057    mov                    r4d, 2
6058    sub                   eobd, 136
6059    mov  [rsp+gprsize*1+16*67], eobd
6060    mov                    r3d, 4
6061    cmovs                  r3d, r4d
6062
6063%if ARCH_X86_32
6064    LEA                     r5, $$
6065%endif
6066
6067    mov  [rsp+gprsize*2+16*67], coeffq
6068
6069.pass1_loop:
6070    LOAD_8ROWS     coeffq+64*1, 64*2, 1
6071    mova   [rsp+gprsize+16*19], m0                        ;in1
6072    mova   [rsp+gprsize+16*26], m1                        ;in3
6073    mova   [rsp+gprsize+16*23], m2                        ;in5
6074    mova   [rsp+gprsize+16*22], m3                        ;in7
6075    mova   [rsp+gprsize+16*21], m4                        ;in9
6076    mova   [rsp+gprsize+16*24], m5                        ;in11
6077    mova   [rsp+gprsize+16*25], m6                        ;in13
6078    mova   [rsp+gprsize+16*20], m7                        ;in15
6079
6080    mov                   tx2d, [rsp+gprsize*1+16*67]
6081    test                  tx2d, tx2d
6082    jl .fast
6083
6084.full:
6085    LOAD_8ROWS     coeffq+64*0, 64*4, 1
6086    call  m(idct_8x8_internal_8bpc).main
6087    SAVE_7ROWS    rsp+gprsize+16*3, 16
6088    LOAD_8ROWS     coeffq+64*2, 64*4, 1
6089    call m(idct_16x8_internal_8bpc).main
6090    mova                    m7, [rsp+gprsize+16*0]
6091    SAVE_8ROWS   rsp+gprsize+16*11, 16
6092
6093    LOAD_8ROWS    coeffq+64*17, 64*2, 1
6094    mova   [rsp+gprsize+16*33], m0                        ;in17
6095    mova   [rsp+gprsize+16*28], m1                        ;in19
6096    mova   [rsp+gprsize+16*29], m2                        ;in21
6097    mova   [rsp+gprsize+16*32], m3                        ;in23
6098    mova   [rsp+gprsize+16*31], m4                        ;in25
6099    mova   [rsp+gprsize+16*30], m5                        ;in27
6100    mova   [rsp+gprsize+16*27], m6                        ;in29
6101    mova   [rsp+gprsize+16*34], m7                        ;in31
6102
6103    call m(idct_8x32_internal_8bpc).main
6104    jmp .pass1_end
6105
6106.fast:
6107    LOAD_4ROWS          coeffq, 256, 1
6108    pxor                    m4, m4
6109    REPX          {mova x, m4}, m5, m6, m7
6110    call  m(idct_8x8_internal_8bpc).main
6111
6112    SAVE_7ROWS    rsp+gprsize+16*3, 16
6113    LOAD_4ROWS    coeffq+128*1, 256, 1
6114    pxor                    m4, m4
6115    REPX          {mova x, m4}, m5, m6, m7
6116    call m(idct_16x8_internal_8bpc).main
6117    mova                    m7, [rsp+gprsize+16*0]
6118    SAVE_8ROWS   rsp+gprsize+16*11, 16
6119
6120    call m(idct_8x32_internal_8bpc).main_fast
6121
6122.pass1_end:
6123    mova    [rsp+gprsize+16*0], m7
6124    lea                   tx2q, [o(.pass1_end1)]
6125    jmp   m(idct_8x8_internal_8bpc).pass1_end
6126
6127.pass1_end1:
6128    SAVE_8ROWS     coeffq+64*0, 64
6129    LOAD_8ROWS   rsp+gprsize+16*11, 16
6130    mova    [rsp+gprsize+16*0], m7
6131    lea                   tx2q, [o(.pass1_end2)]
6132    jmp   m(idct_8x8_internal_8bpc).pass1_end
6133
6134.pass1_end2:
6135    SAVE_8ROWS     coeffq+64*8, 64
6136    LOAD_8ROWS   rsp+gprsize+16*19, 16
6137    mova    [rsp+gprsize+16*0], m7
6138    lea                   tx2q, [o(.pass1_end3)]
6139    jmp   m(idct_8x8_internal_8bpc).pass1_end
6140
6141.pass1_end3:
6142    SAVE_8ROWS    coeffq+64*16, 64
6143    LOAD_8ROWS   rsp+gprsize+16*27, 16
6144    mova    [rsp+gprsize+16*0], m7
6145    lea                   tx2q, [o(.pass1_end4)]
6146    jmp   m(idct_8x8_internal_8bpc).pass1_end
6147
6148.pass1_end4:
6149    SAVE_8ROWS    coeffq+64*24, 64
6150
6151    add                 coeffq, 16
6152    dec                    r3d
6153    jg .pass1_loop
6154
6155.pass2:
6156    mov                 coeffq, [rsp+gprsize*2+16*67]
6157    mov                    r3d, 4
6158    lea                     r4, [dstq+8]
6159    mov  [rsp+gprsize*2+16*67], r4
6160    lea                     r4, [o(m(idct_16x64_internal_8bpc).end1)]
6161    jmp m(idct_16x64_internal_8bpc).pass2_loop
6162
6163
6164cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2
6165%if ARCH_X86_32
6166    LEA                     r5, $$
6167%endif
6168    test                  eobd, eobd
6169    jz .dconly
6170    call m(idct_64x32_internal_8bpc)
6171.end:
6172    RET
6173
6174.dconly:
6175    movd                    m1, [o(pw_2896x8)]
6176    pmulhrsw                m0, m1, [coeffq]
6177    movd                    m2, [o(pw_16384)]
6178    pmulhrsw                m0, m1
6179    mov               [coeffq], eobd
6180    mov                    r3d, 32
6181    lea                   tx2q, [o(.end)]
6182    jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body
6183
6184
6185cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
6186    mov                    r4d, 2
6187    sub                   eobd, 136
6188    mov  [rsp+gprsize*1+16*67], eobd
6189    mov                    r3d, 4
6190    cmovs                  r3d, r4d
6191
6192%if ARCH_X86_32
6193    LEA                     r5, $$
6194%endif
6195
6196    mov  [rsp+gprsize*2+16*67], coeffq
6197    mov  [rsp+gprsize*3+16*67], dstq
6198    lea                   dstq, [rsp+gprsize+16*69]
6199    mov  [rsp+gprsize*4+16*67], dstq
6200
6201.pass1_loop:
6202    LOAD_4ROWS     coeffq+64*0, 64*8, 1
6203    pxor                    m4, m4
6204    REPX          {mova x, m4}, m5, m6, m7
6205    call  m(idct_8x8_internal_8bpc).main
6206    SAVE_7ROWS    rsp+gprsize+16*3, 16
6207
6208    pxor                    m4, m4
6209    LOAD_4ROWS     coeffq+64*4, 64*8, 1
6210
6211    REPX          {mova x, m4}, m5, m6, m7
6212    call m(idct_16x8_internal_8bpc).main
6213    mova                    m7, [rsp+gprsize+16*0]
6214    SAVE_8ROWS   rsp+gprsize+16*11, 16
6215
6216    LOAD_8ROWS     coeffq+64*2, 64*4, 1
6217    mova   [rsp+gprsize+16*19], m0
6218    mova   [rsp+gprsize+16*26], m1
6219    mova   [rsp+gprsize+16*23], m2
6220    mova   [rsp+gprsize+16*22], m3
6221    mova   [rsp+gprsize+16*21], m4
6222    mova   [rsp+gprsize+16*24], m5
6223    mova   [rsp+gprsize+16*25], m6
6224    mova   [rsp+gprsize+16*20], m7
6225
6226    call m(idct_8x32_internal_8bpc).main_fast
6227    SAVE_8ROWS    rsp+gprsize+16*3, 16
6228
6229    LOAD_8ROWS     coeffq+64*1, 64*2, 1
6230    mova   [rsp+gprsize+16*35], m0                        ;in1
6231    mova   [rsp+gprsize+16*49], m1                        ;in3
6232    mova   [rsp+gprsize+16*43], m2                        ;in5
6233    mova   [rsp+gprsize+16*41], m3                        ;in7
6234    mova   [rsp+gprsize+16*39], m4                        ;in9
6235    mova   [rsp+gprsize+16*45], m5                        ;in11
6236    mova   [rsp+gprsize+16*47], m6                        ;in13
6237    mova   [rsp+gprsize+16*37], m7                        ;in15
6238
6239    LOAD_8ROWS    coeffq+64*17, 64*2, 1
6240    mova   [rsp+gprsize+16*63], m0                        ;in17
6241    mova   [rsp+gprsize+16*53], m1                        ;in19
6242    mova   [rsp+gprsize+16*55], m2                        ;in21
6243    mova   [rsp+gprsize+16*61], m3                        ;in23
6244    mova   [rsp+gprsize+16*59], m4                        ;in25
6245    mova   [rsp+gprsize+16*57], m5                        ;in27
6246    mova   [rsp+gprsize+16*51], m6                        ;in29
6247    mova   [rsp+gprsize+16*65], m7                        ;in31
6248
6249    call m(idct_16x64_internal_8bpc).main
6250
6251    LOAD_8ROWS    rsp+gprsize+16*3, 16
6252    mova    [rsp+gprsize+16*0], m7
6253    lea                   tx2q, [o(.pass1_end)]
6254    jmp   m(idct_8x8_internal_8bpc).pass1_end
6255
6256.pass1_end:
6257    SAVE_8ROWS     coeffq+64*0, 64
6258    LOAD_8ROWS   rsp+gprsize+16*11, 16
6259    mova    [rsp+gprsize+16*0], m7
6260    lea                   tx2q, [o(.pass1_end1)]
6261    jmp   m(idct_8x8_internal_8bpc).pass1_end
6262
6263.pass1_end1:
6264    SAVE_8ROWS     coeffq+64*8, 64
6265    LOAD_8ROWS   rsp+gprsize+16*19, 16
6266    mova    [rsp+gprsize+16*0], m7
6267    lea                   tx2q, [o(.pass1_end2)]
6268    jmp   m(idct_8x8_internal_8bpc).pass1_end
6269
6270.pass1_end2:
6271    SAVE_8ROWS    coeffq+64*16, 64
6272    LOAD_8ROWS   rsp+gprsize+16*27, 16
6273    mova    [rsp+gprsize+16*0], m7
6274    lea                   tx2q, [o(.pass1_end3)]
6275    jmp   m(idct_8x8_internal_8bpc).pass1_end
6276
6277.pass1_end3:
6278    SAVE_8ROWS    coeffq+64*24, 64
6279    LOAD_8ROWS   rsp+gprsize+16*35, 16
6280    mova    [rsp+gprsize+16*0], m7
6281    lea                   tx2q, [o(.pass1_end4)]
6282    jmp   m(idct_8x8_internal_8bpc).pass1_end
6283
6284.pass1_end4:
6285    SAVE_8ROWS       dstq+64*0, 64
6286    LOAD_8ROWS   rsp+gprsize+16*43, 16
6287    mova    [rsp+gprsize+16*0], m7
6288    lea                   tx2q, [o(.pass1_end5)]
6289    jmp   m(idct_8x8_internal_8bpc).pass1_end
6290
6291.pass1_end5:
6292    SAVE_8ROWS       dstq+64*8, 64
6293    LOAD_8ROWS   rsp+gprsize+16*51, 16
6294    mova    [rsp+gprsize+16*0], m7
6295    lea                   tx2q, [o(.pass1_end6)]
6296    jmp   m(idct_8x8_internal_8bpc).pass1_end
6297
6298.pass1_end6:
6299    SAVE_8ROWS      dstq+64*16, 64
6300    LOAD_8ROWS   rsp+gprsize+16*59, 16
6301    mova    [rsp+gprsize+16*0], m7
6302    lea                   tx2q, [o(.pass1_end7)]
6303    jmp   m(idct_8x8_internal_8bpc).pass1_end
6304
6305.pass1_end7:
6306    SAVE_8ROWS      dstq+64*24, 64
6307
6308    add                 coeffq, 16
6309    add                   dstq, 16
6310    dec                    r3d
6311    jg .pass1_loop
6312
6313.pass2:
6314    mov                 coeffq, [rsp+gprsize*4+16*67]
6315    mov                   dstq, [rsp+gprsize*3+16*67]
6316    mov                   eobd, [rsp+gprsize*1+16*67]
6317    lea                   dstq, [dstq+32]
6318    mov  [rsp+gprsize*1+16*35], eobd
6319    lea                   tx2q, [o(.pass2_end)]
6320    mov                    r3d, 4
6321    jmp m(idct_32x32_internal_8bpc).pass2_loop
6322
6323.pass2_end:
6324    mova    [rsp+gprsize+16*0], m7
6325    lea                     r3, [o(.pass2_end1)]
6326    jmp  m(idct_8x32_internal_8bpc).end2
6327
6328.pass2_end1:
6329    lea                   tx2q, [o(.pass2_end)]
6330    add                 coeffq, 16*32
6331    mov                   dstq, [rsp+gprsize*2+16*35]
6332    mov                    r3d, [rsp+gprsize*3+16*35]
6333    dec                    r3d
6334    jg m(idct_32x32_internal_8bpc).pass2_loop
6335
6336.pass2_end2:
6337    mov                   dstq, [rsp+gprsize*3+16*67]
6338    mov                 coeffq, [rsp+gprsize*2+16*67]
6339    lea                   tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)]
6340    mov                    r3d, 4
6341    jmp m(idct_32x32_internal_8bpc).pass2_loop
6342
6343
6344cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2
6345%if ARCH_X86_32
6346    LEA                     r5, $$
6347%endif
6348    test                  eobd, eobd
6349    jz .dconly
6350
6351    call m(idct_64x64_internal_8bpc)
6352    RET
6353
6354.dconly:
6355    movd                    m1, [o(pw_2896x8)]
6356    pmulhrsw                m0, m1, [coeffq]
6357    movd                    m2, [o(pw_8192)]
6358    mov               [coeffq], eobd
6359    mov                    r3d, 64
6360    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_64x32_8bpc).end)]
6361    jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body
6362
6363cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
6364    mov                    r5d, 4
6365    mov                    r4d, 2
6366    sub                   eobd, 136
6367    cmovns                 r4d, r5d
6368
6369%if ARCH_X86_32
6370    LEA                     r5, $$
6371%endif
6372
6373    mov  [rsp+gprsize*1+16*67], eobd
6374    mov                    r3d, r4d
6375    mov  [rsp+gprsize*4+16*67], coeffq
6376    mov  [rsp+gprsize*3+16*67], dstq
6377    lea                   dstq, [rsp+gprsize+16*69]
6378    mov  [rsp+gprsize*2+16*67], dstq
6379
6380.pass1_loop:
6381    LOAD_4ROWS     coeffq+64*0, 64*8
6382    pxor                    m4, m4
6383    REPX          {mova x, m4}, m5, m6, m7
6384    call  m(idct_8x8_internal_8bpc).main
6385    SAVE_7ROWS    rsp+gprsize+16*3, 16
6386
6387    pxor                    m4, m4
6388    LOAD_4ROWS     coeffq+64*4, 64*8
6389
6390    REPX          {mova x, m4}, m5, m6, m7
6391    call m(idct_16x8_internal_8bpc).main
6392    mova                    m7, [rsp+gprsize+16*0]
6393    SAVE_8ROWS   rsp+gprsize+16*11, 16
6394
6395    LOAD_8ROWS     coeffq+64*2, 64*4
6396    mova   [rsp+gprsize+16*19], m0
6397    mova   [rsp+gprsize+16*26], m1
6398    mova   [rsp+gprsize+16*23], m2
6399    mova   [rsp+gprsize+16*22], m3
6400    mova   [rsp+gprsize+16*21], m4
6401    mova   [rsp+gprsize+16*24], m5
6402    mova   [rsp+gprsize+16*25], m6
6403    mova   [rsp+gprsize+16*20], m7
6404
6405    call m(idct_8x32_internal_8bpc).main_fast
6406    SAVE_8ROWS    rsp+gprsize+16*3, 16
6407
6408    LOAD_8ROWS     coeffq+64*1, 64*2
6409    mova   [rsp+gprsize+16*35], m0                        ;in1
6410    mova   [rsp+gprsize+16*49], m1                        ;in3
6411    mova   [rsp+gprsize+16*43], m2                        ;in5
6412    mova   [rsp+gprsize+16*41], m3                        ;in7
6413    mova   [rsp+gprsize+16*39], m4                        ;in9
6414    mova   [rsp+gprsize+16*45], m5                        ;in11
6415    mova   [rsp+gprsize+16*47], m6                        ;in13
6416    mova   [rsp+gprsize+16*37], m7                        ;in15
6417
6418    LOAD_8ROWS    coeffq+64*17, 64*2
6419    mova   [rsp+gprsize+16*63], m0                        ;in17
6420    mova   [rsp+gprsize+16*53], m1                        ;in19
6421    mova   [rsp+gprsize+16*55], m2                        ;in21
6422    mova   [rsp+gprsize+16*61], m3                        ;in23
6423    mova   [rsp+gprsize+16*59], m4                        ;in25
6424    mova   [rsp+gprsize+16*57], m5                        ;in27
6425    mova   [rsp+gprsize+16*51], m6                        ;in29
6426    mova   [rsp+gprsize+16*65], m7                        ;in31
6427
6428    call m(idct_16x64_internal_8bpc).main
6429
6430    LOAD_8ROWS    rsp+gprsize+16*3, 16
6431    mova    [rsp+gprsize+16*0], m7
6432    mova                    m7, [o(pw_8192)]
6433    lea                   tx2q, [o(.pass1_end)]
6434    jmp   m(idct_8x8_internal_8bpc).pass1_end1
6435
6436.pass1_end:
6437    SAVE_8ROWS     coeffq+64*0, 64
6438    LOAD_8ROWS   rsp+gprsize+16*11, 16
6439    mova    [rsp+gprsize+16*0], m7
6440    mova                    m7, [o(pw_8192)]
6441    lea                   tx2q, [o(.pass1_end1)]
6442    jmp   m(idct_8x8_internal_8bpc).pass1_end1
6443
6444.pass1_end1:
6445    SAVE_8ROWS     coeffq+64*8, 64
6446    LOAD_8ROWS   rsp+gprsize+16*19, 16
6447    mova    [rsp+gprsize+16*0], m7
6448    mova                    m7, [o(pw_8192)]
6449    lea                   tx2q, [o(.pass1_end2)]
6450    jmp   m(idct_8x8_internal_8bpc).pass1_end1
6451
6452.pass1_end2:
6453    SAVE_8ROWS    coeffq+64*16, 64
6454    LOAD_8ROWS   rsp+gprsize+16*27, 16
6455    mova    [rsp+gprsize+16*0], m7
6456    mova                    m7, [o(pw_8192)]
6457    lea                   tx2q, [o(.pass1_end3)]
6458    jmp   m(idct_8x8_internal_8bpc).pass1_end1
6459
6460.pass1_end3:
6461    SAVE_8ROWS    coeffq+64*24, 64
6462    LOAD_8ROWS   rsp+gprsize+16*35, 16
6463    mova    [rsp+gprsize+16*0], m7
6464    mova                    m7, [o(pw_8192)]
6465    lea                   tx2q, [o(.pass1_end4)]
6466    jmp   m(idct_8x8_internal_8bpc).pass1_end1
6467
6468.pass1_end4:
6469    SAVE_8ROWS       dstq+64*0, 64
6470    LOAD_8ROWS   rsp+gprsize+16*43, 16
6471    mova    [rsp+gprsize+16*0], m7
6472    mova                    m7, [o(pw_8192)]
6473    lea                   tx2q, [o(.pass1_end5)]
6474    jmp   m(idct_8x8_internal_8bpc).pass1_end1
6475
6476.pass1_end5:
6477    SAVE_8ROWS       dstq+64*8, 64
6478    LOAD_8ROWS   rsp+gprsize+16*51, 16
6479    mova    [rsp+gprsize+16*0], m7
6480    mova                    m7, [o(pw_8192)]
6481    lea                   tx2q, [o(.pass1_end6)]
6482    jmp   m(idct_8x8_internal_8bpc).pass1_end1
6483
6484.pass1_end6:
6485    SAVE_8ROWS      dstq+64*16, 64
6486    LOAD_8ROWS   rsp+gprsize+16*59, 16
6487    mova    [rsp+gprsize+16*0], m7
6488    mova                    m7, [o(pw_8192)]
6489    lea                   tx2q, [o(.pass1_end7)]
6490    jmp   m(idct_8x8_internal_8bpc).pass1_end1
6491
6492.pass1_end7:
6493    SAVE_8ROWS      dstq+64*24, 64
6494
6495    add                 coeffq, 16
6496    add                   dstq, 16
6497    dec                    r3d
6498    jg .pass1_loop
6499
6500.pass2:
6501    mov                   dstq, [rsp+gprsize*3+16*67]
6502    mov                 coeffq, [rsp+gprsize*2+16*67]
6503    lea                   dstq, [dstq+32]
6504    mov                    r3d, 4
6505    lea                     r4, [dstq+8]
6506    mov  [rsp+gprsize*2+16*67], r4
6507    lea                     r4, [o(.pass2_end)]
6508    jmp m(idct_16x64_internal_8bpc).pass2_loop
6509
6510.pass2_end:
6511    LOAD_8ROWS   rsp+gprsize+16*35, 16
6512    lea                   dstq, [dstq+strideq*2]
6513    lea                     r3, [rsp+16*32+gprsize]
6514    mova    [rsp+gprsize+16*0], m7
6515    call m(idct_16x64_internal_8bpc).write
6516    mov                   dstq, [rsp+gprsize*2+16*67]
6517    mov                    r3d, [rsp+gprsize*3+16*67]
6518    lea                     r4, [dstq+8]
6519    mov  [rsp+gprsize*2+16*67], r4
6520    lea                     r4, [o(.pass2_end)]
6521
6522    dec                    r3d
6523    jg  m(idct_16x64_internal_8bpc).pass2_loop
6524
6525.pass2_end2:
6526    mov                 coeffq, [rsp+gprsize*4+16*67]
6527    mov                   dstq, [rsp+gprsize*2+16*67]
6528    mov                    r3d, 4
6529    sub                   dstq, 72
6530    lea                     r4, [dstq+8]
6531    mov  [rsp+gprsize*2+16*67], r4
6532    lea                     r4, [o(m(idct_16x64_internal_8bpc).end1)]
6533    jmp m(idct_16x64_internal_8bpc).pass2_loop
6534