1; Copyright © 2020, VideoLAN and dav1d authors
2; Copyright © 2020, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA 64
32int8_permA:  db  0,  1, 16, 17, 32, 33, 48, 49,  2,  3, 18, 19, 34, 35, 50, 51
33             db  4,  5, 20, 21, 36, 37, 52, 53,  6,  7, 22, 23, 38, 39, 54, 55
34             db  8,  9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59
35             db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63
36int8_permB:  db  0,  1, 16, 17, 32, 33, 48, 49,  2,  3, 18, 19, 34, 35, 50, 51
37             db  8,  9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59
38             db  4,  5, 20, 21, 36, 37, 52, 53,  6,  7, 22, 23, 38, 39, 54, 55
39             db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63
40int16_perm:  db  0,  1, 32, 33,  2,  3, 34, 35,  4,  5, 36, 37,  6,  7, 38, 39
41             db  8,  9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47
42             db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55
43             db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63
44dup16_perm:  db  0,  1,  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7
45             db  8,  9,  8,  9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15
46             db 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23
47             db 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29, 30, 31, 30, 31
48idtx_16x4p:  db  0,  1,  4,  5, 16, 17, 20, 21,  2,  3,  6,  7, 18, 19, 22, 23
49             db 32, 33, 36, 37, 48, 49, 52, 53, 34, 35, 38, 39, 50, 51, 54, 55
50             db  8,  9, 12, 13, 24, 25, 28, 29, 10, 11, 14, 15, 26, 27, 30, 31
51             db 40, 41, 44, 45, 56, 57, 60, 61, 42, 43, 46, 47, 58, 59, 62, 63
52idct_8x32p:  db 60, 61,  4,  5, 32, 33,  0,  1, 28, 29, 36, 37, 56, 57,  8,  9
53             db 12, 13, 52, 53, 24, 25, 40, 41, 44, 45, 20, 21, 48, 49, 16, 17
54             db 62, 63,  2,  3,  6,  7, 58, 59, 54, 55, 10, 11, 14, 15, 50, 51
55             db 46, 47, 18, 19, 22, 23, 42, 43, 38, 39, 26, 27, 30, 31, 34, 35
56idct_16x32p: db  6,  7, 58, 59, 38, 39, 26, 27, 32, 33,  0,  1, 30, 31, 34, 35
57             db 46, 47, 18, 19, 22, 23, 42, 43, 24, 25, 40, 41, 44, 45, 20, 21
58             db 62, 63,  2,  3, 48, 49, 16, 17, 56, 57,  8,  9, 14, 15, 50, 51
59             db 54, 55, 10, 11, 60, 61,  4,  5, 12, 13, 52, 53, 28, 29, 36, 37
60end_16x32p:  db  0, 32,  1, 48,  2, 36,  3, 52, 16, 40, 17, 56, 18, 44, 19, 60
61             db  4, 33,  5, 49,  6, 37,  7, 53, 20, 41, 21, 57, 22, 45, 23, 61
62             db  8, 35,  9, 51, 10, 39, 11, 55, 24, 43, 25, 59, 26, 47, 27, 63
63             db 12, 34, 13, 50, 14, 38, 15, 54, 28, 42, 29, 58, 30, 46, 31, 62
64
65; packed 4-bit qword shuffle indices
66permA:       dq 0x1c0d0d1ce0d94040, 0x5849495868fb6262
67             dq 0x3e2f2f3ef1c85151, 0x7a6b6b7a79ea7373
68             dq 0x94858594a451c8d9, 0xd0c1c1d02c73eafb
69             dq 0xb6a7a7b6b540d9c8, 0xf2e3e3f23d62fbea
70permB:       dq 0x40acbd0fcadb0f40, 0x518e9f3ce8f99604
71             dq 0xc824352d56128751, 0xd906171e74301e15
72             dq 0x6271604b03472d62, 0x735342782165b426
73             dq 0xeaf9e8699f8ea573, 0xfbdbca5abdac3c37
74permC:       dq 0x9d409d041551c2e0, 0xbf62bf263773a486
75             dq 0xc88c8c15409dd3f1, 0xeaaeae3762bfb597
76             dq 0x04d9158c8cc84a68, 0x26fb37aeaeea2c0e
77             dq 0x5115049dd9045b79, 0x733726bffb263d1f
78permD:       dq 0x0cda098800041504, 0x0edb09b2028c3726
79             dq 0x0f11fa9c01150415, 0x0988f326039d2637
80             dq 0x05640f1108269d8c, 0x05290edb0aaebfae
81             dq 0x0005000509378c9d, 0xffffffff0bbfaebf
82
83pd_0to15:    dd  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
84gather8a:    dd  0,  2,  1,  3,  8, 10,  9, 11
85gather8b:    dd  0,  1,  4,  5,  8,  9, 12, 13
86gather8c:    dd  0,  4,  2,  6, 12,  8, 14, 10
87gather8d:    dd  0,  3,  1,  2,  8, 11,  9, 10
88
89int_shuf1:   db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
90int_shuf2:   db  8,  9,  0,  1, 10, 11,  2,  3, 12, 13,  4,  5, 14, 15,  6,  7
91int_shuf3:   db  0,  1,  8,  9,  4,  5, 12, 13,  2,  3, 10, 11,  6,  7, 14, 15
92int_shuf4:   db  8,  9,  0,  1, 12, 13,  4,  5, 10, 11,  2,  3, 14, 15,  6,  7
93deint_shuf:  db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
94int_mshift:  db 12, 20,  0,  0, 44, 52,  0,  0
95
96pb_32:           times 4 db 32
97pw_2048:         times 2 dw 2048
98pw_4096:         times 2 dw 4096
99pw_8192:         times 2 dw 8192
100pw_16384:        times 2 dw 16384
101pw_1697x16:      times 2 dw 1697*16
102pw_1697x8:       times 2 dw 1697*8
103pw_2896x8:       times 2 dw 2896*8
104pd_2048:         dd  2048
105
106%define pw_5          (permD+52)
107%define pd_m1         (permD+60)
108%define pw_3803_1321  (permD+44)
109%define pw_2482_3803  (permD+12)
110%define pw_2440_3290  (permD+ 4)
111%define pw_m3290_2440 (permD+28)
112%define pw_3857_1380  (permD+36)
113%define pw_m1380_3857 (permD+20)
114
115pw_8192_m8192:   dw   8192,  -8192
116pw_m8192_8192:   dw  -8192,   8192
117pw_16384_m16384: dw  16384, -16384
118pw_m16384_16384: dw -16384,  16384
119
120pw_m1321_2482:   dw  -1321,  2482
121pw_m3344_3344:   dw  -3344,  3344
122pw_2482_3344:    dw   2482,  3344
123pw_m3803_3344:   dw  -3803,  3344
124pd_3344:         dd   3344
125pw_m1321_m3344:  dw  -1321, -3344
126pw_2896_m2896:   dw   2896, -2896
127
128pw_1567_m3784:   dw   1567, -3784
129pw_3784_m1567:   dw   3784, -1567
130pw_4017_m799:    dw   4017,  -799
131pw_2276_m3406:   dw   2276, -3406
132pw_m799_m4017:   dw   -799, -4017
133pw_m3406_m2276:  dw  -3406, -2276
134
135%macro COEF_PAIR 2-3 0
136pw_%1_%2:   dw  %1,  %2
137pw_m%2_%1:  dw -%2,  %1
138%if %3
139pw_m%1_m%2: dw -%1, -%2
140%endif
141%endmacro
142
143COEF_PAIR 2896, 2896
144COEF_PAIR 1567, 3784, 1
145COEF_PAIR 3784, 1567
146COEF_PAIR  201, 4091
147COEF_PAIR  995, 3973
148COEF_PAIR 1751, 3703
149COEF_PAIR 3035, 2751
150COEF_PAIR 3513, 2106
151COEF_PAIR 4052,  601
152COEF_PAIR 3166, 2598, 1
153COEF_PAIR 3920, 1189, 1
154COEF_PAIR 2276, 3406
155COEF_PAIR 4017,  799
156
157%macro COEF_X8 1-*
158%rep %0
159    dw %1*8, %1*8
160    %rotate 1
161%endrep
162%endmacro
163
164pw_m2276x8: COEF_X8 -2276
165pw_3406x8:  COEF_X8  3406
166pw_4017x8:  COEF_X8  4017
167pw_799x8:   COEF_X8   799
168pw_3784x8:  COEF_X8  3784
169pw_1567x8:  COEF_X8  1567
170
171pw_4076x8:  COEF_X8  4076
172pw_401x8:   COEF_X8   401
173pw_m2598x8: COEF_X8 -2598
174pw_3166x8:  COEF_X8  3166
175pw_3612x8:  COEF_X8  3612
176pw_1931x8:  COEF_X8  1931
177pw_m1189x8: COEF_X8 -1189
178pw_3920x8:  COEF_X8  3920
179
180pw_4091x8:  COEF_X8  4091
181pw_201x8:   COEF_X8   201
182pw_m2751x8: COEF_X8 -2751
183pw_3035x8:  COEF_X8  3035
184pw_3703x8:  COEF_X8  3703
185pw_1751x8:  COEF_X8  1751
186pw_m1380x8: COEF_X8 -1380
187pw_3857x8:  COEF_X8  3857
188pw_3973x8:  COEF_X8  3973
189pw_995x8:   COEF_X8   995
190pw_m2106x8: COEF_X8 -2106
191pw_3513x8:  COEF_X8  3513
192pw_3290x8:  COEF_X8  3290
193pw_2440x8:  COEF_X8  2440
194pw_m601x8:  COEF_X8  -601
195pw_4052x8:  COEF_X8  4052
196
197pw_401_4076x8:   dw   401*8, 4076*8
198pw_m2598_3166x8: dw -2598*8, 3166*8
199pw_1931_3612x8:  dw  1931*8, 3612*8
200pw_m1189_3920x8: dw -1189*8, 3920*8
201pw_799_4017x8:   dw   799*8, 4017*8
202pw_m2276_3406x8: dw -2276*8, 3406*8
203
204pw_201_4091x8:   dw   201*8, 4091*8
205pw_m601_4052x8:  dw  -601*8, 4052*8
206pw_995_3973x8:   dw   995*8, 3973*8
207pw_m1380_3857x8: dw -1380*8, 3857*8
208pw_1751_3703x8:  dw  1751*8, 3703*8
209pw_m2106_3513x8: dw -2106*8, 3513*8
210pw_2440_3290x8:  dw  2440*8, 3290*8
211pw_m2751_3035x8: dw -2751*8, 3035*8
212
213pw_101_4095x8:   dw   101*8, 4095*8
214pw_m2824_2967x8: dw -2824*8, 2967*8
215pw_1660_3745x8:  dw  1660*8, 3745*8
216pw_m1474_3822x8: dw -1474*8, 3822*8
217pw_897_3996x8:   dw   897*8, 3996*8
218pw_m2191_3461x8: dw -2191*8, 3461*8
219pw_2359_3349x8:  dw  2359*8, 3349*8
220pw_m700_4036x8:  dw  -700*8, 4036*8
221pw_501_4065x8:   dw   501*8, 4065*8
222pw_m2520_3229x8: dw -2520*8, 3229*8
223pw_2019_3564x8:  dw  2019*8, 3564*8
224pw_m1092_3948x8: dw -1092*8, 3948*8
225pw_1285_3889x8:  dw  1285*8, 3889*8
226pw_m1842_3659x8: dw -1842*8, 3659*8
227pw_2675_3102x8:  dw  2675*8, 3102*8
228pw_m301_4085x8:  dw  -301*8, 4085*8
229
230idct64_mul: COEF_X8  4095,   101,  2967, -2824,  3745,  1660,  3822, -1474
231COEF_PAIR  401, 4076, 1
232COEF_PAIR  799, 4017
233            COEF_X8  -700,  4036,  2359,  3349, -2191,  3461,   897,  3996
234dw    -2598, -3166,  3166, -2598,  2598,  3166, -4017,  -799,   799, -4017
235            COEF_X8  4065,   501,  3229, -2520,  3564,  2019,  3948, -1092
236COEF_PAIR 1931, 3612, 1
237COEF_PAIR 3406, 2276
238            COEF_X8  -301,  4085,  2675,  3102, -1842,  3659,  1285,  3889
239dw    -1189, -3920,  3920, -1189,  1189,  3920, -2276, -3406,  3406, -2276
240
241SECTION .text
242
243%define o_base int8_permA+64*18
244%define o(x) (r5 - (o_base) + (x))
245%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
246
247; flags: 1 = swap, 2 = interleave (l), 4 = interleave (t), 8 = no_pack,
248;        16 = special_mul1, 32 = special_mul2
249%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags
250    mova                m%2, m%4
251%if %7 & 16
252    vpdpwssd            m%2, m%1, [o(pw_%5)] {bcstd}
253    mova                m%3, m%4
254%if %7 & 32
255    vpdpwssd            m%3, m%1, [o(pw_%6)] {bcstd}
256%else
257    vpdpwssd            m%3, m%1, m%6
258%endif
259%elif %7 & 32
260    vpdpwssd            m%2, m%1, m%5
261    mova                m%3, m%4
262    vpdpwssd            m%3, m%1, [o(pw_%6)] {bcstd}
263%elif %6 < 32
264    vpdpwssd            m%2, m%1, m%5
265    mova                m%3, m%4
266    vpdpwssd            m%3, m%1, m%6
267%elif %7 & 1
268    vpdpwssd            m%2, m%1, [o(pw_%5_%6)] {bcstd}
269    mova                m%3, m%4
270    vpdpwssd            m%3, m%1, [o(pw_m%6_%5)] {bcstd}
271%else
272    vpdpwssd            m%2, m%1, [o(pw_m%6_%5)] {bcstd}
273    mova                m%3, m%4
274    vpdpwssd            m%3, m%1, [o(pw_%5_%6)] {bcstd}
275%endif
276%if %7 & 2
277    psrld               m%2, 12
278    pslld               m%3, 4
279    vpshrdd             m%1, m%3, m%2, 16
280%elif %7 & 4
281    ; compared to using shifts (as above) this has better throughput,
282    ; but worse latency and requires setting up the opmask/index
283    ; registers, so only use this method for the larger transforms
284    pslld               m%1, m%2, 4
285    vpmultishiftqb  m%1{k7}, m13, m%3
286%else
287    psrad               m%2, 12
288    psrad               m%3, 12
289%if %7 & 8 == 0
290    packssdw            m%1, m%3, m%2
291%endif
292%endif
293%endmacro
294
295; flags: same as ITX_MUL2X_PACK
296%macro ITX_MUL4X_PACK 10-11 0 ; dst/src, tmp[1-2], coef_tmp[1-2], rnd, coef[1-4], flags
297%if %11 & 1
298    vpbroadcastd        m%4, [o(pw_%9_%10)]
299    vpbroadcastd    m%4{k1}, [o(pw_%7_%8)]
300    vpbroadcastd        m%5, [o(pw_m%10_%9)]
301    vpbroadcastd    m%5{k1}, [o(pw_m%8_%7)]
302%else
303    vpbroadcastd        m%4, [o(pw_m%10_%9)]
304    vpbroadcastd    m%4{k1}, [o(pw_m%8_%7)]
305    vpbroadcastd        m%5, [o(pw_%9_%10)]
306    vpbroadcastd    m%5{k1}, [o(pw_%7_%8)]
307%endif
308    ITX_MUL2X_PACK       %1, %2, %3, %6, %4, %5, %11
309%endmacro
310
311; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
312; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
313%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2
314    punpcklwd           m%3, m%2, m%1
315    punpckhwd           m%2, m%1
316%if %7 < 32
317    mova                m%1, m%5
318    vpdpwssd            m%1, m%3, m%7
319    mova                m%4, m%5
320    vpdpwssd            m%4, m%2, m%7
321%else
322    mova                m%1, m%5
323    vpdpwssd            m%1, m%3, [o(pw_m%7_%6)] {bcstd}
324    mova                m%4, m%5
325    vpdpwssd            m%4, m%2, [o(pw_m%7_%6)] {bcstd}
326%endif
327    psrad               m%1, 12
328    psrad               m%4, 12
329    packssdw            m%1, m%4
330    mova                m%4, m%5
331%if %7 < 32
332    vpdpwssd            m%4, m%2, m%6
333    mova                m%2, m%5
334    vpdpwssd            m%2, m%3, m%6
335%else
336    vpdpwssd            m%4, m%2, [o(pw_%6_%7)] {bcstd}
337    mova                m%2, m%5
338    vpdpwssd            m%2, m%3, [o(pw_%6_%7)] {bcstd}
339%endif
340    psrad               m%4, 12
341    psrad               m%2, 12
342%if %0 == 8
343    packssdw            m%8, m%2, m%4
344%else
345    packssdw            m%2, m%4
346%endif
347%endmacro
348
349%macro WRAP_XMM 1+
350    %xdefine %%reset RESET_MM_PERMUTATION
351    INIT_XMM cpuname
352    DEFINE_MMREGS xmm
353    AVX512_MM_PERMUTATION
354    %1
355    %%reset
356%endmacro
357
358%macro WRAP_YMM 1+
359    INIT_YMM cpuname
360    %1
361    INIT_ZMM cpuname
362%endmacro
363
364%macro ITX4_END 4-5 2048 ; row[1-4], rnd
365%if %5
366    vpbroadcastd         m2, [o(pw_%5)]
367    pmulhrsw             m0, m2
368    pmulhrsw             m1, m2
369%endif
370    lea                  r2, [dstq+strideq*2]
371%assign %%i 1
372%rep 4
373    %if %1 & 2
374        CAT_XDEFINE %%row_adr, %%i, r2   + strideq*(%1&1)
375    %else
376        CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
377    %endif
378    %assign %%i %%i + 1
379    %rotate 1
380%endrep
381    movd                 m2, [%%row_adr1]
382    pinsrd               m2, [%%row_adr2], 1
383    movd                 m3, [%%row_adr3]
384    pinsrd               m3, [%%row_adr4], 1
385    pmovzxbw             m2, m2
386    pmovzxbw             m3, m3
387    paddw                m0, m2
388    paddw                m1, m3
389    packuswb             m0, m1
390    movd       [%%row_adr1], m0
391    pextrd     [%%row_adr2], m0, 1
392    pextrd     [%%row_adr3], m0, 2
393    pextrd     [%%row_adr4], m0, 3
394    ret
395%endmacro
396
397%macro INV_TXFM_FN 3 ; type1, type2, size
398cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, 0, dst, stride, c, eob, tx2, base
399    %define %%p1 m(i%1_%3_internal_8bpc)
400    lea               baseq, [o_base]
401    ; Jump to the 1st txfm function if we're not taking the fast path, which
402    ; in turn performs an indirect jump to the 2nd txfm function.
403    lea tx2q, [m(i%2_%3_internal_8bpc).pass2]
404%ifidn %1_%2, dct_dct
405    test               eobd, eobd
406    jnz %%p1
407%else
408    ; jump to the 1st txfm function unless it's located directly after this
409    times ((%%end - %%p1) >> 31) & 1 jmp %%p1
410ALIGN function_align
411%%end:
412%endif
413%endmacro
414
415%macro INV_TXFM_4X4_FN 2 ; type1, type2
416    INV_TXFM_FN          %1, %2, 4x4
417%ifidn %1_%2, dct_dct
418    vpbroadcastw         m0, [cq]
419    vpbroadcastd         m1, [o(pw_2896x8)]
420    pmulhrsw             m0, m1
421    mov                [cq], eobd
422    pmulhrsw             m0, m1
423    mova                 m1, m0
424    jmp m(iadst_4x4_internal_8bpc).end2
425%endif
426%endmacro
427
428%macro IDCT4_1D_PACKED 0
429    vpbroadcastd         m4, [o(pd_2048)]
430    punpckhwd            m2, m1, m0
431    punpcklwd            m1, m0
432    ITX_MUL2X_PACK        2, 0, 3, 4, 1567, 3784
433    ITX_MUL2X_PACK        1, 0, 3, 4, 2896, 2896
434    paddsw               m0, m1, m2 ; out0 out1
435    psubsw               m1, m2     ; out3 out2
436%endmacro
437
438%macro IADST4_1D_PACKED 0
439    punpcklwd            m4, m1, m0 ; in2 in0
440    punpckhwd            m5, m1, m0 ; in3 in1
441.main2:
442    vpbroadcastd         m3, [o(pd_2048)]
443    mova                 m0, m3
444    vpdpwssd             m0, m4, [o(pw_3803_1321)] {bcstd}
445    mova                 m2, m3
446    vpdpwssd             m2, m4, [o(pw_m1321_2482)] {bcstd}
447    mova                 m1, m3
448    vpdpwssd             m1, m4, [o(pw_m3344_3344)] {bcstd}
449    vpdpwssd             m3, m4, [o(pw_2482_3803)] {bcstd}
450    vpdpwssd             m0, m5, [o(pw_2482_3344)] {bcstd}
451    vpdpwssd             m2, m5, [o(pw_m3803_3344)] {bcstd}
452    vpdpwssd             m1, m5, [o(pd_3344)] {bcstd}
453    vpdpwssd             m3, m5, [o(pw_m1321_m3344)] {bcstd}
454    REPX      {psrad x, 12}, m0, m2, m1, m3
455    packssdw             m0, m2 ; out0 out1
456    packssdw             m1, m3 ; out2 out3
457%endmacro
458
459INIT_XMM avx512icl
460INV_TXFM_4X4_FN dct, dct
461INV_TXFM_4X4_FN dct, adst
462INV_TXFM_4X4_FN dct, flipadst
463INV_TXFM_4X4_FN dct, identity
464
465cglobal idct_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
466    mova                 m0, [cq+16*0]
467    mova                 m1, [cq+16*1]
468    IDCT4_1D_PACKED
469    mova                 m2, [o(deint_shuf)]
470    shufps               m3, m0, m1, q1331
471    shufps               m0, m0, m1, q0220
472    pshufb               m0, m2
473    pshufb               m1, m3, m2
474    jmp                tx2q
475.pass2:
476    IDCT4_1D_PACKED
477    pxor              ymm16, ymm16
478    mova               [cq], ymm16
479    ITX4_END              0, 1, 3, 2
480
481INV_TXFM_4X4_FN adst, dct
482INV_TXFM_4X4_FN adst, adst
483INV_TXFM_4X4_FN adst, flipadst
484INV_TXFM_4X4_FN adst, identity
485
486cglobal iadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
487    mova                 m0, [cq+16*0]
488    mova                 m1, [cq+16*1]
489    call .main
490    punpckhwd            m3, m0, m1
491    punpcklwd            m0, m1
492    punpckhwd            m1, m0, m3
493    punpcklwd            m0, m3
494    jmp                tx2q
495.pass2:
496    call .main
497.end:
498    pxor              ymm16, ymm16
499    mova               [cq], ymm16
500.end2:
501    ITX4_END              0, 1, 2, 3
502ALIGN function_align
503.main:
504    IADST4_1D_PACKED
505    ret
506
507INV_TXFM_4X4_FN flipadst, dct
508INV_TXFM_4X4_FN flipadst, adst
509INV_TXFM_4X4_FN flipadst, flipadst
510INV_TXFM_4X4_FN flipadst, identity
511
512cglobal iflipadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
513    mova                 m0, [cq+16*0]
514    mova                 m1, [cq+16*1]
515    call m(iadst_4x4_internal_8bpc).main
516    punpcklwd            m2, m1, m0
517    punpckhwd            m1, m0
518    punpcklwd            m0, m1, m2
519    punpckhwd            m1, m2
520    jmp                tx2q
521.pass2:
522    call m(iadst_4x4_internal_8bpc).main
523.end:
524    pxor              ymm16, ymm16
525    mova               [cq], ymm16
526.end2:
527    ITX4_END              3, 2, 1, 0
528
529INV_TXFM_4X4_FN identity, dct
530INV_TXFM_4X4_FN identity, adst
531INV_TXFM_4X4_FN identity, flipadst
532INV_TXFM_4X4_FN identity, identity
533
534cglobal iidentity_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
535    mova                 m0, [cq+16*0]
536    mova                 m1, [cq+16*1]
537    vpbroadcastd         m3, [o(pw_1697x8)]
538    pmulhrsw             m2, m3, m0
539    pmulhrsw             m3, m1
540    paddsw               m0, m2
541    paddsw               m1, m3
542    punpckhwd            m2, m0, m1
543    punpcklwd            m0, m1
544    punpckhwd            m1, m0, m2
545    punpcklwd            m0, m2
546    jmp                tx2q
547.pass2:
548    vpbroadcastd         m3, [o(pw_1697x8)]
549    pmulhrsw             m2, m3, m0
550    pmulhrsw             m3, m1
551    paddsw               m0, m2
552    paddsw               m1, m3
553    jmp m(iadst_4x4_internal_8bpc).end
554
555%macro INV_TXFM_4X8_FN 2 ; type1, type2
556    INV_TXFM_FN          %1, %2, 4x8
557%ifidn %1_%2, dct_dct
558    movd               xmm1, [o(pw_2896x8)]
559    pmulhrsw           xmm0, xmm1, [cq]
560    movd               xmm2, [o(pw_2048)]
561    pmulhrsw           xmm0, xmm1
562    pmulhrsw           xmm0, xmm1
563    pmulhrsw           xmm0, xmm2
564    vpbroadcastw        ym0, xmm0
565    mova                ym1, ym0
566    jmp m(iadst_4x8_internal_8bpc).end3
567%endif
568%endmacro
569
570%macro IDCT8_1D_PACKED 0
571    punpckhwd            m5, m3, m0 ; in7 in1
572    punpckhwd            m4, m1, m2 ; in3 in5
573    punpcklwd            m3, m1     ; in6 in2
574    punpcklwd            m2, m0     ; in4 in0
575.main2:
576    vpbroadcastd         m6, [o(pd_2048)]
577    ITX_MUL2X_PACK        5, 0, 1, 6,  799, 4017, 3 ; t4a t7a
578    ITX_MUL2X_PACK        4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a
579    ITX_MUL2X_PACK        3, 0, 1, 6, 1567, 3784    ; t3 t2
580    psubsw               m0, m5, m4 ; t5a t6a (interleaved)
581    paddsw               m4, m5     ; t4  t7  (interleaved)
582    ITX_MUL2X_PACK        2, 1, 5, 6, 2896, 2896    ; t0 t1
583    ITX_MUL2X_PACK        0, 1, 5, 6, 2896, 2896, 1 ; t6 t5
584%if mmsize > 16
585    vbroadcasti32x4      m1, [o(deint_shuf)]
586    pshufb               m4, m1
587%else
588    pshufb               m4, [o(deint_shuf)]
589%endif
590    psubsw               m1, m2, m3 ; tmp3 tmp2
591    paddsw               m3, m2     ; tmp0 tmp1
592    punpckhqdq           m2, m4, m0 ; t7 t6
593    punpcklqdq           m4, m0     ; t4 t5
594    paddsw               m0, m3, m2 ; out0 out1
595    psubsw               m3, m2     ; out7 out6
596    psubsw               m2, m1, m4 ; out4 out5
597    paddsw               m1, m4     ; out3 out2
598%endmacro
599
600%macro IADST8_1D_PACKED 1 ; pass
601    vpbroadcastd         m6, [o(pd_2048)]
602%if %1 == 1
603    ITX_MUL2X_PACK        0, 4, 5, 6,  401, 4076, 3 ; t1a t0a
604    ITX_MUL2X_PACK        1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a
605    ITX_MUL2X_PACK        2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a
606    ITX_MUL2X_PACK        3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a
607    psubsw               m4, m0, m2 ; t5 t4
608    paddsw               m0, m2     ; t1 t0
609    psubsw               m5, m1, m3 ; t6 t7
610    paddsw               m1, m3     ; t2 t3
611    ITX_MUL2X_PACK        4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a
612    ITX_MUL2X_PACK        5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a
613%if mmsize > 16
614    vbroadcasti32x4      m2, [o(deint_shuf)]
615%else
616    mova                 m2, [o(deint_shuf)]
617%endif
618    vprord               m1, 16
619    psubsw               m3, m0, m1 ; t3 t2
620    paddsw               m0, m1     ; -out7  out0
621    psubsw               m1, m4, m5 ; t7 t6
622    paddsw               m4, m5     ;  out6 -out1
623    pshufb               m0, m2
624    pshufb               m4, m2
625    mova                 m2, m6
626    vpdpwssd             m2, m3, [o(pw_m2896_2896)] {bcstd}
627    mova                 m5, m6
628    vpdpwssd             m5, m1, [o(pw_m2896_2896)] {bcstd}
629    psrad                m2, 12
630    psrad                m5, 12
631    packssdw             m2, m5     ; out4 -out5
632    mova                 m5, m6
633    vpdpwssd             m5, m3, [o(pw_2896_2896)] {bcstd}
634    mova                 m3, m6
635    vpdpwssd             m3, m1, [o(pw_2896_2896)] {bcstd}
636    psrad                m5, 12
637    psrad                m3, 12
638    packssdw             m1, m3, m5 ; out2 -out3
639%else
640    punpckhwd            m0, m4, m3 ; 0 7
641    punpckhwd            m1, m5, m2 ; 2 5
642    punpcklwd            m2, m5     ; 4 3
643    punpcklwd            m3, m4     ; 6 1
644    ITX_MUL2X_PACK        0, 4, 5, 6,  401, 4076 ; t0a t1a
645    ITX_MUL2X_PACK        1, 4, 5, 6, 1931, 3612 ; t2a t3a
646    ITX_MUL2X_PACK        2, 4, 5, 6, 3166, 2598 ; t4a t5a
647    ITX_MUL2X_PACK        3, 4, 5, 6, 3920, 1189 ; t6a t7a
648    psubsw               m4, m0, m2 ; t4 t5
649    paddsw               m0, m2     ; t0 t1
650    psubsw               m5, m1, m3 ; t6 t7
651    paddsw               m1, m3     ; t2 t3
652    shufps               m2, m5, m4, q1032
653    punpckhwd            m4, m2
654    punpcklwd            m5, m2
655    ITX_MUL2X_PACK        4, 2, 3, 6, 1567, 3784    ; t4a t5a
656    ITX_MUL2X_PACK        5, 2, 3, 6, 3784, 1567, 1 ; t6a t7a
657    psubsw               m2, m0, m1 ; t2 t3
658    paddsw               m0, m1     ; out0 -out7
659    psubsw               m1, m4, m5 ; t6 t7
660    paddsw               m4, m5     ; -out1 out6
661    vpbroadcastd         m5, [o(pw_2896x8)]
662    punpckhqdq           m3, m2, m1 ; t3 t7
663    punpcklqdq           m2, m1     ; t2 t6
664    paddsw               m1, m2, m3 ; t2+t3 t6+t7
665    psubsw               m2, m3     ; t2-t3 t6-t7
666    punpckhqdq           m3, m4, m0 ; out6 -out7
667    punpcklqdq           m0, m4     ; out0 -out1
668    pmulhrsw             m2, m5     ; out4 -out5
669    pshufd               m1, m1, q1032
670    pmulhrsw             m1, m5     ; out2 -out3
671%endif
672%endmacro
673
674INIT_YMM avx512icl
675INV_TXFM_4X8_FN dct, dct
676INV_TXFM_4X8_FN dct, identity
677INV_TXFM_4X8_FN dct, adst
678INV_TXFM_4X8_FN dct, flipadst
679
680cglobal idct_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
681    vpermq               m0, [cq+32*0], q3120
682    vpermq               m1, [cq+32*1], q3120
683    vpbroadcastd         m2, [o(pw_2896x8)]
684    pmulhrsw             m0, m2
685    pmulhrsw             m1, m2
686    IDCT4_1D_PACKED
687    vbroadcasti32x4      m2, [o(deint_shuf)]
688    shufps               m3, m0, m1, q1331
689    shufps               m0, m0, m1, q0220
690    pshufb               m0, m2
691    pshufb               m1, m3, m2
692    jmp                tx2q
693.pass2:
694    vextracti32x4       xm2, m0, 1
695    vextracti32x4       xm3, m1, 1
696    call .main
697    vpbroadcastd         m4, [o(pw_2048)]
698    vinserti32x4         m0, m0, xm2, 1
699    vinserti32x4         m1, m1, xm3, 1
700    pshufd               m1, m1, q1032
701    jmp m(iadst_4x8_internal_8bpc).end2
702ALIGN function_align
703.main:
704    WRAP_XMM IDCT8_1D_PACKED
705    ret
706
707INV_TXFM_4X8_FN adst, dct
708INV_TXFM_4X8_FN adst, adst
709INV_TXFM_4X8_FN adst, flipadst
710INV_TXFM_4X8_FN adst, identity
711
712cglobal iadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
713    vpermq               m0, [cq+32*0], q3120
714    vpermq               m1, [cq+32*1], q3120
715    vpbroadcastd         m2, [o(pw_2896x8)]
716    pmulhrsw             m0, m2
717    pmulhrsw             m1, m2
718    call m(iadst_8x4_internal_8bpc).main
719    punpckhwd            m3, m0, m1
720    punpcklwd            m0, m1
721    punpckhwd            m1, m0, m3
722    punpcklwd            m0, m3
723    jmp                tx2q
724.pass2:
725    vextracti32x4       xm2, m0, 1
726    vextracti32x4       xm3, m1, 1
727    pshufd              xm4, xm0, q1032
728    pshufd              xm5, xm1, q1032
729    call .main_pass2
730    vpbroadcastd         m4, [o(pw_2048)]
731    vinserti32x4         m0, xm2, 1
732    vinserti32x4         m1, xm3, 1
733    pxor                 m5, m5
734    psubw                m5, m4
735.end:
736    punpcklqdq           m4, m5
737.end2:
738    pmulhrsw             m0, m4
739    pmulhrsw             m1, m4
740.end3:
741    vpbroadcastd         m3, strided
742    pmulld               m5, m3, [o(pd_0to15)]
743    kxnorb               k1, k1, k1
744    kmovb                k2, k1
745    vpgatherdd       m3{k1}, [dstq+m5]
746    pxor                 m4, m4
747    mova               [cq], zmm20
748    punpcklbw            m2, m3, m4
749    punpckhbw            m3, m4
750    paddw                m0, m2
751    paddw                m1, m3
752    packuswb             m0, m1
753    vpscatterdd [dstq+m5]{k2}, m0
754    RET
755ALIGN function_align
756.main_pass1:
757    punpckhwd           xm0, xm4, xm3 ; 0 7
758    punpckhwd           xm1, xm5, xm2 ; 2 5
759    punpcklwd           xm2, xm5      ; 4 3
760    punpcklwd           xm3, xm4      ; 6 1
761    WRAP_XMM IADST8_1D_PACKED 1
762    punpcklqdq          xm3, xm4, xm0 ; out6 -out7
763    punpckhqdq          xm0, xm4      ; out0 -out1
764    ret
765ALIGN function_align
766.main_pass2:
767    WRAP_XMM IADST8_1D_PACKED 2
768    ret
769
770INV_TXFM_4X8_FN flipadst, dct
771INV_TXFM_4X8_FN flipadst, adst
772INV_TXFM_4X8_FN flipadst, flipadst
773INV_TXFM_4X8_FN flipadst, identity
774
775cglobal iflipadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
776    vpermq               m0, [cq+32*0], q3120
777    vpermq               m1, [cq+32*1], q3120
778    vpbroadcastd         m2, [o(pw_2896x8)]
779    pmulhrsw             m0, m2
780    pmulhrsw             m1, m2
781    call m(iadst_8x4_internal_8bpc).main
782    punpcklwd            m3, m1, m0
783    punpckhwd            m1, m0
784    punpcklwd            m0, m1, m3
785    punpckhwd            m1, m3
786    jmp                tx2q
787.pass2:
788    vextracti32x4       xm2, m0, 1
789    vextracti32x4       xm3, m1, 1
790    pshufd              xm4, xm0, q1032
791    pshufd              xm5, xm1, q1032
792    call m(iadst_4x8_internal_8bpc).main_pass2
793    vpbroadcastd         m5, [o(pw_2048)]
794    vinserti32x4         m3, xm1, 1
795    vinserti32x4         m2, xm0, 1
796    pxor                 m4, m4
797    psubw                m4, m5
798    pshufd               m0, m3, q1032
799    pshufd               m1, m2, q1032
800    jmp m(iadst_4x8_internal_8bpc).end
801
802INIT_ZMM avx512icl
803INV_TXFM_4X8_FN identity, dct
804INV_TXFM_4X8_FN identity, adst
805INV_TXFM_4X8_FN identity, flipadst
806INV_TXFM_4X8_FN identity, identity
807
808cglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
809    vpbroadcastd         m0, [o(pw_2896x8)]
810    pmulhrsw             m0, [cq]
811    mova                 m1, [o(int8_permB)]
812    vpbroadcastd         m2, [o(pw_1697x8)]
813    vpermb               m0, m1, m0
814    pmulhrsw             m2, m0
815    paddsw               m0, m2
816    vextracti32x8       ym1, m0, 1
817    jmp                tx2q
818.pass2:
819    vpbroadcastd        ym4, [o(pw_4096)]
820    jmp m(iadst_4x8_internal_8bpc).end2
821
822%macro INV_TXFM_4X16_FN 2 ; type1, type2
823    INV_TXFM_FN          %1, %2, 4x16
824%ifidn %1_%2, dct_dct
825    movsx               r6d, word [cq]
826    mov                [cq], eobd
827    imul                r6d, 181
828    add                 r6d, 128+256
829    sar                 r6d, 8+1
830    imul                r6d, 181
831    add                 r6d, 128+2048
832    sar                 r6d, 8+4
833    vpbroadcastw         m0, r6d
834    mova                 m1, m0
835    jmp m(iadst_4x16_internal_8bpc).end3
836%endif
837%endmacro
838
839%macro IDCT16_1D_PACKED 0
840    punpckhwd            m8, m7, m0 ; dct16 in15 in1
841    punpcklwd            m9, m4, m0 ; dct4  in2  in0
842    punpckhwd            m0, m3, m4 ; dct16 in7  in9
843    punpcklwd            m7, m1     ; dct8  in7  in1
844    punpckhwd            m1, m6     ; dct16 in3  in13
845    punpcklwd            m3, m5     ; dct8  in3  in5
846    punpckhwd            m5, m2     ; dct16 in11 in5
847    punpcklwd            m6, m2     ; dct4  in3  in1
848.main2:
849    vpbroadcastd        m10, [o(pd_2048)]
850.main3:
851    vpbroadcastq        m13, [o(int_mshift)]
852    vpcmpub              k7, m13, m10, 6 ; 0x33...
853    ITX_MUL2X_PACK        8, 2, 4, 10,  401, 4076, 5 ; t8a  t15a
854    ITX_MUL2X_PACK        0, 2, 4, 10, 3166, 2598, 5 ; t9a  t14a
855    ITX_MUL2X_PACK        5, 2, 4, 10, 1931, 3612, 5 ; t10a t13a
856    ITX_MUL2X_PACK        1, 2, 4, 10, 3920, 1189, 5 ; t11a t12a
857    ITX_MUL2X_PACK        7, 2, 4, 10,  799, 4017, 5 ; t4a  t7a
858    ITX_MUL2X_PACK        3, 2, 4, 10, 3406, 2276, 5 ; t5a  t6a
859.main4:
860    psubsw               m2, m8, m0 ; t9  t14
861    paddsw               m8, m0     ; t8  t15
862    psubsw               m4, m1, m5 ; t10 t13
863    paddsw               m1, m5     ; t11 t12
864    ITX_MUL2X_PACK        6, 0, 5, 10, 1567,  3784    ; t3   t2
865    psubsw               m0, m8, m1 ; t11a t12a
866    paddsw               m8, m1     ; t8a  t15a
867    psubsw               m1, m7, m3 ; t5a  t6a
868    paddsw               m7, m3     ; t4   t7
869.main5:
870    ITX_MUL2X_PACK        2, 3, 5, 10, 1567,  3784, 5 ; t9a  t14a
871    ITX_MUL2X_PACK        4, 3, 5, 10, m3784, 1567, 5 ; t10a t13a
872%if mmsize > 16
873    vbroadcasti32x4      m5, [o(deint_shuf)]
874%else
875    mova                 m5, [o(deint_shuf)]
876%endif
877    vpbroadcastd        m11, [o(pw_m2896_2896)]
878    vpbroadcastd        m12, [o(pw_2896_2896)]
879    paddsw               m3, m2, m4 ; t9   t14
880    psubsw               m2, m4     ; t10  t13
881    pshufb               m8, m5
882    pshufb               m7, m5
883    pshufb               m3, m5
884    ITX_MUL2X_PACK        9, 4,  5, 10, 11, 12    ; t0   t1
885    ITX_MUL2X_PACK        1, 4,  5, 10, 12, 11    ; t5   t6
886    ITX_MUL2X_PACK        0, 4,  5, 10, 11, 12, 8 ; t11  t12
887    ITX_MUL2X_PACK        2, 0, 11, 10, 11, 12, 8 ; t10a t13a
888    punpckhqdq           m2, m7, m1 ; t7 t6
889    punpcklqdq           m7, m1     ; t4 t5
890    psubsw               m1, m9, m6 ; dct4 out3 out2
891    paddsw               m9, m6     ; dct4 out0 out1
892    packssdw             m5, m11    ; t12  t13a
893    packssdw             m4, m0     ; t11  t10a
894    punpckhqdq           m0, m8, m3 ; t15a t14
895    punpcklqdq           m8, m3     ; t8a  t9
896    psubsw               m3, m9, m2 ; dct8 out7 out6
897    paddsw               m9, m2     ; dct8 out0 out1
898    psubsw               m2, m1, m7 ; dct8 out4 out5
899    paddsw               m1, m7     ; dct8 out3 out2
900    psubsw               m7, m9, m0 ; out15 out14
901    paddsw               m0, m9     ; out0  out1
902    psubsw               m6, m1, m5 ; out12 out13
903    paddsw               m1, m5     ; out3  out2
904    psubsw               m5, m2, m4 ; out11 out10
905    paddsw               m2, m4     ; out4  out5
906    psubsw               m4, m3, m8 ; out8  out9
907    paddsw               m3, m8     ; out7  out6
908%endmacro
909
910INV_TXFM_4X16_FN dct, dct
911INV_TXFM_4X16_FN dct, identity
912INV_TXFM_4X16_FN dct, adst
913INV_TXFM_4X16_FN dct, flipadst
914
915cglobal idct_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
916    mova                ym1, [cq+32*2]
917    vinserti32x8         m1, [cq+32*0], 1
918    mova                 m0, [o(int16_perm)]
919    mova                ym2, [cq+32*3]
920    vinserti32x8         m2, [cq+32*1], 1
921    vpbroadcastd         m4, [o(pd_2048)]
922    vpermb               m1, m0, m1 ; c0 a0 c1 a1 c2 a2 c3 a3
923    vpermb               m2, m0, m2 ; d0 b0 d1 b1 d2 b2 d3 b3
924    ITX_MUL2X_PACK        1, 0, 3, 4, 2896, 2896, 2
925    ITX_MUL2X_PACK        2, 0, 3, 4, 1567, 3784, 2
926    vpbroadcastd         m4, [o(pw_16384)]
927    psubsw               m3, m1, m2
928    paddsw               m1, m2     ; out0 out1
929    vprord               m3, 16     ; out2 out3
930    punpckldq            m0, m1, m3
931    punpckhdq            m1, m3
932    pmulhrsw             m0, m4
933    pmulhrsw             m1, m4
934    jmp                tx2q
935.pass2:
936    vextracti32x4       xm2, ym0, 1
937    vextracti32x4       xm3, ym1, 1
938    vextracti32x4       xm4, m0, 2
939    vextracti32x4       xm5, m1, 2
940    vextracti32x4       xm6, m0, 3
941    vextracti32x4       xm7, m1, 3
942    call .main
943    vinserti32x4        ym0, xm2, 1
944    vinserti32x4        ym1, xm3, 1
945    vinserti32x4        ym4, xm6, 1
946    vinserti32x4        ym5, xm7, 1
947    vinserti32x8         m0, ym4, 1
948    vinserti32x8         m1, ym5, 1
949    vpbroadcastd         m5, [o(pw_2048)]
950    pshufd               m1, m1, q1032
951    jmp m(iadst_4x16_internal_8bpc).end2
952ALIGN function_align
953.main:
954    WRAP_XMM IDCT16_1D_PACKED
955    ret
956
957INV_TXFM_4X16_FN adst, dct
958INV_TXFM_4X16_FN adst, adst
959INV_TXFM_4X16_FN adst, flipadst
960INV_TXFM_4X16_FN adst, identity
961
962cglobal iadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
963    mova                 m1, [o(permB)]
964    vpermq               m0, m1, [cq+64*0]
965    vpermq               m1, m1, [cq+64*1]
966    call m(iadst_16x4_internal_8bpc).main
967    vpbroadcastd         m3, [o(pw_16384)]
968    punpckhwd            m2, m0, m1
969    punpcklwd            m0, m1
970    pmulhrsw             m2, m3
971    pmulhrsw             m0, m3
972    punpckhwd            m1, m0, m2
973    punpcklwd            m0, m2
974    jmp                tx2q
975.pass2:
976    call .main
977    vpbroadcastd         m5, [o(pw_2048)]
978    psrlq               m10, 4
979    psubw                m6, m8, m5
980.end:
981    vpbroadcastd         m7, [o(pw_2896x8)]
982    paddsw              ym1, ym2, ym4
983    psubsw              ym2, ym4
984    vinserti32x8         m1, ym2, 1
985    pmulhrsw             m1, m7      ; -out7   out4   out6  -out5   out8  -out11 -out9   out10
986    psrlq                m0, m10, 4
987    vpermi2q             m0, m1, m3  ; 0 1   4 5   8 9   c d
988    vpermt2q             m1, m10, m3 ; 2 3   6 7   a b   e f
989    punpcklqdq           m5, m6
990.end2:
991    pmulhrsw             m0, m5
992    pmulhrsw             m1, m5
993.end3:
994    vpbroadcastd         m3, strided
995    pmulld               m5, m3, [o(pd_0to15)]
996    kxnorw               k1, k1, k1
997    kmovw                k2, k1
998    vpgatherdd       m3{k1}, [dstq+m5]
999    pxor                 m4, m4
1000    mova          [cq+64*0], m4
1001    mova          [cq+64*1], m4
1002    punpcklbw            m2, m3, m4
1003    punpckhbw            m3, m4
1004    paddw                m0, m2
1005    paddw                m1, m3
1006    packuswb             m0, m1
1007    vpscatterdd [dstq+m5]{k2}, m0
1008    RET
1009ALIGN function_align
1010.main:
1011    movu                 m3, [o(permB+1)]
1012    psrlq               m10, m3, 4
1013.main2:
1014    vpermi2q             m3, m0, m1  ; in15 in12 in13 in14 in11 in8  in9  in10
1015    vpermt2q             m0, m10, m1 ; in0  in3  in2  in1  in4  in7  in6  in5
1016    vpbroadcastd         m9, [o(pd_2048)]
1017    vpbroadcastq       ym13, [o(int_mshift)]
1018    kxnorb               k1, k1, k1
1019    punpckhwd            m4, m3, m0  ; in12 in3  in14 in1
1020    punpcklwd            m0, m3      ; in0  in15 in2  in13
1021    kshiftrb             k1, k1, 4
1022    vextracti32x8       ym3, m4, 1   ; in8  in7  in10 in5
1023    vextracti32x8       ym1, m0, 1   ; in4  in11 in6  in9
1024INIT_YMM avx512icl
1025    vpcmpub              k7, m13, m9, 6 ; 0x33...
1026    pxor                 m8, m8
1027    ITX_MUL4X_PACK        0, 2, 5, 6, 7, 9,  201, 4091,  995, 3973, 5
1028    ITX_MUL4X_PACK        1, 2, 5, 6, 7, 9, 1751, 3703, 2440, 3290, 5
1029    ITX_MUL4X_PACK        3, 2, 5, 6, 7, 9, 3035, 2751, 3513, 2106, 5
1030    ITX_MUL4X_PACK        4, 2, 5, 6, 7, 9, 3857, 1380, 4052,  601, 5
1031    psubsw               m2, m0, m3 ; t9a  t8a  t11a t10a
1032    paddsw               m0, m3     ; t1a  t0a  t3a  t2a
1033    psubsw               m3, m1, m4 ; t13a t12a t15a t14a
1034    paddsw               m4, m1     ; t5a  t4a  t7a  t6a
1035    ITX_MUL4X_PACK        2, 1, 5, 6, 7, 9,  799, 4017, 3406, 2276, 5
1036    psubw                m7, m8, m7
1037    ITX_MUL2X_PACK        3, 1, 5, 9, 7, 6, 4
1038    vpbroadcastd         m6, [o(pw_3784_m1567)]
1039    vpbroadcastd     m6{k1}, [o(pw_m3784_1567)]
1040    psubsw               m1, m0, m4 ; t5   t4   t7   t6
1041    paddsw               m0, m4     ; t1   t0   t3   t2
1042    psubsw               m4, m2, m3 ; t13a t12a t15a t14a
1043    paddsw               m2, m3     ; t9a  t8a  t11a t10a
1044    ITX_MUL2X_PACK        1, 3, 5, 9, 1567_3784, 6, 16 ; t4a t5a t7a t6a
1045    ITX_MUL2X_PACK        4, 3, 5, 9, 1567_3784, 6, 16 ; t12 t13 t15 t14
1046    vbroadcasti32x4      m5, [o(deint_shuf)]
1047    pshufb               m0, m5
1048    pshufb               m2, m5
1049    vshufi32x4           m3, m0, m2, 0x03  ; t3   t2   t11a t10a
1050    vinserti32x4         m0, xm2, 1        ; t1   t0   t9a  t8a
1051    vshufi32x4           m2, m1, m4, 0x03  ; t7a  t6a  t15  t14
1052    vinserti32x4         m1, xm4, 1        ; t4a  t5a  t12  t13
1053    pshufd               m2, m2, q1032     ; t6a  t7a  t14  t15
1054    psubsw               m4, m0, m3        ; t3a t2a t11 t10
1055    paddsw               m0, m3            ; -out15  out0   out14 -out1
1056    paddsw               m3, m1, m2        ;  out12 -out3  -out13  out2
1057    psubsw               m1, m2            ; t7 t6 t15a t14a
1058    punpckhqdq           m2, m4, m1        ; t2a t6  t10 t14a
1059    punpcklqdq           m4, m1            ; t3a t7  t11 t15a
1060INIT_ZMM avx512icl
1061    vinserti32x8         m3, ym0, 1        ; out12 -out3  -out13  out2  -out15  out0   out14 -out1
1062    ret
1063
1064INV_TXFM_4X16_FN flipadst, dct
1065INV_TXFM_4X16_FN flipadst, adst
1066INV_TXFM_4X16_FN flipadst, flipadst
1067INV_TXFM_4X16_FN flipadst, identity
1068
1069cglobal iflipadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1070    mova                 m1, [o(permB)]
1071    vpermq               m0, m1, [cq+64*0]
1072    vpermq               m1, m1, [cq+64*1]
1073    call m(iadst_16x4_internal_8bpc).main
1074    vpbroadcastd         m3, [o(pw_16384)]
1075    punpcklwd            m2, m1, m0
1076    punpckhwd            m1, m0
1077    pmulhrsw             m2, m3
1078    pmulhrsw             m1, m3
1079    punpcklwd            m0, m1, m2
1080    punpckhwd            m1, m2
1081    jmp                tx2q
1082.pass2:
1083    call m(iadst_4x16_internal_8bpc).main
1084    vpbroadcastd         m6, [o(pw_2048)]
1085    psrlq               m10, 12
1086    psubw                m5, m8, m6
1087    jmp m(iadst_4x16_internal_8bpc).end
1088
1089INV_TXFM_4X16_FN identity, dct
1090INV_TXFM_4X16_FN identity, adst
1091INV_TXFM_4X16_FN identity, flipadst
1092INV_TXFM_4X16_FN identity, identity
1093
1094cglobal iidentity_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1095    mova                 m2, [o(int16_perm)]
1096    vpermb               m1, m2, [cq+64*0]
1097    vpermb               m2, m2, [cq+64*1]
1098    vpbroadcastd         m4, [o(pw_1697x8)]
1099    vpbroadcastd         m0, [o(pd_m1)]
1100    pmulhrsw             m3, m4, m1    ; we want to do a signed avg, but pavgw is
1101    vpcmpw               k1, m1, m0, 4 ; unsigned. as long as both signs are equal
1102    pmulhrsw             m4, m2        ; it still works, but if the input is -1 the
1103    vpcmpw               k2, m2, m0, 4 ; pmulhrsw result will become 0 which causes
1104    vpavgw        m1{k1}{z}, m3        ; pavgw to output -32768 instead of 0 unless
1105    vpavgw        m2{k2}{z}, m4        ; we explicitly deal with that case here.
1106    punpckldq            m0, m1, m2
1107    punpckhdq            m1, m2
1108    jmp                tx2q
1109.pass2:
1110    vpbroadcastd         m3, [o(pw_1697x16)]
1111    vpbroadcastd         m5, [o(pw_2048)]
1112    pmulhrsw             m2, m3, m0
1113    pmulhrsw             m3, m1
1114    paddsw               m0, m0
1115    paddsw               m1, m1
1116    paddsw               m0, m2
1117    paddsw               m1, m3
1118    jmp m(iadst_4x16_internal_8bpc).end2
1119
1120%macro WRITE_8X4 4-7 strideq*1, strideq*2, r6 ; coefs[1-2], tmp[1-2], off[1-3]
1121    movq               xm%3, [dstq   ]
1122    movhps             xm%3, [dstq+%5]
1123    movq               xm%4, [dstq+%6]
1124    movhps             xm%4, [dstq+%7]
1125    pmovzxbw            m%3, xm%3
1126    pmovzxbw            m%4, xm%4
1127%ifnum %1
1128    paddw               m%3, m%1
1129%else
1130    paddw               m%3, %1
1131%endif
1132%ifnum %2
1133    paddw               m%4, m%2
1134%else
1135    paddw               m%4, %2
1136%endif
1137    packuswb            m%3, m%4
1138    vextracti32x4      xm%4, m%3, 1
1139    movq          [dstq   ], xm%3
1140    movhps        [dstq+%6], xm%3
1141    movq          [dstq+%5], xm%4
1142    movhps        [dstq+%7], xm%4
1143%endmacro
1144
1145%macro INV_TXFM_8X4_FN 2 ; type1, type2
1146    INV_TXFM_FN          %1, %2, 8x4
1147%ifidn %1_%2, dct_dct
1148    movd                xm1, [o(pw_2896x8)]
1149    pmulhrsw            xm0, xm1, [cq]
1150    movd                xm2, [o(pw_2048)]
1151    pmulhrsw            xm0, xm1
1152    pmulhrsw            xm0, xm1
1153    pmulhrsw            xm0, xm2
1154    vpbroadcastw         m0, xm0
1155    mova                 m1, m0
1156    jmp m(iadst_8x4_internal_8bpc).end3
1157%endif
1158%endmacro
1159
1160INIT_YMM avx512icl
1161INV_TXFM_8X4_FN dct, dct
1162INV_TXFM_8X4_FN dct, adst
1163INV_TXFM_8X4_FN dct, flipadst
1164INV_TXFM_8X4_FN dct, identity
1165
1166cglobal idct_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1167    vpbroadcastd        xm3, [o(pw_2896x8)]
1168    pmulhrsw            xm0, xm3, [cq+16*0]
1169    pmulhrsw            xm1, xm3, [cq+16*1]
1170    pmulhrsw            xm2, xm3, [cq+16*2]
1171    pmulhrsw            xm3,      [cq+16*3]
1172    call m(idct_4x8_internal_8bpc).main
1173    vbroadcasti32x4      m4, [o(deint_shuf)]
1174    vinserti32x4         m3, m1, xm3, 1
1175    vinserti32x4         m1, m0, xm2, 1
1176    shufps               m0, m1, m3, q0220
1177    shufps               m1, m3, q1331
1178    pshufb               m0, m4
1179    pshufb               m1, m4
1180    jmp                tx2q
1181.pass2:
1182    IDCT4_1D_PACKED
1183    vpermq               m0, m0, q3120
1184    vpermq               m1, m1, q2031
1185    jmp m(iadst_8x4_internal_8bpc).end2
1186
1187INV_TXFM_8X4_FN adst, dct
1188INV_TXFM_8X4_FN adst, adst
1189INV_TXFM_8X4_FN adst, flipadst
1190INV_TXFM_8X4_FN adst, identity
1191
1192cglobal iadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1193    vpbroadcastd        xm0, [o(pw_2896x8)]
1194    pshufd              xm4,      [cq+16*0], q1032
1195    pmulhrsw            xm3, xm0, [cq+16*3]
1196    pshufd              xm5,      [cq+16*1], q1032
1197    pmulhrsw            xm2, xm0, [cq+16*2]
1198    pmulhrsw            xm4, xm0
1199    pmulhrsw            xm5, xm0
1200    call m(iadst_4x8_internal_8bpc).main_pass1
1201    vinserti32x4         m0, xm2, 1
1202    vinserti32x4         m1, xm3, 1
1203    pxor                 m3, m3
1204    punpckhwd            m2, m0, m1
1205    punpcklwd            m0, m1
1206    psubsw               m3, m2
1207    punpckhwd            m1, m0, m3
1208    punpcklwd            m0, m3
1209    jmp                tx2q
1210.pass2:
1211    call .main
1212.end:
1213    vpermq               m0, m0, q3120
1214    vpermq               m1, m1, q3120
1215.end2:
1216    vpbroadcastd         m2, [o(pw_2048)]
1217    pmulhrsw             m0, m2
1218    pmulhrsw             m1, m2
1219.end3:
1220    pxor                 m2, m2
1221    mova               [cq], zmm18
1222    lea                  r6, [strideq*3]
1223    WRITE_8X4             0, 1, 4, 5
1224    RET
1225ALIGN function_align
1226.main:
1227    IADST4_1D_PACKED
1228    ret
1229
1230INV_TXFM_8X4_FN flipadst, dct
1231INV_TXFM_8X4_FN flipadst, adst
1232INV_TXFM_8X4_FN flipadst, flipadst
1233INV_TXFM_8X4_FN flipadst, identity
1234
1235cglobal iflipadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1236    vpbroadcastd        xm0, [o(pw_2896x8)]
1237    pshufd              xm4,      [cq+16*0], q1032
1238    pmulhrsw            xm3, xm0, [cq+16*3]
1239    pshufd              xm5,      [cq+16*1], q1032
1240    pmulhrsw            xm2, xm0, [cq+16*2]
1241    pmulhrsw            xm4, xm0
1242    pmulhrsw            xm5, xm0
1243    call m(iadst_4x8_internal_8bpc).main_pass1
1244    vinserti32x4         m3, m3, xm1, 1
1245    vinserti32x4         m2, m2, xm0, 1
1246    punpckhwd            m1, m3, m2
1247    punpcklwd            m3, m2
1248    pxor                 m0, m0
1249    psubsw               m0, m1
1250    punpckhwd            m1, m0, m3
1251    punpcklwd            m0, m3
1252    jmp                tx2q
1253.pass2:
1254    call m(iadst_8x4_internal_8bpc).main
1255    mova                 m2, m1
1256    vpermq               m1, m0, q2031
1257    vpermq               m0, m2, q2031
1258    jmp m(iadst_8x4_internal_8bpc).end2
1259
1260INV_TXFM_8X4_FN identity, dct
1261INV_TXFM_8X4_FN identity, adst
1262INV_TXFM_8X4_FN identity, flipadst
1263INV_TXFM_8X4_FN identity, identity
1264
1265cglobal iidentity_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1266    mova                xm2, [cq+16*0]
1267    mova                xm0, [cq+16*1]
1268    vinserti32x4         m2, [cq+16*2], 1
1269    vinserti32x4         m0, [cq+16*3], 1
1270    vpbroadcastd         m3, [o(pw_2896x8)]
1271    punpcklwd            m1, m2, m0
1272    punpckhwd            m2, m0
1273    pmulhrsw             m1, m3
1274    pmulhrsw             m2, m3
1275    punpcklwd            m0, m1, m2
1276    punpckhwd            m1, m2
1277    paddsw               m0, m0
1278    paddsw               m1, m1
1279    jmp                tx2q
1280.pass2:
1281    vpbroadcastd         m3, [o(pw_1697x8)]
1282    pmulhrsw             m2, m3, m0
1283    pmulhrsw             m3, m1
1284    paddsw               m0, m2
1285    paddsw               m1, m3
1286    jmp m(iadst_8x4_internal_8bpc).end
1287
1288%macro INV_TXFM_8X8_FN 2 ; type1, type2
1289    INV_TXFM_FN          %1, %2, 8x8
1290%ifidn %1_%2, dct_dct
1291INIT_ZMM avx512icl
1292    movsx               r6d, word [cq]
1293    mov                [cq], eobd
1294.dconly:
1295    imul                r6d, 181
1296    add                 r6d, 128+256
1297    sar                 r6d, 8+1
1298.dconly2:
1299    vpbroadcastd        ym2, strided
1300    imul                r6d, 181
1301    pmulld              ym5, ym2, [o(pd_0to15)]
1302    kxnorb               k1, k1, k1
1303    add                 r6d, 128+2048
1304    sar                 r6d, 8+4
1305    pxor                 m3, m3
1306    vpbroadcastw         m4, r6d
1307.dconly_loop:
1308    kmovb                k2, k1
1309    vpgatherdq       m2{k1}, [dstq+ym5]
1310    punpcklbw            m0, m2, m3
1311    punpckhbw            m1, m2, m3
1312    paddw                m0, m4
1313    paddw                m1, m4
1314    packuswb             m0, m1
1315    kmovb                k1, k2
1316    vpscatterdq [dstq+ym5]{k2}, m0
1317    lea                dstq, [dstq+strideq*8]
1318    sub                 r3d, 8
1319    jg .dconly_loop
1320    RET
1321INIT_YMM avx512icl
1322%endif
1323%endmacro
1324
1325INV_TXFM_8X8_FN dct, dct
1326INV_TXFM_8X8_FN dct, identity
1327INV_TXFM_8X8_FN dct, adst
1328INV_TXFM_8X8_FN dct, flipadst
1329
1330cglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1331    vpermq               m0, [cq+32*0], q3120 ; 0 1
1332    vpermq               m3, [cq+32*3], q3120 ; 6 7
1333    vpermq               m2, [cq+32*2], q3120 ; 4 5
1334    vpermq               m1, [cq+32*1], q3120 ; 2 3
1335    call .main
1336    shufps               m4, m0, m1, q0220
1337    shufps               m5, m0, m1, q1331
1338    shufps               m1, m2, m3, q0220
1339    shufps               m3, m2, m3, q1331
1340    vbroadcasti32x4      m0, [o(deint_shuf)]
1341    vpbroadcastd         m2, [o(pw_16384)]
1342    REPX   {pshufb   x, m0}, m4, m5, m1, m3
1343    REPX   {pmulhrsw x, m2}, m4, m5, m1, m3
1344    vinserti32x4         m0, m4, xm1, 1
1345    vshufi32x4           m2, m4, m1, 0x03
1346    vinserti32x4         m1, m5, xm3, 1
1347    vshufi32x4           m3, m5, m3, 0x03
1348    jmp                tx2q
1349.pass2:
1350    call .main
1351    vpbroadcastd         m4, [o(pw_2048)]
1352    vpermq               m0, m0, q3120
1353    vpermq               m1, m1, q2031
1354    vpermq               m2, m2, q3120
1355    vpermq               m3, m3, q2031
1356    jmp m(iadst_8x8_internal_8bpc).end2
1357ALIGN function_align
1358.main:
1359    IDCT8_1D_PACKED
1360    ret
1361
1362INV_TXFM_8X8_FN adst, dct
1363INV_TXFM_8X8_FN adst, adst
1364INV_TXFM_8X8_FN adst, flipadst
1365INV_TXFM_8X8_FN adst, identity
1366
1367cglobal iadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1368    vpermq               m4, [cq+32*0], q1302 ; 1 0
1369    vpermq               m3, [cq+32*3], q3120 ; 6 7
1370    vpermq               m5, [cq+32*1], q1302 ; 3 2
1371    vpermq               m2, [cq+32*2], q3120 ; 4 5
1372    call .main_pass1
1373    vpbroadcastd         m5, [o(pw_16384_m16384)]
1374    punpcklwd            m4, m0, m1
1375    punpckhwd            m0, m1
1376    punpcklwd            m1, m2, m3
1377    punpckhwd            m2, m3
1378    punpcklwd            m3, m4, m0
1379    punpckhwd            m4, m0
1380    punpcklwd            m0, m1, m2
1381    punpckhwd            m1, m2
1382    REPX   {pmulhrsw x, m5}, m3, m4, m0, m1
1383    vshufi32x4           m2, m3, m0, 0x03
1384    vinserti32x4         m0, m3, xm0, 1
1385    vshufi32x4           m3, m4, m1, 0x03
1386    vinserti32x4         m1, m4, xm1, 1
1387    jmp                tx2q
1388.pass2:
1389    pshufd               m4, m0, q1032
1390    pshufd               m5, m1, q1032
1391    call .main_pass2
1392    vpbroadcastd         m5, [o(pw_2048)]
1393    vpbroadcastd        xm4, [o(pw_4096)]
1394    psubw                m4, m5 ; lower half = 2048, upper half = -2048
1395.end:
1396    REPX {vpermq x, x, q3120}, m0, m1, m2, m3
1397.end2:
1398    pmulhrsw             m0, m4
1399    pmulhrsw             m1, m4
1400.end3:
1401    pmulhrsw             m2, m4
1402    pmulhrsw             m3, m4
1403.end4:
1404    pxor                 m4, m4
1405    mova          [cq+32*0], m4
1406    mova          [cq+32*1], m4
1407    mova          [cq+32*2], m4
1408    mova          [cq+32*3], m4
1409    lea                  r6, [strideq*3]
1410    WRITE_8X4             0, 1, 4, 5
1411    lea                dstq, [dstq+strideq*4]
1412    WRITE_8X4             2, 3, 4, 5
1413    RET
1414ALIGN function_align
1415.main_pass1:
1416    punpckhwd            m0, m4, m3 ; 0 7
1417    punpckhwd            m1, m5, m2 ; 2 5
1418    punpcklwd            m2, m5     ; 4 3
1419    punpcklwd            m3, m4     ; 6 1
1420    IADST8_1D_PACKED 1
1421    punpcklqdq           m3, m4, m0        ; out6 -out7
1422    punpckhqdq           m0, m4            ; out0 -out1
1423    ret
1424ALIGN function_align
1425.main_pass2:
1426    IADST8_1D_PACKED 2
1427    ret
1428
1429INV_TXFM_8X8_FN flipadst, dct
1430INV_TXFM_8X8_FN flipadst, adst
1431INV_TXFM_8X8_FN flipadst, flipadst
1432INV_TXFM_8X8_FN flipadst, identity
1433
1434cglobal iflipadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1435    vpermq               m4, [cq+32*0], q1302 ; 1 0
1436    vpermq               m3, [cq+32*3], q3120 ; 6 7
1437    vpermq               m5, [cq+32*1], q1302 ; 3 2
1438    vpermq               m2, [cq+32*2], q3120 ; 4 5
1439    call m(iadst_8x8_internal_8bpc).main_pass1
1440    vpbroadcastd         m5, [o(pw_m16384_16384)]
1441    punpckhwd            m4, m3, m2
1442    punpcklwd            m3, m2
1443    punpckhwd            m2, m1, m0
1444    punpcklwd            m1, m0
1445    punpckhwd            m0, m4, m3
1446    punpcklwd            m4, m3
1447    punpckhwd            m3, m2, m1
1448    punpcklwd            m2, m1
1449    REPX   {pmulhrsw x, m5}, m0, m4, m3, m2
1450    vinserti32x4         m1, m0, xm3, 1
1451    vshufi32x4           m3, m0, m3, 0x03
1452    vinserti32x4         m0, m4, xm2, 1
1453    vshufi32x4           m2, m4, m2, 0x03
1454    jmp                tx2q
1455.pass2:
1456    pshufd               m4, m0, q1032
1457    pshufd               m5, m1, q1032
1458    call m(iadst_8x8_internal_8bpc).main_pass2
1459    vpbroadcastd         m4, [o(pw_2048)]
1460    vpbroadcastd        xm5, [o(pw_4096)]
1461    psubw                m4, m5 ; lower half = -2048, upper half = 2048
1462    vpermq               m5, m3, q2031
1463    vpermq               m3, m0, q2031
1464    vpermq               m0, m2, q2031
1465    vpermq               m2, m1, q2031
1466    pmulhrsw             m1, m0, m4
1467    pmulhrsw             m0, m5, m4
1468    jmp m(iadst_8x8_internal_8bpc).end3
1469
1470INV_TXFM_8X8_FN identity, dct
1471INV_TXFM_8X8_FN identity, adst
1472INV_TXFM_8X8_FN identity, flipadst
1473INV_TXFM_8X8_FN identity, identity
1474
1475cglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1476    mova                xm3, [cq+16*0]
1477    mova                xm2, [cq+16*1]
1478    vinserti32x4         m3, [cq+16*4], 1
1479    vinserti32x4         m2, [cq+16*5], 1
1480    mova                xm4, [cq+16*2]
1481    mova                xm0, [cq+16*3]
1482    vinserti32x4         m4, [cq+16*6], 1
1483    vinserti32x4         m0, [cq+16*7], 1
1484    punpcklwd            m1, m3, m2
1485    punpckhwd            m3, m2
1486    punpcklwd            m2, m4, m0
1487    punpckhwd            m4, m0
1488    punpckldq            m0, m1, m2
1489    punpckhdq            m1, m2
1490    punpckldq            m2, m3, m4
1491    punpckhdq            m3, m4
1492    jmp                tx2q
1493.pass2:
1494    vpbroadcastd         m4, [o(pw_4096)]
1495    jmp m(iadst_8x8_internal_8bpc).end
1496
1497%macro INV_TXFM_8X16_FN 2 ; type1, type2
1498    INV_TXFM_FN          %1, %2, 8x16
1499%ifidn %1_%2, dct_dct
1500    movsx               r6d, word [cq]
1501    mov                [cq], eobd
1502    imul                r6d, 181
1503    mov                 r3d, 16
1504    add                 r6d, 128
1505    sar                 r6d, 8
1506    jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly
1507%endif
1508%endmacro
1509
1510%macro ITX_8X16_LOAD_COEFS 0
1511    vpbroadcastd         m4, [o(pw_2896x8)]
1512    pmulhrsw             m0, m4, [cq+32*0]
1513    add                  cq, 32*4
1514    pmulhrsw             m7, m4, [cq+32*3]
1515    pmulhrsw             m1, m4, [cq-32*3]
1516    pmulhrsw             m6, m4, [cq+32*2]
1517    pmulhrsw             m2, m4, [cq-32*2]
1518    pmulhrsw             m5, m4, [cq+32*1]
1519    pmulhrsw             m3, m4, [cq-32*1]
1520    pmulhrsw             m4,     [cq+32*0]
1521%endmacro
1522
1523INIT_ZMM avx512icl
1524INV_TXFM_8X16_FN dct, dct
1525INV_TXFM_8X16_FN dct, identity
1526INV_TXFM_8X16_FN dct, adst
1527INV_TXFM_8X16_FN dct, flipadst
1528
1529cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1530    mova                 m3, [o(permB)]
1531    vpermq               m0, m3, [cq+64*0]
1532    vpbroadcastd         m4, [o(pw_2896x8)]
1533    vpermq               m1, m3, [cq+64*1]
1534    vpermq               m2, m3, [cq+64*2]
1535    vpermq               m3, m3, [cq+64*3]
1536    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
1537    call m(idct_16x8_internal_8bpc).main
1538    vpbroadcastd         m5, [o(pw_16384)]
1539    punpckhwd            m4, m0, m2 ; b0 f0 b1 f1 b2 f2 b3 f3
1540    punpcklwd            m0, m2     ; a0 e0 a1 e1 a2 e2 a3 e3
1541    punpckhwd            m2, m1, m3 ; c0 g0 c1 g1 c2 g2 c3 g3
1542    punpcklwd            m1, m3     ; d0 h0 d1 h1 d2 h2 d3 h3
1543    REPX   {pmulhrsw x, m5}, m4, m0, m2, m1
1544    punpckhwd            m3, m0, m4 ; a2 b2 e2 f2 a3 b3 e3 f3
1545    punpcklwd            m0, m4     ; a0 b0 e0 f0 a1 b1 e1 f1
1546    punpckhwd            m4, m2, m1 ; c2 d2 g2 h2 c3 d3 g3 h3
1547    punpcklwd            m2, m1     ; c0 d0 g0 h0 c1 d1 g1 h1
1548    punpckhdq            m1, m0, m2 ;  1  5  9 13
1549    punpckldq            m0, m2     ;  0  4  8 12
1550    punpckldq            m2, m3, m4 ;  2  6 10 14
1551    punpckhdq            m3, m4     ;  3  7 11 15
1552    jmp                tx2q
1553.pass2:
1554    vprord               m5, [o(int16_perm)], 16
1555    vshufi32x4           m2, m2, q1320     ;  2 10 14  6
1556    vshufi32x4           m4, m1, m3, q2310 ;  1  5 15 11
1557    vshufi32x4           m1, m3, q0132     ;  9 13  7  3
1558    vpermb               m9, m5, m0
1559    vpermb               m7, m5, m2
1560    vpermb               m8, m5, m4
1561    vpermb               m0, m5, m1
1562    vextracti32x8       ym6, m9, 1
1563    vextracti32x8       ym3, m7, 1
1564    vextracti32x8       ym5, m8, 1
1565    vextracti32x8       ym1, m0, 1
1566    call .main2
1567    mova                ym8, [o(gather8a)]
1568    lea                  r3, [dstq+strideq*4]
1569    pmovzxdq             m9, ym8
1570    pshufd              ym8, ym8, q1230
1571    vpermt2q             m0, m9, m4
1572    vpermt2q             m1, m9, m5
1573    vpermt2q             m2, m9, m6
1574    vpermt2q             m3, m9, m7
1575.end:
1576    vpbroadcastd         m7, [o(pw_2048)]
1577.end2:
1578    pmulhrsw             m0, m7
1579    pmulhrsw             m1, m7
1580.end3:
1581    pmulhrsw             m2, m7
1582    pmulhrsw             m3, m7
1583.end4:
1584    vpbroadcastd        ym6, strided
1585    kxnorb               k1, k1, k1
1586    pxor                 m4, m4
1587    pmulld              ym8, ym6
1588    kmovb                k2, k1
1589    vpgatherdq       m6{k1}, [dstq+ym8]
1590    kmovb                k1, k2
1591    vpgatherdq       m7{k2}, [r3+ym8]
1592    mova          [cq+64*0], m4
1593    mova          [cq+64*1], m4
1594    kmovb                k2, k1
1595    mova          [cq+64*2], m4
1596    mova          [cq+64*3], m4
1597    punpcklbw            m5, m6, m4
1598    punpckhbw            m6, m4
1599    paddw                m0, m5
1600    paddw                m1, m6
1601    packuswb             m0, m1
1602    vpscatterdq [dstq+ym8]{k1}, m0
1603    punpcklbw            m6, m7, m4
1604    punpckhbw            m7, m4
1605    paddw                m2, m6
1606    paddw                m3, m7
1607    packuswb             m2, m3
1608    vpscatterdq [r3+ym8]{k2}, m2
1609    RET
1610ALIGN function_align
1611.main:
1612    WRAP_YMM IDCT16_1D_PACKED
1613    ret
1614
1615INV_TXFM_8X16_FN adst, dct
1616INV_TXFM_8X16_FN adst, adst
1617INV_TXFM_8X16_FN adst, flipadst
1618INV_TXFM_8X16_FN adst, identity
1619
1620cglobal iadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1621    call m(iadst_16x8_internal_8bpc).main_pass1
1622    vbroadcasti32x4      m6, [o(int_shuf1)]
1623    vpbroadcastd         m7, [o(pw_16384_m16384)]
1624    punpckhwd            m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3
1625    punpcklwd            m4, m0     ; g0 h0 g1 h1 g2 h2 g3 h3
1626    pshufb               m5, m1, m6 ; c0 d0 c1 d1 c2 d2 c3 d3
1627    pshufb               m2, m6     ; e0 f0 e1 f1 e2 f2 e3 f3
1628.pass1_end:
1629    REPX   {pmulhrsw x, m7}, m3, m5, m4, m2
1630    punpckldq            m0, m3, m5 ; a0 b0 c0 d0 a1 b1 c1 d1
1631    punpckhdq            m3, m5     ; a2 b2 c2 d2 a3 b3 c3 d3
1632    punpckhdq            m5, m2, m4 ; e2 f2 g2 h2 e3 f3 g3 h3
1633    punpckldq            m2, m4     ; e0 f0 g0 h0 e1 f1 g1 h1
1634    punpckhqdq           m1, m0, m2
1635    punpcklqdq           m0, m2
1636    punpcklqdq           m2, m3, m5
1637    punpckhqdq           m3, m5
1638    jmp                tx2q
1639.pass2:
1640    call .main_pass2
1641    vpbroadcastd         m6, [o(pw_2048)]
1642    psrlq               m10, 4
1643    psubw                m7, m8, m6
1644.pass2_end:
1645    vpbroadcastd         m5, [o(pw_2896x8)]
1646    paddsw               m1, m2, m4
1647    psubsw               m2, m4
1648    pmulhrsw             m1, m5      ; -out7   out4   out6  -out5
1649    pmulhrsw             m5, m2      ;  out8  -out11 -out9   out10
1650    mova                ym8, [o(gather8c)]
1651    lea                  r3, [dstq+strideq]
1652    psrlq                m2, m10, 4
1653    vpermi2q             m2, m0, m3  ;  1  3 13 15
1654    vpermt2q             m0, m10, m3 ;  0  2 12 14
1655    psrlq                m3, m10, 8
1656    vpermi2q             m3, m1, m5  ;  5  7  9 11
1657    psrlq               m10, 12
1658    vpermt2q             m1, m10, m5 ;  4  6  8 10
1659    pmulhrsw             m0, m6
1660    pmulhrsw             m1, m6
1661    jmp m(idct_8x16_internal_8bpc).end3
1662ALIGN function_align
1663.main_pass1:
1664    vpbroadcastd         m2, [o(pw_2896x8)]
1665    pmulhrsw             m5, m2, [cq+64*0]
1666    pmulhrsw             m3, m2, [cq+64*3]
1667    pmulhrsw             m1, m2, [cq+64*1]
1668    pmulhrsw             m2,     [cq+64*2]
1669    movu                 m4, [o(permA+3)]
1670    psrlq               m10, m4, 4
1671    mova                 m6, m4
1672    vpermi2q             m4, m5, m3  ; in0  in12 in2  in14
1673    vpermt2q             m5, m10, m3 ; in15 in3  in13 in1
1674    vpermi2q             m6, m1, m2  ; in4  in8  in6  in10
1675    vpermt2q             m1, m10, m2 ; in11 in7  in9  in5
1676    jmp .main
1677ALIGN function_align
1678.main_pass2:
1679    mova                 m4, [o(permC)]
1680    psrlq                m5, m4, 4
1681    vpermi2q             m4, m0, m2  ; in0  in12 in2  in14
1682    psrlq                m6, m5, 4
1683    vpermi2q             m5, m1, m3  ; in15 in3  in13 in1
1684    psrlq               m10, m6, 4
1685    vpermi2q             m6, m0, m2  ; in4  in8  in6  in10
1686    vpermt2q             m1, m10, m3 ; in11 in7  in9  in5
1687.main:
1688    vpbroadcastd         m9, [o(pd_2048)]
1689    vpbroadcastq        m13, [o(int_mshift)]
1690    kxnorb               k1, k1, k1
1691    punpcklwd            m0, m4, m5  ; in0  in15 in2  in13
1692    punpckhwd            m4, m5      ; in12 in3  in14 in1
1693    punpcklwd            m5, m6, m1  ; in4  in11 in6  in9
1694    punpckhwd            m6, m1      ; in8  in7  in10 in5
1695    vpcmpub              k7, m13, m9, 6 ; 0x33...
1696    pxor                 m8, m8
1697    ITX_MUL4X_PACK        0, 1, 2, 3, 7, 9,  201, 4091,  995, 3973, 5
1698    ITX_MUL4X_PACK        6, 1, 2, 3, 7, 9, 3035, 2751, 3513, 2106, 5
1699    ITX_MUL4X_PACK        4, 1, 2, 3, 7, 9, 3857, 1380, 4052,  601, 5
1700    ITX_MUL4X_PACK        5, 1, 2, 3, 7, 9, 1751, 3703, 2440, 3290, 5
1701    psubsw               m2, m0, m6 ; t9a  t8a  t11a t10a
1702    paddsw               m0, m6     ; t1a  t0a  t3a  t2a
1703    psubsw               m3, m5, m4 ; t13a t12a t15a t14a
1704    paddsw               m5, m4     ; t5a  t4a  t7a  t6a
1705    ITX_MUL4X_PACK        2, 4, 1, 6, 7, 9,  799, 4017, 3406, 2276, 5
1706    psubw                m7, m8, m7
1707    ITX_MUL2X_PACK        3, 4, 1, 9, 7, 6, 4
1708    vpbroadcastd         m6, [o(pw_3784_m1567)]
1709    vpbroadcastd     m6{k1}, [o(pw_m3784_1567)]
1710    psubsw               m1, m0, m5 ; t5   t4   t7   t6
1711    paddsw               m0, m5     ; t1   t0   t3   t2
1712    psubsw               m4, m2, m3 ; t13a t12a t15a t14a
1713    paddsw               m2, m3     ; t9a  t8a  t11a t10a
1714    ITX_MUL2X_PACK        1, 3, 5, 9, 1567_3784, 6, 16 ; t5a t4a t6a t7a
1715    ITX_MUL2X_PACK        4, 3, 5, 9, 1567_3784, 6, 16 ; t13 t12 t14 t15
1716    vbroadcasti32x4      m5, [o(deint_shuf)]
1717    pshufb               m0, m5
1718    pshufb               m2, m5
1719    vshufi32x4           m3, m0, m2, q3232 ; t3   t2   t11a t10a
1720    vinserti32x8         m0, ym2, 1        ; t1   t0   t9a  t8a
1721    vshufi32x4           m2, m1, m4, q3232 ; t6a  t7a  t14  t15
1722    vinserti32x8         m1, ym4, 1        ; t5a  t4a  t13  t12
1723    pshufd               m2, m2, q1032     ; t7a  t6a  t15  t14
1724    psubsw               m4, m0, m3        ; t3a t2a t11 t10
1725    paddsw               m0, m3            ; -out15  out0   out14 -out1
1726    paddsw               m3, m1, m2        ;  out12 -out3  -out13  out2
1727    psubsw               m1, m2            ; t7 t6 t15a t14a
1728    punpckhqdq           m2, m4, m1        ; t2a t6  t10 t14a
1729    punpcklqdq           m4, m1            ; t3a t7  t11 t15a
1730    ret
1731
1732INV_TXFM_8X16_FN flipadst, dct
1733INV_TXFM_8X16_FN flipadst, adst
1734INV_TXFM_8X16_FN flipadst, flipadst
1735INV_TXFM_8X16_FN flipadst, identity
1736
1737cglobal iflipadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1738    call m(iadst_16x8_internal_8bpc).main_pass1
1739    vbroadcasti32x4      m6, [o(int_shuf2)]
1740    vpbroadcastd         m7, [o(pw_m16384_16384)]
1741    punpcklwd            m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3
1742    punpckhwd            m4, m0     ; g0 h0 g1 h1 g2 h2 g3 h3
1743    pshufb               m5, m2, m6 ; c0 d0 c1 d1 c2 d2 c3 d3
1744    pshufb               m2, m1, m6 ; e0 f0 e1 f1 e2 f2 e3 f3
1745    jmp m(iadst_8x16_internal_8bpc).pass1_end
1746.pass2:
1747    call m(iadst_8x16_internal_8bpc).main_pass2
1748    vpbroadcastd         m7, [o(pw_2048)]
1749    psrlq               m10, 36
1750    psubw                m6, m8, m7
1751    jmp m(iadst_8x16_internal_8bpc).pass2_end
1752
1753INV_TXFM_8X16_FN identity, dct
1754INV_TXFM_8X16_FN identity, adst
1755INV_TXFM_8X16_FN identity, flipadst
1756INV_TXFM_8X16_FN identity, identity
1757
1758cglobal iidentity_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1759    mova                 m0, [o(int16_perm)]
1760    vpermb               m3, m0, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3
1761    vpermb               m2, m0, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3
1762    vpermb               m4, m0, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3
1763    vpermb               m0, m0, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3
1764    vpbroadcastd         m5, [o(pw_2896x8)]
1765    punpckldq            m1, m3, m2        ; a0 b0 c0 d0 a1 b1 c1 d1
1766    punpckhdq            m3, m2            ; a2 b2 c2 d2 a3 b3 c3 d3
1767    punpckldq            m2, m4, m0        ; e0 f0 g0 h0 a1 f1 g1 h1
1768    punpckhdq            m4, m0            ; e2 f2 g2 h2 e3 f3 g3 h3
1769    REPX   {pmulhrsw x, m5}, m1, m2, m3, m4
1770    punpcklqdq           m0, m1, m2        ; a0 b0 c0 d0 e0 f0 g0 h0
1771    punpckhqdq           m1, m2            ; a1 b1 c1 d1 e1 f1 g1 h1
1772    punpcklqdq           m2, m3, m4        ; a2 b2 c2 d2 e2 f2 g2 h2
1773    punpckhqdq           m3, m4            ; a3 b3 c3 d3 e3 f3 g3 h3
1774    jmp                tx2q
1775.pass2:
1776    vpbroadcastd         m7, [o(pw_1697x16)]
1777    mova                ym8, [o(gather8b)]
1778    lea                  r3, [dstq+strideq*2]
1779    pmulhrsw             m4, m7, m0
1780    pmulhrsw             m5, m7, m1
1781    pmulhrsw             m6, m7, m2
1782    pmulhrsw             m7, m3
1783    REPX      {paddsw x, x}, m0, m1, m2, m3
1784    paddsw               m0, m4
1785    paddsw               m1, m5
1786    paddsw               m2, m6
1787    paddsw               m3, m7
1788    jmp m(idct_8x16_internal_8bpc).end
1789
1790%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2]
1791    pmovzxbw            m%3, [dstq+%5]
1792%ifnum %1
1793    paddw               m%3, m%1
1794%else
1795    paddw               m%3, %1
1796%endif
1797    pmovzxbw            m%4, [dstq+%6]
1798%ifnum %2
1799    paddw               m%4, m%2
1800%else
1801    paddw               m%4, %2
1802%endif
1803    packuswb            m%3, m%4
1804    vpermq              m%3, m%3, q3120
1805    mova          [dstq+%5], xm%3
1806    vextracti32x4 [dstq+%6], m%3, 1
1807%endmacro
1808
1809%macro INV_TXFM_16X4_FN 2 ; type1, type2
1810    INV_TXFM_FN          %1, %2, 16x4
1811%ifidn %1_%2, dct_dct
1812    movsx               r6d, word [cq]
1813    mov                [cq], eobd
1814    jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly2
1815%endif
1816%endmacro
1817
1818INIT_ZMM avx512icl
1819INV_TXFM_16X4_FN dct, dct
1820INV_TXFM_16X4_FN dct, adst
1821INV_TXFM_16X4_FN dct, flipadst
1822INV_TXFM_16X4_FN dct, identity
1823
1824cglobal idct_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1825    mova                xm0, [cq+16*0]
1826    mova                xm1, [cq+16*1]
1827    mova                xm2, [cq+16*2]
1828    mova                xm3, [cq+16*3]
1829    mova                xm4, [cq+16*4]
1830    mova                xm5, [cq+16*5]
1831    mova                xm6, [cq+16*6]
1832    mova                xm7, [cq+16*7]
1833    call m(idct_4x16_internal_8bpc).main
1834    vpbroadcastd         m8, [o(pw_16384)]
1835    vinserti32x4        ym1, xm3, 1 ; 3 2   7 6
1836    vinserti32x4        ym5, xm7, 1 ; b a   f e
1837    vinserti32x4        ym0, xm2, 1 ; 0 1   4 5
1838    vinserti32x4        ym4, xm6, 1 ; 8 9   c d
1839    vinserti32x8         m1, ym5, 1 ; 3 2   7 6   b a   f e
1840    vinserti32x8         m0, ym4, 1 ; 0 1   4 5   8 9   c d
1841    pmulhrsw             m1, m8
1842    pmulhrsw             m0, m8
1843    pshufd               m1, m1, q1032
1844    punpckhwd            m2, m0, m1
1845    punpcklwd            m0, m1
1846    punpckhwd            m1, m0, m2
1847    punpcklwd            m0, m2
1848    jmp                tx2q
1849.pass2:
1850    IDCT4_1D_PACKED
1851    mova                 m2, [o(permA)]
1852    jmp m(iadst_16x4_internal_8bpc).end
1853
1854INV_TXFM_16X4_FN adst, dct
1855INV_TXFM_16X4_FN adst, adst
1856INV_TXFM_16X4_FN adst, flipadst
1857INV_TXFM_16X4_FN adst, identity
1858
1859cglobal iadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1860    mova                 m0, [cq+64*0]
1861    mova                 m1, [cq+64*1]
1862    movshdup             m3, [o(permB)]
1863    psrlq               m10, m3, 4
1864    call m(iadst_4x16_internal_8bpc).main2
1865    vpbroadcastd         m6, [o(pw_16384_m16384)]
1866    psrlq                m0, m10, 4
1867    psrlq               m10, 8
1868.pass1_end:
1869    punpcklwd           ym5, ym4, ym2
1870    punpckhwd           ym4, ym2
1871    vinserti32x8         m5, ym4, 1
1872    mova                 m1, m9
1873    vpdpwssd             m1, m5, [o(pw_m2896_2896)] {1to16}
1874    mova                 m4, m9
1875    vpdpwssd             m4, m5, [o(pw_2896_2896)] {1to16}
1876    psrad                m1, 12
1877    psrad                m4, 12
1878    packssdw             m1, m4 ;  out8  -out7  -out9   out6  -out11  out4   out10 -out5
1879    vpermi2q             m0, m1, m3  ; 0 1   4 5   8 9   c d
1880    vpermt2q             m1, m10, m3 ; 2 3   6 7   a b   e f
1881    punpckhwd            m2, m0, m1
1882    punpcklwd            m0, m1
1883    punpckhwd            m1, m0, m2
1884    punpcklwd            m0, m2
1885    pmulhrsw             m0, m6
1886    pmulhrsw             m1, m6
1887    jmp                tx2q
1888.pass2:
1889    call .main
1890    movu                 m2, [o(permA+1)]
1891.end:
1892    vpbroadcastd         m3, [o(pw_2048)]
1893    pmulhrsw             m0, m3
1894    pmulhrsw             m1, m3
1895.end2:
1896    psrlq                m3, m2, 4
1897    vpermi2q             m2, m0, m1
1898    vpermi2q             m3, m0, m1
1899.end3:
1900    lea                  r3, [dstq+strideq*2]
1901    mova                xm1, [dstq+strideq*0]
1902    vinserti32x4        ym1, [dstq+strideq*1], 1
1903    vinserti32x4         m1, [r3  +strideq*0], 2
1904    vinserti32x4         m1, [r3  +strideq*1], 3
1905    pxor                 m4, m4
1906    mova          [cq+64*0], m4
1907    mova          [cq+64*1], m4
1908    punpcklbw            m0, m1, m4
1909    punpckhbw            m1, m4
1910    paddw                m0, m2
1911    paddw                m1, m3
1912    packuswb             m0, m1
1913    mova          [dstq+strideq*0], xm0
1914    vextracti32x4 [dstq+strideq*1], ym0, 1
1915    vextracti32x4 [r3  +strideq*0], m0, 2
1916    vextracti32x4 [r3  +strideq*1], m0, 3
1917    RET
1918ALIGN function_align
1919.main:
1920    IADST4_1D_PACKED
1921    ret
1922
1923INV_TXFM_16X4_FN flipadst, dct
1924INV_TXFM_16X4_FN flipadst, adst
1925INV_TXFM_16X4_FN flipadst, flipadst
1926INV_TXFM_16X4_FN flipadst, identity
1927
1928cglobal iflipadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1929    mova                 m0, [cq+64*0]
1930    mova                 m1, [cq+64*1]
1931    movshdup             m3, [o(permB)]
1932    psrlq               m10, m3, 4
1933    call m(iadst_4x16_internal_8bpc).main2
1934    vpbroadcastd         m6, [o(pw_m16384_16384)]
1935    psrlq                m0, m10, 12
1936    psrlq               m10, 16
1937    jmp m(iadst_16x4_internal_8bpc).pass1_end
1938.pass2:
1939    call m(iadst_16x4_internal_8bpc).main
1940    movu                m2, [o(permA+2)]
1941    jmp m(iadst_16x4_internal_8bpc).end
1942
1943INV_TXFM_16X4_FN identity, dct
1944INV_TXFM_16X4_FN identity, adst
1945INV_TXFM_16X4_FN identity, flipadst
1946INV_TXFM_16X4_FN identity, identity
1947
1948cglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1949    mova                 m1, [cq+64*0]
1950    mova                 m2, [cq+64*1]
1951    vpbroadcastd         m3, [o(pw_1697x16)]
1952    vpbroadcastd         m4, [o(pw_16384)]
1953    mova                 m5, [o(idtx_16x4p)]
1954    shufps               m0, m1, m2, q2020
1955    shufps               m1, m2, q3131
1956    pmulhrsw             m2, m3, m0
1957    pmulhrsw             m3, m1
1958    pmulhrsw             m2, m4
1959    pmulhrsw             m3, m4
1960    paddsw               m0, m2
1961    paddsw               m1, m3
1962    vpermb               m0, m5, m0
1963    vpermb               m1, m5, m1
1964    jmp                tx2q
1965.pass2:
1966    vpbroadcastd         m3, [o(pw_1697x8)]
1967    pmulhrsw             m2, m3, m0
1968    pmulhrsw             m3, m1
1969    paddsw               m0, m2
1970    paddsw               m1, m3
1971    movu                 m2, [o(permA+1)]
1972    jmp m(iadst_16x4_internal_8bpc).end
1973
1974%macro INV_TXFM_16X8_FN 2 ; type1, type2
1975    INV_TXFM_FN          %1, %2, 16x8
1976%ifidn %1_%2, dct_dct
1977    movsx               r6d, word [cq]
1978    mov                [cq], eobd
1979    mov                 r3d, 8
1980.dconly:
1981    imul                r6d, 181
1982    add                 r6d, 128
1983    sar                 r6d, 8
1984.dconly2:
1985    imul                r6d, 181
1986    add                 r6d, 128+256
1987    sar                 r6d, 8+1
1988.dconly3:
1989    imul                r6d, 181
1990    lea                  r2, [strideq*3]
1991    add                 r6d, 128+2048
1992    sar                 r6d, 8+4
1993    pxor                 m2, m2
1994    vpbroadcastw         m3, r6d
1995.dconly_loop:
1996    mova                xm1, [dstq+strideq*0]
1997    vinserti32x4        ym1, [dstq+strideq*1], 1
1998    vinserti32x4         m1, [dstq+strideq*2], 2
1999    vinserti32x4         m1, [dstq+r2       ], 3
2000    punpcklbw            m0, m1, m2
2001    punpckhbw            m1, m2
2002    paddw                m0, m3
2003    paddw                m1, m3
2004    packuswb             m0, m1
2005    mova          [dstq+strideq*0], xm0
2006    vextracti32x4 [dstq+strideq*1], ym0, 1
2007    vextracti32x4 [dstq+strideq*2], m0, 2
2008    vextracti32x4 [dstq+r2       ], m0, 3
2009    lea                dstq, [dstq+strideq*4]
2010    sub                 r3d, 4
2011    jg .dconly_loop
2012    RET
2013%endif
2014%endmacro
2015
2016%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd
2017    vpbroadcastd         m8, [o(pw_2896x8)]
2018    vpermq               m0, [cq+32*0], q3120
2019    add                  cq, 32*4
2020    vpermq               m7, [cq+32*3], q%1
2021    vpermq               m1, [cq-32*3], q%1
2022    vpermq               m6, [cq+32*2], q3120
2023    vpermq               m2, [cq-32*2], q3120
2024    vpermq               m5, [cq+32*1], q%1
2025    vpermq               m3, [cq-32*1], q%1
2026    vpermq               m4, [cq+32*0], q3120
2027    REPX   {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4
2028%endmacro
2029
2030INV_TXFM_16X8_FN dct, dct
2031INV_TXFM_16X8_FN dct, identity
2032INV_TXFM_16X8_FN dct, adst
2033INV_TXFM_16X8_FN dct, flipadst
2034
2035cglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
2036    vpbroadcastd         m1, [o(pw_2896x8)]
2037    vpermq               m0, [cq+64*0], q3120
2038    vpermq               m2, [cq+64*1], q3120
2039    vpermq               m4, [cq+64*2], q3120
2040    vpermq               m6, [cq+64*3], q3120
2041    REPX   {pmulhrsw x, m1}, m0, m2, m4, m6
2042    vextracti32x8       ym1, m0, 1
2043    vextracti32x8       ym3, m2, 1
2044    vextracti32x8       ym5, m4, 1
2045    vextracti32x8       ym7, m6, 1
2046    call m(idct_8x16_internal_8bpc).main
2047    vbroadcasti32x4      m8, [o(int_shuf1)]
2048    vbroadcasti32x4      m9, [o(int_shuf2)]
2049    vinserti32x8         m0, ym2, 1 ; a0 a1 a2 a3 b0 b1 b2 b3
2050    vinserti32x8         m1, ym3, 1 ; d0 d1 d2 d3 c0 c1 c2 c3
2051    vinserti32x8         m4, ym6, 1 ; i0 i1 i2 i3 j0 j1 j2 j3
2052    vinserti32x8         m5, ym7, 1 ; l0 l1 l2 l3 k0 k1 k2 k3
2053    vpbroadcastd         m2, [o(pw_16384)]
2054    pshufb               m0, m8     ; a0 b0 a1 b1 a2 b2 a3 b3
2055    pshufb               m1, m9     ; c0 d0 c1 d1 c2 d2 c3 d3
2056    pshufb               m6, m4, m8 ; i0 j0 i1 j1 i2 j2 i3 j3
2057    pshufb               m7, m5, m9 ; m0 n0 m1 n1 m2 n2 m3 n3
2058    REPX   {pmulhrsw x, m2}, m0, m1, m6, m7
2059    punpckldq            m2, m0, m1 ; a0 b0 c0 d0 a1 b1 c1 d1
2060    punpckhdq            m3, m0, m1 ; a2 b2 c2 d2 a3 b3 c3 d3
2061    punpckldq            m4, m6, m7 ; i0 j0 k0 l0 i1 j1 k1 l1
2062    punpckhdq            m5, m6, m7 ; i2 j2 k2 l2 i3 j3 k3 l3
2063    jmp                tx2q
2064.pass2:
2065    vshufi32x4           m0, m2, m4, q2020 ; 0 1
2066    vshufi32x4           m2, m4, q3131     ; 4 5
2067    vshufi32x4           m1, m3, m5, q2020 ; 2 3
2068    vshufi32x4           m3, m5, q3131     ; 6 7
2069    call .main
2070    movshdup             m4, [o(permC)]
2071    psrlq                m6, m4, 4
2072    vpermq               m5, m4, q1032
2073    vpermi2q             m4, m0, m2 ; a2 a3   b2 b3   e2 e3   f2 f3
2074    vpermt2q             m0, m6, m2 ; a0 a1   b0 b1   e0 e1   f0 f1
2075    psrlq                m6, m5, 4
2076    vpermi2q             m5, m1, m3 ; c2 c3   d2 d3   g2 g3   h2 h3
2077    vpermt2q             m1, m6, m3 ; c0 c1   d0 d1   g0 g1   h0 h1
2078    vpbroadcastd         m6, [o(pw_2048)]
2079.end:
2080    REPX   {pmulhrsw x, m6}, m0, m4, m1, m5
2081.end2:
2082    lea                  r3, [dstq+strideq*4]
2083    lea                  r4, [strideq*3]
2084    mova                xm3, [dstq+strideq*0]
2085    mova                xm6, [dstq+strideq*2]
2086    vinserti32x4        ym3, [dstq+strideq*1], 1
2087    vinserti32x4        ym6, [dstq+r4       ], 1
2088    vinserti32x4         m3, [r3  +strideq*0], 2
2089    vinserti32x4         m6, [r3  +strideq*2], 2
2090    vinserti32x4         m3, [r3  +strideq*1], 3
2091    vinserti32x4         m6, [r3  +r4       ], 3
2092    pxor                 m7, m7
2093    mova          [cq+64*0], m7
2094    mova          [cq+64*1], m7
2095    mova          [cq+64*2], m7
2096    mova          [cq+64*3], m7
2097    punpcklbw            m2, m3, m7
2098    punpckhbw            m3, m7
2099    paddw                m0, m2
2100    paddw                m4, m3
2101    packuswb             m0, m4
2102    mova          [dstq+strideq*0], xm0
2103    vextracti32x4 [dstq+strideq*1], ym0, 1
2104    vextracti32x4 [r3  +strideq*0], m0, 2
2105    vextracti32x4 [r3  +strideq*1], m0, 3
2106    punpcklbw            m3, m6, m7
2107    punpckhbw            m6, m7
2108    paddw                m1, m3
2109    paddw                m5, m6
2110    packuswb             m1, m5
2111    mova          [dstq+strideq*2], xm1
2112    vextracti32x4 [dstq+r4       ], ym1, 1
2113    vextracti32x4 [r3  +strideq*2], m1, 2
2114    vextracti32x4 [r3  +r4       ], m1, 3
2115    RET
2116ALIGN function_align
2117.main:
2118    IDCT8_1D_PACKED
2119    ret
2120
2121INV_TXFM_16X8_FN adst, dct
2122INV_TXFM_16X8_FN adst, adst
2123INV_TXFM_16X8_FN adst, flipadst
2124INV_TXFM_16X8_FN adst, identity
2125
2126cglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
2127    call m(iadst_8x16_internal_8bpc).main_pass1
2128    vpbroadcastd         m7, [o(pw_16384_m16384)]
2129    psrlq               m10, 4
2130.pass1_end:
2131    punpcklwd            m5, m4, m2
2132    punpckhwd            m4, m2
2133    mova                 m1, m9
2134    vpdpwssd             m1, m5, [o(pw_m2896_2896)] {1to16}
2135    mova                 m6, m9
2136    vpdpwssd             m6, m5, [o(pw_2896_2896)] {1to16}
2137    mova                 m2, m9
2138    vpdpwssd             m2, m4, [o(pw_m2896_2896)] {1to16}
2139    vpdpwssd             m9, m4, [o(pw_2896_2896)] {1to16}
2140    psrad                m1, 12
2141    psrad                m6, 12
2142    packssdw             m1, m6 ;  out8  -out7  -out9   out6
2143    psrad                m2, 12
2144    psrad                m9, 12
2145    packssdw             m2, m9 ; -out11  out4   out10 -out5
2146    psrlq                m4, m10, 4
2147    vpermi2q             m4, m0, m2
2148    vpermt2q             m0, m10, m2
2149    psrlq                m5, m10, 8
2150    vpermi2q             m5, m1, m3
2151    psrlq               m10, 12
2152    vpermt2q             m1, m10, m3
2153    punpcklwd            m3, m4, m5 ; a0 c0 a1 c1 a2 c2 a3 c3
2154    punpckhwd            m4, m5     ; b0 d0 b1 d1 b2 d2 b3 d3
2155    punpcklwd            m5, m1, m0 ; i0 k0 i1 k1 2i k2 i3 k3
2156    punpckhwd            m1, m0     ; j0 l0 j1 l1 j2 l2 j3 l3
2157    punpcklwd            m2, m3, m4 ; a0 b0 c0 d0 a1 b1 c1 d1
2158    punpckhwd            m3, m4     ; a2 b2 c2 d2 a3 b3 c3 d3
2159    punpcklwd            m4, m5, m1 ; i0 j0 k0 l0 i1 j1 k1 l1
2160    punpckhwd            m5, m1     ; i2 j2 k2 l2 i3 j3 k3 l3
2161    REPX   {pmulhrsw x, m7}, m2, m3, m4, m5
2162    jmp                tx2q
2163.pass2:
2164    vshufi32x4           m0, m2, m4, q2020
2165    vshufi32x4           m2, m4, q3131     ; 4 5
2166    vshufi32x4           m1, m3, m5, q2020
2167    vshufi32x4           m3, m5, q3131     ; 6 7
2168    pshufd               m4, m0, q1032     ; 1 0
2169    pshufd               m5, m1, q1032     ; 3 2
2170    call .main_pass2
2171    pmulhrsw             m0, m6
2172    pmulhrsw             m1, m6
2173    psrlq                m6, m4, 4
2174    mova                 m5, m4
2175    vpermi2q             m4, m0, m2
2176    vpermt2q             m0, m6, m2
2177    vpermi2q             m5, m1, m3
2178    vpermt2q             m1, m6, m3
2179    jmp m(idct_16x8_internal_8bpc).end2
2180ALIGN function_align
2181.main_pass1:
2182    vpbroadcastd         m4, [o(pw_2896x8)]
2183    pmulhrsw             m3, m4, [cq+64*0]
2184    pmulhrsw             m1, m4, [cq+64*3]
2185    pmulhrsw             m2, m4, [cq+64*1]
2186    pmulhrsw             m4, [cq+64*2]
2187    mova                 m5, [o(int16_perm)]
2188    kxnorb               k1, k1, k1
2189    vpblendmd        m0{k1}, m1, m3 ; 0 7
2190    vmovdqa32        m3{k1}, m1     ; 6 1
2191    vpblendmd        m1{k1}, m4, m2 ; 2 5
2192    vmovdqa32        m2{k1}, m4     ; 4 3
2193    REPX  {vpermb x, m5, x}, m0, m1, m2, m3
2194    IADST8_1D_PACKED 1
2195    ret
2196ALIGN function_align
2197.main_pass2:
2198    IADST8_1D_PACKED 2
2199    movshdup             m4, [o(permC)]
2200    pxor                 m5, m5
2201    psubd                m5, m6
2202    packssdw             m6, m5
2203    pmulhrsw             m2, m6
2204    pmulhrsw             m3, m6
2205    ret
2206
2207INV_TXFM_16X8_FN flipadst, dct
2208INV_TXFM_16X8_FN flipadst, adst
2209INV_TXFM_16X8_FN flipadst, flipadst
2210INV_TXFM_16X8_FN flipadst, identity
2211
2212cglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
2213    call m(iadst_8x16_internal_8bpc).main_pass1
2214    vpbroadcastd         m7, [o(pw_m16384_16384)]
2215    psrlq               m10, 20
2216    jmp m(iadst_16x8_internal_8bpc).pass1_end
2217.pass2:
2218    vshufi32x4           m0, m2, m4, q2020
2219    vshufi32x4           m2, m4, q3131     ; 4 5
2220    vshufi32x4           m1, m3, m5, q2020
2221    vshufi32x4           m3, m5, q3131     ; 6 7
2222    pshufd               m4, m0, q1032     ; 1 0
2223    pshufd               m5, m1, q1032     ; 3 2
2224    call m(iadst_16x8_internal_8bpc).main_pass2
2225    pmulhrsw             m5, m6, m0
2226    pmulhrsw             m0, m6, m1
2227    psrlq                m1, m4, 12
2228    psrlq                m4, 8
2229    mova                 m7, m4
2230    vpermi2q             m4, m0, m3
2231    vpermt2q             m0, m1, m3
2232    vpermi2q             m1, m5, m2
2233    vpermt2q             m5, m7, m2
2234    jmp m(idct_16x8_internal_8bpc).end2
2235
2236INV_TXFM_16X8_FN identity, dct
2237INV_TXFM_16X8_FN identity, adst
2238INV_TXFM_16X8_FN identity, flipadst
2239INV_TXFM_16X8_FN identity, identity
2240
2241cglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
2242    vpbroadcastd         m0, [o(pw_2896x8)]
2243    pmulhrsw             m3, m0, [cq+64*0]
2244    pmulhrsw             m4, m0, [cq+64*1]
2245    pmulhrsw             m5, m0, [cq+64*2]
2246    pmulhrsw             m0,     [cq+64*3]
2247    vpbroadcastd         m7, [o(pw_1697x16)]
2248    vpbroadcastd         m8, [o(pw_16384)]
2249    shufps               m2, m3, m4, q2020 ; a0 a1 a4 a5 e0 e1 e4 e5
2250    shufps               m3, m4, q3131     ; a2 a3 a6 a7 e2 e3 e6 e7
2251    shufps               m4, m5, m0, q2020 ; i0 i1 i4 i5 m0 m1 m4 m5
2252    shufps               m5, m0, q3131     ; i2 i3 i6 i7 m2 m3 m6 m7
2253    mova                 m9, [o(int8_permA)]
2254    pmulhrsw             m0, m7, m2
2255    pmulhrsw             m1, m7, m3
2256    pmulhrsw             m6, m7, m4
2257    pmulhrsw             m7, m5
2258    REPX   {pmulhrsw x, m8}, m0, m1, m6, m7
2259    paddsw               m2, m0
2260    paddsw               m3, m1
2261    paddsw               m4, m6
2262    paddsw               m5, m7
2263    REPX  {vpermb x, m9, x}, m2, m3, m4, m5
2264    jmp                tx2q
2265.pass2:
2266    mova                 m7, [o(permB)]
2267    vpbroadcastd         m6, [o(pw_4096)]
2268    vpermq               m0, m7, m2
2269    vpermq               m4, m7, m4
2270    vpermq               m1, m7, m3
2271    vpermq               m5, m7, m5
2272    jmp m(idct_16x8_internal_8bpc).end
2273
2274%macro INV_TXFM_16X16_FN 2 ; type1, type2
2275    INV_TXFM_FN          %1, %2, 16x16
2276%ifidn %1_%2, dct_dct
2277    movsx               r6d, word [cq]
2278    mov                [cq], eobd
2279    imul                r6d, 181
2280    mov                 r3d, 16
2281    add                 r6d, 128+512
2282    sar                 r6d, 8+2
2283    jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3
2284%endif
2285%endmacro
2286
2287INV_TXFM_16X16_FN dct, dct
2288INV_TXFM_16X16_FN dct, identity
2289INV_TXFM_16X16_FN dct, adst
2290INV_TXFM_16X16_FN dct, flipadst
2291
2292cglobal idct_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
2293    mova                 m7, [o(permB)]
2294    vpermq               m0, m7, [cq+64*0]
2295    vpermq               m1, m7, [cq+64*1]
2296    vpermq               m2, m7, [cq+64*2]
2297    vpermq               m3, m7, [cq+64*3]
2298    vpermq               m4, m7, [cq+64*4]
2299    vpermq               m5, m7, [cq+64*5]
2300    vpermq               m6, m7, [cq+64*6]
2301    vpermq               m7, m7, [cq+64*7]
2302    call .main
2303    vbroadcasti32x4     m12, [o(int_shuf1)]
2304    vbroadcasti32x4     m11, [o(int_shuf2)]
2305    vpbroadcastd        m13, [o(pw_8192)]
2306    pshufb               m0, m12
2307    pshufb               m8, m1, m11
2308    pshufb               m2, m12
2309    pshufb               m9, m3, m11
2310    pshufb               m4, m12
2311    pshufb              m10, m5, m11
2312    pshufb               m6, m12
2313    pshufb              m11, m7, m11
2314    REPX  {pmulhrsw x, m13}, m0, m8, m2, m9, m4, m10, m6, m11
2315    punpckhdq            m1, m0, m8
2316    punpckldq            m0, m8
2317    punpckhdq            m3, m2, m9
2318    punpckldq            m2, m9
2319    punpckhdq            m5, m4, m10
2320    punpckldq            m4, m10
2321    punpckhdq            m7, m6, m11
2322    punpckldq            m6, m11
2323    jmp                tx2q
2324.pass2:
2325    vshufi32x4           m8, m4, m6, q3232 ; i8 ic m8 mc
2326    vinserti32x8         m4, ym6, 1        ; i0 i4 m0 m4
2327    vshufi32x4           m6, m0, m2, q3232 ; a8 ac e8 ec
2328    vinserti32x8         m0, ym2, 1        ; a0 a4 e0 e4
2329    vshufi32x4           m9, m5, m7, q3232 ; ia ie ma me
2330    vinserti32x8         m5, ym7, 1        ; i2 i6 m2 m6
2331    vshufi32x4           m7, m1, m3, q3232 ; aa ae ea ee
2332    vinserti32x8         m1, ym3, 1        ; a2 a6 e2 e6
2333    vshufi32x4           m2, m0, m4, q3131 ;  4  5
2334    vshufi32x4           m0, m4, q2020     ;  0  1
2335    vshufi32x4           m4, m6, m8, q2020 ;  8  9
2336    vshufi32x4           m6, m8, q3131     ; 12 13
2337    vshufi32x4           m3, m1, m5, q3131 ;  6  7
2338    vshufi32x4           m1, m5, q2020     ;  2  3
2339    vshufi32x4           m5, m7, m9, q2020 ; 10 11
2340    vshufi32x4           m7, m9, q3131     ; 14 15
2341    call .main
2342    mova                  m8, [o(permD)]
2343    psrlq                m12, m8, 4
2344    psrlq                 m9, m8, 8
2345    psrlq                m13, m8, 12
2346    mova                 m10, m8
2347    vpermi2q              m8, m0, m2 ;  0  1  4  5
2348    vpermt2q              m0, m12, m2
2349    mova                 m11, m9
2350    vpermi2q              m9, m1, m3 ;  2  3  6  7
2351    vpermt2q              m1, m13, m3
2352    vpermi2q             m10, m4, m6 ;  8  9 12 13
2353    vpermt2q              m4, m12, m6
2354    vpermi2q             m11, m5, m7 ; 10 11 14 15
2355    vpermt2q              m5, m13, m7
2356.end:
2357    vpbroadcastd        m12, [o(pw_2048)]
2358.end2:
2359    REPX  {pmulhrsw x, m12}, m0, m1, m4, m5
2360.end3:
2361    REPX  {pmulhrsw x, m12}, m8, m9, m10, m11
2362    lea                  r3, [strideq*3]
2363    lea                  r4, [dstq+strideq*4]
2364    lea                  r5, [dstq+strideq*8]
2365    lea                  r6, [r4  +strideq*8]
2366    mova                xm3, [dstq+strideq*0]
2367    mova                xm6, [dstq+strideq*2]
2368    vinserti32x4        ym3, [dstq+strideq*1], 1
2369    vinserti32x4        ym6, [dstq+r3       ], 1
2370    vinserti32x4         m3, [r4+strideq*0], 2
2371    vinserti32x4         m6, [r4+strideq*2], 2
2372    vinserti32x4         m3, [r4+strideq*1], 3
2373    vinserti32x4         m6, [r4+r3       ], 3
2374    mova               xm12, [r5+strideq*0]
2375    mova               xm13, [r5+strideq*2]
2376    vinserti32x4       ym12, [r5+strideq*1], 1
2377    vinserti32x4       ym13, [r5+r3       ], 1
2378    vinserti32x4        m12, [r6+strideq*0], 2
2379    vinserti32x4        m13, [r6+strideq*2], 2
2380    vinserti32x4        m12, [r6+strideq*1], 3
2381    vinserti32x4        m13, [r6+r3       ], 3
2382    pxor                 m7, m7
2383    REPX {mova [cq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
2384    punpcklbw            m2, m3, m7
2385    punpckhbw            m3, m7
2386    paddw                m0, m2
2387    paddw                m8, m3
2388    packuswb             m0, m8
2389    punpcklbw            m2, m6, m7
2390    punpckhbw            m6, m7
2391    paddw                m1, m2
2392    paddw                m9, m6
2393    packuswb             m1, m9
2394    punpcklbw            m2, m12, m7
2395    punpckhbw           m12, m7
2396    paddw                m2, m4
2397    paddw               m10, m12
2398    packuswb             m2, m10
2399    punpcklbw            m3, m13, m7
2400    punpckhbw           m13, m7
2401    paddw                m3, m5
2402    paddw               m11, m13
2403    packuswb             m3, m11
2404    mova          [dstq+strideq*0], xm0
2405    vextracti32x4 [dstq+strideq*1], ym0, 1
2406    mova          [dstq+strideq*2], xm1
2407    vextracti32x4 [dstq+r3       ], ym1, 1
2408    vextracti32x4 [r4+strideq*0], m0, 2
2409    vextracti32x4 [r4+strideq*1], m0, 3
2410    vextracti32x4 [r4+strideq*2], m1, 2
2411    vextracti32x4 [r4+r3       ], m1, 3
2412    mova          [r5+strideq*0], xm2
2413    vextracti32x4 [r5+strideq*1], ym2, 1
2414    mova          [r5+strideq*2], xm3
2415    vextracti32x4 [r5+r3       ], ym3, 1
2416    vextracti32x4 [r6+strideq*0], m2, 2
2417    vextracti32x4 [r6+strideq*1], m2, 3
2418    vextracti32x4 [r6+strideq*2], m3, 2
2419    vextracti32x4 [r6+r3       ], m3, 3
2420    RET
2421ALIGN function_align
2422.main_fast2: ; bottom three-quarters are zero
2423    vpbroadcastd        m10, [o(pd_2048)]
2424    vpbroadcastq        m13, [o(int_mshift)]
2425    vpcmpub              k7, m13, m10, 6
2426.main_fast4:
2427    vpbroadcastd         m2, [o(pw_401_4076x8)]
2428    vpbroadcastd         m4, [o(pw_m1189_3920x8)]
2429    vpbroadcastd         m3, [o(pw_799_4017x8)]
2430    pmulhrsw             m2, m8     ; t8a  t15a
2431    pmulhrsw             m4, m1     ; t11a t12a
2432    pmulhrsw             m7, m3     ; t4a  t7a
2433    pxor                 m6, m6
2434    psubsw               m0, m2, m4 ; t11a t12a
2435    paddsw               m8, m2, m4 ; t8a  t15a
2436    mova                 m1, m7
2437    jmp .main5
2438ALIGN function_align
2439.main_fast: ; bottom half is zero
2440    vpbroadcastd        m10, [o(pd_2048)]
2441.main_fast3:
2442    vpbroadcastq        m13, [o(int_mshift)]
2443    vpcmpub              k7, m13, m10, 6
2444.main_fast5:
2445    vpbroadcastd         m2, [o(pw_401_4076x8)]
2446    vpbroadcastd         m4, [o(pw_m2598_3166x8)]
2447    vpbroadcastd        m11, [o(pw_1931_3612x8)]
2448    vpbroadcastd        m12, [o(pw_m1189_3920x8)]
2449    pmulhrsw             m8, m2  ; t8a  t15a
2450    vpbroadcastd         m2, [o(pw_799_4017x8)]
2451    pmulhrsw             m0, m4  ; t9a  t14a
2452    vpbroadcastd         m4, [o(pw_m2276_3406x8)]
2453    pmulhrsw             m5, m11 ; t10a t13a
2454    pmulhrsw             m1, m12 ; t11a t12a
2455    pmulhrsw             m7, m2  ; t4a  t7a
2456    pmulhrsw             m3, m4  ; t5a  t6a
2457    jmp .main4
2458ALIGN function_align
2459.main:
2460    IDCT16_1D_PACKED
2461    ret
2462
2463INV_TXFM_16X16_FN adst, dct
2464INV_TXFM_16X16_FN adst, adst
2465INV_TXFM_16X16_FN adst, flipadst
2466
2467cglobal iadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
2468    call .main_pass1
2469    vpbroadcastd        m10, [o(pw_8192_m8192)]
2470    punpcklwd            m8, m0, m1 ; b0 d0 b1 d1 b2 d2 b3 d3
2471    punpckhwd            m0, m1     ; a0 c0 a1 c1 a2 c2 a3 c3
2472    punpckhwd            m1, m0, m8 ; a2 b2 c2 d2 a3 b3 c3 d3
2473    punpcklwd            m0, m8     ; a0 b0 c0 d0 a1 b1 c1 d1
2474    punpcklwd            m8, m2, m3 ; f0 h0 f1 h1 f2 h2 f3 h3
2475    punpckhwd            m2, m3     ; e0 g0 e1 g1 e2 g2 e3 g3
2476    punpckhwd            m3, m2, m8 ; e2 f2 g2 h2 e3 f3 g3 h3
2477    punpcklwd            m2, m8     ; e0 f0 g0 h0 e1 f1 g1 h1
2478    punpckhwd            m8, m4, m5 ; i0 k0 i1 k1 i2 k2 i3 k3
2479    punpcklwd            m4, m5     ; j0 l0 j1 l1 j2 l2 j3 l3
2480    punpckhwd            m5, m4, m8 ; i2 j2 k2 l2 i3 j3 k3 l3
2481    punpcklwd            m4, m8     ; i0 j0 k0 l0 i1 j1 k1 l1
2482    punpckhwd            m8, m6, m7 ; m0 o0 m1 o1 m2 o2 m3 o3
2483    punpcklwd            m6, m7     ; n0 p0 n1 p1 n2 p2 n3 p3
2484    punpckhwd            m7, m6, m8 ; m2 n2 o2 p2 m3 n3 o3 p3
2485    punpcklwd            m6, m8     ; m0 n0 o0 p0 m1 n1 o1 p1
2486.pass1_end:
2487    REPX  {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
2488    jmp                tx2q
2489.pass2:
2490    call .main_pass2
2491    mova                m10, [o(permD)]
2492    psrlq                m8, m10, 8
2493    psrlq               m12, m10, 12
2494    psrlq               m13, m10, 4
2495    mova                 m9, m8
2496    vpermi2q             m8, m0, m2 ;  0  1  4  5
2497    vpermt2q             m0, m12, m2
2498    vpermi2q             m9, m1, m3 ;  2  3  6  7
2499    vpermt2q             m1, m12, m3
2500    vpbroadcastd        m12, [o(pw_2048)]
2501    mov                 r3d, 0xff00ff00
2502    mova                m11, m10
2503    vpermi2q            m10, m4, m6 ;  8  9 12 13
2504    vpermt2q             m4, m13, m6
2505    kmovd                k1, r3d
2506    vpermi2q            m11, m5, m7 ; 10 11 14 15
2507    vpermt2q             m5, m13, m7
2508    pxor                 m7, m7
2509    vpsubw          m12{k1}, m7, m12
2510    jmp m(idct_16x16_internal_8bpc).end2
2511ALIGN function_align
2512.main_pass1:
2513    mova                 m4, [o(permB)]
2514    psrlq                m3, m4, 4
2515    vpermq               m0, m4, [cq+64*0]
2516    vpermq               m7, m3, [cq+64*7]
2517    vpermq               m6, m4, [cq+64*6]
2518    vpermq               m1, m3, [cq+64*1]
2519    vpermq               m2, m4, [cq+64*2]
2520    vpermq               m5, m3, [cq+64*5]
2521    vpermq               m4, m4, [cq+64*4]
2522    vpermq               m3, m3, [cq+64*3]
2523    call .main
2524    vpbroadcastd        m13, [o(pw_2896_2896)]
2525    vpbroadcastd        m12, [o(pw_m2896_2896)]
2526    mova                 m2, m10
2527    vpdpwssd             m2, m5, m13       ; -out5
2528    mova                 m8, m10
2529    vpdpwssd             m8, m11, m13      ;  out4
2530    mova                 m9, m10
2531    vpdpwssd             m9, m5, m12       ;  out10
2532    mova                 m5, m10
2533    vpdpwssd             m5, m11, m12      ; -out11
2534    mova                m11, m10
2535    vpdpwssd            m11, m3, m13       ; -out7
2536    mova                m14, m10
2537    vpdpwssd            m14, m4, m13       ;  out6
2538    mova                m13, m10
2539    vpdpwssd            m13, m3, m12       ;  out8
2540    vpdpwssd            m10, m4, [o(pw_2896_m2896)] {1to16} ; -out9
2541    REPX      {psrad x, 12}, m2, m8, m9, m5, m11, m14, m13, m10
2542    packssdw             m2, m8            ; -out5   out4
2543    packssdw             m5, m9, m5        ;  out10 -out11
2544    packssdw             m3, m11, m14      ; -out7   out6
2545    packssdw             m4, m13, m10      ;  out8  -out9
2546    ret
2547ALIGN function_align
2548.main_pass2:
2549    vshufi32x4           m8, m4, m6, q3232 ; i8 ic m8 mc
2550    vinserti32x8         m4, ym6, 1        ; i0 i4 m0 m4
2551    vshufi32x4           m6, m0, m2, q3232 ; a8 ac e8 ec
2552    vinserti32x8         m0, ym2, 1        ; a0 a4 e0 e4
2553    vshufi32x4           m9, m5, m7, q3232 ; ia ie ma me
2554    vinserti32x8         m5, ym7, 1        ; i2 i6 m2 m6
2555    vshufi32x4           m7, m1, m3, q3232 ; aa ae ea ee
2556    vinserti32x8         m1, ym3, 1        ; a2 a6 e2 e6
2557    vshufi32x4           m2, m0, m4, q3131 ;  4  5
2558    vshufi32x4           m0, m4, q2020     ;  0  1
2559    vshufi32x4           m4, m6, m8, q2020 ;  8  9
2560    vshufi32x4           m6, m8, q3131     ; 12 13
2561    vshufi32x4           m3, m1, m5, q3131 ;  6  7
2562    vshufi32x4           m1, m5, q2020     ;  2  3
2563    vshufi32x4           m5, m7, m9, q2020 ; 10 11
2564    vshufi32x4           m7, m9, q3131     ; 14 15
2565    REPX {pshufd x, x, q1032}, m1, m3, m5, m7
2566    call .main
2567    vpbroadcastd         m8, [o(pw_2896x8)]
2568    pshufb               m2, m11, m12
2569    pshufb               m5, m12
2570    pshufb               m3, m12
2571    pshufb               m4, m12
2572    punpcklqdq           m9, m5, m2        ;  t15a   t7
2573    punpckhqdq           m5, m2            ;  t14a   t6
2574    shufps               m2, m3, m4, q1032 ;  t2a    t10
2575    shufps               m3, m4, q3210     ;  t3a    t11
2576    psubsw               m4, m2, m3        ;  out8  -out9
2577    paddsw               m3, m2            ; -out7   out6
2578    paddsw               m2, m5, m9        ; -out5   out4
2579    psubsw               m5, m9            ;  out10 -out11
2580    REPX   {pmulhrsw x, m8}, m2, m3, m4, m5
2581    ret
2582ALIGN function_align
2583.main:
2584    vpbroadcastd        m10, [o(pd_2048)]
2585    vpbroadcastq        m13, [o(int_mshift)]
2586    punpckhwd            m8, m7, m0 ; in14 in1
2587    punpcklwd            m0, m7     ; in0  in15
2588    punpcklwd            m7, m6, m1 ; in12 in3
2589    punpckhwd            m1, m6     ; in2  in13
2590    punpckhwd            m6, m5, m2 ; in10 in5
2591    punpcklwd            m2, m5     ; in4  in11
2592    punpcklwd            m5, m4, m3 ; in8  in7
2593    punpckhwd            m3, m4     ; in6  in9
2594    vpcmpub              k7, m13, m10, 6 ; 0x33...
2595    ITX_MUL2X_PACK        0, 4, 9, 10,  201, 4091, 5 ; t0  t1
2596    ITX_MUL2X_PACK        1, 4, 9, 10,  995, 3973, 5 ; t2  t3
2597    ITX_MUL2X_PACK        2, 4, 9, 10, 1751, 3703, 5 ; t4  t5
2598    ITX_MUL2X_PACK        3, 4, 9, 10, 2440, 3290, 5 ; t6  t7
2599    ITX_MUL2X_PACK        5, 4, 9, 10, 3035, 2751, 5 ; t8  t9
2600    ITX_MUL2X_PACK        6, 4, 9, 10, 3513, 2106, 5 ; t10 t11
2601    ITX_MUL2X_PACK        7, 4, 9, 10, 3857, 1380, 5 ; t12 t13
2602    ITX_MUL2X_PACK        8, 4, 9, 10, 4052,  601, 5 ; t14 t15
2603    psubsw               m4, m0, m5 ; t9a  t8a
2604    paddsw               m0, m5     ; t1a  t0a
2605    psubsw               m5, m1, m6 ; t11a t10a
2606    paddsw               m1, m6     ; t3a  t2a
2607    psubsw               m6, m2, m7 ; t13a t12a
2608    paddsw               m2, m7     ; t5a  t4a
2609    psubsw               m7, m3, m8 ; t15a t14a
2610    paddsw               m3, m8     ; t7a  t6a
2611    ITX_MUL2X_PACK        4, 8, 9, 10, 799,       4017,        4 ; t8  t9
2612    ITX_MUL2X_PACK        6, 8, 9, 10, 799_4017,  4017_m799,  52 ; t12 t13
2613    ITX_MUL2X_PACK        5, 8, 9, 10, 3406,      2276,        4 ; t10 t11
2614    ITX_MUL2X_PACK        7, 8, 9, 10, 3406_2276, 2276_m3406, 52 ; t14 t15
2615    psubsw               m8, m1, m3 ; t7   t6
2616    paddsw               m1, m3     ; t3   t2
2617    psubsw               m3, m0, m2 ; t5   t4
2618    paddsw               m0, m2     ; t1   t0
2619    psubsw               m2, m5, m7 ; t14a t15a
2620    paddsw               m7, m5     ; t10a t11a
2621    psubsw               m5, m4, m6 ; t12a t13a
2622    paddsw               m4, m6     ; t8a  t9a
2623    ITX_MUL2X_PACK        3, 6, 9, 10, 1567,       3784,        5 ; t5a t4a
2624    ITX_MUL2X_PACK        8, 6, 9, 10, 3784_m1567, 1567_3784,  52 ; t7a t6a
2625    ITX_MUL2X_PACK        2, 6, 9, 10, 3784,       1567,        4 ; t15 t14
2626    ITX_MUL2X_PACK        5, 6, 9, 10, 3784_1567,  1567_m3784, 52 ; t13 t12
2627    vbroadcasti32x4     m12, [o(deint_shuf)]
2628    paddsw               m6, m4, m7        ; -out1  out14
2629    psubsw               m4, m7            ;  t10    t11
2630    psubsw              m11, m3, m8        ;  t7     t6
2631    paddsw               m8, m3            ;  out12 -out3
2632    psubsw               m3, m0, m1        ;  t3a    t2a
2633    paddsw               m0, m1            ; -out15  out0
2634    paddsw               m1, m2, m5        ; -out13  out2
2635    psubsw               m5, m2            ;  t15a   t14a
2636    pshufb               m0, m12
2637    pshufb               m6, m12
2638    pshufb               m8, m12
2639    pshufb               m1, m12
2640    shufps               m7, m6, m0, q1032 ;  out14 -out15
2641    shufps               m0, m6, m0, q3210 ; -out1   out0
2642    punpcklqdq           m6, m8, m1        ;  out12 -out13
2643    punpckhqdq           m1, m8, m1        ; -out3   out2
2644    ret
2645
2646INV_TXFM_16X16_FN flipadst, dct
2647INV_TXFM_16X16_FN flipadst, adst
2648INV_TXFM_16X16_FN flipadst, flipadst
2649
2650cglobal iflipadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
2651    call m(iadst_16x16_internal_8bpc).main_pass1
2652    vpbroadcastd        m10, [o(pw_m8192_8192)]
2653    punpcklwd            m8, m1, m0 ; m0 o0 m1 o1 m2 o2 m3 o3
2654    punpckhwd            m9, m1, m0 ; n0 p0 n1 p1 n2 p2 n3 p3
2655    punpckhwd            m1, m7, m6 ; a0 c0 a1 c1 a2 c2 a3 c3
2656    punpcklwd            m7, m6     ; b0 d0 b1 d1 b2 d2 b3 d3
2657    punpcklwd            m0, m1, m7 ; a0 b0 c0 d0 a1 b1 c1 d1
2658    punpckhwd            m1, m7     ; a2 b2 c2 d2 a3 b3 c3 d3
2659    punpcklwd            m6, m8, m9 ; m0 n0 o0 p0 m1 n1 o1 p1
2660    punpckhwd            m7, m8, m9 ; m2 n2 o2 p2 m3 n3 o3 p3
2661    punpcklwd            m8, m3, m2 ; i0 k0 i1 k1 i2 k2 i3 k3
2662    punpckhwd            m9, m3, m2 ; j0 l0 j1 l1 j2 l2 j3 l3
2663    punpckhwd            m3, m5, m4 ; e0 g0 e1 g1 e2 g2 e3 g3
2664    punpcklwd            m5, m4     ; f0 h0 f1 h1 f2 h2 f3 h3
2665    punpcklwd            m2, m3, m5 ; e0 f0 g0 h0 e1 f1 g1 h1
2666    punpckhwd            m3, m5     ; e2 f2 g2 h2 e3 f3 g3 h3
2667    punpcklwd            m4, m8, m9 ; i0 j0 k0 l0 i1 j1 k1 l1
2668    punpckhwd            m5, m8, m9 ; i2 j2 k2 l2 i3 j3 k3 l3
2669    jmp m(iadst_16x16_internal_8bpc).pass1_end
2670.pass2:
2671    call m(iadst_16x16_internal_8bpc).main_pass2
2672    mova                m10, [o(permD)]
2673    psrlq                m8, m10, 8
2674    psrlq               m12, m10, 12
2675    psrlq               m13, m10, 4
2676    mova                 m9, m8
2677    vpermi2q             m8, m7, m5 ;  0  1  4  5
2678    vpermt2q             m7, m12, m5
2679    vpermi2q             m9, m6, m4 ;  2  3  6  7
2680    vpermt2q             m6, m12, m4
2681    vpbroadcastd        m12, [o(pw_2048)]
2682    mov                 r3d, 0x00ff00ff
2683    mova                m11, m10
2684    vpermi2q            m10, m3, m1 ;  8  9 12 13
2685    vpermt2q             m3, m13, m1
2686    kmovd                k1, r3d
2687    vpermi2q            m11, m2, m0 ; 10 11 14 15
2688    vpermt2q             m2, m13, m0
2689    pxor                 m0, m0
2690    vpsubw          m12{k1}, m0, m12
2691    pmulhrsw             m0, m7, m12
2692    pmulhrsw             m1, m6, m12
2693    pmulhrsw             m4, m3, m12
2694    pmulhrsw             m5, m2, m12
2695    jmp m(idct_16x16_internal_8bpc).end3
2696
2697INV_TXFM_16X16_FN identity, dct
2698INV_TXFM_16X16_FN identity, identity
2699
2700cglobal iidentity_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
2701    mova                 m8, [o(int16_perm)]
2702    vpermb               m1, m8, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3
2703    vpermb               m2, m8, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3
2704    vpbroadcastd         m0, [o(pw_1697x16)]
2705    vpermb               m3, m8, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3
2706    vpermb               m4, m8, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3
2707    vpermb               m5, m8, [cq+64*4] ; i0 j0 i1 j1 i2 j2 i3 j3
2708    vpermb               m6, m8, [cq+64*5] ; k0 l0 k1 l1 k2 l2 k3 l3
2709    vpermb               m7, m8, [cq+64*6] ; m0 n0 m1 n1 m2 n2 m3 n3
2710    vpermb               m8, m8, [cq+64*7] ; o0 p0 o1 p1 o2 p2 o3 p3
2711    pmulhrsw             m9, m0, m1
2712    pmulhrsw            m10, m0, m2
2713    pmulhrsw            m11, m0, m3
2714    pmulhrsw            m12, m0, m4
2715    pmulhrsw            m13, m0, m5
2716    pmulhrsw            m14, m0, m6
2717    pmulhrsw            m15, m0, m7
2718    pmulhrsw             m0, m8
2719    REPX       {psraw x, 1}, m9, m10, m11, m12
2720    pavgw                m1, m9
2721    pavgw                m2, m10
2722    pavgw                m3, m11
2723    pavgw                m4, m12
2724    REPX       {psraw x, 1}, m13, m14, m15, m0
2725    pavgw                m5, m13
2726    pavgw                m6, m14
2727    pavgw                m7, m15
2728    pavgw                m8, m0
2729    punpckldq            m0, m1, m2 ; a0 b0 c0 d0 a1 b1 c1 d1
2730    punpckhdq            m1, m2     ; a2 b2 c2 d2 a3 b3 c3 d3
2731    punpckldq            m2, m3, m4 ; e0 f0 g0 h0 e1 f1 g1 h1
2732    punpckhdq            m3, m4     ; e2 f2 g2 h2 e3 f3 g3 h3
2733    punpckldq            m4, m5, m6 ; i0 j0 k0 l0 i1 j1 k1 l1
2734    punpckhdq            m5, m6     ; i2 j2 k2 l2 i3 j3 k3 l3
2735    punpckldq            m6, m7, m8 ; m0 n0 o0 p0 m1 n1 o1 p1
2736    punpckhdq            m7, m8     ; m2 n2 o2 p2 m3 n3 o3 p3
2737    jmp                tx2q
2738ALIGN function_align
2739.pass2:
2740    vpbroadcastd        m11, [o(pw_1697x16)]
2741    pmulhrsw            m12, m11, m0
2742    pmulhrsw            m13, m11, m1
2743    pmulhrsw            m14, m11, m2
2744    pmulhrsw            m15, m11, m3
2745    pmulhrsw             m8, m11, m4
2746    pmulhrsw             m9, m11, m5
2747    pmulhrsw            m10, m11, m6
2748    pmulhrsw            m11, m7
2749    REPX      {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7
2750    paddsw               m0, m12
2751    paddsw               m1, m13
2752    paddsw               m2, m14
2753    paddsw               m3, m15
2754    paddsw               m8, m4
2755    movu                 m4, [o(permD+2)]
2756    paddsw               m9, m5
2757    paddsw               m6, m10
2758    paddsw               m7, m11
2759    psrlq               m12, m4, 4
2760    mova                 m5, m4
2761    mova                m10, m4
2762    mova                m11, m4
2763    vpermi2q             m4, m0, m2  ;  8  9 12 13
2764    vpermt2q             m0, m12, m2 ;  0  1  4  5
2765    vpermi2q             m5, m1, m3  ; 10 11 14 15
2766    vpermt2q             m1, m12, m3 ;  2  3  6  7
2767    vpermi2q            m10, m8, m6
2768    vpermt2q             m8, m12, m6
2769    vpermi2q            m11, m9, m7
2770    vpermt2q             m9, m12, m7
2771    jmp m(idct_16x16_internal_8bpc).end
2772
2773%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4]
2774    vpbroadcastd        m%3, [o(pw_%4_%5x8)]
2775    punpcklwd           m%1, m%2, m%2
2776    pmulhrsw            m%1, m%3
2777    vpbroadcastd        m%3, [o(pw_%6_%7x8)]
2778    punpckhwd           m%2, m%2
2779    pmulhrsw            m%2, m%3
2780%endmacro
2781
2782cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob
2783%undef cmp
2784    lea                  r5, [o_base]
2785    test               eobd, eobd
2786    jz .dconly
2787    cmp                eobd, 107
2788    jb .fast
2789    mova                 m5, [cq+64*5]
2790    mova                 m3, [cq+64*3]
2791    mova                 m1, [cq+64*1]
2792    mova                 m7, [cq+64*7]
2793    mova                 m2, [cq+64*2]
2794    mova                 m6, [cq+64*6]
2795    mova                 m0, [cq+64*0]
2796    mova                 m4, [cq+64*4]
2797    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
2798    mova                 m8, [o(idct_8x32p)]
2799    vpbroadcastd         m9, [o(pw_8192)]
2800    REPX  {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7
2801    punpckldq            m8, m0, m1 ; ab
2802    punpckhdq            m0, m1
2803    punpckldq            m1, m2, m3 ; cd
2804    punpckhdq            m2, m3
2805    punpckldq            m3, m4, m5 ; ef
2806    punpckhdq            m4, m5
2807    punpckldq            m5, m6, m7 ; gh
2808    punpckhdq            m6, m7
2809    REPX   {pmulhrsw x, m9}, m8, m0, m1, m2, m3, m4, m5, m6
2810    punpcklqdq          m18, m8, m1 ; 30  2    6 26   31  1   23  9
2811    punpckhqdq          m14, m8, m1 ; 16  0   12 20    3 29   11 21
2812    punpcklqdq          m21, m0, m2 ; 14 18   22 10   27  5   19 13
2813    punpckhqdq          m15, m0, m2 ; 18  4   24  8    7 25   15 17
2814    punpcklqdq          m20, m3, m5
2815    punpckhqdq          m16, m3, m5
2816    punpcklqdq          m19, m4, m6
2817    punpckhqdq          m17, m4, m6
2818    vinserti32x4        ym8, ym18, xm20, 1
2819    vshufi32x4          ym1, ym18, ym20, 0x03
2820    vinserti32x4        ym9, ym14, xm16, 1
2821    vshufi32x4          ym3, ym14, ym16, 0x03
2822    vinserti32x4        ym0, ym21, xm19, 1
2823    vshufi32x4          ym5, ym21, ym19, 0x03
2824    vinserti32x4        ym7, ym15, xm17, 1
2825    vshufi32x4          ym6, ym15, ym17, 0x03
2826    call m(idct_8x16_internal_8bpc).main2
2827    psrlq               m12, [o(permB)], 60
2828    vpermt2q            m14, m12, m16
2829    vpermt2q            m21, m12, m19
2830    vpermt2q            m15, m12, m17
2831    vpermi2q            m12, m18, m20
2832    vextracti32x8      ym16, m14, 1
2833    vextracti32x8      ym19, m21, 1
2834    vextracti32x8      ym17, m15, 1
2835    vextracti32x8      ym20, m12, 1
2836    call .main2
2837    jmp .end
2838.fast: ; right half is zero
2839    mova                 m0, [o(int16_perm)]
2840    mova                ym2, [cq+64*4]
2841    vinserti32x8         m2, [cq+64*0], 1
2842    mova                ym3, [cq+64*6]
2843    vinserti32x8         m3, [cq+64*2], 1
2844    mova                ym4, [cq+64*3]
2845    vinserti32x8         m4, [cq+64*5], 1
2846    mova                ym5, [cq+64*7]
2847    vinserti32x8         m5, [cq+64*1], 1
2848    REPX  {vpermb x, m0, x}, m2, m3, m4, m5
2849    call m(idct_16x8_internal_8bpc).main2
2850    vbroadcasti32x4      m4, [o(int_shuf3)]
2851    vbroadcasti32x4      m5, [o(int_shuf4)]
2852    pshufb               m2, m4     ; e0 f0 e2 f2 e1 f1 e3 f3
2853    pshufb               m3, m5     ; g0 h0 g2 h2 g1 h1 g3 h3
2854    pshufb               m0, m4     ; a0 b0 a2 b2 a1 b1 a3 b3
2855    pshufb               m1, m5     ; c0 d0 c2 d2 c1 d1 c3 d3
2856    vpbroadcastd         m4, [o(pw_8192)]
2857    psrlq                m5, [o(permB)], 60
2858    punpckldq            m6, m2, m3 ; e0 f0 g0 h0 e2 f2 g2 h2
2859    punpckhdq           m17, m2, m3 ; e1 f1 g1 h1 e3 f3 g3 h3
2860    punpckldq            m2, m0, m1 ; a0 b0 c0 d0 a2 b2 c2 d2
2861    punpckhdq           m16, m0, m1 ; a1 b1 c1 d1 a3 b3 c3 d3
2862    REPX   {pmulhrsw x, m4}, m6, m17, m2, m16
2863    vinserti32x4        ym0, ym2, xm6, 1      ;  0  2
2864    vshufi32x4          ym1, ym2, ym6, 0x03   ;  4  6
2865    vinserti32x4       ym14, ym16, xm17, 1    ;  1  3
2866    vshufi32x4         ym15, ym16, ym17, 0x03 ;  5  7
2867    pxor                ym4, ym4
2868    vpermt2q             m2, m5, m6           ;  8 10
2869    vpermt2q            m16, m5, m17          ;  9 11
2870    mova                ym5, ym4
2871    mova                ym6, ym4
2872    mova                ym7, ym4
2873    vextracti32x8       ym3, m2, 1            ; 12 14
2874    vextracti32x8      ym17, m16, 1           ; 13 15
2875    call m(idct_8x16_internal_8bpc).main
2876    call .main_fast
2877.end:
2878    vpbroadcastd       ym12, strided
2879    vpbroadcastd        m13, [o(pw_2048)]
2880    pmulld              ym7, ym12, [o(gather8d)]
2881    REPX  {pmulhrsw x, m13}, m0, m1, m2, m3, m8, m9, m10, m11
2882    lea                  r3, [dstq+strideq*4]
2883    shl             strideq, 4
2884    lea                  r4, [dstq+strideq]
2885    add                  r1, r3
2886    kxnorb               k1, k1, k1
2887    pxor                 m6, m6
2888    kmovb                k2, k1
2889    vpgatherdq      m12{k1}, [r0+ym7]
2890    kmovb                k1, k2
2891    vpgatherdq      m13{k2}, [r3+ym7]
2892    kmovb                k2, k1
2893    vpgatherdq      m14{k1}, [r4+ym7]
2894    kmovb                k1, k2
2895    vpgatherdq      m15{k2}, [r1+ym7]
2896    REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
2897    punpcklbw            m4, m12, m6
2898    punpckhbw           m12, m6
2899    paddw                m0, m4
2900    paddw                m1, m12
2901    packuswb             m0, m1
2902    kmovb                k2, k1
2903    vpscatterdq [r0+ym7]{k1}, m0
2904    punpcklbw            m4, m13, m6
2905    punpckhbw           m13, m6
2906    paddw                m2, m4
2907    paddw                m3, m13
2908    packuswb             m2, m3
2909    kmovb                k1, k2
2910    vpscatterdq [r3+ym7]{k2}, m2
2911    punpcklbw            m4, m14, m6
2912    punpckhbw           m14, m6
2913    paddw                m8, m4
2914    paddw                m9, m14
2915    packuswb             m8, m9
2916    kmovb                k2, k1
2917    vpscatterdq [r4+ym7]{k1}, m8
2918    punpcklbw            m4, m15, m6
2919    punpckhbw           m15, m6
2920    paddw               m10, m4
2921    paddw               m11, m15
2922    packuswb            m10, m11
2923    vpscatterdq [r1+ym7]{k2}, m10
2924    RET
2925.dconly:
2926    movsx               r6d, word [cq]
2927    mov                [cq], eobd
2928    mov                 r3d, 32
2929    imul                r6d, 181
2930    add                 r6d, 128+512
2931    sar                 r6d, 8+2
2932    jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2
2933INIT_YMM avx512icl
2934ALIGN function_align
2935.main_fast: ; bottom half is zero
2936    ITX_UNPACK_MULHRSW   12, 14, 8,  201, 4091,  m601, 4052 ; t16a, t31a, t23a, t24a
2937    ITX_UNPACK_MULHRSW   21, 15, 8,  995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
2938    ITX_UNPACK_MULHRSW   20, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a
2939    ITX_UNPACK_MULHRSW   19, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a
2940    jmp .main3
2941ALIGN function_align
2942.main:
2943    punpcklwd           m12, m21, m14 ; in31 in1
2944    punpckhwd           m14, m21      ; in3  in29
2945    punpcklwd           m21, m20, m15 ; in27 in5
2946    punpckhwd           m15, m20      ; in7  in25
2947    punpcklwd           m20, m19, m16 ; in23 in9
2948    punpckhwd           m16, m19      ; in11 in21
2949    punpcklwd           m19, m18, m17 ; in19 in13
2950    punpckhwd           m17, m18      ; in15 in17
2951.main2:
2952    ITX_MUL2X_PACK       12, 8, 9, 10,  201, 4091, 5 ; t16a, t31a
2953    ITX_MUL2X_PACK       14, 8, 9, 10, 4052,  601, 5 ; t23a, t24a
2954    ITX_MUL2X_PACK       21, 8, 9, 10,  995, 3973, 5 ; t20a, t27a
2955    ITX_MUL2X_PACK       15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a
2956    ITX_MUL2X_PACK       20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a
2957    ITX_MUL2X_PACK       16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a
2958    ITX_MUL2X_PACK       19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a
2959    ITX_MUL2X_PACK       17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a
2960.main3:
2961    psubsw              m11, m12, m17 ; t17 t30
2962    paddsw              m12, m17      ; t16 t31
2963    psubsw              m17, m15, m20 ; t18 t29
2964    paddsw              m20, m15      ; t19 t28
2965    psubsw              m15, m21, m16 ; t21 t26
2966    paddsw              m21, m16      ; t20 t27
2967    psubsw              m16, m14, m19 ; t22 t25
2968    paddsw              m14, m19      ; t23 t24
2969    ITX_MUL2X_PACK       11, 18, 19, 10,   799, 4017, 5 ; t17a t30a
2970    ITX_MUL2X_PACK       17, 18, 19, 10, m4017,  799, 5 ; t18a t29a
2971    ITX_MUL2X_PACK       15, 18, 19, 10,  3406, 2276, 5 ; t21a t26a
2972    ITX_MUL2X_PACK       16, 18, 19, 10, m2276, 3406, 5 ; t22a t25a
2973    vpbroadcastd         m8, [o(pw_m3784_1567)]
2974    psubsw              m19, m12, m20 ; t19a t28a
2975    paddsw              m20, m12      ; t16a t31a
2976    psubsw              m12, m14, m21 ; t20a t27a
2977    paddsw              m14, m21      ; t23a t24a
2978    psubsw              m21, m11, m17 ; t18  t29
2979    paddsw              m11, m17      ; t17  t30
2980    psubsw              m17, m16, m15 ; t21  t26
2981    paddsw              m16, m15      ; t22  t25
2982    ITX_MUL2X_PACK       21, 18, 15, 10, 1567_3784, 8,   20 ; t18a t29a
2983    ITX_MUL2X_PACK       19, 18, 15, 10, 1567_3784, 8,   20 ; t19  t28
2984    ITX_MUL2X_PACK       12, 18, 15, 10, 8, m1567_m3784, 36 ; t20  t27
2985    ITX_MUL2X_PACK       17, 18, 15, 10, 8, m1567_m3784, 36 ; t21a t26a
2986    vbroadcasti32x4     m18, [o(deint_shuf)]
2987    vpbroadcastd         m8, [o(pw_m2896_2896)]
2988    vpbroadcastd         m9, [o(pw_2896_2896)]
2989    psubsw              m15, m20, m14 ; t23  t24
2990    paddsw              m20, m14      ; t16  t31
2991    psubsw              m14, m11, m16 ; t22a t25a
2992    paddsw              m11, m16      ; t17a t30a
2993    psubsw              m16, m21, m17 ; t21  t26
2994    paddsw              m21, m17      ; t18  t29
2995    psubsw              m17, m19, m12 ; t20a t27a
2996    paddsw              m19, m12      ; t19a t28a
2997    REPX    {pshufb x, m18}, m20, m11, m21, m19
2998    ITX_MUL2X_PACK       15, 18, 12, 10, 8, 9, 8 ; t23a t22a
2999    ITX_MUL2X_PACK       14, 13, 15, 10, 8, 9, 8 ; t22  t25
3000    packssdw             m18, m13     ; t23a t22
3001    packssdw             m12, m15     ; t24a t25
3002    ITX_MUL2X_PACK       16, 13, 15, 10, 8, 9, 8 ; t21a t26a
3003    ITX_MUL2X_PACK       17, 16, 14, 10, 8, 9, 8 ; t20  t27
3004    packssdw            m16, m13      ; t20  t21a
3005    packssdw            m14, m15      ; t27  t26a
3006    punpcklqdq          m13, m19, m21 ; t19a t18
3007    punpckhqdq          m19, m21      ; t28a t29
3008    punpcklqdq          m21, m20, m11 ; t16  t17a
3009    punpckhqdq          m20, m11      ; t31  t30a
3010    psubsw              m15, m1, m19  ; out28 out29
3011    paddsw               m1, m19      ; out3  out2
3012    psubsw               m9, m6, m13  ; out19 out18
3013    paddsw               m6, m13      ; out12 out13
3014    psubsw              m10, m5, m16  ; out20 out21
3015    paddsw               m5, m16      ; out11 out10
3016    psubsw              m19, m3, m12  ; out24 out25
3017    paddsw               m3, m12      ; out7  out6
3018    psubsw               m8, m7, m21  ; out16 out17
3019    paddsw               m7, m21      ; out15 out14
3020    psubsw              m21, m0, m20  ; out31 out30
3021    paddsw               m0, m20      ; out0  out1
3022    psubsw              m11, m4, m18  ; out23 out22
3023    paddsw               m4, m18      ; out8  out9
3024    psubsw              m18, m2, m14  ; out27 out26
3025    paddsw               m2, m14      ; out4  out5
3026INIT_ZMM avx512icl
3027    movu                m16, [o(permD+3)]
3028    vpermt2q             m0, m16, m4  ;  0  1  8  9
3029    vpermt2q             m8, m16, m19 ; 16 17 24 25
3030    vpermt2q             m1, m16, m5  ;  3  2 11 10
3031    vpermt2q             m9, m16, m18 ; 19 18 27 26
3032    vpermt2q             m2, m16, m6  ;  4  5 12 13
3033    vpermt2q            m10, m16, m15 ; 20 21 28 29
3034    vpermt2q             m3, m16, m7  ;  7  6 15 14
3035    vpermt2q            m11, m16, m21 ; 23 22 31 30
3036    vzeroupper
3037    ret
3038
3039%macro LOAD_PACKED_16X2 3 ; dst, row[1-2]
3040    vbroadcasti32x4    ym%1, [cq+16*%2]
3041    vbroadcasti32x4     ym8, [cq+16*%3]
3042    shufpd             ym%1, ym8, 0x0c
3043%endmacro
3044
3045cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob
3046%undef cmp
3047    test               eobd, eobd
3048    jz .dconly
3049    lea                  r5, [o_base]
3050    LOAD_PACKED_16X2      0,  0,  2 ; in0  in2
3051    LOAD_PACKED_16X2      1,  4,  6 ; in4  in6
3052    LOAD_PACKED_16X2      2,  8, 10 ; in8  in10
3053    LOAD_PACKED_16X2      3, 12, 14 ; in12 in14
3054    LOAD_PACKED_16X2     14,  1,  3 ; in1  in3
3055    LOAD_PACKED_16X2     15,  5,  7 ; in5  in7
3056    LOAD_PACKED_16X2     16,  9, 11 ; in9  in11
3057    LOAD_PACKED_16X2     17, 13, 15 ; in13 in15
3058    pxor                 m4, m4
3059    REPX {mova [cq+64*x], m4}, 0, 1, 2, 3
3060    cmp                eobd, 107
3061    jb .fast
3062    LOAD_PACKED_16X2      4, 16, 18 ; in16 in18
3063    LOAD_PACKED_16X2      5, 20, 22 ; in20 in22
3064    LOAD_PACKED_16X2      6, 24, 26 ; in24 in26
3065    LOAD_PACKED_16X2      7, 28, 30 ; in28 in30
3066    call m(idct_8x16_internal_8bpc).main
3067    LOAD_PACKED_16X2     18, 19, 17 ; in19 in17
3068    LOAD_PACKED_16X2     19, 23, 21 ; in23 in21
3069    LOAD_PACKED_16X2     20, 27, 25 ; in27 in25
3070    LOAD_PACKED_16X2     21, 31, 29 ; in31 in29
3071    pxor                 m8, m8
3072    REPX {mova [cq+64*x], m8}, 4, 5, 6, 7
3073    call m(inv_txfm_add_dct_dct_8x32_8bpc).main
3074    jmp .pass2
3075.fast: ; bottom half is zero
3076    mova                ym5, ym4
3077    mova                ym6, ym4
3078    mova                ym7, ym4
3079    call m(idct_8x16_internal_8bpc).main
3080    call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
3081.pass2:
3082    vpbroadcastd        m12, [o(pw_8192)]
3083    vshufi32x4           m7, m3, m11, q2020 ;  7 15 23 31
3084    vshufi32x4           m6, m3, m11, q3131 ;  6 14 22 30
3085    vshufi32x4           m5, m2, m10, q3131 ;  5 13 21 29
3086    vshufi32x4           m4, m2, m10, q2020 ;  4 12 20 28
3087    vshufi32x4           m3, m1, m9, q2020  ;  3 11 19 27
3088    vshufi32x4           m2, m1, m9, q3131  ;  2 10 18 26
3089    vshufi32x4           m1, m0, m8, q3131  ;  1  9 17 15
3090    vshufi32x4           m0, m8, q2020      ;  0  8 16 24
3091    REPX  {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
3092    call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8
3093    call .main
3094    vpbroadcastd         m8, [o(pw_2048)]
3095    REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
3096    lea                  r2, [strideq*3]
3097    lea                  r3, [dstq+strideq*4]
3098    movshdup            m12, [o(permD)]
3099    pmovzxbw             m8, [dstq+strideq*0]
3100    pmovzxbw             m9, [dstq+strideq*1]
3101    pmovzxbw            m10, [dstq+strideq*2]
3102    pmovzxbw            m11, [dstq+r2       ]
3103    paddw                m0, m8
3104    paddw                m1, m9
3105    paddw                m2, m10
3106    paddw                m3, m11
3107    pmovzxbw             m8, [r3+strideq*0]
3108    pmovzxbw             m9, [r3+strideq*1]
3109    pmovzxbw            m10, [r3+strideq*2]
3110    pmovzxbw            m11, [r3+r2       ]
3111    paddw                m4, m8
3112    paddw                m5, m9
3113    paddw                m6, m10
3114    paddw                m7, m11
3115    packuswb             m0, m1
3116    packuswb             m2, m3
3117    vpermq               m0, m12, m0
3118    vpermq               m2, m12, m2
3119    mova          [dstq+strideq*0], ym0
3120    vextracti32x8 [dstq+strideq*1], m0, 1
3121    mova          [dstq+strideq*2], ym2
3122    vextracti32x8 [dstq+r2       ], m2, 1
3123    packuswb             m4, m5
3124    packuswb             m6, m7
3125    vpermq               m4, m12, m4
3126    vpermq               m6, m12, m6
3127    mova          [r3+strideq*0], ym4
3128    vextracti32x8 [r3+strideq*1], m4, 1
3129    mova          [r3+strideq*2], ym6
3130    vextracti32x8 [r3+r2       ], m6, 1
3131    RET
3132.dconly:
3133    movsx               r6d, word [cq]
3134    mov                [cq], eobd
3135    mov                 r3d, 8
3136.dconly2:
3137    imul                r6d, 181
3138    add                 r6d, 128+512
3139    sar                 r6d, 8+2
3140.dconly3:
3141    imul                r6d, 181
3142    add                 r6d, 128+2048
3143    sar                 r6d, 8+4
3144    pxor                 m2, m2
3145    vpbroadcastw         m3, r6d
3146.dconly_loop:
3147    mova                ym1, [dstq+strideq*0]
3148    vinserti32x8         m1, [dstq+strideq*1], 1
3149    punpcklbw            m0, m1, m2
3150    punpckhbw            m1, m2
3151    paddw                m0, m3
3152    paddw                m1, m3
3153    packuswb             m0, m1
3154    mova          [dstq+strideq*0], ym0
3155    vextracti32x8 [dstq+strideq*1], m0, 1
3156    lea                dstq, [dstq+strideq*2]
3157    sub                 r3d, 2
3158    jg .dconly_loop
3159    RET
3160ALIGN function_align
3161.main:
3162    vpbroadcastd       m10, [o(pd_2048)]
3163.main2:
3164    ITX_MULSUB_2W        5, 3, 8, 9, 10, 3406, 2276 ; t5a, t6a
3165    ITX_MULSUB_2W        1, 7, 8, 9, 10,  799, 4017 ; t4a, t7a
3166    ITX_MULSUB_2W        2, 6, 8, 9, 10, 1567, 3784 ; t2, t3
3167    vpbroadcastd       m11, [o(pw_2896_2896)]
3168    vpbroadcastd       m12, [o(pw_m2896_2896)]
3169    ITX_MULSUB_2W        0, 4, 8, 9, 10, 11, 12 ; t1, t0
3170.main3:
3171    paddsw              m8, m1, m5 ; t4
3172    psubsw              m1, m5     ; t5a
3173    paddsw              m9, m7, m3 ; t7
3174    psubsw              m7, m3     ; t6a
3175    ITX_MULSUB_2W        7, 1, 3, 5, 10, 11, 12 ; t5, t6
3176    psubsw              m5, m0, m2 ; dct4 out2
3177    paddsw              m2, m0     ; dct4 out1
3178    paddsw              m0, m4, m6 ; dct4 out0
3179    psubsw              m4, m6     ; dct4 out3
3180    psubsw              m6, m2, m1 ; out6
3181    paddsw              m1, m2     ; out1
3182    paddsw              m2, m5, m7 ; out2
3183    psubsw              m5, m7     ; out5
3184    psubsw              m7, m0, m9 ; out7
3185    paddsw              m0, m9     ; out0
3186    paddsw              m3, m4, m8 ; out3
3187    psubsw              m4, m8     ; out4
3188    ret
3189
3190cglobal inv_txfm_add_identity_identity_8x32_8bpc, 3, 5, 0, dst, stride, c
3191    vpbroadcastd         m7, [pw_5]
3192    paddsw               m0, m7, [cq+64*0]
3193    paddsw               m1, m7, [cq+64*1]
3194    vpbroadcastd        ym9, strided
3195    paddsw               m2, m7, [cq+64*2]
3196    paddsw               m3, m7, [cq+64*3]
3197    paddsw               m4, m7, [cq+64*4]
3198    paddsw               m5, m7, [cq+64*5]
3199    paddsw               m6, m7, [cq+64*6]
3200    paddsw               m7,     [cq+64*7]
3201    pmulld             ym14, ym9, [pd_0to15]
3202    lea                  r3, [dstq+strideq*1]
3203    lea                  r4, [dstq+strideq*2]
3204    kxnorb               k1, k1, k1
3205    pxor                m13, m13
3206    add                  r1, r4 ; dstq+strideq*3
3207    kmovb                k2, k1
3208    vpgatherdq       m9{k1}, [r0+ym14*4]
3209    kmovb                k1, k2
3210    vpgatherdq      m10{k2}, [r3+ym14*4]
3211    kmovb                k2, k1
3212    call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8
3213    REPX       {psraw x, 3}, m0, m1, m2, m3, m4, m5, m6, m7
3214    vpgatherdq      m11{k1}, [r4+ym14*4]
3215    kmovb                k1, k2
3216    vpgatherdq      m12{k2}, [r1+ym14*4]
3217    REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7
3218    punpcklbw            m8, m9, m13  ;  0  8 16 24
3219    punpckhbw            m9, m13      ;  4 12 20 28
3220    paddw                m0, m8
3221    paddw                m4, m9
3222    packuswb             m0, m4
3223    kmovb                k2, k1
3224    vpscatterdq [r0+ym14*4]{k1}, m0
3225    punpcklbw            m8, m10, m13 ;  1  9 17 25
3226    punpckhbw           m10, m13      ;  5 13 21 29
3227    paddw                m1, m8
3228    paddw                m5, m10
3229    packuswb             m1, m5
3230    kmovb                k1, k2
3231    vpscatterdq [r3+ym14*4]{k2}, m1
3232    punpcklbw            m8, m11, m13 ;  2 10 18 26
3233    punpckhbw           m11, m13      ;  6 14 22 30
3234    paddw                m2, m8
3235    paddw                m6, m11
3236    packuswb             m2, m6
3237    kmovb                k2, k1
3238    vpscatterdq [r4+ym14*4]{k1}, m2
3239    punpcklbw            m8, m12, m13 ;  3 11 19 27
3240    punpckhbw           m12, m13      ;  7 15 23 31
3241    paddw                m3, m8
3242    paddw                m7, m12
3243    packuswb             m3, m7
3244    vpscatterdq [r1+ym14*4]{k2}, m3
3245    RET
3246
3247cglobal inv_txfm_add_identity_identity_32x8_8bpc, 3, 5, 0, dst, stride, c
3248    vpbroadcastd         m0, [pw_4096]
3249    pmulhrsw             m3, m0, [cq+64*0]
3250    pmulhrsw             m4, m0, [cq+64*4]
3251    pmulhrsw             m6, m0, [cq+64*1]
3252    pmulhrsw             m5, m0, [cq+64*5]
3253    pmulhrsw             m7, m0, [cq+64*2]
3254    pmulhrsw             m2, m0, [cq+64*6]
3255    pmulhrsw             m8, m0, [cq+64*3]
3256    pmulhrsw             m0,     [cq+64*7]
3257    mova                m13, [int8_permA]
3258    lea                  r3, [strideq*3]
3259    lea                  r4, [dstq+strideq*4]
3260    punpckldq            m1, m3, m4
3261    punpckhdq            m3, m4
3262    punpckldq            m4, m6, m5
3263    punpckhdq            m6, m5
3264    punpckldq            m5, m7, m2
3265    punpckhdq            m7, m2
3266    punpckldq            m2, m8, m0
3267    punpckhdq            m8, m0
3268    mova                ym9, [dstq+strideq*0]
3269    vinserti32x8         m9, [dstq+strideq*2], 1
3270    mova               ym10, [dstq+strideq*1]
3271    vinserti32x8        m10, [dstq+r3       ], 1
3272    mova               ym11, [r4+strideq*0]
3273    vinserti32x8        m11, [r4+strideq*2], 1
3274    mova               ym12, [r4+strideq*1]
3275    vinserti32x8        m12, [r4+r3       ], 1
3276    REPX {vpermb x, m13, x}, m1, m4, m5, m2, m3, m6, m7, m8
3277    pxor                m13, m13
3278    REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7
3279    punpcklqdq           m0, m1, m4 ; a0 a2   c0 c2
3280    punpckhqdq           m1, m4     ; b0 b2   d0 d2
3281    punpcklqdq           m4, m5, m2 ; a1 a3   c1 c3
3282    punpckhqdq           m5, m2     ; b1 b3   d1 d3
3283    punpcklqdq           m2, m3, m6 ; e0 e2   g0 g2
3284    punpckhqdq           m3, m6     ; f0 f2   h0 h2
3285    punpcklqdq           m6, m7, m8 ; e1 e3   g1 g3
3286    punpckhqdq           m7, m8     ; f1 f3   h1 h3
3287    punpcklbw            m8, m9, m13
3288    punpckhbw            m9, m13
3289    paddw                m0, m8
3290    paddw                m4, m9
3291    packuswb             m0, m4
3292    mova          [dstq+strideq*0], ym0
3293    vextracti32x8 [dstq+strideq*2], m0, 1
3294    punpcklbw            m8, m10, m13
3295    punpckhbw           m10, m13
3296    paddw                m1, m8
3297    paddw                m5, m10
3298    packuswb             m1, m5
3299    mova          [dstq+strideq*1], ym1
3300    vextracti32x8 [dstq+r3       ], m1, 1
3301    punpcklbw            m8, m11, m13
3302    punpckhbw           m11, m13
3303    paddw                m2, m8
3304    paddw                m6, m11
3305    packuswb             m2, m6
3306    mova          [r4+strideq*0], ym2
3307    vextracti32x8 [r4+strideq*2], m2, 1
3308    punpcklbw            m8, m12, m13
3309    punpckhbw           m12, m13
3310    paddw                m3, m8
3311    paddw                m7, m12
3312    packuswb             m3, m7
3313    mova          [r4+strideq*1], ym3
3314    vextracti32x8 [r4+r3       ], m3, 1
3315    RET
3316
3317%macro IDCT_16x32_END 3 ; src[1-2], row
3318    mova                xm8, [dstq+strideq*0]
3319    vinserti32x4        ym8, [dstq+strideq*1], 1
3320    mova                xm9, [dstq+r3       ]
3321    vinserti32x4        ym9, [dstq+strideq*2], 1
3322    pmulhrsw            m%1, m10
3323    pmulhrsw            m%2, m10
3324    vpermb               m8, m11, m8
3325    vpermb               m9, m11, m9
3326    mova   [cq+64*(%3*2+0)], m13
3327    mova   [cq+64*(%3*2+1)], m13
3328    paddw                m8, m%1
3329    paddw                m9, m%2
3330    packuswb             m8, m9
3331    vpermd               m8, m12, m8
3332    mova          [dstq+strideq*0], xm8
3333    vextracti32x4 [dstq+strideq*1], ym8, 1
3334    vextracti32x4 [dstq+strideq*2], m8, 2
3335    vextracti32x4 [dstq+r3       ], m8, 3
3336%if %1 != 20
3337    lea                dstq, [dstq+strideq*4]
3338%endif
3339%endmacro
3340
3341cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 22, dst, stride, c, eob
3342%undef cmp
3343    lea                  r5, [o_base]
3344    test               eobd, eobd
3345    jz .dconly
3346    vpbroadcastd        m15, [o(pw_2896x8)]
3347    cmp                eobd, 151
3348    jb .fast
3349    pmulhrsw             m5, m15, [cq+64*10]
3350    pmulhrsw             m3, m15, [cq+64* 6]
3351    pmulhrsw             m1, m15, [cq+64* 2]
3352    pmulhrsw             m7, m15, [cq+64*14]
3353    pmulhrsw             m2, m15, [cq+64* 4]
3354    pmulhrsw             m6, m15, [cq+64*12]
3355    pmulhrsw             m0, m15, [cq+64* 0]
3356    pmulhrsw             m4, m15, [cq+64* 8]
3357    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
3358    pmulhrsw            m14, m15, [cq+64* 1]
3359    pmulhrsw            m21, m15, [cq+64*15]
3360    pmulhrsw            m18, m15, [cq+64* 9]
3361    pmulhrsw            m17, m15, [cq+64* 7]
3362    pmulhrsw            m16, m15, [cq+64* 5]
3363    pmulhrsw            m19, m15, [cq+64*11]
3364    pmulhrsw            m20, m15, [cq+64*13]
3365    pmulhrsw            m15,      [cq+64* 3]
3366    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
3367    mova                 m8, [o(idct_16x32p)]
3368    vpbroadcastd         m9, [o(pw_16384)]
3369    REPX {vpermb x, m8, x}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
3370                            m14, m15, m16, m17, m18, m19, m20, m21
3371    punpckldq            m8, m0, m1
3372    punpckhdq            m0, m1
3373    punpckldq            m1, m2, m3
3374    punpckhdq            m2, m3
3375    REPX   {pmulhrsw x, m9}, m8, m0, m1, m2
3376    punpckldq            m3, m4, m5
3377    punpckhdq            m4, m5
3378    punpckldq            m5, m6, m7
3379    punpckhdq            m6, m7
3380    REPX   {pmulhrsw x, m9}, m3, m4, m5, m6
3381    punpckldq            m7, m14, m15
3382    punpckhdq           m14, m15
3383    punpckldq           m15, m16, m17
3384    punpckhdq           m16, m17
3385    REPX   {pmulhrsw x, m9}, m7, m14, m15, m16
3386    punpckldq           m17, m18, m19
3387    punpckhdq           m18, m19
3388    punpckldq           m19, m20, m21
3389    punpckhdq           m20, m21
3390    REPX   {pmulhrsw x, m9}, m17, m18, m19, m20
3391    punpcklqdq          m21, m8, m1
3392    punpckhqdq           m8, m1
3393    punpcklqdq           m1, m0, m2
3394    punpckhqdq           m0, m2
3395    punpcklqdq           m2, m3, m5
3396    punpckhqdq           m3, m5
3397    punpcklqdq           m5, m4, m6
3398    punpckhqdq           m4, m6
3399    punpcklqdq           m6, m7, m15
3400    punpckhqdq           m7, m15
3401    punpcklqdq          m15, m14, m16
3402    punpckhqdq          m14, m16
3403    punpcklqdq          m16, m17, m19
3404    punpckhqdq          m17, m19
3405    punpcklqdq          m19, m18, m20
3406    punpckhqdq          m18, m20
3407    vinserti32x8        m20, m21, ym2, 1
3408    vshufi32x4          m21, m2, q3232
3409    vinserti32x8         m2, m8, ym3, 1
3410    vshufi32x4           m8, m3, q3232
3411    vinserti32x8         m3, m1, ym5, 1
3412    vshufi32x4           m1, m5, q3232
3413    vinserti32x8         m5, m0, ym4, 1
3414    vshufi32x4           m0, m4, q3232
3415    vinserti32x8         m4, m6, ym16, 1
3416    vshufi32x4           m6, m16, q3232
3417    vinserti32x8        m16, m7, ym17, 1
3418    vshufi32x4           m7, m17, q3232
3419    vinserti32x8        m17, m15, ym19, 1
3420    vshufi32x4          m15, m19, q3232
3421    vinserti32x8        m19, m14, ym18, 1
3422    vshufi32x4          m14, m18, q3232
3423    vshufi32x4          m18, m21, m6, q3131 ; 27  5
3424    vshufi32x4          m21, m6, q2020      ; 31  1
3425    vshufi32x4           m6, m8, m7, q2020  ; 24  8
3426    vshufi32x4           m8, m7, q3131      ; 30  2
3427    vshufi32x4           m7, m1, m15, q2020 ; 28  4
3428    vshufi32x4           m1, m15, q3131     ;  6 26
3429    vshufi32x4          m15, m0, m14, q2020 ;  7 25
3430    vshufi32x4           m0, m14, q3131     ; 14 18
3431    vshufi32x4          m14, m20, m4, q2020 ;  3 29
3432    vshufi32x4          m20, m4, q3131      ; 23  9
3433    vshufi32x4           m9, m3, m17, q2020 ; 16  0
3434    vshufi32x4           m3, m17, q3131     ; 12 20
3435    vshufi32x4          m17, m5, m19, q2020 ; 15 17
3436    vshufi32x4           m5, m19, q3131     ; 22 10
3437    vshufi32x4          m19, m2, m16, q2020 ; 19 13
3438    vshufi32x4          m16, m2, m16, q3131 ; 11 21
3439    call m(idct_16x16_internal_8bpc).main3
3440    call .main_oddhalf
3441    jmp .pass2
3442.fast: ; right half is zero
3443    mova                ym8, [cq+64*15]
3444    vinserti32x8         m8, [cq+64* 1], 1
3445    mova                 m2, [o(int16_perm)]
3446    mova                ym9, [cq+64* 8]
3447    vinserti32x8         m9, [cq+64* 0], 1
3448    mova                ym0, [cq+64* 7]
3449    vinserti32x8         m0, [cq+64* 9], 1
3450    mova                ym7, [cq+64*14]
3451    vinserti32x8         m7, [cq+64* 2], 1
3452    mova                ym1, [cq+64* 3]
3453    vinserti32x8         m1, [cq+64*13], 1
3454    mova                ym3, [cq+64* 6]
3455    vinserti32x8         m3, [cq+64*10], 1
3456    mova                ym5, [cq+64*11]
3457    vinserti32x8         m5, [cq+64* 5], 1
3458    mova                ym6, [cq+64*12]
3459    vinserti32x8         m6, [cq+64* 4], 1
3460    REPX  {pmulhrsw x, m15}, m8, m9, m0, m7, m1, m3, m5, m6
3461    REPX  {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6
3462    call m(idct_16x16_internal_8bpc).main2
3463    vbroadcasti32x4      m8, [o(int_shuf3)]
3464    vbroadcasti32x4      m9, [o(int_shuf4)]
3465    vpbroadcastd        m11, [o(pw_16384)]
3466    pshufb               m0, m8
3467    pshufb               m1, m9
3468    pshufb               m2, m8
3469    pshufb               m3, m9
3470    REPX  {pmulhrsw x, m11}, m0, m1, m2, m3
3471    pshufb               m4, m8
3472    pshufb               m5, m9
3473    pshufb               m6, m8
3474    pshufb               m7, m9
3475    REPX  {pmulhrsw x, m11}, m4, m5, m6, m7
3476    punpckhdq           m17, m0, m1
3477    punpckldq            m0, m1
3478    punpckhdq           m16, m2, m3
3479    punpckldq            m2, m3
3480    punpckhdq           m18, m4, m5
3481    punpckldq            m4, m5
3482    punpckhdq            m5, m6, m7
3483    punpckldq            m6, m7
3484    vinserti32x8         m1, m0, ym2, 1
3485    vshufi32x4           m3, m0, m2, q3232
3486    vinserti32x8         m2, m4, ym6, 1
3487    vshufi32x4           m4, m6, q3232
3488    vinserti32x8        m15, m17, ym16, 1
3489    vshufi32x4          m17, m16, q3232
3490    vinserti32x8        m16, m18, ym5, 1
3491    vshufi32x4          m18, m5, q3232
3492    vshufi32x4           m0, m1, m2, q2020   ;  0  2
3493    vshufi32x4           m1, m2, q3131       ;  4  6
3494    vshufi32x4           m2, m3, m4, q2020   ;  8 10
3495    vshufi32x4           m3, m4, q3131       ; 12 14
3496    vshufi32x4          m14, m15, m16, q2020 ;  1  3
3497    vshufi32x4          m15, m16, q3131      ;  5  7
3498    vshufi32x4          m16, m17, m18, q2020 ;  9 11
3499    vshufi32x4          m17, m18, q3131      ; 13 15
3500    pxor                 m6, m6
3501    punpckhwd            m8, m0, m0
3502    punpcklwd            m9, m6, m0
3503    punpckhwd            m0, m3, m3
3504    punpckhwd            m5, m2, m2
3505    punpcklwd            m7, m1, m1
3506    punpckhwd            m1, m1
3507    punpcklwd            m3, m3
3508    punpcklwd            m6, m2
3509    call m(idct_16x16_internal_8bpc).main_fast5
3510    punpcklwd           m21, m14, m14
3511    punpckhwd           m14, m14
3512    punpcklwd           m18, m15, m15
3513    punpckhwd           m15, m15
3514    punpcklwd           m20, m16, m16
3515    punpckhwd           m16, m16
3516    punpcklwd           m19, m17, m17
3517    punpckhwd           m17, m17
3518    call .main_oddhalf_fast
3519.pass2:
3520    vpbroadcastd        m10, [o(pw_2048)]
3521    mova                m11, [o(end_16x32p)]
3522    lea                  r3, [strideq*3]
3523    pxor                m13, m13
3524    psrld               m12, m11, 8
3525    IDCT_16x32_END        0,  1,  0
3526    IDCT_16x32_END        2,  3,  1
3527    IDCT_16x32_END        4,  5,  2
3528    IDCT_16x32_END        6,  7,  3
3529    IDCT_16x32_END       14, 15,  4
3530    IDCT_16x32_END       16, 17,  5
3531    IDCT_16x32_END       18, 19,  6
3532    IDCT_16x32_END       20, 21,  7
3533    RET
3534ALIGN function_align
3535.dconly:
3536    movsx               r6d, word [cq]
3537    mov                [cq], eobd
3538    mov                 r3d, 32
3539    jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly
3540ALIGN function_align
3541.main_oddhalf_fast2: ; bottom three-quarters are zero
3542    vpbroadcastd         m8, [o(pw_201_4091x8)]
3543    vpbroadcastd        m20, [o(pw_m1380_3857x8)]
3544    vpbroadcastd         m9, [o(pw_995_3973x8)]
3545    vpbroadcastd        m16, [o(pw_m601_4052x8)]
3546    pmulhrsw            m21, m8  ; t16a, t31a
3547    pmulhrsw            m20, m15 ; t19a, t28a
3548    pmulhrsw            m18, m9  ; t20a, t27a
3549    pmulhrsw            m14, m16 ; t23a, t24a
3550    mova                 m8, m21
3551    mova                m17, m20
3552    mova                m15, m18
3553    mova                m16, m14
3554    jmp .main3
3555ALIGN function_align
3556.main_oddhalf_fast: ; bottom half is zero
3557    vpbroadcastd         m8, [o(pw_201_4091x8)]
3558    vpbroadcastd         m9, [o(pw_m2751_3035x8)]
3559    vpbroadcastd        m11, [o(pw_1751_3703x8)]
3560    vpbroadcastd        m12, [o(pw_m1380_3857x8)]
3561    pmulhrsw            m21, m8  ; t16a, t31a
3562    vpbroadcastd         m8, [o(pw_995_3973x8)]
3563    pmulhrsw            m17, m9  ; t17a, t30a
3564    vpbroadcastd         m9, [o(pw_m2106_3513x8)]
3565    pmulhrsw            m20, m11 ; t18a, t29a
3566    vpbroadcastd        m11, [o(pw_2440_3290x8)]
3567    pmulhrsw            m15, m12 ; t19a, t28a
3568    vpbroadcastd        m12, [o(pw_m601_4052x8)]
3569    pmulhrsw            m18, m8  ; t20a, t27a
3570    pmulhrsw            m16, m9  ; t21a, t26a
3571    pmulhrsw            m19, m11 ; t22a, t25a
3572    pmulhrsw            m14, m12 ; t23a, t24a
3573    jmp .main2
3574ALIGN function_align
3575.main_oddhalf:
3576    ITX_MUL2X_PACK       21, 8, 9, 10,  201, 4091, 5 ; t16a, t31a
3577    ITX_MUL2X_PACK       17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a
3578    ITX_MUL2X_PACK       20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a
3579    ITX_MUL2X_PACK       15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a
3580    ITX_MUL2X_PACK       18, 8, 9, 10,  995, 3973, 5 ; t20a, t27a
3581    ITX_MUL2X_PACK       16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a
3582    ITX_MUL2X_PACK       19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a
3583    ITX_MUL2X_PACK       14, 8, 9, 10, 4052,  601, 5 ; t23a, t24a
3584.main2:
3585    psubsw               m8, m21, m17 ; t17 t30
3586    paddsw              m21, m17      ; t16 t31
3587    psubsw              m17, m15, m20 ; t18 t29
3588    paddsw              m20, m15      ; t19 t28
3589    psubsw              m15, m18, m16 ; t21 t26
3590    paddsw              m18, m16      ; t20 t27
3591    psubsw              m16, m14, m19 ; t22 t25
3592    paddsw              m14, m19      ; t23 t24
3593.main3:
3594    ITX_MUL2X_PACK        8, 9, 19, 10,   799, 4017, 5 ; t17a t30a
3595    ITX_MUL2X_PACK       17, 9, 19, 10, m4017,  799, 5 ; t18a t29a
3596    ITX_MUL2X_PACK       15, 9, 19, 10,  3406, 2276, 5 ; t21a t26a
3597    ITX_MUL2X_PACK       16, 9, 19, 10, m2276, 3406, 5 ; t22a t25a
3598    vpbroadcastd        m11, [o(pw_m3784_1567)]
3599    psubsw              m19, m21, m20 ; t19a t28a
3600    paddsw              m21, m20      ; t16a t31a
3601    psubsw              m20, m14, m18 ; t20a t27a
3602    paddsw              m14, m18      ; t23a t24a
3603    psubsw              m18, m8, m17  ; t18  t29
3604    paddsw               m8, m17      ; t17  t30
3605    psubsw              m17, m16, m15 ; t21  t26
3606    paddsw              m15, m16      ; t22  t25
3607    ITX_MUL2X_PACK       18, 9, 16, 10, 1567_3784, 11,   20 ; t18a t29a
3608    ITX_MUL2X_PACK       19, 9, 16, 10, 1567_3784, 11,   20 ; t19  t28
3609    ITX_MUL2X_PACK       20, 9, 16, 10, 11, m1567_m3784, 36 ; t20  t27
3610    ITX_MUL2X_PACK       17, 9, 16, 10, 11, m1567_m3784, 36 ; t21a t26a
3611    vbroadcasti32x4      m9, [o(deint_shuf)]
3612    psubsw              m16, m21, m14 ; t23  t24
3613    paddsw              m14, m21      ; t16  t31
3614    psubsw              m21, m8, m15  ; t22a t25a
3615    paddsw              m15, m8       ; t17a t30a
3616    psubsw               m8, m18, m17 ; t21  t26
3617    paddsw              m18, m17      ; t18  t29
3618    paddsw              m17, m19, m20 ; t19a t28a
3619    psubsw              m19, m20      ; t20a t27a
3620    vpbroadcastd        m11, [o(pw_m2896_2896)]
3621    vpbroadcastd        m12, [o(pw_2896_2896)]
3622    REPX     {pshufb x, m9}, m14, m15, m18, m17
3623    mova                 m9, m10
3624    vpdpwssd             m9, m16, m11
3625    mova                m20, m10
3626    vpdpwssd            m20, m21, m11
3627    psrad                m9, 12
3628    psrad               m20, 12
3629    packssdw             m9, m20      ; t23a t22
3630    mova                m20, m10
3631    vpdpwssd            m20, m16, m12
3632    mova                m16, m10
3633    vpdpwssd            m16, m21, m12
3634    psrad               m20, 12
3635    psrad               m16, 12
3636    packssdw            m16, m20, m16 ; t24a t25
3637    ITX_MUL2X_PACK        8, 21, 20, 10, 11, 12, 8 ; t21a t26a
3638    ITX_MUL2X_PACK       19,  8, 11, 10, 11, 12, 8 ; t20  t27
3639    packssdw            m11, m20      ; t27  t26a
3640    packssdw             m8, m21      ; t20  t21a
3641    punpcklqdq          m20, m14, m15 ; t16  t17a
3642    punpckhqdq          m14, m15      ; t31  t30a
3643    punpckhqdq          m15, m17, m18 ; t28a t29
3644    punpcklqdq          m17, m18      ; t19a t18
3645    psubsw              m21, m0, m14  ; out31 out30
3646    paddsw               m0, m14      ; out0  out1
3647    psubsw              m14, m7, m20  ; out16 out17
3648    paddsw               m7, m20      ; out15 out14
3649    psubsw              m20, m1, m15  ; out28 out29
3650    paddsw               m1, m15      ; out3  out2
3651    psubsw              m15, m6, m17  ; out19 out18
3652    paddsw               m6, m17      ; out12 out13
3653    psubsw              m17, m4, m9   ; out23 out22
3654    paddsw               m4, m9       ; out8  out9
3655    psubsw              m18, m3, m16  ; out24 out25
3656    paddsw               m3, m16      ; out7  out6
3657    psubsw              m16, m5, m8   ; out20 out21
3658    paddsw               m5, m8       ; out11 out10
3659    psubsw              m19, m2, m11  ; out27 out26
3660    paddsw               m2, m11      ; out4  out5
3661    ret
3662
3663cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 22, dst, stride, c, eob
3664%undef cmp
3665    lea                  r5, [o_base]
3666    test               eobd, eobd
3667    jz .dconly
3668    mova                m21, [o(permB)]
3669    vpermq               m1, m21, [cq+64* 0] ;  0  1
3670    vpermq              m14, m21, [cq+64* 1] ;  2  3
3671    vpermq              m20, m21, [cq+64* 2] ;  4  5
3672    vpermq              m15, m21, [cq+64* 3] ;  6  7
3673    vpbroadcastd         m8, [o(pw_2896x8)]
3674    vpermq               m2, m21, [cq+64* 4] ;  8  9
3675    vpermq              m16, m21, [cq+64* 5] ; 10 11
3676    vpermq               m3, m21, [cq+64* 6] ; 12 13
3677    vpermq              m17, m21, [cq+64* 7] ; 14 15
3678    REPX   {pmulhrsw x, m8}, m1, m14, m20, m15, m2, m16, m3, m17
3679    pxor                m12, m12
3680    REPX {mova [cq+64*x], m12}, 0, 1, 2, 3, 4, 5, 6, 7
3681    cmp                eobd, 151
3682    jb .fast
3683    vpermq               m9, m21, [cq+64* 8] ; 16 17
3684    vpermq              m19, m21, [cq+64* 9] ; 18 19
3685    vpermq               m4, m21, [cq+64*10] ; 20 21
3686    vpermq               m5, m21, [cq+64*11] ; 22 23
3687    vpermq               m6, m21, [cq+64*12] ; 24 25
3688    vpermq              m18, m21, [cq+64*13] ; 26 27
3689    vpermq               m7, m21, [cq+64*14] ; 28 29
3690    vpermq              m21, m21, [cq+64*15] ; 30 31
3691    REPX   {pmulhrsw x, m8}, m9, m19, m4, m5, m6, m18, m7, m21
3692    REPX {mova [cq+64*x], m12}, 8, 9, 10, 11, 12, 13, 14, 15
3693    punpcklwd            m8, m21, m14 ; 30  2
3694    punpckhwd           m21, m1       ; 31  1
3695    punpcklwd            m0, m17, m19 ; 14 18
3696    punpckhwd           m17, m9       ; 15 17
3697    punpcklwd            m9, m1       ; 16  0
3698    punpckhwd           m14, m7       ;  3 29
3699    punpcklwd            m1, m15, m18 ;  6 26
3700    punpckhwd           m15, m6       ;  7 25
3701    punpcklwd            m6, m2       ; 24  8
3702    punpckhwd           m19, m3       ; 19 13
3703    punpcklwd            m3, m4       ; 12 20
3704    punpckhwd           m18, m20      ; 27  5
3705    punpcklwd            m7, m20      ; 28  4
3706    punpckhwd           m20, m5, m2   ; 23  9
3707    punpcklwd            m5, m16      ; 22 10
3708    punpckhwd           m16, m4       ; 11 21
3709    call m(idct_16x16_internal_8bpc).main2
3710    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
3711    jmp .pass2
3712.fast: ; bottom half zero
3713    punpcklwd            m8, m14, m14 ;  2
3714    punpcklwd            m0, m17, m17 ; 14
3715    punpcklwd            m5, m16, m16 ; 10
3716    punpcklwd            m9, m12, m1  ; __  0
3717    punpckhwd           m21, m1, m1   ;  1
3718    punpcklwd            m1, m15, m15 ;  6
3719    punpcklwd            m7, m20, m20 ;  4
3720    punpckhwd           m19, m3, m3   ; 13
3721    punpcklwd            m3, m3       ; 12
3722    punpcklwd            m6, m12, m2  ; __  8
3723    punpckhwd           m18, m20, m20 ;  5
3724    punpckhwd           m20, m2, m2   ;  9
3725    call m(idct_16x16_internal_8bpc).main_fast
3726    punpckhwd           m15, m15      ;  7
3727    punpckhwd           m14, m14      ;  3
3728    punpckhwd           m16, m16      ; 11
3729    punpckhwd           m17, m17      ; 15
3730    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
3731.pass2:
3732    vpbroadcastd         m9, [o(pw_16384)]
3733    call .transpose_round
3734    vshufi32x4          m16, m14, m2, q3131 ;  5
3735    vshufi32x4          m14, m2, q2020      ;  1
3736    vshufi32x4           m2, m0, m3, q3131  ;  4
3737    vshufi32x4           m0, m3, q2020      ;  0
3738    vshufi32x4           m3, m1, m18, q3131 ;  6
3739    vshufi32x4           m1, m18, q2020     ;  2
3740    vshufi32x4          m18, m20, m6, q2020 ;  9
3741    vshufi32x4          m20, m6, q3131      ; 13
3742    vshufi32x4           m6, m21, m4, q3131 ; 12
3743    vshufi32x4           m4, m21, m4, q2020 ;  8
3744    vshufi32x4          m21, m19, m7, q3131 ; 15
3745    vshufi32x4          m19, m7, q2020      ; 11
3746    vshufi32x4           m7, m5, m15, q3131 ; 14
3747    vshufi32x4           m5, m15, q2020     ; 10
3748    vshufi32x4          m15, m17, m9, q2020 ;  3
3749    vshufi32x4          m17, m9, q3131      ;  7
3750    call m(inv_txfm_add_dct_dct_32x8_8bpc).main2
3751    call .main_oddhalf
3752    vpbroadcastd        m12, [o(pw_2048)]
3753    movshdup            m13, [o(permD)]
3754    lea                  r2, [strideq*3]
3755    pmovzxbw             m8, [dstq+strideq*0]
3756    pmovzxbw             m9, [dstq+strideq*1]
3757    pmovzxbw            m10, [dstq+strideq*2]
3758    pmovzxbw            m11, [dstq+r2       ]
3759    REPX  {pmulhrsw x, m12}, m0, m1, m2, m3
3760    lea                  r3, [dstq+strideq*4]
3761    paddw                m0, m8
3762    paddw                m1, m9
3763    paddw                m2, m10
3764    paddw                m3, m11
3765    pmovzxbw             m8, [r3+strideq*0]
3766    pmovzxbw             m9, [r3+strideq*1]
3767    pmovzxbw            m10, [r3+strideq*2]
3768    pmovzxbw            m11, [r3+r2       ]
3769    REPX  {pmulhrsw x, m12}, m4, m5, m6, m7
3770    lea                  r4, [dstq+strideq*8]
3771    packuswb             m0, m1
3772    paddw                m4, m8
3773    paddw                m5, m9
3774    packuswb             m2, m3
3775    paddw                m6, m10
3776    paddw                m7, m11
3777    pmovzxbw             m8, [r4+strideq*0]
3778    pmovzxbw             m9, [r4+strideq*1]
3779    pmovzxbw            m10, [r4+strideq*2]
3780    pmovzxbw            m11, [r4+r2       ]
3781    REPX  {pmulhrsw x, m12}, m14, m15, m16, m17
3782    lea                  r5, [r3+strideq*8]
3783    packuswb             m4, m5
3784    paddw               m14, m8
3785    paddw               m15, m9
3786    packuswb             m6, m7
3787    paddw               m16, m10
3788    paddw               m17, m11
3789    pmovzxbw             m8, [r5+strideq*0]
3790    pmovzxbw             m9, [r5+strideq*1]
3791    pmovzxbw            m10, [r5+strideq*2]
3792    pmovzxbw            m11, [r5+r2       ]
3793    REPX  {pmulhrsw x, m12}, m18, m19, m20, m21
3794    packuswb            m14, m15
3795    paddw               m18, m8
3796    paddw               m19, m9
3797    packuswb            m16, m17
3798    paddw               m20, m10
3799    paddw               m21, m11
3800    packuswb            m18, m19
3801    packuswb            m20, m21
3802    REPX {vpermq x, m13, x}, m0, m2, m4, m6, m14, m16, m18, m20
3803    mova          [dstq+strideq*0], ym0
3804    vextracti32x8 [dstq+strideq*1], m0, 1
3805    mova          [dstq+strideq*2], ym2
3806    vextracti32x8 [dstq+r2       ], m2, 1
3807    mova          [r3+strideq*0], ym4
3808    vextracti32x8 [r3+strideq*1], m4, 1
3809    mova          [r3+strideq*2], ym6
3810    vextracti32x8 [r3+r2       ], m6, 1
3811    mova          [r4+strideq*0], ym14
3812    vextracti32x8 [r4+strideq*1], m14, 1
3813    mova          [r4+strideq*2], ym16
3814    vextracti32x8 [r4+r2       ], m16, 1
3815    mova          [r5+strideq*0], ym18
3816    vextracti32x8 [r5+strideq*1], m18, 1
3817    mova          [r5+strideq*2], ym20
3818    vextracti32x8 [r5+r2       ], m20, 1
3819    RET
3820ALIGN function_align
3821.dconly:
3822    movsx               r6d, word [cq]
3823    mov                [cq], eobd
3824    imul                r6d, 181
3825    mov                 r3d, 16
3826    add                 r6d, 128
3827    sar                 r6d, 8
3828    imul                r6d, 181
3829    add                 r6d, 128+256
3830    sar                 r6d, 8+1
3831    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3
3832ALIGN function_align
3833.main_oddhalf_fast2: ; bottom three-quarters are zero
3834    vpbroadcastd         m9, [o(pw_2896x8)]
3835    vpbroadcastd         m2, [o(pw_4017x8)]
3836    vpbroadcastd         m3, [o(pw_799x8)]
3837    vpbroadcastd        m18, [o(pw_4076x8)]
3838    vpbroadcastd        m19, [o(pw_401x8)]
3839    vpbroadcastd        m20, [o(pw_m1189x8)]
3840    vpbroadcastd        m16, [o(pw_3920x8)]
3841    pmulhrsw             m9, m0  ; t0
3842    pmulhrsw             m2, m1  ; t7a
3843    pmulhrsw             m1, m3  ; t4a
3844    pmulhrsw            m18, m14 ; t15a
3845    pmulhrsw            m14, m19 ; t8a
3846    pmulhrsw            m20, m15 ; t11a
3847    pmulhrsw            m15, m16 ; t12a
3848    psubsw               m7, m9, m2 ; idct8 out7
3849    paddsw               m0, m9, m2 ; idct8 out0
3850    psubsw               m4, m9, m1 ; idct8 out4
3851    paddsw               m3, m9, m1 ; idct8 out3
3852    ITX_MULSUB_2W         2, 1, 5, 6, 10, 2896, 2896 ; t5, t6
3853    mova                m21, m18
3854    mova                m19, m14
3855    mova                m16, m15
3856    mova                 m8, m20
3857    psubsw               m6, m9, m1 ; idct8 out6
3858    paddsw               m1, m9     ; idct8 out1
3859    psubsw               m5, m9, m2 ; idct8 out5
3860    paddsw               m2, m9     ; idct8 out2
3861    jmp .main3
3862ALIGN function_align
3863.main_oddhalf_fast: ; bottom half is zero
3864    vpbroadcastd         m5, [o(pw_m2276x8)]
3865    vpbroadcastd        m11, [o(pw_3406x8)]
3866    vpbroadcastd         m7, [o(pw_4017x8)]
3867    vpbroadcastd        m12, [o(pw_799x8)]
3868    vpbroadcastd         m6, [o(pw_3784x8)]
3869    vpbroadcastd        m10, [o(pw_1567x8)]
3870    vpbroadcastd         m4, [o(pw_2896x8)]
3871    pmulhrsw             m5, m3  ; t5a
3872    pmulhrsw             m3, m11 ; t6a
3873    pmulhrsw             m7, m1  ; t7a
3874    pmulhrsw             m1, m12 ; t4a
3875    pmulhrsw             m6, m2  ; t3
3876    pmulhrsw             m2, m10 ; t2
3877    pmulhrsw             m4, m0  ; t0
3878    vpbroadcastd        m11, [o(pw_2896_2896)]
3879    vpbroadcastd        m12, [o(pw_m2896_2896)]
3880    vpbroadcastd        m10, [o(pd_2048)]
3881    mova                 m0, m4  ; t1
3882    call m(inv_txfm_add_dct_dct_32x8_8bpc).main3
3883    vpbroadcastd        m21, [o(pw_4076x8)]
3884    vpbroadcastd         m8, [o(pw_401x8)]
3885    vpbroadcastd        m18, [o(pw_m2598x8)]
3886    vpbroadcastd         m9, [o(pw_3166x8)]
3887    vpbroadcastd        m19, [o(pw_3612x8)]
3888    vpbroadcastd        m11, [o(pw_1931x8)]
3889    vpbroadcastd        m20, [o(pw_m1189x8)]
3890    vpbroadcastd        m12, [o(pw_3920x8)]
3891    pmulhrsw            m21, m14 ; t15a
3892    pmulhrsw            m14, m8  ; t8a
3893    pmulhrsw            m18, m17 ; t9a
3894    pmulhrsw            m17, m9  ; t14a
3895    pmulhrsw            m19, m16 ; t13a
3896    pmulhrsw            m16, m11 ; t10a
3897    pmulhrsw            m20, m15 ; t11a
3898    pmulhrsw            m15, m12 ; t12a
3899    jmp .main2
3900ALIGN function_align
3901.main_oddhalf:
3902    ITX_MULSUB_2W        14, 21, 8, 9, 10,  401, 4076 ; t8a,  t15a
3903    ITX_MULSUB_2W        18, 17, 8, 9, 10, 3166, 2598 ; t9a,  t14a
3904    ITX_MULSUB_2W        16, 19, 8, 9, 10, 1931, 3612 ; t10a, t13a
3905    ITX_MULSUB_2W        20, 15, 8, 9, 10, 3920, 1189 ; t11a, t12a
3906.main2:
3907    paddsw               m8, m20, m16 ; t11
3908    psubsw              m20, m16      ; t10
3909    paddsw              m16, m15, m19 ; t12
3910    psubsw              m15, m19      ; t13
3911    psubsw              m19, m14, m18 ; t9
3912    paddsw              m14, m18      ; t8
3913    psubsw              m18, m21, m17 ; t14
3914    paddsw              m21, m17      ; t15
3915.main3:
3916    vpbroadcastd        m11, [o(pw_1567_3784)]
3917    vpbroadcastd        m12, [o(pw_m3784_1567)]
3918    ITX_MULSUB_2W        18, 19, 9, 17, 10, 11, 12 ; t9a,  t14a
3919    vpbroadcastd        m11, [o(pw_m1567_m3784)]
3920    ITX_MULSUB_2W        15, 20, 9, 17, 10, 12, 11 ; t10a, t13a
3921    vpbroadcastd        m11, [o(pw_2896_2896)]
3922    vpbroadcastd        m12, [o(pw_m2896_2896)]
3923    psubsw              m17, m14, m8  ; t11a
3924    paddsw               m8, m14      ; t8a
3925    paddsw              m14, m18, m15 ; t9
3926    psubsw              m18, m15      ; t10
3927    psubsw              m15, m19, m20 ; t13
3928    paddsw              m19, m20      ; t14
3929    paddsw              m20, m21, m16 ; t15a
3930    psubsw              m16, m21, m16 ; t12a
3931    ITX_MULSUB_2W        15, 18, 9, 21, 10, 11, 12 ; t10a, t13a
3932    ITX_MULSUB_2W        16, 17, 9, 21, 10, 11, 12 ; t11,  t12
3933    psubsw              m21, m0, m20 ; out15
3934    paddsw               m0, m20     ; out0
3935    psubsw              m20, m1, m19 ; out14
3936    paddsw               m1, m19     ; out1
3937    psubsw              m19, m2, m18 ; out13
3938    paddsw               m2, m18     ; out2
3939    psubsw              m18, m3, m17 ; out12
3940    paddsw               m3, m17     ; out3
3941    psubsw              m17, m4, m16 ; out11
3942    paddsw               m4, m16     ; out4
3943    psubsw              m16, m5, m15 ; out10
3944    paddsw               m5, m15     ; out5
3945    psubsw              m15, m6, m14 ; out9
3946    paddsw               m6, m14     ; out6
3947    psubsw              m14, m7, m8  ; out8
3948    paddsw               m7, m8      ; out7
3949    ret
3950.transpose_round:
3951    punpcklwd            m8, m0, m2
3952    punpckhwd            m0, m2
3953    punpcklwd            m2, m1, m3
3954    punpckhwd            m1, m3
3955    punpcklwd            m3, m4, m6
3956    punpckhwd            m4, m6
3957    punpcklwd            m6, m5, m7
3958    punpckhwd            m5, m7
3959    punpcklwd            m7, m14, m16
3960    punpckhwd           m14, m16
3961    punpcklwd           m16, m15, m17
3962    punpckhwd           m15, m17
3963    punpcklwd           m17, m19, m21
3964    punpckhwd           m19, m21
3965    punpckhwd           m21, m18, m20
3966    punpcklwd           m18, m20
3967    punpcklwd           m20, m8, m1
3968    punpckhwd            m8, m1
3969    punpcklwd            m1, m0, m2
3970    punpckhwd            m0, m2
3971    punpcklwd            m2, m3, m5
3972    punpckhwd            m3, m5
3973    punpcklwd            m5, m4, m6
3974    punpckhwd            m4, m6
3975    REPX   {pmulhrsw x, m9}, m20, m8, m1, m0
3976    punpcklwd            m6, m7, m15
3977    punpckhwd            m7, m15
3978    punpcklwd           m15, m14, m16
3979    punpckhwd           m14, m16
3980    REPX   {pmulhrsw x, m9}, m2, m3, m5, m4
3981    punpckhwd           m16, m18, m19
3982    punpcklwd           m18, m19
3983    punpcklwd           m19, m21, m17
3984    punpckhwd           m21, m17
3985    REPX   {pmulhrsw x, m9}, m6, m7, m15, m14
3986    punpcklwd           m17, m8, m0         ; a2   a6   aa   ae
3987    punpckhwd            m8, m0             ; a3   a7   ab   af
3988    punpcklwd            m0, m20, m1        ; a0   a4   a8   ac
3989    punpckhwd           m20, m1             ; a1   a5   a9   ad
3990    REPX   {pmulhrsw x, m9}, m16, m18, m19, m21
3991    punpcklwd            m1, m2, m5         ; b0   b4   b8   bc
3992    punpckhwd            m2, m5             ; b1   b5   b9   bd
3993    punpcklwd            m5, m3, m4         ; b2   b6   ba   be
3994    punpckhwd            m3, m4             ; b3   b7   bb   bf
3995    punpcklwd            m4, m6, m15        ; c0   c4   c8   cc
3996    punpckhwd            m6, m15            ; c1   c5   c9   cd
3997    punpcklwd           m15, m7, m14        ; c2   c6   ca   ce
3998    punpckhwd            m7, m14            ; c3   c7   cb   cf
3999    punpcklwd           m14, m18, m19       ; d0   d4   d8   dc
4000    punpckhwd           m18, m19            ; d1   d5   d9   dd
4001    punpcklwd            m9, m16, m21       ; d2   d6   da   de
4002    punpckhwd           m16, m21            ; d3   d7   db   df
4003    vshufi32x4          m21, m0, m1, q3232  ; a8   ac   b8   bc
4004    vinserti32x8         m0, ym1, 1         ; a0   a4   b0   b4
4005    vinserti32x8         m1, m17, ym5, 1    ; a2   a6   b2   b6
4006    vshufi32x4           m5, m17, m5, q3232 ; aa   ae   ba   be
4007    vinserti32x8        m17, m8, ym3, 1     ; a3   a7   b3   b7
4008    vshufi32x4          m19, m8, m3, q3232  ; ab   af   bb   bf
4009    vinserti32x8         m3, m4, ym14, 1    ; c0   c4   d0   d4
4010    vshufi32x4           m4, m14, q3232     ; c8   cc   d8   dc
4011    vinserti32x8        m14, m20, ym2, 1    ; a1   a5   b1   b5
4012    vshufi32x4          m20, m2, q3232      ; a9   ad   b9   bd
4013    vinserti32x8         m2, m6, ym18, 1    ; c1   c5   d1   d5
4014    vshufi32x4           m6, m18, q3232     ; c9   cd   d9   dd
4015    vinserti32x8        m18, m15, ym9, 1    ; c2   c6   d2   d6
4016    vshufi32x4          m15, m9, q3232      ; ca   ce   da   de
4017    vinserti32x8         m9, m7, ym16, 1    ; c3   c7   d3   d7
4018    vshufi32x4           m7, m16, q3232     ; cb   cf   db   df
4019    ret
4020
4021%macro IDTX_16x32 4 ; src/dst[1-4]
4022    pmulhrsw            m%1, m15, [cq+64*%1]
4023    pmulhrsw            m%2, m15, [cq+64*%2]
4024    pmulhrsw            m%3, m15, [cq+64*%3]
4025    pmulhrsw            m%4, m15, [cq+64*%4]
4026    pmulhrsw            m18, m16, m%1
4027    pmulhrsw            m19, m16, m%2
4028    pmulhrsw            m20, m16, m%3
4029    pmulhrsw            m21, m16, m%4
4030    REPX  {pmulhrsw x, m17}, m18, m19, m20, m21
4031    paddsw              m%1, m18
4032    paddsw              m%2, m19
4033    paddsw              m%3, m20
4034    paddsw              m%4, m21
4035%endmacro
4036
4037%macro IDTX_16x32_STORE 2 ; src[1-2]
4038    mova               xm17, [dstq+r3*0]
4039    vinserti128        ym17, [dstq+r3*4], 1
4040    vinserti32x4        m17, [dstq+r3*8], 2
4041    vinserti32x4        m17, [dstq+r4*8], 3
4042    mova   [cq+64*(%1*2+0)], m18
4043    mova   [cq+64*(%1*2+1)], m18
4044    punpcklbw           m16, m17, m18
4045    punpckhbw           m17, m18
4046    paddw               m16, m%1
4047    paddw               m17, m%2
4048    packuswb            m16, m17
4049    mova          [dstq+r3*0], xm16
4050    vextracti128  [dstq+r3*4], ym16, 1
4051    vextracti32x4 [dstq+r3*8], m16, 2
4052    vextracti32x4 [dstq+r4*8], m16, 3
4053%if %1 != 7
4054    add                dstq, strideq
4055%endif
4056%endmacro
4057
4058cglobal inv_txfm_add_identity_identity_16x32_8bpc, 3, 5, 22, dst, stride, c
4059    vpbroadcastd        m15, [pw_2896x8]
4060    vpbroadcastd        m16, [pw_1697x16]
4061    vpbroadcastd        m17, [pw_16384]
4062    IDTX_16x32            0,  1,  2,  3
4063    IDTX_16x32            4,  5,  6,  7
4064    IDTX_16x32            8,  9, 10, 11
4065    IDTX_16x32           12, 13, 14, 15
4066    vpbroadcastd        m16, [pw_8192]
4067    call .transpose_2x8x8_round
4068    lea                  r3, [strideq*2]
4069    lea                  r4, [strideq*3]
4070    pxor                m18, m18
4071    IDTX_16x32_STORE      0,  8
4072    IDTX_16x32_STORE      1,  9
4073    IDTX_16x32_STORE      2, 10
4074    IDTX_16x32_STORE      3, 11
4075    IDTX_16x32_STORE      4, 12
4076    IDTX_16x32_STORE      5, 13
4077    IDTX_16x32_STORE      6, 14
4078    IDTX_16x32_STORE      7, 15
4079    RET
4080ALIGN function_align
4081.transpose_2x8x8_round:
4082    punpckhwd           m17, m4, m5
4083    punpcklwd            m4, m5
4084    punpckhwd            m5, m0, m1
4085    punpcklwd            m0, m1
4086    punpckhwd            m1, m6, m7
4087    punpcklwd            m6, m7
4088    punpckhwd            m7, m2, m3
4089    punpcklwd            m2, m3
4090    punpckhdq            m3, m0, m2
4091    punpckldq            m0, m2
4092    punpckldq            m2, m4, m6
4093    punpckhdq            m4, m6
4094    punpckhdq            m6, m5, m7
4095    punpckldq            m5, m7
4096    punpckldq            m7, m17, m1
4097    punpckhdq           m17, m1
4098    REPX  {pmulhrsw x, m16}, m0, m2, m3, m4, m5, m7, m6, m17
4099    punpckhqdq           m1, m0, m2
4100    punpcklqdq           m0, m2
4101    punpcklqdq           m2, m3, m4
4102    punpckhqdq           m3, m4
4103    punpcklqdq           m4, m5, m7
4104    punpckhqdq           m5, m7
4105    punpckhqdq           m7, m6, m17
4106    punpcklqdq           m6, m17
4107    punpckhwd           m17, m12, m13
4108    punpcklwd           m12, m13
4109    punpckhwd           m13, m8, m9
4110    punpcklwd            m8, m9
4111    punpckhwd            m9, m14, m15
4112    punpcklwd           m14, m15
4113    punpckhwd           m15, m10, m11
4114    punpcklwd           m10, m11
4115    punpckhdq           m11, m8, m10
4116    punpckldq            m8, m10
4117    punpckldq           m10, m12, m14
4118    punpckhdq           m12, m14
4119    punpckhdq           m14, m13, m15
4120    punpckldq           m13, m15
4121    punpckldq           m15, m17, m9
4122    punpckhdq           m17, m9
4123    REPX  {pmulhrsw x, m16}, m8, m10, m11, m12, m13, m15, m14, m17
4124    punpckhqdq           m9, m8, m10
4125    punpcklqdq           m8, m10
4126    punpcklqdq          m10, m11, m12
4127    punpckhqdq          m11, m12
4128    punpcklqdq          m12, m13, m15
4129    punpckhqdq          m13, m15
4130    punpckhqdq          m15, m14, m17
4131    punpcklqdq          m14, m17
4132    ret
4133
4134%macro IDTX_32x16 4 ; dst[1-4]
4135    pmulhrsw            m%2, m12, [cq+32*(%1+ 0)]
4136    pmulhrsw            m18, m12, [cq+32*(%1+16)]
4137    pmulhrsw            m%4, m12, [cq+32*(%3+ 0)]
4138    pmulhrsw            m19, m12, [cq+32*(%3+16)]
4139    REPX      {paddsw x, x}, m%2, m18, m%4, m19
4140    mova                m%1, m14
4141    vpermi2q            m%1, m%2, m18
4142    vpermt2q            m%2, m16, m18
4143%if %3 != 14
4144    mova                m%3, m14
4145%endif
4146    vpermi2q            m%3, m%4, m19
4147    vpermt2q            m%4, m16, m19
4148    pmulhrsw            m18, m17, m%1
4149    pmulhrsw            m19, m17, m%2
4150    pmulhrsw            m20, m17, m%3
4151    pmulhrsw            m21, m17, m%4
4152    REPX      {paddsw x, x}, m%1, m%2, m%3, m%4
4153    paddsw              m%1, m18
4154    paddsw              m%2, m19
4155    paddsw              m%3, m20
4156    paddsw              m%4, m21
4157%endmacro
4158
4159%macro IDTX_32x16_STORE 2-3 0 ; src[1-2], 32x32
4160    mova               ym19, [dstq+strideq*0]
4161    vinserti32x8        m19, [dstq+strideq*8], 1
4162%if %3 == 0
4163    mova   [cq+64*(%1*2+0)], m20
4164    mova   [cq+64*(%1*2+1)], m20
4165%endif
4166    punpcklbw           m18, m19, m20
4167    punpckhbw           m19, m20
4168    paddw               m18, m%1
4169    paddw               m19, m%2
4170    packuswb            m18, m19
4171    mova          [dstq+strideq*0], ym18
4172    vextracti32x8 [dstq+strideq*8], m18, 1
4173%if %3 || %1 != 7
4174    add                dstq, strideq
4175%endif
4176%endmacro
4177
4178cglobal inv_txfm_add_identity_identity_32x16_8bpc, 3, 3, 22, dst, stride, c
4179    vpbroadcastd        m12, [pw_2896x8]
4180    movu                m14, [permB+7]
4181    vpbroadcastd        m17, [pw_1697x16]
4182    psrlq               m16, m14, 4
4183    IDTX_32x16            0,  1,  2,  3
4184    IDTX_32x16            4,  5,  6,  7
4185    IDTX_32x16            8,  9, 10, 11
4186    IDTX_32x16           12, 13, 14, 15
4187    vpbroadcastd        m16, [pw_2048]
4188    call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round
4189    pxor                m20, m20
4190    IDTX_32x16_STORE      0,  8
4191    IDTX_32x16_STORE      1,  9
4192    IDTX_32x16_STORE      2, 10
4193    IDTX_32x16_STORE      3, 11
4194    IDTX_32x16_STORE      4, 12
4195    IDTX_32x16_STORE      5, 13
4196    IDTX_32x16_STORE      6, 14
4197    IDTX_32x16_STORE      7, 15
4198    RET
4199
4200%macro IDCT_32x32_END 4 ; src, mem, stride[1-2]
4201    pmovzxbw            m10, [dstq+%3]
4202    pmovzxbw            m11, [r3  +%4]
4203%if %2 < 8
4204    paddsw               m8, m%2, m%1
4205    psubsw               m9, m%2, m%1
4206%else
4207    mova                 m9, [cq+64*(%2*2-16)]
4208    paddsw               m8, m9, m%1
4209    psubsw               m9, m%1
4210%endif
4211    pmulhrsw             m8, m12
4212    pmulhrsw             m9, m12
4213%if %2 >= 8
4214%if %2 == 8
4215    pxor                 m0, m0
4216%endif
4217    mova  [cq+64*(%2*2-16)], m0
4218    mova  [cq+64*(%2*2-15)], m0
4219%endif
4220    paddw                m8, m10
4221    paddw                m9, m11
4222    packuswb             m8, m9
4223    vpermq               m8, m13, m8
4224    mova          [dstq+%3], ym8
4225    vextracti32x8 [r3  +%4], m8, 1
4226%if %2 == 3 || %2 == 7 || %2 == 11
4227    add                dstq, r5
4228    sub                  r3, r5
4229%endif
4230%endmacro
4231
4232cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 0, dst, stride, c, eob
4233%undef cmp
4234    lea                  r5, [o_base]
4235    test               eobd, eobd
4236    jz .dconly
4237    WIN64_SPILL_XMM      30
4238    cmp                eobd, 136
4239    jb .fast
4240    mova                 m5, [cq+64*20]
4241    mova                 m3, [cq+64*12]
4242    mova                 m1, [cq+64* 4]
4243    mova                 m7, [cq+64*28]
4244    mova                 m2, [cq+64* 8]
4245    mova                 m6, [cq+64*24]
4246    mova                 m0, [cq+64* 0]
4247    mova                 m4, [cq+64*16]
4248    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
4249    mova                m14, [cq+64* 2]
4250    mova                m21, [cq+64*30]
4251    mova                m18, [cq+64*18]
4252    mova                m17, [cq+64*14]
4253    mova                m16, [cq+64*10]
4254    mova                m19, [cq+64*22]
4255    mova                m20, [cq+64*26]
4256    mova                m15, [cq+64* 6]
4257    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
4258    mova         [cq+64* 0], m14
4259    mova         [cq+64* 2], m15
4260    mova         [cq+64* 4], m16
4261    mova         [cq+64* 6], m17
4262    mova         [cq+64* 8], m18
4263    mova         [cq+64*10], m19
4264    mova         [cq+64*12], m20
4265    mova         [cq+64*14], m21
4266    mova                m22, [cq+64* 1]
4267    mova                m21, [cq+64*31]
4268    mova                m14, [cq+64*17]
4269    mova                m29, [cq+64*15]
4270    mova                m26, [cq+64* 9]
4271    mova                m17, [cq+64*23]
4272    mova                m18, [cq+64*25]
4273    mova                m25, [cq+64* 7]
4274    mova                m24, [cq+64* 5]
4275    mova                m19, [cq+64*27]
4276    mova                m16, [cq+64*21]
4277    mova                m27, [cq+64*11]
4278    mova                m28, [cq+64*13]
4279    mova                m15, [cq+64*19]
4280    mova                m20, [cq+64*29]
4281    mova                m23, [cq+64* 3]
4282    call .main_oddhalf
4283    vpbroadcastd        m10, [o(pw_8192)]
4284    psubsw              m13, m0, m29 ; 31
4285    paddsw               m0, m29     ;  0
4286    psubsw              m29, m1, m28 ; 30
4287    paddsw               m1, m28     ;  1
4288    psubsw              m28, m2, m27 ; 29
4289    paddsw               m2, m27     ;  2
4290    psubsw              m27, m3, m26 ; 28
4291    paddsw               m3, m26     ;  3
4292    psubsw              m26, m4, m25 ; 27
4293    paddsw               m4, m25     ;  4
4294    psubsw              m25, m5, m24 ; 26
4295    paddsw               m5, m24     ;  5
4296    psubsw              m24, m6, m23 ; 25
4297    paddsw               m6, m23     ;  6
4298    psubsw              m23, m7, m22 ; 24
4299    paddsw               m7, m22     ;  7
4300    pxor                 m9, m9
4301    punpckhwd            m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7
4302    punpcklwd            m0, m1     ; a0 b0 a1 b1 a2 b2 a3 b3
4303    punpckhwd            m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7
4304    punpcklwd            m2, m3     ; c0 d0 c1 d1 c2 d2 c3 d3
4305    REPX {mova [cq+64*x], m9}, 16, 17, 18, 19
4306    punpckhwd           m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7
4307    punpcklwd            m4, m5     ; e0 f0 e1 f1 e2 f2 e3 f3
4308    punpckhwd            m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7
4309    punpcklwd            m6, m7     ; g0 h0 g1 h1 g2 h2 g3 h3
4310    REPX {mova [cq+64*x], m9}, 20, 21, 22, 23
4311    punpckhwd            m3, m23, m24
4312    punpcklwd           m23, m24
4313    punpckhwd           m24, m25, m26
4314    punpcklwd           m25, m26
4315    REPX {mova [cq+64*x], m9}, 24, 25, 26, 27
4316    punpckhwd           m26, m27, m28
4317    punpcklwd           m27, m28
4318    punpckhwd           m28, m29, m13
4319    punpcklwd           m29, m13
4320    REPX {mova [cq+64*x], m9}, 28, 29, 30, 31
4321    punpckhdq            m7, m0, m2  ; a2 b2 c2 d2 a3 b3 c3 d3
4322    punpckldq            m0, m2      ; a0 b0 c0 d0 a1 b1 c1 d1
4323    punpckhdq            m2, m4, m6  ; e2 f2 g2 h2 e3 f3 g3 h3
4324    punpckldq            m4, m6      ; e0 f0 g0 h0 e1 f1 g1 h1
4325    punpckhdq            m6, m8, m1  ; a6 b6 c6 d6 a7 b7 c7 d7
4326    punpckldq            m8, m1      ; a4 b4 c4 d4 a5 b5 c5 d5
4327    punpckhdq            m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7
4328    punpckldq           m22, m5      ; e4 f4 g4 h5 e5 f5 g5 h5
4329    REPX  {pmulhrsw x, m10}, m0, m4, m8, m22
4330    punpckhdq           m13, m23, m25
4331    punpckldq           m23, m25
4332    punpckhdq           m25, m27, m29
4333    punpckldq           m27, m29
4334    REPX  {pmulhrsw x, m10}, m13, m23, m25, m27
4335    punpckhdq            m9, m3, m24
4336    punpckldq            m3, m24
4337    punpckhdq           m24, m26, m28
4338    punpckldq           m26, m28
4339    punpcklqdq           m5, m23, m27 ; d00 d08 d16 d24
4340    punpckhqdq          m23, m27      ; d01 d09 d17 d25
4341    punpckhqdq          m27, m13, m25 ; d03 d11 d19 d27
4342    punpcklqdq          m13, m25      ; d02 d10 d18 d26
4343    punpckhqdq          m25, m3, m26  ; d05 d13 d21 d29
4344    punpcklqdq           m3, m26      ; d04 d12 d20 d28
4345    punpckhqdq          m26, m9, m24  ; d07 d15 d23 d31
4346    punpcklqdq           m9, m24      ; d06 d14 d22 d30
4347    REPX  {pmulhrsw x, m10}, m25, m3, m26
4348    mova         [cq+64* 9], m23
4349    mova         [cq+64*11], m27
4350    mova         [cq+64*13], m25
4351    mova         [cq+64*15], m26
4352    punpckhqdq          m24, m8, m22  ; a05 a13 a21 a29
4353    punpcklqdq           m8, m22      ; a04 a12 a20 a28
4354    punpckhqdq          m22, m0, m4   ; a01 a09 a17 a25
4355    punpcklqdq           m0, m4       ; a00 a08 a16 a24
4356    punpckhqdq          m23, m7, m2   ; a03 a11 a19 a27
4357    punpcklqdq           m7, m2       ; a02 a10 a18 a26
4358    punpckhqdq          m25, m6, m1   ; a07 a15 a23 a31
4359    punpcklqdq           m6, m1       ; a06 a14 a22 a30
4360    mova                 m2, [cq+64* 0]
4361    mova                m11, [cq+64* 2]
4362    mova                m12, [cq+64* 4]
4363    mova                m29, [cq+64* 6]
4364    mova                m27, [cq+64* 8]
4365    mova                m26, [cq+64*10]
4366    mova                 m4, [cq+64*12]
4367    mova                m28, [cq+64*14]
4368    psubsw               m1, m2, m21  ; 23
4369    paddsw               m2, m21      ;  8
4370    psubsw              m21, m11, m20 ; 22
4371    paddsw              m11, m20      ;  9
4372    psubsw              m20, m12, m19 ; 21
4373    paddsw              m12, m19      ; 10
4374    psubsw              m19, m29, m18 ; 20
4375    paddsw              m29, m18      ; 11
4376    psubsw              m18, m27, m17 ; 19
4377    paddsw              m27, m17      ; 12
4378    psubsw              m17, m26, m16 ; 18
4379    paddsw              m26, m16      ; 13
4380    paddsw              m16, m4, m15  ; 14
4381    psubsw               m4, m15      ; 17
4382    pmulhrsw            m15, m6, m10
4383    psubsw               m6, m28, m14 ; 16
4384    paddsw              m28, m14      ; 15
4385    pmulhrsw            m14, m7, m10
4386    punpcklwd            m7, m6, m4
4387    punpckhwd            m6, m4
4388    punpckhwd            m4, m17, m18
4389    punpcklwd           m17, m18
4390    punpckhwd           m18, m19, m20
4391    punpcklwd           m19, m20
4392    punpckhwd           m20, m21, m1
4393    punpcklwd           m21, m1
4394    punpckhwd            m1, m2, m11  ; i4 j4 i5 j5 i6 j6 i7 j7
4395    punpcklwd            m2, m11      ; i0 j1 i1 j1 i2 j2 i3 j3
4396    punpckhwd           m11, m12, m29 ; k4 l4 k5 l5 k6 l6 k7 l7
4397    punpcklwd           m12, m29      ; k0 l0 k1 l1 k2 l2 k3 l3
4398    punpckhwd           m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7
4399    punpcklwd           m27, m26      ; m0 n0 m1 n1 m2 n2 m3 n3
4400    punpckhwd           m26, m16, m28 ; o4 p4 o5 p5 o6 p6 o7 p7
4401    punpcklwd           m16, m28      ; o0 p0 o1 p1 o2 p2 o3 p3
4402    pmulhrsw            m23, m10
4403    pmulhrsw            m25, m10
4404    punpckhdq           m28, m2, m12  ; i2 j2 k2 l2 i3 j3 k3 l3
4405    punpckldq            m2, m12      ; i0 j0 k0 l0 i1 j1 k1 l1
4406    punpckhdq           m12, m27, m16 ; m2 n2 o2 p2 m3 n3 o3 p3
4407    punpckldq           m27, m16      ; m0 n0 o0 p0 m1 n1 o1 p1
4408    REPX  {pmulhrsw x, m10}, m28, m2, m12, m27
4409    punpckhdq           m16, m1, m11  ; i6 j6 k6 l6 i7 j7 k7 l7
4410    punpckldq            m1, m11      ; i4 j4 k4 l4 i5 j5 k5 l5
4411    punpckhdq           m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7
4412    punpckldq           m29, m26      ; m4 n4 o4 p4 m5 n5 o5 p5
4413    REPX  {pmulhrsw x, m10}, m16, m1, m11, m29
4414    punpckhdq           m26, m19, m21
4415    punpckldq           m19, m21
4416    punpckhdq           m21, m6, m4
4417    punpckldq            m6, m4
4418    REPX  {pmulhrsw x, m10}, m26, m19, m21, m6
4419    punpckhdq            m4, m18, m20
4420    punpckldq           m18, m20
4421    punpckhdq           m20, m7, m17
4422    punpckldq            m7, m17
4423    REPX  {pmulhrsw x, m10}, m4, m18, m20, m7
4424    punpcklqdq          m17, m28, m12 ; b02 b10 b18 b26
4425    punpckhqdq          m28, m12      ; b03 b11 b19 b27
4426    punpckhqdq          m12, m2, m27  ; b01 b09 b17 b25
4427    punpcklqdq           m2, m27      ; b00 b08 b16 b24
4428    punpckhqdq          m27, m1, m29  ; b05 b13 b21 b29
4429    punpcklqdq           m1, m29      ; b04 b12 b20 b28
4430    punpckhqdq          m29, m16, m11 ; b07 b15 b23 b31
4431    punpcklqdq          m16, m11      ; b06 b14 b22 b30
4432    mova         [cq+64* 1], m12
4433    mova         [cq+64* 3], m28
4434    mova         [cq+64* 5], m27
4435    mova         [cq+64* 7], m29
4436    punpckhqdq          m27, m20, m26 ; c03 c11 c19 c27
4437    punpcklqdq          m20, m26      ; c02 c10 c18 c26
4438    punpckhqdq          m26, m7, m19  ; c01 c09 c17 c25
4439    punpcklqdq           m7, m19      ; c00 c08 c16 c24
4440    punpckhqdq          m28, m6, m18  ; c05 c13 c21 c29
4441    punpcklqdq           m6, m18      ; c04 c12 c20 c28
4442    punpckhqdq          m29, m21, m4  ; c07 c15 c23 c31
4443    punpcklqdq          m21, m4       ; c06 c14 c22 c30
4444    pmulhrsw            m19, m9, m10
4445    vshufi32x4           m4, m0, m2, q3232   ; a16 a24 b16 b24
4446    vinserti32x8         m0, ym2, 1          ; a00 a08 b00 b08
4447    vshufi32x4           m2, m7, m5, q3232   ; c16 c24 d16 d24
4448    vinserti32x8         m7, ym5, 1          ; c00 c08 d00 d08
4449    vshufi32x4           m5, m8, m1, q3232   ; a20 a28 b20 b28
4450    vinserti32x8         m1, m8, ym1, 1      ; a04 a12 b04 b12
4451    vshufi32x4           m8, m6, m3, q3232   ; c20 c28 d20 d28
4452    vinserti32x8         m6, ym3, 1          ; c04 c12 d04 d12
4453    vshufi32x4           m3, m1, m6, q3131   ; 12
4454    vshufi32x4           m1, m6, q2020       ;  4
4455    vshufi32x4           m6, m4, m2, q3131   ; 24
4456    vshufi32x4           m4, m2, q2020       ; 16
4457    vshufi32x4           m2, m0, m7, q3131   ;  8
4458    vshufi32x4           m0, m7, q2020       ;  0
4459    vshufi32x4           m7, m5, m8, q3131   ; 28
4460    vshufi32x4           m5, m8, q2020       ; 20
4461    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
4462    vshufi32x4          m18, m14, m17, q3232 ; a18 a26 b18 b26
4463    vinserti32x8        m14, ym17, 1         ; a02 a10 b02 b10
4464    vshufi32x4          m17, m20, m13, q3232 ; c18 c26 d18 d26
4465    vinserti32x8        m20, ym13, 1         ; c02 c10 d02 d10
4466    vshufi32x4          m13, m21, m19, q3232 ; c22 c30 d22 d30
4467    vinserti32x8        m21, ym19, 1         ; c06 c14 d06 d14
4468    vshufi32x4          m19, m15, m16, q3232 ; a22 a30 b22 b30
4469    vinserti32x8        m15, ym16, 1         ; a06 a14 b06 b14
4470    vshufi32x4          m16, m14, m20, q3131 ; 10
4471    vshufi32x4          m14, m20, q2020      ;  2
4472    vshufi32x4          m20, m18, m17, q3131 ; 26
4473    vshufi32x4          m18, m17, q2020      ; 18
4474    vshufi32x4          m17, m15, m21, q3131 ; 14
4475    vshufi32x4          m15, m21, q2020      ;  6
4476    vshufi32x4          m21, m19, m13, q3131 ; 30
4477    vshufi32x4          m19, m13, q2020      ; 22
4478    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
4479    mova         [cq+64* 0], m14
4480    mova         [cq+64* 2], m15
4481    mova         [cq+64* 4], m16
4482    mova         [cq+64* 6], m17
4483    mova         [cq+64* 8], m18
4484    mova         [cq+64*10], m19
4485    mova         [cq+64*12], m20
4486    mova         [cq+64*14], m21
4487    mova                m15, [cq+64* 1]
4488    mova                m16, [cq+64* 3]
4489    mova                m17, [cq+64* 5]
4490    mova                m19, [cq+64* 7]
4491    mova                m20, [cq+64* 9]
4492    mova                m21, [cq+64*11]
4493    mova                m13, [cq+64*13]
4494    mova                m18, [cq+64*15]
4495    vshufi32x4          m14, m22, m15, q3232 ; a17 a25 b17 b25
4496    vinserti32x8        m22, ym15, 1         ; a01 a09 b01 b09
4497    vshufi32x4          m15, m23, m16, q3232 ; a19 a27 b19 b27
4498    vinserti32x8        m23, ym16, 1         ; a03 a11 b03 b11
4499    vshufi32x4          m16, m24, m17, q3232 ; a21 a29 b21 b29
4500    vinserti32x8        m24, ym17, 1         ; a05 a13 b05 b13
4501    vshufi32x4          m17, m25, m19, q3232 ; a23 a31 b23 b31
4502    vinserti32x8        m25, ym19, 1         ; a07 a15 b07 b15
4503    vinserti32x8         m8, m26, ym20, 1    ; c01 c09 d01 d09
4504    vshufi32x4          m26, m20, q3232      ; c17 c25 d17 d25
4505    vinserti32x8         m9, m27, ym21, 1    ; c03 c11 d03 d11
4506    vshufi32x4          m27, m21, q3232      ; c19 c27 d19 d27
4507    vinserti32x8        m11, m28, ym13, 1    ; c05 c13 d05 d13
4508    vshufi32x4          m28, m13, q3232      ; c21 c29 d21 d29
4509    vinserti32x8        m12, m29, ym18, 1    ; c07 c15 d07 d15
4510    vshufi32x4          m29, m18, q3232      ; c23 c31 d23 d31
4511    vshufi32x4          m18, m14, m26, q3131 ; 25
4512    vshufi32x4          m14, m26, q2020      ; 17
4513    vshufi32x4          m19, m15, m27, q3131 ; 27
4514    vshufi32x4          m15, m27, q2020      ; 19
4515    vshufi32x4          m20, m16, m28, q3131 ; 29
4516    vshufi32x4          m16, m28, q2020      ; 21
4517    vshufi32x4          m21, m17, m29, q3131 ; 31
4518    vshufi32x4          m17, m29, q2020      ; 23
4519    vshufi32x4          m26, m22, m8, q3131  ;  9
4520    vshufi32x4          m22, m8, q2020       ;  1
4521    vshufi32x4          m27, m23, m9, q3131  ; 11
4522    vshufi32x4          m23, m9, q2020       ;  3
4523    vshufi32x4          m28, m24, m11, q3131 ; 13
4524    vshufi32x4          m24, m11, q2020      ;  5
4525    vshufi32x4          m29, m25, m12, q3131 ; 15
4526    vshufi32x4          m25, m12, q2020      ;  7
4527    call .main_oddhalf
4528    jmp .end
4529.fast: ; bottom/right halves are zero
4530    mova                m14, [o(dup16_perm)]
4531    pmovzxwd             m9,       [cq+64* 0]
4532    pmovzxwd             m6,       [cq+64* 8]
4533    vpermb               m8, m14,  [cq+64* 2]
4534    vpermb              ym0, ym14, [cq+64*14]
4535    vpermb              ym5, ym14, [cq+64*10]
4536    vpermb               m1, m14,  [cq+64* 6]
4537    vpermb               m7, m14,  [cq+64* 4]
4538    vpermb              ym3, ym14, [cq+64*12]
4539    pslld                m9, 16
4540    pslld                m6, 16
4541    call m(idct_16x16_internal_8bpc).main_fast
4542    vpermb              m21, m14,  [cq+64* 1]
4543    vpermb             ym17, ym14, [cq+64*15]
4544    vpermb             ym20, ym14, [cq+64* 9]
4545    vpermb              m15, m14,  [cq+64* 7]
4546    vpermb              m18, m14,  [cq+64* 5]
4547    vpermb             ym16, ym14, [cq+64*11]
4548    vpermb             ym19, ym14, [cq+64*13]
4549    vpermb              m14, m14,  [cq+64* 3]
4550    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
4551    vpbroadcastd         m9, [o(pw_8192)]
4552    call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round
4553    vshufi32x4          m22, m14, m2, q2020 ;  1
4554    vshufi32x4          m24, m14, m2, q3131 ;  5
4555    vshufi32x4          m23, m17, m9, q2020 ;  3
4556    vshufi32x4          m25, m17, m9, q3131 ;  7
4557    vshufi32x4          m16, m5, m15, q2020 ; 10
4558    vshufi32x4          m17, m5, m15, q3131 ; 14
4559    vshufi32x4          m14, m1, m18, q2020 ;  2
4560    vshufi32x4          m15, m1, m18, q3131 ;  6
4561    vshufi32x4           m1, m0, m3, q3131  ;  4
4562    vshufi32x4           m0, m3, q2020      ;  0
4563    vshufi32x4           m3, m21, m4, q3131 ; 12
4564    vshufi32x4           m2, m21, m4, q2020 ;  8
4565    vshufi32x4          m26, m20, m6, q2020 ;  9
4566    vshufi32x4          m28, m20, m6, q3131 ; 13
4567    vshufi32x4          m27, m19, m7, q2020 ; 11
4568    vshufi32x4          m29, m19, m7, q3131 ; 15
4569    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
4570    mova         [cq+64* 0], m14
4571    mova         [cq+64* 2], m15
4572    mova         [cq+64* 4], m16
4573    mova         [cq+64* 6], m17
4574    mova         [cq+64* 8], m18
4575    mova         [cq+64*10], m19
4576    mova         [cq+64*12], m20
4577    mova         [cq+64*14], m21
4578    call .main_oddhalf_fast
4579.end:
4580    lea                  r4, [strideq*3]
4581    vpbroadcastd        m12, [o(pw_2048)]
4582    movshdup            m13, [o(permD)]
4583    lea                  r3, [dstq+r4*8]
4584    lea                  r5, [strideq+r4] ; stride*4
4585    add                  r3, r5           ; dst+stride*28
4586    IDCT_32x32_END       29,  0, strideq*0, r4
4587    IDCT_32x32_END       28,  1, strideq*1, strideq*2
4588    IDCT_32x32_END       27,  2, strideq*2, strideq*1
4589    IDCT_32x32_END       26,  3, r4       , strideq*0
4590    IDCT_32x32_END       25,  4, strideq*0, r4
4591    IDCT_32x32_END       24,  5, strideq*1, strideq*2
4592    IDCT_32x32_END       23,  6, strideq*2, strideq*1
4593    IDCT_32x32_END       22,  7, r4       , strideq*0
4594    IDCT_32x32_END       21,  8, strideq*0, r4
4595    IDCT_32x32_END       20,  9, strideq*1, strideq*2
4596    IDCT_32x32_END       19, 10, strideq*2, strideq*1
4597    IDCT_32x32_END       18, 11, r4       , strideq*0
4598    IDCT_32x32_END       17, 12, strideq*0, r4
4599    IDCT_32x32_END       16, 13, strideq*1, strideq*2
4600    IDCT_32x32_END       15, 14, strideq*2, strideq*1
4601    IDCT_32x32_END       14, 15, r4       , strideq*0
4602    RET
4603.dconly:
4604    movsx               r6d, word [cq]
4605    mov                [cq], eobd
4606    mov                 r3d, 32
4607    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly2
4608ALIGN function_align
4609.main_oddhalf_fast2: ; bottom three-quarters are zero
4610    vpbroadcastd        m21, [o(pw_4091x8)]
4611    vpbroadcastd         m8, [o(pw_201x8)]
4612    vpbroadcastd        m18, [o(pw_m1380x8)]
4613    vpbroadcastd         m9, [o(pw_3857x8)]
4614    vpbroadcastd        m19, [o(pw_3973x8)]
4615    vpbroadcastd        m11, [o(pw_995x8)]
4616    vpbroadcastd        m28, [o(pw_m601x8)]
4617    vpbroadcastd        m12, [o(pw_4052x8)]
4618    pmulhrsw            m21, m22 ; t31a
4619    pmulhrsw            m22, m8  ; t16a
4620    pmulhrsw            m18, m25 ; t19a
4621    pmulhrsw            m25, m9 ; t28a
4622    pmulhrsw            m19, m24 ; t27a
4623    pmulhrsw            m24, m11 ; t20a
4624    pmulhrsw            m28, m23 ; t23a
4625    pmulhrsw            m23, m12 ; t24a
4626    mova                m15, m21
4627    mova                 m8, m22
4628    mova                m14, m18
4629    mova                m27, m25
4630    mova                m29, m19
4631    mova                m26, m24
4632    mova                m16, m28
4633    mova                m20, m23
4634    jmp .main3
4635ALIGN function_align
4636.main_oddhalf_fast: ; bottom half is zero
4637    vpbroadcastd        m21, [o(pw_4091x8)]
4638    vpbroadcastd         m8, [o(pw_201x8)]
4639    vpbroadcastd        m14, [o(pw_m2751x8)]
4640    vpbroadcastd         m9, [o(pw_3035x8)]
4641    vpbroadcastd        m17, [o(pw_3703x8)]
4642    vpbroadcastd        m11, [o(pw_1751x8)]
4643    vpbroadcastd        m18, [o(pw_m1380x8)]
4644    vpbroadcastd        m12, [o(pw_3857x8)]
4645    pmulhrsw            m21, m22 ; t31a
4646    vpbroadcastd        m19, [o(pw_3973x8)]
4647    pmulhrsw            m22, m8  ; t16a
4648    vpbroadcastd         m8, [o(pw_995x8)]
4649    pmulhrsw            m14, m29 ; t30a
4650    vpbroadcastd        m16, [o(pw_m2106x8)]
4651    pmulhrsw            m29, m9  ; t17a
4652    vpbroadcastd         m9, [o(pw_3513x8)]
4653    pmulhrsw            m17, m26 ; t29a
4654    vpbroadcastd        m15, [o(pw_3290x8)]
4655    pmulhrsw            m26, m11 ; t18a
4656    vpbroadcastd        m11, [o(pw_2440x8)]
4657    pmulhrsw            m18, m25 ; t19a
4658    vpbroadcastd        m20, [o(pw_m601x8)]
4659    pmulhrsw            m25, m12 ; t28a
4660    vpbroadcastd        m12, [o(pw_4052x8)]
4661    pmulhrsw            m19, m24 ; t27a
4662    pmulhrsw            m24, m8  ; t20a
4663    pmulhrsw            m16, m27 ; t21a
4664    pmulhrsw            m27, m9  ; t26a
4665    pmulhrsw            m15, m28 ; t25a
4666    pmulhrsw            m28, m11 ; t22a
4667    pmulhrsw            m20, m23 ; t23a
4668    pmulhrsw            m23, m12 ; t24a
4669    jmp .main2
4670ALIGN function_align
4671.main_oddhalf:
4672    ITX_MULSUB_2W        22, 21,  8,  9, 10,  201, 4091 ; t16a, t31a
4673    ITX_MULSUB_2W        14, 29,  8,  9, 10, 3035, 2751 ; t17a, t30a
4674    ITX_MULSUB_2W        26, 17,  8,  9, 10, 1751, 3703 ; t18a, t29a
4675    ITX_MULSUB_2W        18, 25,  8,  9, 10, 3857, 1380 ; t19a, t28a
4676    ITX_MULSUB_2W        24, 19,  8,  9, 10,  995, 3973 ; t20a, t27a
4677    ITX_MULSUB_2W        16, 27,  8,  9, 10, 3513, 2106 ; t21a, t26a
4678    ITX_MULSUB_2W        28, 15,  8,  9, 10, 2440, 3290 ; t22a, t25a
4679    ITX_MULSUB_2W        20, 23,  8,  9, 10, 4052,  601 ; t23a, t24a
4680.main2:
4681    psubsw               m8, m22, m14 ; t17
4682    paddsw              m22, m14      ; t16
4683    paddsw              m14, m18, m26 ; t19
4684    psubsw              m18, m26      ; t18
4685    psubsw              m26, m24, m16 ; t21
4686    paddsw              m24, m16      ; t20
4687    psubsw              m16, m20, m28 ; t22
4688    paddsw              m28, m20      ; t23
4689    psubsw              m20, m23, m15 ; t25
4690    paddsw              m23, m15      ; t24
4691    psubsw              m15, m21, m29 ; t30
4692    paddsw              m21, m29      ; t31
4693    psubsw              m29, m19, m27 ; t26
4694    paddsw              m19, m27      ; t27
4695    paddsw              m27, m25, m17 ; t28
4696    psubsw              m25, m17      ; t29
4697.main3:
4698    ITX_MULSUB_2W        15,  8,  9, 17, 10,   799, 4017 ; t17a, t30a
4699    ITX_MULSUB_2W        25, 18,  9, 17, 10, m4017,  799 ; t18a, t29a
4700    ITX_MULSUB_2W        29, 26,  9, 17, 10,  3406, 2276 ; t21a, t26a
4701    ITX_MULSUB_2W        20, 16,  9, 17, 10, m2276, 3406 ; t22a, t25a
4702    vpbroadcastd        m12, [o(pw_m3784_1567)]
4703    vpbroadcastd        m11, [o(pw_1567_3784)]
4704    psubsw              m17, m21, m27 ; t28a
4705    paddsw              m21, m27      ; t31a
4706    psubsw              m27, m15, m25 ; t18
4707    paddsw              m15, m25      ; t17
4708    psubsw              m25, m20, m29 ; t21
4709    paddsw              m20, m29      ; t22
4710    psubsw              m29, m8, m18  ; t29
4711    paddsw               m8, m18      ; t30
4712    psubsw              m18, m22, m14 ; t19a
4713    paddsw              m22, m14      ; t16a
4714    psubsw              m14, m28, m24 ; t20a
4715    paddsw              m24, m28      ; t23a
4716    paddsw              m28, m16, m26 ; t25
4717    psubsw              m16, m26      ; t26
4718    psubsw              m26, m23, m19 ; t27a
4719    paddsw              m23, m19      ; t24a
4720    ITX_MULSUB_2W        29, 27,  9, 19, 10, 11, 12 ; t18a, t29a
4721    ITX_MULSUB_2W        17, 18,  9, 19, 10, 11, 12 ; t19,  t28
4722    vpbroadcastd        m11, [o(pw_m1567_m3784)]
4723    ITX_MULSUB_2W        16, 25,  9, 19, 10, 12, 11 ; t21a, t26a
4724    ITX_MULSUB_2W        26, 14,  9, 19, 10, 12, 11 ; t20,  t27
4725    vpbroadcastd        m12, [o(pw_m2896_2896)]
4726    vpbroadcastd        m11, [o(pw_2896_2896)]
4727    psubsw              m19, m27, m25 ; t26
4728    paddsw              m27, m25      ; t29
4729    psubsw              m25, m17, m26 ; t20a
4730    paddsw              m17, m26      ; t19a
4731    paddsw              m26, m18, m14 ; t28a
4732    psubsw              m18, m14      ; t27a
4733    paddsw              m14, m22, m24 ; t16
4734    psubsw              m22, m24      ; t23
4735    psubsw              m24, m29, m16 ; t21
4736    paddsw              m16, m29      ; t18
4737    paddsw              m29, m21, m23 ; t31
4738    psubsw              m21, m23      ; t24
4739    psubsw              m23, m15, m20 ; t22a
4740    paddsw              m15, m20      ; t17a
4741    psubsw              m20, m8, m28  ; t25a
4742    paddsw              m28, m8       ; t30a
4743    ITX_MULSUB_2W        18, 25,  8,  9, 10, 11, 12 ; t20,  t27
4744    ITX_MULSUB_2W        19, 24,  8,  9, 10, 11, 12 ; t21a, t26a
4745    ITX_MULSUB_2W        21, 22,  8,  9, 10, 11, 12 ; t23a, t24a
4746    ITX_MULSUB_2W        20, 23,  8,  9, 10, 11, 12 ; t22,  t25
4747    ret
4748
4749%macro IDTX_32x32 2 ; dst[1-2]
4750    vmovdqa32           ym%1, [cq+64*(%1+ 0)] ; force EVEX encoding, which
4751    vmovdqa32           ym17, [cq+64*(%1+16)] ; reduces code size due to
4752    vmovdqa32           ym%2, [cq+64*(%2+ 0)] ; compressed displacements
4753    vmovdqa32           ym18, [cq+64*(%2+16)]
4754    vpermt2q             m%1, m21, m17
4755    vpermt2q             m%2, m21, m18
4756%endmacro
4757
4758cglobal inv_txfm_add_identity_identity_32x32_8bpc, 3, 3, 22, dst, stride, c
4759    movu                 m21, [permB+7]
4760    vpbroadcastd         m16, [pw_8192]
4761    pxor                 m20, m20
4762.loop:
4763    IDTX_32x32            0,  1
4764    IDTX_32x32            2,  3
4765    IDTX_32x32            4,  5
4766    IDTX_32x32            6,  7
4767    IDTX_32x32            8,  9
4768    IDTX_32x32           10, 11
4769    IDTX_32x32           12, 13
4770    IDTX_32x32           14, 15
4771    call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round
4772    IDTX_32x16_STORE      0,  8, 1
4773    IDTX_32x16_STORE      1,  9, 1
4774    IDTX_32x16_STORE      2, 10, 1
4775    IDTX_32x16_STORE      3, 11, 1
4776    IDTX_32x16_STORE      4, 12, 1
4777    IDTX_32x16_STORE      5, 13, 1
4778    IDTX_32x16_STORE      6, 14, 1
4779    IDTX_32x16_STORE      7, 15, 1
4780    lea                dstq, [dstq+strideq*8]
4781    btc                  cq, 5
4782    jnc .loop
4783    mov                 r0d, 8
4784.zero_loop:
4785    mova          [cq+64*0], m20
4786    mova          [cq+64*1], m20
4787    mova          [cq+64*2], m20
4788    mova          [cq+64*3], m20
4789    add                  cq, 64*4
4790    dec                 r0d
4791    jg .zero_loop
4792    RET
4793
4794cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 7, 0, dst, stride, c, eob
4795%undef cmp
4796    lea                  r5, [o_base]
4797    test               eobd, eobd
4798    jz .dconly
4799    WIN64_SPILL_XMM      30
4800    cmp                eobd, 151
4801    jb .fast
4802    mova                 m5, [cq+64*10]
4803    mova                 m3, [cq+64* 6]
4804    mova                 m1, [cq+64* 2]
4805    mova                 m7, [cq+64*14]
4806    mova                 m2, [cq+64* 4]
4807    mova                 m6, [cq+64*12]
4808    mova                 m0, [cq+64* 0]
4809    mova                 m4, [cq+64* 8]
4810    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
4811    mova                m14, [cq+64* 1]
4812    mova                m21, [cq+64*15]
4813    mova                m18, [cq+64* 9]
4814    mova                m17, [cq+64* 7]
4815    mova                m16, [cq+64* 5]
4816    mova                m19, [cq+64*11]
4817    mova                m20, [cq+64*13]
4818    mova                m15, [cq+64* 3]
4819    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
4820    vpbroadcastd         m9, [o(pw_8192)]
4821%macro TRANSPOSE_8x4_ROUND 4
4822    punpckhwd            m8, m%3, m%4 ; c4 d4 c5 d5 c6 d6 c7 d7
4823    punpcklwd           m%3, m%4      ; c0 d0 c1 d1 c2 d2 c3 d3
4824    punpckhwd           m%4, m%1, m%2 ; a4 b4 a5 b5 a6 b6 a7 b7
4825    punpcklwd           m%1, m%2      ; a0 b0 a1 b1 a2 b2 a3 b3
4826    punpckhdq           m%2, m%1, m%3 ; a2 b2 c2 d2 a3 b3 c3 d3
4827    punpckldq           m%1, m%3      ; a0 b0 c0 d0 a1 b1 c1 d1
4828    punpckldq           m%3, m%4, m8  ; a4 b4 c4 d4 a5 b5 c5 d5
4829    punpckhdq           m%4, m8       ; a6 b6 c6 d6 a7 b7 c7 d7
4830    REPX   {pmulhrsw x, m9}, m%2, m%1, m%3, m%4
4831%endmacro
4832    TRANSPOSE_8x4_ROUND   0,  1,  2,  3
4833    TRANSPOSE_8x4_ROUND   4,  5,  6,  7
4834    TRANSPOSE_8x4_ROUND  14, 15, 16, 17
4835    TRANSPOSE_8x4_ROUND  18, 19, 20, 21
4836    vinserti32x8        m26, m0, ym4, 1     ; a0  a4  b0  b4
4837    vshufi32x4           m0, m4, q3232      ; a8  a12 b8  b12
4838    vinserti32x8        m27, m1, ym5, 1     ; a1  a5  b1  b5
4839    vshufi32x4           m1, m5, q3232      ; a9  a13 b9  b13
4840    vinserti32x8        m28, m2, ym6, 1     ; a2  a6  b2  b6
4841    vshufi32x4           m2, m6, q3232      ; a10 a14 b10 b14
4842    vinserti32x8        m29, m3, ym7, 1     ; a3  a7  b3  b7
4843    vshufi32x4           m8, m3, m7, q3232  ; a11 a15 b11 b15
4844    vinserti32x8         m4, m14, ym18, 1   ; c0  c4  d0  d4
4845    vshufi32x4          m14, m18, q3232     ; c8  c12 d8  d12
4846    vinserti32x8         m5, m15, ym19, 1   ; c1  c5  d1  d5
4847    vshufi32x4          m15, m19, q3232     ; c9  c13 d9  d13
4848    vinserti32x8         m6, m16, ym20, 1   ; c2  c6  d2  d6
4849    vshufi32x4          m16, m20, q3232     ; c10 c14 d10 d14
4850    vinserti32x8         m7, m17, ym21, 1   ; c3  c7  d3  d7
4851    vshufi32x4          m17, m21, q3232     ; c11 c15 d11 d15
4852    vshufi32x4          m22, m26, m4, q2020 ;  0  1
4853    vshufi32x4          m26, m4, q3131      ;  8  9
4854    vshufi32x4          m23, m27, m5, q2020 ;  2  3
4855    vshufi32x4          m27, m5, q3131      ; 10 11
4856    vshufi32x4          m24, m28, m6, q2020 ;  4  5
4857    vshufi32x4          m28, m6, q3131      ; 12 13
4858    vshufi32x4          m25, m29, m7, q2020 ;  6  7
4859    vshufi32x4          m29, m7, q3131      ; 14 15
4860    vshufi32x4           m4, m0, m14, q2020 ; 16 17
4861    vshufi32x4           m3, m0, m14, q3131 ; 24 25
4862    vshufi32x4          m20, m1, m15, q2020 ; 18 19
4863    vshufi32x4          m19, m1, m15, q3131 ; 26 27
4864    vshufi32x4           m5, m2, m16, q2020 ; 20 21
4865    vshufi32x4           m0, m2, m16, q3131 ; 28 29
4866    vshufi32x4          m16, m8, m17, q2020 ; 22 23
4867    vshufi32x4          m17, m8, m17, q3131 ; 30 31
4868    pxor                 m6, m6
4869    mova         [cq+64* 0], m4
4870    mova         [cq+64* 2], m5
4871    mova         [cq+64* 4], m3
4872    mova         [cq+64* 6], m0
4873    punpcklwd            m8, m24, m24 ;  4
4874    punpcklwd            m0, m0       ; 28
4875    punpcklwd            m5, m5       ; 20
4876    punpcklwd            m1, m28, m28 ; 12
4877    punpcklwd            m7, m26, m26 ;  8
4878    punpcklwd            m3, m3       ; 24
4879    punpcklwd            m9, m6, m22  ; __  0
4880    punpcklwd            m6, m4       ; __ 16
4881    call m(idct_16x16_internal_8bpc).main_fast3
4882    mova         [cq+64* 1], m20
4883    mova         [cq+64* 3], m16
4884    mova         [cq+64* 5], m19
4885    mova         [cq+64* 7], m17
4886    punpcklwd           m21, m23, m23 ;  2
4887    punpcklwd           m17, m17      ; 30
4888    punpcklwd           m20, m20      ; 18
4889    punpcklwd           m15, m29, m29 ; 14
4890    punpcklwd           m18, m27, m27 ; 10
4891    punpcklwd           m16, m16      ; 22
4892    punpcklwd           m19, m19      ; 26
4893    punpcklwd           m14, m25, m25 ;  6
4894    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
4895    mova         [cq+64* 8], m14
4896    mova         [cq+64* 9], m15
4897    mova         [cq+64*10], m16
4898    mova         [cq+64*11], m17
4899    mova         [cq+64*12], m18
4900    mova         [cq+64*13], m19
4901    mova         [cq+64*14], m20
4902    mova         [cq+64*15], m21
4903    mova                m21, [cq+64* 7]
4904    mova                m14, [cq+64* 0]
4905    mova                m17, [cq+64* 3]
4906    mova                m18, [cq+64* 4]
4907    mova                m19, [cq+64* 5]
4908    mova                m16, [cq+64* 2]
4909    mova                m15, [cq+64* 1]
4910    mova                m20, [cq+64* 6]
4911    REPX   {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \
4912                             m24, m19, m16, m27, m28, m15, m20, m23
4913    call .main_oddhalf
4914    jmp .end
4915.fast: ; right half is zero
4916    mova                ym8, [cq+64*15]
4917    vinserti32x8         m8, [cq+64* 1], 1
4918    mova                 m2, [o(int16_perm)]
4919    mova                ym9, [cq+64* 8]
4920    vinserti32x8         m9, [cq+64* 0], 1
4921    mova                ym0, [cq+64* 7]
4922    vinserti32x8         m0, [cq+64* 9], 1
4923    mova                ym7, [cq+64*14]
4924    vinserti32x8         m7, [cq+64* 2], 1
4925    mova                ym1, [cq+64* 3]
4926    vinserti32x8         m1, [cq+64*13], 1
4927    mova                ym3, [cq+64* 6]
4928    vinserti32x8         m3, [cq+64*10], 1
4929    mova                ym5, [cq+64*11]
4930    vinserti32x8         m5, [cq+64* 5], 1
4931    mova                ym6, [cq+64*12]
4932    vinserti32x8         m6, [cq+64* 4], 1
4933    REPX  {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6
4934    call m(idct_16x16_internal_8bpc).main2
4935    vbroadcasti32x4      m8, [o(int_shuf3)]
4936    vbroadcasti32x4      m9, [o(int_shuf4)]
4937    vpbroadcastd        m11, [o(pw_8192)]
4938    pshufb               m0, m8
4939    pshufb               m1, m9
4940    pshufb               m2, m8
4941    pshufb               m3, m9
4942    REPX  {pmulhrsw x, m11}, m0, m1, m2, m3
4943    pshufb               m4, m8
4944    pshufb               m5, m9
4945    pshufb               m6, m8
4946    pshufb               m7, m9
4947    REPX  {pmulhrsw x, m11}, m4, m5, m6, m7
4948    punpckhdq           m28, m0, m1
4949    punpckldq            m0, m1
4950    punpckhdq           m27, m2, m3
4951    punpckldq            m2, m3
4952    punpckhdq           m22, m4, m5
4953    punpckldq            m4, m5
4954    punpckhdq           m23, m6, m7
4955    punpckldq            m6, m7
4956    vinserti32x8        m14, m0, ym2, 1
4957    vshufi32x4          m15, m0, m2, q3232
4958    vinserti32x8         m2, m4, ym6, 1
4959    vshufi32x4           m4, m6, q3232
4960    vshufi32x4          m21, m14, m2, q2020 ;  0  2
4961    vshufi32x4          m14, m2, q3131      ;  4  6
4962    vshufi32x4          m18, m15, m4, q2020 ;  8 10
4963    vshufi32x4          m15, m4, q3131      ; 12 14
4964    pxor                 m9, m9
4965    punpcklwd            m8, m14, m14 ;  4
4966    punpcklwd            m1, m15, m15 ; 12
4967    punpcklwd            m7, m18, m18 ;  8
4968    punpcklwd            m9, m21      ; __  0
4969    call m(idct_16x16_internal_8bpc).main_fast4
4970    punpckhwd           m21, m21      ;  2
4971    punpckhwd           m15, m15      ; 14
4972    punpckhwd           m18, m18      ; 10
4973    punpckhwd           m14, m14      ;  6
4974    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
4975    vinserti32x8        m24, m28, ym27, 1
4976    vshufi32x4          m28, m27, q3232
4977    vinserti32x8        m27, m22, ym23, 1
4978    vshufi32x4          m22, m23, q3232
4979    vshufi32x4          m23, m24, m27, q2020 ;  1  3
4980    vshufi32x4          m24, m27, q3131      ;  5  7
4981    vshufi32x4          m27, m28, m22, q2020 ;  9 11
4982    vshufi32x4          m28, m22, q3131      ; 13 15
4983    punpcklwd           m22, m23, m23 ;  1
4984    punpckhwd           m29, m28, m28 ; 15
4985    punpcklwd           m26, m27, m27 ;  9
4986    punpckhwd           m25, m24, m24 ;  7
4987    mova         [cq+64* 8], m14
4988    mova         [cq+64* 9], m15
4989    mova         [cq+64*10], m16
4990    mova         [cq+64*11], m17
4991    punpcklwd           m24, m24      ;  5
4992    punpckhwd           m27, m27      ; 11
4993    punpcklwd           m28, m28      ; 13
4994    punpckhwd           m23, m23      ;  3
4995    mova         [cq+64*12], m18
4996    mova         [cq+64*13], m19
4997    mova         [cq+64*14], m20
4998    mova         [cq+64*15], m21
4999    call .main_oddhalf_fast
5000.end:
5001    imul                 r6, strideq, 60
5002    mova                m10, [o(end_16x32p)]
5003    vpbroadcastd        m11, [o(pw_2048)]
5004    lea                  r3, [strideq*3]
5005    pxor                m12, m12
5006    add                  r6, dstq         ; dst+stride*60
5007    psrldq              m13, m10, 1
5008    lea                  r4, [strideq+r3] ; stride*4
5009%macro IDCT_16x64_END 3 ; idct32, idct64, tmp
5010%if %1 & 1
5011    %define %%s0 r3
5012    %define %%s1 strideq*2
5013    %define %%s2 strideq*1
5014    %define %%s3 strideq*0
5015%else
5016    %define %%s0 strideq*0
5017    %define %%s1 strideq*1
5018    %define %%s2 strideq*2
5019    %define %%s3 r3
5020%if %1
5021    add                dstq, r4
5022    sub                  r6, r4
5023%endif
5024%endif
5025%if %1 < 8
5026    pmulhrsw             m8, m11, m%1
5027    pmulhrsw             m9, m11, m%2
5028%else
5029    mova                 m9, [cq+64*%1]
5030    paddsw               m8, m9, m%2 ; out  0+n,  1+n
5031    psubsw               m9, m%2     ; out 63-n, 62-n
5032    pmulhrsw             m8, m11
5033    pmulhrsw             m9, m11
5034%endif
5035    mova               xm29, [dstq+%%s0]
5036    vinserti128        ym29, [dstq+%%s1], 1
5037    mova               xm%3, [r6  +%%s3]
5038    vinserti128        ym%3, [r6  +%%s2], 1
5039    vpermb              m29, m10, m29
5040    vpermb              m%3, m10, m%3
5041    mova         [cq+64*%1], m12
5042    paddw               m29, m8
5043    paddw               m%3, m9
5044    packuswb            m29, m%3
5045    vpermd              m29, m13, m29
5046    mova          [dstq+%%s0], xm29
5047    vextracti128  [dstq+%%s1], ym29, 1
5048    vextracti32x4 [r6  +%%s2], m29, 2
5049    vextracti32x4 [r6  +%%s3], m29, 3
5050%endmacro
5051    IDCT_16x64_END        0, 29,  0
5052    IDCT_16x64_END        1, 28, 28
5053    IDCT_16x64_END        2, 27, 28
5054    IDCT_16x64_END        3, 26, 28
5055    IDCT_16x64_END        4, 25, 28
5056    IDCT_16x64_END        5, 24, 28
5057    IDCT_16x64_END        6, 23, 28
5058    IDCT_16x64_END        7, 22, 28
5059    IDCT_16x64_END        8, 21, 28
5060    IDCT_16x64_END        9, 20, 28
5061    IDCT_16x64_END       10, 19, 28
5062    IDCT_16x64_END       11, 18, 28
5063    IDCT_16x64_END       12, 17, 28
5064    IDCT_16x64_END       13, 16, 28
5065    IDCT_16x64_END       14, 15, 28
5066    IDCT_16x64_END       15, 14, 28
5067    RET
5068.dconly:
5069    movsx               r6d, word [cq]
5070    mov                [cq], eobd
5071    imul                r6d, 181
5072    mov                 r3d, 64
5073    add                 r6d, 128+512
5074    sar                 r6d, 8+2
5075    jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3
5076ALIGN function_align
5077.main_oddhalf_fast: ; bottom three-quarters are zero
5078    vpbroadcastd         m8, [o(pw_101_4095x8)]
5079    vpbroadcastd        m21, [o(pw_m1474_3822x8)]
5080    vpbroadcastd        m14, [o(pw_897_3996x8)]
5081    vpbroadcastd        m17, [o(pw_m700_4036x8)]
5082    vpbroadcastd        m18, [o(pw_501_4065x8)]
5083    vpbroadcastd        m19, [o(pw_m1092_3948x8)]
5084    vpbroadcastd        m16, [o(pw_1285_3889x8)]
5085    vpbroadcastd        m15, [o(pw_m301_4085x8)]
5086    pmulhrsw             m8, m22 ; t32a t63a
5087    pmulhrsw            m21, m29 ; t35a t60a
5088    pmulhrsw            m14, m26 ; t36a t59a
5089    pmulhrsw            m17, m25 ; t39a t56
5090    pmulhrsw            m18, m24 ; t40a t55a
5091    pmulhrsw            m19, m27 ; t43a t52a
5092    pmulhrsw            m16, m28 ; t44a t51a
5093    pmulhrsw            m15, m23 ; t47a t48a
5094    mova                m22, m8
5095    mova                m29, m21
5096    mova                m26, m14
5097    mova                m25, m17
5098    mova                m24, m18
5099    mova                m27, m19
5100    mova                m28, m16
5101    mova                m20, m15
5102    jmp .main_oddhalf2
5103ALIGN function_align
5104.main_oddhalf:
5105    vpbroadcastd         m8, [o(pw_101_4095x8)]
5106    vpbroadcastd         m9, [o(pw_m2824_2967x8)]
5107    vpbroadcastd        m11, [o(pw_1660_3745x8)]
5108    vpbroadcastd        m12, [o(pw_m1474_3822x8)]
5109    pmulhrsw            m22, m8       ; t32a t63a
5110    vpbroadcastd         m8, [o(pw_897_3996x8)]
5111    pmulhrsw            m21, m9       ; t33a t62a
5112    vpbroadcastd         m9, [o(pw_m2191_3461x8)]
5113    pmulhrsw            m14, m11      ; t34a t61a
5114    vpbroadcastd        m11, [o(pw_2359_3349x8)]
5115    pmulhrsw            m29, m12      ; t35a t60a
5116    vpbroadcastd        m12, [o(pw_m700_4036x8)]
5117    pmulhrsw            m26, m8       ; t36a t59a
5118    vpbroadcastd         m8, [o(pw_501_4065x8)]
5119    pmulhrsw            m17, m9       ; t37a t58a
5120    vpbroadcastd         m9, [o(pw_m2520_3229x8)]
5121    pmulhrsw            m18, m11      ; t38a t57a
5122    vpbroadcastd        m11, [o(pw_2019_3564x8)]
5123    pmulhrsw            m25, m12      ; t39a t56a
5124    vpbroadcastd        m12, [o(pw_m1092_3948x8)]
5125    pmulhrsw            m24, m8       ; t40a t55a
5126    vpbroadcastd         m8, [o(pw_1285_3889x8)]
5127    pmulhrsw            m19, m9       ; t41a t54a
5128    vpbroadcastd         m9, [o(pw_m1842_3659x8)]
5129    pmulhrsw            m16, m11      ; t42a t53a
5130    vpbroadcastd        m11, [o(pw_2675_3102x8)]
5131    pmulhrsw            m27, m12      ; t43a t52a
5132    vpbroadcastd        m12, [o(pw_m301_4085x8)]
5133    pmulhrsw            m28, m8       ; t44a t51a
5134    pmulhrsw            m15, m9       ; t45a t50a
5135    pmulhrsw            m20, m11      ; t46a t49a
5136    pmulhrsw            m23, m12      ; t47a t48a
5137    psubsw               m8, m22, m21 ; t33  t62
5138    paddsw              m22, m21      ; t32  t63
5139    psubsw              m21, m29, m14 ; t34  t61
5140    paddsw              m29, m14      ; t35  t60
5141    psubsw              m14, m26, m17 ; t37  t58
5142    paddsw              m26, m17      ; t36  t59
5143    psubsw              m17, m25, m18 ; t38  t57
5144    paddsw              m25, m18      ; t39  t56
5145    psubsw              m18, m24, m19 ; t41  t54
5146    paddsw              m24, m19      ; t40  t55
5147    psubsw              m19, m27, m16 ; t42  t53
5148    paddsw              m27, m16      ; t43  t52
5149    psubsw              m16, m28, m15 ; t45  t50
5150    paddsw              m28, m15      ; t44  t51
5151    psubsw              m15, m23, m20 ; t46  t49
5152    paddsw              m20, m23      ; t47  t48
5153.main_oddhalf2:
5154    ITX_MUL2X_PACK        8, 9, 23, 10,   401, 4076, 5 ; t33a t62a
5155    ITX_MUL2X_PACK       21, 9, 23, 10, m4076,  401, 5 ; t34a t61a
5156    ITX_MUL2X_PACK       14, 9, 23, 10,  3166, 2598, 5 ; t37a t58a
5157    ITX_MUL2X_PACK       17, 9, 23, 10, m2598, 3166, 5 ; t38a t57a
5158    ITX_MUL2X_PACK       18, 9, 23, 10,  1931, 3612, 5 ; t41a t54a
5159    ITX_MUL2X_PACK       19, 9, 23, 10, m3612, 1931, 5 ; t42a t53a
5160    ITX_MUL2X_PACK       16, 9, 23, 10,  3920, 1189, 5 ; t45a t50a
5161    ITX_MUL2X_PACK       15, 9, 23, 10, m1189, 3920, 5 ; t46a t49a
5162    vpbroadcastd        m11, [o(pw_m4017_799)]
5163    psubsw              m23, m25, m26 ; t36a t59a
5164    paddsw              m25, m26      ; t39a t56a
5165    psubsw              m26, m24, m27 ; t43a t52a
5166    paddsw              m27, m24      ; t40a t55a
5167    psubsw              m24, m20, m28 ; t44a t51a
5168    paddsw              m20, m28      ; t47a t48a
5169    psubsw              m28, m8, m21  ; t34  t61
5170    paddsw               m8, m21      ; t33  t62
5171    psubsw              m21, m17, m14 ; t37  t58
5172    paddsw              m17, m14      ; t38  t57
5173    psubsw              m14, m18, m19 ; t42  t53
5174    paddsw              m18, m19      ; t41  t54
5175    psubsw              m19, m15, m16 ; t45  t50
5176    paddsw              m15, m16      ; t46  t49
5177    psubsw              m16, m22, m29 ; t35a t60a
5178    paddsw              m22, m29      ; t32a t63a
5179    ITX_MUL2X_PACK       16, 9, 29, 10, 799_4017, 11,    20 ; t35  t60
5180    ITX_MUL2X_PACK       28, 9, 29, 10, 799_4017, 11,    20 ; t34a t61a
5181    ITX_MUL2X_PACK       23, 9, 29, 10, 11, m799_m4017,  36 ; t36  t59
5182    ITX_MUL2X_PACK       21, 9, 29, 10, 11, m799_m4017,  36 ; t37a t58a
5183    vpbroadcastd        m11, [o(pw_m2276_3406)]
5184    ITX_MUL2X_PACK       26, 9, 29, 10, 3406_2276, 11,   20 ; t43  t52
5185    ITX_MUL2X_PACK       14, 9, 29, 10, 3406_2276, 11,   20 ; t42a t53a
5186    ITX_MUL2X_PACK       24, 9, 29, 10, 11, m3406_m2276, 36 ; t44  t51
5187    ITX_MUL2X_PACK       19, 9, 29, 10, 11, m3406_m2276, 36 ; t45a t50a
5188    vpbroadcastd        m11, [o(pw_1567_3784)]
5189    vpbroadcastd        m12, [o(pw_m3784_1567)]
5190    psubsw              m29, m22, m25 ; t39  t56
5191    paddsw              m22, m25      ; t32  t63
5192    psubsw              m25, m20, m27 ; t40  t55
5193    paddsw              m20, m27      ; t47  t48
5194    psubsw              m27, m8, m17  ; t38a t57a
5195    paddsw               m8, m17      ; t33a t62a
5196    psubsw              m17, m15, m18 ; t41a t54a
5197    paddsw              m15, m18      ; t46a t49a
5198    paddsw              m18, m16, m23 ; t35a t60a
5199    psubsw              m16, m23      ; t36a t59a
5200    psubsw              m23, m24, m26 ; t43a t52a
5201    paddsw              m24, m26      ; t44a t51a
5202    paddsw              m26, m28, m21 ; t34  t61
5203    psubsw              m28, m21      ; t37  t58
5204    psubsw              m21, m19, m14 ; t42  t53
5205    paddsw              m19, m14      ; t45  t50
5206    ITX_MUL2X_PACK       29, 9, 14, 10, 11, 12, 4 ; t39a t56a
5207    ITX_MUL2X_PACK       27, 9, 14, 10, 11, 12, 4 ; t38  t57
5208    ITX_MUL2X_PACK       16, 9, 14, 10, 11, 12, 4 ; t36  t59
5209    ITX_MUL2X_PACK       28, 9, 14, 10, 11, 12, 4 ; t37a t58a
5210    vpbroadcastd        m11, [o(pw_m1567_m3784)]
5211    ITX_MUL2X_PACK       25, 9, 14, 10, 12, 11, 4 ; t40a t55a
5212    ITX_MUL2X_PACK       17, 9, 14, 10, 12, 11, 4 ; t41  t54
5213    ITX_MUL2X_PACK       23, 9, 14, 10, 12, 11, 4 ; t43  t52
5214    ITX_MUL2X_PACK       21, 9, 14, 10, 12, 11, 4 ; t42a t53a
5215    vbroadcasti32x4     m13, [o(deint_shuf)]
5216    vpbroadcastd        m11, [o(pw_2896_2896)]
5217    vpbroadcastd        m12, [o(pw_m2896_2896)]
5218    paddsw              m14, m22, m20 ; t32a t63a
5219    psubsw              m22, m20      ; t47a t48a
5220    psubsw              m20, m8, m15  ; t46  t49
5221    paddsw               m8, m15      ; t33  t62
5222    paddsw              m15, m18, m24 ; t35  t60
5223    psubsw              m18, m24      ; t44  t51
5224    psubsw              m24, m26, m19 ; t45a t50a
5225    paddsw              m26, m19      ; t34a t61a
5226    REPX    {pshufb x, m13}, m14, m8, m15, m26
5227    psubsw              m19, m29, m25 ; t40  t55
5228    paddsw              m25, m29      ; t39  t56
5229    psubsw              m29, m27, m17 ; t41a t54a
5230    paddsw              m27, m17      ; t38a t57a
5231    psubsw              m17, m16, m23 ; t43a t52a
5232    paddsw              m16, m23      ; t36a t59a
5233    psubsw               m9, m28, m21 ; t42  t53
5234    paddsw              m28, m21      ; t37  t58
5235    REPX    {pshufb x, m13}, m25, m27, m16, m28
5236    ITX_MUL2X_PACK       22, 13, 21, 10, 11, 12, 8 ; t47  t48
5237    ITX_MUL2X_PACK       20, 23, 22, 10, 11, 12, 8 ; t46a t49a
5238    packssdw            m21, m22      ; t47  t46a
5239    packssdw            m13, m23      ; t48  t49a
5240    ITX_MUL2X_PACK       18, 22, 20, 10, 11, 12, 8 ; t44a t51a
5241    ITX_MUL2X_PACK       24, 23, 18, 10, 11, 12, 8 ; t45  t50
5242    packssdw            m20, m18      ; t44a t45
5243    packssdw            m22, m23      ; t51a t50
5244    ITX_MUL2X_PACK       19, 24, 18, 10, 11, 12, 8 ; t40a t55a
5245    ITX_MUL2X_PACK       29, 23, 19, 10, 11, 12, 8 ; t41  t54
5246    packssdw            m18, m19      ; t40a t41
5247    packssdw            m24, m23      ; t55a t54
5248    ITX_MUL2X_PACK       17, 23, 19, 10, 11, 12, 8 ; t43  t52
5249    ITX_MUL2X_PACK        9, 29, 17, 10, 11, 12, 8 ; t42a t53a
5250    packssdw            m19, m17      ; t43  t42a
5251    packssdw            m23, m29      ; t52  t53a
5252    punpcklqdq          m17, m25, m27 ; t39  t38a
5253    punpckhqdq          m25, m27      ; t56  t57a
5254    punpckhqdq          m27, m15, m26 ; t60  t61a
5255    punpcklqdq          m15, m26      ; t35  t34a
5256    punpckhqdq          m26, m16, m28 ; t59a t58
5257    punpcklqdq          m16, m28      ; t36a t37
5258    punpckhqdq          m28, m14, m8  ; t63a t62
5259    punpcklqdq          m14, m8       ; t32a t33
5260    psubsw              m29, m0, m28  ; out63 out62
5261    paddsw               m0, m28      ; out0  out1
5262    psubsw              m28, m1, m27  ; out60 out61
5263    paddsw               m1, m27      ; out3  out2
5264    psubsw              m27, m2, m26  ; out59 out58
5265    paddsw               m2, m26      ; out4  out5
5266    psubsw              m26, m3, m25  ; out56 out57
5267    paddsw               m3, m25      ; out7  out6
5268    psubsw              m25, m4, m24  ; out55 out54
5269    paddsw               m4, m24      ; out8  out9
5270    psubsw              m24, m5, m23  ; out52 out53
5271    paddsw               m5, m23      ; out11 out10
5272    psubsw              m23, m6, m22  ; out51 out50
5273    paddsw               m6, m22      ; out12 out13
5274    psubsw              m22, m7, m13  ; out48 out49
5275    paddsw               m7, m13      ; out15 out14
5276    ret
5277
5278cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 7, 0, dst, stride, c, eob
5279%undef cmp
5280    lea                  r5, [o_base]
5281    test               eobd, eobd
5282    jnz .normal
5283    movsx               r6d, word [cq]
5284    mov                [cq], eobd
5285    mov                 r3d, 16
5286.dconly:
5287    imul                r6d, 181
5288    add                 r6d, 128+512
5289    sar                 r6d, 8+2
5290.dconly2:
5291    imul                r6d, 181
5292    add                 r6d, 128+2048
5293    sar                 r6d, 8+4
5294    pxor                 m2, m2
5295    vpbroadcastw         m3, r6d
5296.dconly_loop:
5297    mova                 m1, [dstq]
5298    punpcklbw            m0, m1, m2
5299    punpckhbw            m1, m2
5300    paddw                m0, m3
5301    paddw                m1, m3
5302    packuswb             m0, m1
5303    mova             [dstq], m0
5304    add                dstq, strideq
5305    dec                 r3d
5306    jg .dconly_loop
5307    RET
5308.normal:
5309    WIN64_SPILL_XMM      31
5310    mova                m19, [o(dup16_perm)]
5311    mova                m24, [cq+64* 2]
5312    mova                m28, [cq+64* 6]
5313    mova                m26, [cq+64* 4]
5314    mova                m22, [cq+64* 0]
5315    mova                m23, [cq+64* 1]
5316    mova                m29, [cq+64* 7]
5317    mova                m27, [cq+64* 5]
5318    mova                m25, [cq+64* 3]
5319    vpermb               m8, m19, m24        ;  4
5320    vpermb               m1, m19, m28        ; 12
5321    vpermb               m7, m19, m26        ;  8
5322    vpermb               m9, m19, m22        ; __  0
5323    vpermb              m21, m19, m23        ;  2
5324    vpermb              m15, m19, m29        ; 14
5325    vpermb              m18, m19, m27        ; 10
5326    vpermb              m14, m19, m25        ;  6
5327    pslld                m9, 16
5328    vpord               m30, m19, [o(pb_32)] {1to16}
5329    REPX {vpermb x, m30, x}, m22, m29, m26, m25, m24, m27, m28, m23
5330    cmp                eobd, 151
5331    jb .fast
5332    vpermb               m0, m19, [cq+64*14] ; 28
5333    vpermb               m5, m19, [cq+64*10] ; 20
5334    vpermb               m3, m19, [cq+64*12] ; 24
5335    vpermb               m6, m19, [cq+64* 8] ; __ 16
5336    pslld                m6, 16
5337    call m(idct_16x16_internal_8bpc).main_fast
5338    vpermb              m17, m19, [cq+64*15] ; 30
5339    vpermb              m20, m19, [cq+64* 9] ; 18
5340    vpermb              m16, m19, [cq+64*11] ; 22
5341    vpermb              m19, m19, [cq+64*13] ; 26
5342    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
5343    mova         [cq+64* 0], m14
5344    mova         [cq+64* 1], m15
5345    mova         [cq+64* 2], m16
5346    mova         [cq+64* 3], m17
5347    mova         [cq+64* 4], m18
5348    mova         [cq+64* 5], m19
5349    mova         [cq+64* 6], m20
5350    mova         [cq+64* 7], m21
5351    vpermb              m21, m30, [cq+64*15]
5352    vpermb              m14, m30, [cq+64* 8]
5353    vpermb              m17, m30, [cq+64*11]
5354    vpermb              m18, m30, [cq+64*12]
5355    vpermb              m19, m30, [cq+64*13]
5356    vpermb              m16, m30, [cq+64*10]
5357    vpermb              m15, m30, [cq+64* 9]
5358    vpermb              m20, m30, [cq+64*14]
5359    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf
5360    jmp .end
5361.fast: ; bottom half is zero
5362    call m(idct_16x16_internal_8bpc).main_fast2
5363    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
5364    mova         [cq+64* 0], m14
5365    mova         [cq+64* 1], m15
5366    mova         [cq+64* 2], m16
5367    mova         [cq+64* 3], m17
5368    mova         [cq+64* 4], m18
5369    mova         [cq+64* 5], m19
5370    mova         [cq+64* 6], m20
5371    mova         [cq+64* 7], m21
5372    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
5373.end:
5374    mova         [cq+64* 8], m4
5375    mova         [cq+64* 9], m5
5376    mova         [cq+64*10], m6
5377    mova         [cq+64*11], m7
5378    mova         [cq+64*12], m26
5379    mova         [cq+64*13], m27
5380    mova         [cq+64*14], m28
5381    mova         [cq+64*15], m29
5382    vpbroadcastd        m13, [o(pw_8192)]
5383    call .pass1_end
5384    call .pass2
5385    mova         [cq+64* 0], m0
5386    mova         [cq+64* 1], m1
5387    mova         [cq+64* 2], m2
5388    mova         [cq+64* 3], m3
5389    mova         [cq+64* 4], m4
5390    mova         [cq+64* 5], m5
5391    mova         [cq+64* 6], m6
5392    mova         [cq+64* 7], m7
5393    pmulhrsw             m0, m13, [cq+64* 8]
5394    pmulhrsw             m1, m13, [cq+64* 9]
5395    pmulhrsw             m2, m13, [cq+64*10]
5396    pmulhrsw             m3, m13, [cq+64*11]
5397    vpbroadcastd        m30, [o(pw_2048)]
5398    pmulhrsw             m4, m13, m22
5399    pmulhrsw             m5, m13, m23
5400    pmulhrsw             m6, m13, m24
5401    pmulhrsw             m7, m13, m25
5402    pmulhrsw            m22, m30, m14
5403    pmulhrsw            m14, m13, m26
5404    pmulhrsw            m23, m30, m15
5405    pmulhrsw            m15, m13, m27
5406    pmulhrsw            m24, m30, m16
5407    pmulhrsw            m16, m13, m28
5408    pmulhrsw            m25, m30, m17
5409    pmulhrsw            m17, m13, m29
5410    pmulhrsw            m26, m30, m18
5411    pmulhrsw            m18, m13, [cq+64*12]
5412    pmulhrsw            m27, m30, m19
5413    pmulhrsw            m19, m13, [cq+64*13]
5414    pmulhrsw            m28, m30, m20
5415    pmulhrsw            m20, m13, [cq+64*14]
5416    pmulhrsw            m29, m30, m21
5417    pmulhrsw            m21, m13, [cq+64*15]
5418    call .transpose_round
5419    call .pass2
5420    pxor                m10, m10
5421    lea                  r3, [strideq*3]
5422%macro IDCT_64x16_END 4
5423    mova                 m9, [dstq+%4]
5424%if %1 < 8
5425    pmulhrsw            m%3, m30, [cq+64*%1]
5426%endif
5427    pmulhrsw            m%2, m30
5428    mova         [cq+64*%1], m10
5429    punpcklbw            m8, m9, m10
5430    punpckhbw            m9, m10
5431    paddw                m8, m%3
5432    paddw                m9, m%2
5433    packuswb             m8, m9
5434    mova          [dstq+%4], m8
5435%if %1 == 3 || %1 == 7 || %1 == 11
5436    lea                dstq, [dstq+strideq*4]
5437%endif
5438%endmacro
5439    IDCT_64x16_END        0,  0, 11, strideq*0
5440    IDCT_64x16_END        1,  1, 11, strideq*1
5441    IDCT_64x16_END        2,  2, 11, strideq*2
5442    IDCT_64x16_END        3,  3, 11, r3
5443    IDCT_64x16_END        4,  4, 11, strideq*0
5444    IDCT_64x16_END        5,  5, 11, strideq*1
5445    IDCT_64x16_END        6,  6, 11, strideq*2
5446    IDCT_64x16_END        7,  7, 11, r3
5447    IDCT_64x16_END        8, 14, 22, strideq*0
5448    IDCT_64x16_END        9, 15, 23, strideq*1
5449    IDCT_64x16_END       10, 16, 24, strideq*2
5450    IDCT_64x16_END       11, 17, 25, r3
5451    IDCT_64x16_END       12, 18, 26, strideq*0
5452    IDCT_64x16_END       13, 19, 27, strideq*1
5453    IDCT_64x16_END       14, 20, 28, strideq*2
5454    IDCT_64x16_END       15, 21, 29, r3
5455    RET
5456ALIGN function_align
5457.pass1_end:
5458    mova                 m4, [cq+64* 0]
5459    mova                 m5, [cq+64* 1]
5460    mova                 m6, [cq+64* 2]
5461    mova                 m7, [cq+64* 3]
5462    mova                 m8, [cq+64* 4]
5463    mova                 m9, [cq+64* 5]
5464    mova                m11, [cq+64* 6]
5465    mova                m12, [cq+64* 7]
5466    psubsw              m29, m4, m21  ; out47 out46
5467    paddsw               m4, m21      ; out16 out17
5468    psubsw              m28, m5, m20  ; out44 out45
5469    paddsw               m5, m20      ; out19 out18
5470    REPX  {pmulhrsw x, m13}, m0, m1, m2, m3
5471    psubsw              m27, m6, m19  ; out43 out42
5472    paddsw               m6, m19      ; out20 out21
5473    psubsw              m26, m7, m18  ; out40 out41
5474    paddsw               m7, m18      ; out23 out22
5475    pmulhrsw            m18, m13, m22
5476    pmulhrsw            m19, m13, m23
5477    pmulhrsw            m20, m13, m24
5478    pmulhrsw            m21, m13, m25
5479    paddsw              m25, m12, m14 ; out31 out30
5480    psubsw              m14, m12, m14 ; out32 out33
5481    paddsw              m24, m11, m15 ; out28 out29
5482    psubsw              m15, m11, m15 ; out35 out34
5483    REPX  {pmulhrsw x, m13}, m4, m5, m6, m7
5484    paddsw              m23, m9, m16  ; out27 out26
5485    psubsw              m16, m9, m16  ; out36 out37
5486    paddsw              m22, m8, m17  ; out24 out25
5487    psubsw              m17, m8, m17  ; out39 out38
5488    REPX  {pmulhrsw x, m13}, m14, m15, m16, m17
5489.transpose_round:
5490%macro TRANSPOSE_8x4_PACKED 4
5491    punpckhwd            m8, m%1, m%3 ; b0 f0 b1 f1 b2 f2 b3 f3
5492    punpcklwd           m%1, m%3      ; a0 e0 a1 e1 a2 e2 a3 e3
5493    punpcklwd           m%3, m%2, m%4 ; d0 h0 d1 h1 d2 h2 d3 h3
5494    punpckhwd           m%2, m%4      ; c0 g0 c1 g1 c2 g2 c3 g3
5495    punpckhwd           m%4, m%1, m%2 ; a2 c2 e2 g2 a3 c3 e3 g3
5496    punpcklwd           m%1, m%2      ; a0 c0 e0 g0 a1 c1 e1 g1
5497    punpckhwd           m%2, m8, m%3  ; b2 d2 f2 h2 b3 d3 f3 h3
5498    punpcklwd            m8, m%3      ; b0 d0 f0 h0 b1 d1 f1 h1
5499    punpcklwd           m%3, m%4, m%2 ; 2
5500    punpckhwd           m%4, m%2      ; 3
5501    punpckhwd           m%2, m%1, m8  ; 1
5502    punpcklwd           m%1, m8       ; 0
5503%endmacro
5504    TRANSPOSE_8x4_PACKED  0,  1,  2,  3
5505    TRANSPOSE_8x4_PACKED 18, 19, 20, 21
5506    TRANSPOSE_8x4_PACKED  4,  5,  6,  7
5507    TRANSPOSE_8x4_PACKED 14, 15, 16, 17
5508    vshufi32x4           m8, m0, m4, q3232   ; a02 a03 b02 b03
5509    vinserti32x8         m0, ym4, 1          ; a00 a01 b00 b01
5510    vshufi32x4           m4, m1, m5, q3232   ; a12 a13 b12 b13
5511    vinserti32x8         m9, m1, ym5, 1      ; a10 a11 b10 b11
5512    vshufi32x4           m5, m2, m6, q3232   ; a22 a23 b22 b23
5513    vinserti32x8         m1, m2, ym6, 1      ; a20 a21 b20 b21
5514    vshufi32x4           m6, m3, m7, q3232   ; a32 a33 b32 b33
5515    vinserti32x8        m11, m3, ym7, 1      ; a30 a31 b30 b31
5516    vshufi32x4           m2, m14, m18, q3232 ; c02 c03 d02 d03
5517    vinserti32x8         m3, m14, ym18, 1    ; c00 c01 d00 d01
5518    vshufi32x4          m18, m15, m19, q3232 ; c12 c13 d12 d13
5519    vinserti32x8        m15, ym19, 1         ; c10 c11 d10 d11
5520    vshufi32x4          m19, m16, m20, q3232 ; c22 c23 d22 d23
5521    vinserti32x8        m16, ym20, 1         ; c20 c21 d20 d21
5522    vshufi32x4          m20, m17, m21, q3232 ; c32 c33 d32 d33
5523    vinserti32x8        m17, ym21, 1         ; c30 c31 d30 d31
5524    ret
5525.pass2:
5526    vshufi32x4           m7, m5, m19, q3131  ; 14
5527    vshufi32x4           m5, m19, q2020      ; 10
5528    vshufi32x4          m21, m6, m20, q3131  ; 15
5529    vshufi32x4          m19, m6, m20, q2020  ; 11
5530    vshufi32x4          m20, m4, m18, q3131  ; 13
5531    vshufi32x4          m18, m4, m18, q2020  ;  9
5532    vshufi32x4           m6, m8, m2, q3131   ; 12
5533    vshufi32x4           m4, m8, m2, q2020   ;  8
5534    vshufi32x4           m2, m0, m3, q3131   ;  4
5535    vshufi32x4           m0, m3, q2020       ;  0
5536    vshufi32x4           m3, m1, m16, q3131  ;  6
5537    vshufi32x4           m1, m16, q2020      ;  2
5538    vshufi32x4          m16, m9, m15, q3131  ;  5
5539    vshufi32x4          m14, m9, m15, q2020  ;  1
5540    vshufi32x4          m15, m11, m17, q2020 ;  3
5541    vshufi32x4          m17, m11, m17, q3131 ;  7
5542    call m(inv_txfm_add_dct_dct_32x8_8bpc).main2
5543    jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
5544
5545cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 7, 0, dst, stride, c, eob
5546    lea                  r5, [o_base]
5547    test               eobd, eobd
5548    jz .dconly
5549    PROLOGUE              0, 9, 30, 64*32, dst, stride, c, eob
5550    vpbroadcastd        m23, [o(pw_2896x8)]
5551%undef cmp
5552    cmp                eobd, 136
5553    jb .fast
5554    pmulhrsw             m5, m23, [cq+64*20]
5555    pmulhrsw             m3, m23, [cq+64*12]
5556    pmulhrsw             m1, m23, [cq+64* 4]
5557    pmulhrsw             m7, m23, [cq+64*28]
5558    pmulhrsw             m2, m23, [cq+64* 8]
5559    pmulhrsw             m6, m23, [cq+64*24]
5560    pmulhrsw             m0, m23, [cq+64* 0]
5561    pmulhrsw             m4, m23, [cq+64*16]
5562    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
5563    pmulhrsw            m14, m23, [cq+64* 2]
5564    pmulhrsw            m21, m23, [cq+64*30]
5565    pmulhrsw            m18, m23, [cq+64*18]
5566    pmulhrsw            m17, m23, [cq+64*14]
5567    pmulhrsw            m16, m23, [cq+64*10]
5568    pmulhrsw            m19, m23, [cq+64*22]
5569    pmulhrsw            m20, m23, [cq+64*26]
5570    pmulhrsw            m15, m23, [cq+64* 6]
5571    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
5572    mova         [cq+64* 0], m14
5573    mova         [cq+64* 2], m15
5574    mova         [cq+64* 4], m16
5575    mova         [cq+64* 6], m17
5576    mova         [cq+64* 8], m18
5577    mova         [cq+64*10], m19
5578    mova         [cq+64*12], m20
5579    mova         [cq+64*14], m21
5580    pmulhrsw            m22, m23, [cq+64* 1]
5581    pmulhrsw            m21, m23, [cq+64*31]
5582    pmulhrsw            m14, m23, [cq+64*17]
5583    pmulhrsw            m29, m23, [cq+64*15]
5584    pmulhrsw            m26, m23, [cq+64* 9]
5585    pmulhrsw            m17, m23, [cq+64*23]
5586    pmulhrsw            m18, m23, [cq+64*25]
5587    pmulhrsw            m25, m23, [cq+64* 7]
5588    pmulhrsw            m24, m23, [cq+64* 5]
5589    pmulhrsw            m19, m23, [cq+64*27]
5590    pmulhrsw            m16, m23, [cq+64*21]
5591    pmulhrsw            m27, m23, [cq+64*11]
5592    pmulhrsw            m28, m23, [cq+64*13]
5593    pmulhrsw            m15, m23, [cq+64*19]
5594    pmulhrsw            m20, m23, [cq+64*29]
5595    pmulhrsw            m23,      [cq+64* 3]
5596    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
5597    vpbroadcastd        m12, [o(pw_16384)]
5598    psubsw              m13, m0, m29 ; 31
5599    paddsw               m0, m29     ;  0
5600    psubsw              m29, m1, m28 ; 30
5601    paddsw               m1, m28     ;  1
5602    psubsw              m28, m2, m27 ; 29
5603    paddsw               m2, m27     ;  2
5604    psubsw              m27, m3, m26 ; 28
5605    paddsw               m3, m26     ;  3
5606    psubsw              m26, m4, m25 ; 27
5607    paddsw               m4, m25     ;  4
5608    psubsw              m25, m5, m24 ; 26
5609    paddsw               m5, m24     ;  5
5610    psubsw              m24, m6, m23 ; 25
5611    paddsw               m6, m23     ;  6
5612    psubsw              m23, m7, m22 ; 24
5613    paddsw               m7, m22     ;  7
5614    pxor                 m9, m9
5615    punpckhwd            m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7
5616    punpcklwd            m0, m1     ; a0 b0 a1 b1 a2 b2 a3 b3
5617    punpckhwd            m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7
5618    punpcklwd            m2, m3     ; c0 d0 c1 d1 c2 d2 c3 d3
5619    REPX {mova [cq+64*x], m9}, 16, 17, 18, 19
5620    punpckhwd           m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7
5621    punpcklwd            m4, m5     ; e0 f0 e1 f1 e2 f2 e3 f3
5622    punpckhwd            m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7
5623    punpcklwd            m6, m7     ; g0 h0 g1 h1 g2 h2 g3 h3
5624    REPX {mova [cq+64*x], m9}, 20, 21, 22, 23
5625    punpckhwd            m3, m23, m24
5626    punpcklwd           m23, m24
5627    punpckhwd           m24, m25, m26
5628    punpcklwd           m25, m26
5629    REPX {mova [cq+64*x], m9}, 24, 25, 26, 27
5630    punpckhwd           m26, m27, m28
5631    punpcklwd           m27, m28
5632    punpckhwd           m28, m29, m13
5633    punpcklwd           m29, m13
5634    REPX {mova [cq+64*x], m9}, 28, 29, 30, 31
5635    punpckhdq            m7, m0, m2  ; a2 b2 c2 d2 a3 b3 c3 d3
5636    punpckldq            m0, m2      ; a0 b0 c0 d0 a1 b1 c1 d1
5637    punpckhdq            m2, m4, m6  ; e2 f2 g2 h2 e3 f3 g3 h3
5638    punpckldq            m4, m6      ; e0 f0 g0 h0 e1 f1 g1 h1
5639    REPX  {pmulhrsw x, m12}, m7, m0, m2, m4
5640    punpckhdq            m6, m8, m1  ; a6 b6 c6 d6 a7 b7 c7 d7
5641    punpckldq            m8, m1      ; a4 b4 c4 d4 a5 b5 c5 d5
5642    punpckhdq            m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7
5643    punpckldq           m22, m5      ; e4 f4 g4 h5 e5 f5 g5 h5
5644    REPX  {pmulhrsw x, m12}, m6, m8, m1, m22
5645    punpckhdq           m13, m23, m25
5646    punpckldq           m23, m25
5647    punpckhdq           m25, m27, m29
5648    punpckldq           m27, m29
5649    REPX  {pmulhrsw x, m12}, m13, m23, m25, m27
5650    punpckhdq            m9, m3, m24
5651    punpckldq            m3, m24
5652    punpckhdq           m24, m26, m28
5653    punpckldq           m26, m28
5654    REPX  {pmulhrsw x, m12}, m9, m3, m24, m26
5655    punpckhqdq           m5, m23, m27 ; d01 d09 d17 d25
5656    punpcklqdq          m23, m27      ; d00 d08 d16 d24
5657    punpcklqdq          m27, m13, m25 ; d02 d10 d18 d26
5658    punpckhqdq          m13, m25      ; d03 d11 d19 d27
5659    punpcklqdq          m25, m3, m26  ; d04 d12 d20 d28
5660    punpckhqdq           m3, m26      ; d05 d13 d21 d29
5661    punpcklqdq          m26, m9, m24  ; d06 d14 d22 d30
5662    punpckhqdq           m9, m24      ; d07 d15 d23 d31
5663    mova         [cq+64* 3], m23
5664    mova         [cq+64*13], m27
5665    mova         [cq+64* 7], m25
5666    mova         [cq+64*15], m26
5667    punpckhqdq          m24, m8, m22  ; a05 a13 a21 a29
5668    punpcklqdq           m8, m22      ; a04 a12 a20 a28
5669    punpckhqdq          m22, m0, m4   ; a01 a09 a17 a25
5670    punpcklqdq           m0, m4       ; a00 a08 a16 a24
5671    punpckhqdq          m23, m7, m2   ; a03 a11 a19 a27
5672    punpcklqdq           m7, m2       ; a02 a10 a18 a26
5673    punpckhqdq          m25, m6, m1   ; a07 a15 a23 a31
5674    punpcklqdq           m6, m1       ; a06 a14 a22 a30
5675    mova         [cq+64* 1], m0
5676    mova         [cq+64* 9], m7
5677    mova         [cq+64* 5], m8
5678    mova         [cq+64*11], m6
5679    mova                 m2, [cq+64* 0]
5680    mova                m11, [cq+64* 2]
5681    mova                 m8, [cq+64* 4]
5682    mova                m29, [cq+64* 6]
5683    mova                m27, [cq+64* 8]
5684    mova                m26, [cq+64*10]
5685    mova                 m4, [cq+64*12]
5686    mova                m28, [cq+64*14]
5687    psubsw               m1, m2, m21  ; 23
5688    paddsw               m2, m21      ;  8
5689    psubsw              m21, m11, m20 ; 22
5690    paddsw              m11, m20      ;  9
5691    psubsw              m20, m8, m19  ; 21
5692    paddsw               m8, m19      ; 10
5693    psubsw              m19, m29, m18 ; 20
5694    paddsw              m29, m18      ; 11
5695    psubsw              m18, m27, m17 ; 19
5696    paddsw              m27, m17      ; 12
5697    psubsw              m17, m26, m16 ; 18
5698    paddsw              m26, m16      ; 13
5699    psubsw              m16, m4, m15  ; 17
5700    paddsw               m4, m15      ; 14
5701    psubsw              m15, m28, m14 ; 16
5702    paddsw              m28, m14      ; 15
5703    punpcklwd           m14, m15, m16
5704    punpckhwd           m15, m16
5705    punpckhwd           m16, m17, m18
5706    punpcklwd           m17, m18
5707    punpckhwd           m18, m19, m20
5708    punpcklwd           m19, m20
5709    punpckhwd           m20, m21, m1
5710    punpcklwd           m21, m1
5711    punpckhwd            m1, m2, m11  ; i4 j4 i5 j5 i6 j6 i7 j7
5712    punpcklwd            m2, m11      ; i0 j1 i1 j1 i2 j2 i3 j3
5713    punpckhwd           m11, m8, m29  ; k4 l4 k5 l5 k6 l6 k7 l7
5714    punpcklwd            m8, m29      ; k0 l0 k1 l1 k2 l2 k3 l3
5715    punpckhwd           m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7
5716    punpcklwd           m27, m26      ; m0 n0 m1 n1 m2 n2 m3 n3
5717    punpckhwd           m26, m4, m28  ; o4 p4 o5 p5 o6 p6 o7 p7
5718    punpcklwd            m4, m28      ; o0 p0 o1 p1 o2 p2 o3 p3
5719    punpckhdq           m28, m2, m8   ; i2 j2 k2 l2 i3 j3 k3 l3
5720    punpckldq            m2, m8       ; i0 j0 k0 l0 i1 j1 k1 l1
5721    punpckhdq            m8, m27, m4  ; m2 n2 o2 p2 m3 n3 o3 p3
5722    punpckldq           m27, m4       ; m0 n0 o0 p0 m1 n1 o1 p1
5723    REPX  {pmulhrsw x, m12}, m28, m2, m8, m27
5724    punpckhdq            m4, m1, m11  ; i6 j6 k6 l6 i7 j7 k7 l7
5725    punpckldq            m1, m11      ; i4 j4 k4 l4 i5 j5 k5 l5
5726    punpckhdq           m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7
5727    punpckldq           m29, m26      ; m4 n4 o4 p4 m5 n5 o5 p5
5728    REPX  {pmulhrsw x, m12}, m4, m1, m11, m29
5729    punpckhdq           m26, m19, m21
5730    punpckldq           m19, m21
5731    punpckhdq           m21, m15, m16
5732    punpckldq           m15, m16
5733    REPX  {pmulhrsw x, m12}, m26, m19, m21, m15
5734    punpckhdq           m16, m18, m20
5735    punpckldq           m18, m20
5736    punpckhdq           m20, m14, m17
5737    punpckldq           m14, m17
5738    REPX  {pmulhrsw x, m12}, m16, m18, m20, m14
5739    punpckhqdq          m17, m28, m8  ; b03 b11 b19 b27
5740    punpcklqdq          m28, m8       ; b02 b10 b18 b26
5741    punpckhqdq           m8, m2, m27  ; b01 b09 b17 b25
5742    punpcklqdq           m2, m27      ; b00 b08 b16 b24
5743    punpcklqdq          m27, m1, m29  ; b04 b12 b20 b28
5744    punpckhqdq           m1, m29      ; b05 b13 b21 b29
5745    punpcklqdq          m29, m4, m11  ; b06 b14 b22 b30
5746    punpckhqdq           m4, m11      ; b07 b15 b23 b31
5747    mova         [cq+64* 0], m2
5748    mova         [cq+64* 8], m28
5749    mova         [cq+64* 4], m27
5750    mova         [cq+64*10], m29
5751    punpckhqdq          m27, m20, m26 ; c03 c11 c19 c27
5752    punpcklqdq          m20, m26      ; c02 c10 c18 c26
5753    punpckhqdq          m26, m14, m19 ; c01 c09 c17 c25
5754    punpcklqdq          m14, m19      ; c00 c08 c16 c24
5755    punpckhqdq          m28, m15, m18 ; c05 c13 c21 c29
5756    punpcklqdq          m15, m18      ; c04 c12 c20 c28
5757    punpckhqdq          m29, m21, m16 ; c07 c15 c23 c31
5758    punpcklqdq          m21, m16      ; c06 c14 c22 c30
5759    mova         [cq+64* 2], m14
5760    mova         [cq+64*12], m20
5761    mova         [cq+64* 6], m15
5762    mova         [cq+64*14], m21
5763    vshufi32x4          m14, m22, m8, q3232  ; a17 a25 b17 b25
5764    vinserti32x8        m22, ym8, 1          ; a01 a09 b01 b09
5765    vshufi32x4          m15, m23, m17, q3232 ; a19 a27 b19 b27
5766    vinserti32x8        m23, ym17, 1         ; a03 a11 b03 b11
5767    vshufi32x4          m16, m24, m1, q3232  ; a21 a29 b21 b29
5768    vinserti32x8        m24, ym1, 1          ; a05 a13 b05 b13
5769    vshufi32x4          m17, m25, m4, q3232  ; a23 a31 b23 b31
5770    vinserti32x8        m25, ym4, 1          ; a07 a15 b07 b15
5771    vinserti32x8        m19, m26, ym5, 1     ; c01 c09 d01 d09
5772    vshufi32x4          m26, m5, q3232       ; c17 c25 d17 d25
5773    vinserti32x8        m20, m27, ym13, 1    ; c03 c11 d03 d11
5774    vshufi32x4          m27, m13, q3232      ; c19 c27 d19 d27
5775    vinserti32x8        m21, m28, ym3, 1     ; c05 c13 d05 d13
5776    vshufi32x4          m28, m3, q3232       ; c21 c29 d21 d29
5777    vinserti32x8        m18, m29, ym9, 1     ; c07 c15 d07 d15
5778    vshufi32x4          m29, m9, q3232       ; c23 c31 d23 d31
5779    mov                  r4, rsp
5780    vshufi32x4           m0, m22, m19, q2020 ;  1
5781    vshufi32x4           m1, m17, m29, q3131 ; 31
5782    vshufi32x4           m2, m14, m26, q2020 ; 17
5783    vshufi32x4           m3, m25, m18, q3131 ; 15
5784    call .main_part1
5785    vshufi32x4           m0, m25, m18, q2020 ;  7
5786    vshufi32x4           m1, m14, m26, q3131 ; 25
5787    vshufi32x4           m2, m17, m29, q2020 ; 23
5788    vshufi32x4           m3, m22, m19, q3131 ;  9
5789    call .main_part1
5790    vshufi32x4           m0, m24, m21, q2020 ;  5
5791    vshufi32x4           m1, m15, m27, q3131 ; 27
5792    vshufi32x4           m2, m16, m28, q2020 ; 21
5793    vshufi32x4           m3, m23, m20, q3131 ; 11
5794    call .main_part1
5795    vshufi32x4           m0, m23, m20, q2020 ;  3
5796    vshufi32x4           m1, m16, m28, q3131 ; 29
5797    vshufi32x4           m2, m15, m27, q2020 ; 19
5798    vshufi32x4           m3, m24, m21, q3131 ; 13
5799    call .main_part1
5800    call .main_part2
5801    mova                 m0, [cq+64* 1] ; a0
5802    mova                m15, [cq+64* 0] ; b0
5803    mova                 m3, [cq+64* 2] ; c0
5804    mova                m16, [cq+64* 3] ; d0
5805    mova                m14, [cq+64* 5] ; a4
5806    mova                 m8, [cq+64* 4] ; b4
5807    mova                m17, [cq+64* 6] ; c4
5808    mova                 m1, [cq+64* 7] ; d4
5809    vshufi32x4           m2, m0, m15, q3232  ; a16 a24 b16 b24
5810    vinserti32x8         m0, ym15, 1         ; a00 a08 b00 b08
5811    vshufi32x4          m15, m3, m16, q3232  ; c16 c24 d16 d24
5812    vinserti32x8         m3, ym16, 1         ; c00 c08 d00 d08
5813    vshufi32x4          m16, m14, m8, q3232  ; a20 a28 b20 b28
5814    vinserti32x8        m14, ym8, 1          ; a04 a12 b04 b12
5815    vshufi32x4           m8, m17, m1, q3232  ; c20 c28 d20 d28
5816    vinserti32x8        m17, ym1, 1          ; c04 c12 d04 d12
5817    vshufi32x4           m1, m0, m3, q3131   ;  8
5818    vshufi32x4           m0, m3, q2020       ;  0
5819    vshufi32x4           m3, m2, m15, q3131  ; 24
5820    vshufi32x4           m2, m15, q2020      ; 16
5821    vshufi32x4          m15, m14, m17, q3131 ; 12
5822    vshufi32x4          m14, m17, q2020      ;  4
5823    vshufi32x4          m17, m16, m8, q3131  ; 28
5824    vshufi32x4          m16, m8, q2020       ; 20
5825    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
5826    mova                 m8, [cq+64* 8]
5827    mova                 m9, [cq+64*12]
5828    mova                m11, [cq+64*10]
5829    mova                m12, [cq+64*14]
5830    mova         [cq+64* 0], m14
5831    mova         [cq+64* 2], m15
5832    mova         [cq+64* 4], m16
5833    mova         [cq+64* 6], m17
5834    mova         [cq+64* 8], m18
5835    mova         [cq+64*10], m19
5836    mova         [cq+64*12], m20
5837    mova         [cq+64*14], m21
5838    mova                m22, [cq+64* 9]
5839    mova                m27, [cq+64*13]
5840    mova                m23, [cq+64*11]
5841    mova                m24, [cq+64*15]
5842    vshufi32x4          m26, m22, m8, q3232  ; a18 a26 b18 b26
5843    vinserti32x8        m22, ym8, 1          ; a02 a10 b02 b10
5844    vshufi32x4           m8, m9, m27, q3232  ; c18 c26 d18 d26
5845    vinserti32x8         m9, ym27, 1         ; c02 c10 d02 d10
5846    vshufi32x4          m27, m23, m11, q3232 ; a22 a30 b22 b30
5847    vinserti32x8        m23, ym11, 1         ; a06 a14 b06 b14
5848    vshufi32x4          m11, m12, m24, q3232 ; c22 c30 d22 d30
5849    vinserti32x8        m12, ym24, 1         ; c06 c14 d06 d14
5850    vshufi32x4          m28, m26, m8, q3131  ; 26
5851    vshufi32x4          m26, m8, q2020       ; 18
5852    vshufi32x4          m24, m22, m9, q3131  ; 10
5853    vshufi32x4          m22, m9, q2020       ;  2
5854    vshufi32x4          m29, m27, m11, q3131 ; 30
5855    vshufi32x4          m27, m11, q2020      ; 22
5856    vshufi32x4          m25, m23, m12, q3131 ; 14
5857    vshufi32x4          m23, m12, q2020      ;  6
5858    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
5859    jmp .end
5860.fast: ; bottom/right halves are zero
5861    pmulhrsw            ym9, ym23, [cq+64* 0]
5862    pmulhrsw            ym6, ym23, [cq+64* 8]
5863    mova                m14, [o(dup16_perm)]
5864    pmulhrsw            ym8, ym23, [cq+64* 2]
5865    pmulhrsw            xm0, xm23, [cq+64*14]
5866    pmulhrsw            xm5, xm23, [cq+64*10]
5867    pmulhrsw            ym1, ym23, [cq+64* 6]
5868    pmulhrsw            ym7, ym23, [cq+64* 4]
5869    pmulhrsw            xm3, xm23, [cq+64*12]
5870    pmovzxwd             m9, ym9
5871    pmovzxwd             m6, ym6
5872    vpermb               m8, m14, m8
5873    punpcklwd           xm0, xm0
5874    vpermb              ym5, ym14, ym5
5875    vpermb               m1, m14, m1
5876    vpermb               m7, m14, m7
5877    punpcklwd           xm3, xm3
5878    pslld                m9, 16
5879    pslld                m6, 16
5880    call m(idct_16x16_internal_8bpc).main_fast
5881          vpmulhrsw    ym21, ym23, [cq+64* 1]
5882    {evex}vpmulhrsw    xm17, xm23, [cq+64*15] ; force EVEX encoding, which
5883    {evex}vpmulhrsw    xm20, xm23, [cq+64* 9] ; reduces code size due to
5884    {evex}vpmulhrsw    ym15, ym23, [cq+64* 7] ; compressed displacements
5885    {evex}vpmulhrsw    ym18, ym23, [cq+64* 5]
5886    {evex}vpmulhrsw    xm16, xm23, [cq+64*11]
5887    {evex}vpmulhrsw    xm19, xm23, [cq+64*13]
5888    {evex}vpmulhrsw    ym23,       [cq+64* 3]
5889    vpermb              m21, m14, m21
5890    punpcklwd          xm17, xm17
5891    vpermb             ym20, ym14, ym20
5892    vpermb              m15, m14, m15
5893    vpermb              m18, m14, m18
5894    vpermb             ym16, ym14, ym16
5895    punpcklwd          xm19, xm19
5896    vpermb              m14, m14, m23
5897    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
5898    vpbroadcastd         m9, [o(pw_16384)]
5899    call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round
5900    vshufi32x4          m16, m0, m3, q2020  ;  0
5901    vshufi32x4          m26, m0, m3, q3131  ;  4
5902    vshufi32x4           m0, m14, m2, q2020 ;  1
5903    vshufi32x4          m14, m2, q3131      ;  5
5904    vshufi32x4           m3, m19, m7, q3131 ; 15
5905    vshufi32x4          m19, m7, q2020      ; 11
5906    vshufi32x4          m27, m17, m9, q2020 ;  3
5907    vshufi32x4          m17, m9, q3131      ;  7
5908    vshufi32x4          m28, m20, m6, q2020 ;  9
5909    vshufi32x4          m20, m6, q3131      ; 13
5910    vshufi32x4          m22, m1, m18, q2020 ;  2
5911    vshufi32x4          m23, m1, m18, q3131 ;  6
5912    vshufi32x4          m24, m5, m15, q2020 ; 10
5913    vshufi32x4          m25, m5, m15, q3131 ; 14
5914    vshufi32x4          m15, m21, m4, q3131 ; 12
5915    vshufi32x4          m21, m21, m4, q2020 ;  8
5916    mov                  r4, rsp
5917    call .main_part1_fast
5918    mova                 m0, m17
5919    mova                 m3, m28
5920    call .main_part1_fast
5921    mova                 m0, m14
5922    mova                 m3, m19
5923    call .main_part1_fast
5924    mova                 m0, m27
5925    mova                 m3, m20
5926    call .main_part1_fast
5927    call .main_part2
5928    mova                 m0, m16
5929    mova                 m1, m21
5930    mova                m14, m26
5931    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2
5932    mova         [cq+64*14], m21
5933    mova         [cq+64* 0], m14
5934    mova         [cq+64* 6], m17
5935    mova         [cq+64* 8], m18
5936    mova         [cq+64*10], m19
5937    mova         [cq+64* 4], m16
5938    mova         [cq+64* 2], m15
5939    mova         [cq+64*12], m20
5940    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
5941.end:
5942    lea                  r4, [strideq*3]
5943    vpbroadcastd        m12, [o(pw_2048)]
5944    movshdup            m13, [o(permD)]
5945    lea                  r5, [r4+strideq]   ; stride*4
5946    lea                  r3, [dstq+r4*8]
5947    lea                  r6, [strideq+r5*8] ; stride*33
5948    lea                  r8, [r4+r5*8]      ; stride*35
5949    add                  r3, r5             ; dst+stride*28
5950    lea                  r7, [r6+strideq]   ; stride*34
5951%macro IDCT_32x64_END 6 ; src, mem, stride[1-4]
5952%if %2 < 8
5953    paddsw              m10, m%2, m%1
5954    psubsw              m11, m%2, m%1
5955%else
5956    mova                m11, [cq+64*(%2*2-16)]
5957    paddsw              m10, m11, m%1
5958    psubsw              m11, m%1
5959%endif
5960    mova                 m9, [rsp+64*(31-%2)]
5961    mova                m%1, [rsp+64*%2]
5962    paddsw               m8, m10, m9
5963    psubsw              m10, m9
5964    paddsw               m9, m11, m%1
5965    pmovzxbw             m0, [dstq+%3]
5966    psubsw              m11, m%1
5967    pmovzxbw            m%1, [r3  +%4]
5968    REPX  {pmulhrsw x, m12}, m8, m10, m9, m11
5969    paddw                m8, m0
5970    pmovzxbw             m0, [r3  +%5]
5971    paddw               m10, m%1
5972    pmovzxbw            m%1, [dstq+%6]
5973    paddw                m9, m0
5974    paddw               m11, m%1
5975%if %2 >= 8
5976%if %2 == 8
5977    pxor                 m1, m1
5978%endif
5979    mova  [cq+64*(%2*2-16)], m1
5980    mova  [cq+64*(%2*2-15)], m1
5981%endif
5982    packuswb             m8, m10
5983    packuswb             m9, m11
5984    vpermq               m8, m13, m8
5985    vpermq               m9, m13, m9
5986    mova          [dstq+%3], ym8
5987    vextracti32x8 [r3  +%4], m8, 1
5988    mova          [r3  +%5], ym9
5989    vextracti32x8 [dstq+%6], m9, 1
5990%if %2 == 3 || %2 == 7 || %2 == 11
5991    add                dstq, r5
5992    sub                  r3, r5
5993%endif
5994%endmacro
5995    IDCT_32x64_END       29,  0, strideq*0, r8,   r4       , r5*8
5996    IDCT_32x64_END       28,  1, strideq*1, r7,   strideq*2, r6
5997    IDCT_32x64_END       27,  2, strideq*2, r6,   strideq*1, r7
5998    IDCT_32x64_END       26,  3, r4       , r5*8, strideq*0, r8
5999    IDCT_32x64_END       25,  4, strideq*0, r8,   r4       , r5*8
6000    IDCT_32x64_END       24,  5, strideq*1, r7,   strideq*2, r6
6001    IDCT_32x64_END       23,  6, strideq*2, r6,   strideq*1, r7
6002    IDCT_32x64_END       22,  7, r4       , r5*8, strideq*0, r8
6003    IDCT_32x64_END       21,  8, strideq*0, r8,   r4       , r5*8
6004    IDCT_32x64_END       20,  9, strideq*1, r7,   strideq*2, r6
6005    IDCT_32x64_END       19, 10, strideq*2, r6,   strideq*1, r7
6006    IDCT_32x64_END       18, 11, r4       , r5*8, strideq*0, r8
6007    IDCT_32x64_END       17, 12, strideq*0, r8,   r4       , r5*8
6008    IDCT_32x64_END       16, 13, strideq*1, r7,   strideq*2, r6
6009    IDCT_32x64_END       15, 14, strideq*2, r6,   strideq*1, r7
6010    IDCT_32x64_END       14, 15, r4       , r5*8, strideq*0, r8
6011    RET
6012.dconly:
6013    movsx               r6d, word [cq]
6014    mov                [cq], eobd
6015    imul                r6d, 181
6016    mov                 r3d, 64
6017    add                 r6d, 128
6018    sar                 r6d, 8
6019    imul                r6d, 181
6020    add                 r6d, 128+256
6021    sar                 r6d, 8+1
6022    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3
6023ALIGN function_align ; bottom three-quarters are zero
6024.main_part1_fast:
6025    vpbroadcastd         m1, [o(idct64_mul+4*0)]
6026    vpbroadcastd         m8, [o(idct64_mul+4*1)]
6027    vpbroadcastd         m2, [o(idct64_mul+4*6)]
6028    vpbroadcastd         m9, [o(idct64_mul+4*7)]
6029    pmulhrsw             m1, m0     ; t63a
6030    pmulhrsw             m0, m8     ; t32a
6031    pmulhrsw             m2, m3     ; t60a
6032    pmulhrsw             m3, m9     ; t35a
6033    mova                 m8, m0
6034    mova                 m7, m1
6035    mova                 m6, m3
6036    mova                 m5, m2
6037    jmp .main_part1b
6038.main_part1:
6039    ; idct64 steps 1-5:
6040    ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
6041    ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
6042    ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
6043    ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
6044    vpbroadcastd         m7, [o(idct64_mul+4*0)]
6045    vpbroadcastd         m8, [o(idct64_mul+4*1)]
6046    vpbroadcastd         m6, [o(idct64_mul+4*2)]
6047    vpbroadcastd         m9, [o(idct64_mul+4*3)]
6048    pmulhrsw             m7, m0     ; t63a
6049    vpbroadcastd         m5, [o(idct64_mul+4*4)]
6050    pmulhrsw             m0, m8     ; t32a
6051    vpbroadcastd         m8, [o(idct64_mul+4*5)]
6052    pmulhrsw             m6, m1     ; t62a
6053    vpbroadcastd         m4, [o(idct64_mul+4*6)]
6054    pmulhrsw             m1, m9     ; t33a
6055    vpbroadcastd         m9, [o(idct64_mul+4*7)]
6056    pmulhrsw             m5, m2     ; t61a
6057    pmulhrsw             m2, m8     ; t34a
6058    pmulhrsw             m4, m3     ; t60a
6059    pmulhrsw             m3, m9     ; t35a
6060    psubsw               m8, m0, m1 ; t33
6061    paddsw               m0, m1     ; t32
6062    psubsw               m1, m7, m6 ; t62
6063    paddsw               m7, m6     ; t63
6064    psubsw               m6, m3, m2 ; t34
6065    paddsw               m3, m2     ; t35
6066    psubsw               m2, m4, m5 ; t61
6067    paddsw               m5, m4     ; t60
6068.main_part1b:
6069    vpbroadcastd        m11, [o(idct64_mul+4*8)]
6070    vpbroadcastd        m12, [o(idct64_mul+4*9)]
6071    ITX_MULSUB_2W         1, 8, 4, 9, 10, 11, 12 ; t33a, t62a
6072    vpbroadcastd        m11, [o(idct64_mul+4*10)]
6073    ITX_MULSUB_2W         2, 6, 4, 9, 10, 12, 11 ; t34a, t61a
6074    vpbroadcastd        m11, [o(idct64_mul+4*11)]
6075    vpbroadcastd        m12, [o(idct64_mul+4*12)]
6076    psubsw               m4, m0, m3 ; t35a
6077    paddsw               m0, m3     ; t32a
6078    psubsw               m3, m7, m5 ; t60a
6079    paddsw               m7, m5     ; t63a
6080    psubsw               m5, m1, m2 ; t34
6081    paddsw               m1, m2     ; t33
6082    psubsw               m2, m8, m6 ; t61
6083    paddsw               m6, m8     ; t62
6084    add                  r5, 4*13
6085    ITX_MULSUB_2W         3, 4, 8, 9, 10, 11, 12 ; t35,  t60
6086    ITX_MULSUB_2W         2, 5, 8, 9, 10, 11, 12 ; t34a, t61a
6087    mova          [r4+64*0], m0
6088    mova          [r4+64*7], m7
6089    mova          [r4+64*1], m1
6090    mova          [r4+64*6], m6
6091    mova          [r4+64*3], m3
6092    mova          [r4+64*4], m4
6093    mova          [r4+64*2], m2
6094    mova          [r4+64*5], m5
6095    add                  r4, 64*8
6096    ret
6097.main_part2:
6098    vpbroadcastd        m11, [o(pw_1567_3784  -16*13)]
6099    vpbroadcastd        m12, [o(pw_m3784_1567 -16*13)]
6100    lea                  r6, [r4+64*7]
6101    vpbroadcastd        m17, [o(pw_m1567_m3784-16*13)]
6102    vpbroadcastd        m18, [o(pw_2896_2896  -16*13)]
6103    vpbroadcastd        m19, [o(pw_m2896_2896 -16*13)]
6104    sub                  r5, 16*13
6105.main_part2_loop:
6106    mova                 m0, [r4-64*32] ; t32a
6107    mova                 m1, [r6-64*24] ; t39a
6108    mova                 m2, [r6-64*32] ; t63a
6109    mova                 m3, [r4-64*24] ; t56a
6110    mova                 m4, [r4-64*16] ; t40a
6111    mova                 m5, [r6-64* 8] ; t47a
6112    mova                 m6, [r6-64*16] ; t55a
6113    mova                 m7, [r4-64* 8] ; t48a
6114    psubsw               m8, m0, m1 ; t39
6115    paddsw               m0, m1     ; t32
6116    psubsw               m1, m2, m3 ; t56
6117    paddsw               m2, m3     ; t63
6118    psubsw               m3, m5, m4 ; t40
6119    paddsw               m5, m4     ; t47
6120    psubsw               m4, m7, m6 ; t55
6121    paddsw               m7, m6     ; t48
6122    ITX_MULSUB_2W         1, 8, 6, 9, 10, 11, 12 ; t39a, t56a
6123    ITX_MULSUB_2W         4, 3, 6, 9, 10, 12, 17 ; t40a, t55a
6124    psubsw               m6, m2, m7 ; t48a
6125    paddsw               m2, m7     ; t63a
6126    psubsw               m7, m0, m5 ; t47a
6127    paddsw               m0, m5     ; t32a
6128    psubsw               m5, m8, m3 ; t55
6129    paddsw               m8, m3     ; t56
6130    psubsw               m3, m1, m4 ; t40
6131    paddsw               m1, m4     ; t39
6132    ITX_MULSUB_2W         6, 7, 4, 9, 10, 18, 19 ; t47,  t48
6133    ITX_MULSUB_2W         5, 3, 4, 9, 10, 18, 19 ; t40a, t55a
6134    mova         [r6-64* 8], m2
6135    mova         [r4-64*32], m0
6136    mova         [r4-64* 8], m8
6137    mova         [r6-64*32], m1
6138    mova         [r6-64*24], m6
6139    mova         [r4-64*16], m7
6140    mova         [r4-64*24], m5
6141    mova         [r6-64*16], m3
6142    add                  r4, 64
6143    sub                  r6, 64
6144    cmp                  r4, r6
6145    jb .main_part2_loop
6146    ret
6147
6148cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 7, 0, dst, stride, c, eob
6149    lea                  r5, [o_base]
6150    test               eobd, eobd
6151    jz .dconly
6152    PROLOGUE              0, 7, 30, 64*32, dst, stride, c, eob
6153    vpbroadcastd        m23, [o(pw_2896x8)]
6154%undef cmp
6155    cmp                eobd, 136
6156    jb .fast
6157    pmulhrsw             m0, m23, [cq+64* 1]
6158    pmulhrsw             m1, m23, [cq+64*31]
6159    pmulhrsw             m2, m23, [cq+64*17]
6160    pmulhrsw             m3, m23, [cq+64*15]
6161    vpbroadcastd        m10, [o(pd_2048)]
6162    mov                  r4, rsp
6163    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
6164    pmulhrsw             m0, m23, [cq+64* 7]
6165    pmulhrsw             m1, m23, [cq+64*25]
6166    pmulhrsw             m2, m23, [cq+64*23]
6167    pmulhrsw             m3, m23, [cq+64* 9]
6168    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
6169    pmulhrsw             m0, m23, [cq+64* 5]
6170    pmulhrsw             m1, m23, [cq+64*27]
6171    pmulhrsw             m2, m23, [cq+64*21]
6172    pmulhrsw             m3, m23, [cq+64*11]
6173    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
6174    pmulhrsw             m0, m23, [cq+64* 3]
6175    pmulhrsw             m1, m23, [cq+64*29]
6176    pmulhrsw             m2, m23, [cq+64*19]
6177    pmulhrsw             m3, m23, [cq+64*13]
6178    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
6179    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
6180    pmulhrsw             m3, m23, [cq+64*24]
6181    pmulhrsw             m1, m23, [cq+64* 8]
6182    pmulhrsw             m2, m23, [cq+64*16]
6183    pmulhrsw             m0, m23, [cq+64* 0]
6184    pmulhrsw            m14, m23, [cq+64* 4]
6185    pmulhrsw            m17, m23, [cq+64*28]
6186    pmulhrsw            m16, m23, [cq+64*20]
6187    pmulhrsw            m15, m23, [cq+64*12]
6188    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
6189    pmulhrsw            m22, m23, [cq+64* 2]
6190    pmulhrsw            m29, m23, [cq+64*30]
6191    pmulhrsw            m26, m23, [cq+64*18]
6192    pmulhrsw            m25, m23, [cq+64*14]
6193    pmulhrsw            m24, m23, [cq+64*10]
6194    pmulhrsw            m27, m23, [cq+64*22]
6195    pmulhrsw            m28, m23, [cq+64*26]
6196    pmulhrsw            m23,      [cq+64* 6]
6197    mova         [cq+64* 0], m14
6198    mova         [cq+64* 1], m15
6199    mova         [cq+64* 2], m16
6200    mova         [cq+64* 3], m17
6201    mova         [cq+64* 4], m18
6202    mova         [cq+64* 5], m19
6203    mova         [cq+64* 6], m20
6204    mova         [cq+64* 7], m21
6205    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
6206    vpbroadcastd        m13, [o(pw_16384)]
6207    call .pass1_end_part1
6208    mova         [cq+64*16], m1
6209    mova         [cq+64*17], m3
6210    mova         [cq+64*18], m5
6211    mova         [cq+64*19], m7
6212    mova         [cq+64*24], m23
6213    mova         [cq+64*25], m25
6214    mova         [cq+64*26], m27
6215    mova         [cq+64*27], m29
6216    pmulhrsw            m23, m13, m0 ; a0
6217    pmulhrsw            m25, m13, m2 ; a2
6218    pmulhrsw            m27, m13, m4 ; a4
6219    pmulhrsw            m29, m13, m6 ; a6
6220    REPX {pmulhrsw x, m13}, m22, m24, m26, m28 ; e0 e2 e4 e6
6221    call .pass1_end_part2
6222    mova         [cq+64*20], m15
6223    mova         [cq+64*21], m17
6224    mova         [cq+64*22], m19
6225    mova         [cq+64*23], m21
6226    mova         [cq+64*28], m1
6227    mova         [cq+64*29], m3
6228    mova         [cq+64*30], m5
6229    mova         [cq+64*31], m7
6230    REPX {pmulhrsw x, m13}, m14, m16, m18, m20 ; c0 c2 c4 c6
6231    REPX {pmulhrsw x, m13}, m0, m2, m4, m6     ; g0 g2 g4 g6
6232    vinserti32x8        m3, m23, ym14, 1 ; a00 a01 c00 c01
6233    vshufi32x4         m23, m14, q3232   ; a02 a03 c02 c03
6234    vinserti32x8       m15, m22, ym0, 1  ; e00 e01 g00 g01
6235    vshufi32x4         m22, m0, q3232    ; e02 e03 g02 g03
6236    vinserti32x8        m1, m27, ym18, 1 ; a40 a41 c40 c41
6237    vshufi32x4         m27, m18, q3232   ; a42 a43 c42 c43
6238    vinserti32x8       m18, m26, ym4, 1  ; e40 e41 g40 g41
6239    vshufi32x4         m26, m4, q3232    ; e42 e43 g42 g43
6240    vinserti32x8       m14, m25, ym16, 1 ; a20 a21 c20 c21
6241    vshufi32x4         m25, m16, q3232   ; a22 a23 c22 c23
6242    vinserti32x8       m17, m24, ym2, 1  ; e20 e21 g20 g21
6243    vshufi32x4         m24, m2, q3232    ; e22 e23 g22 g23
6244    vinserti32x8       m19, m29, ym20, 1 ; a60 a61 c60 c61
6245    vshufi32x4         m29, m20, q3232   ; a62 a63 c62 c63
6246    vinserti32x8       m20, m28, ym6, 1  ; e60 e61 g60 g61
6247    vshufi32x4         m28, m6, q3232    ; e62 e63 g62 g63
6248    vshufi32x4          m2, m3, m15, q3131  ;  8
6249    vshufi32x4          m0, m3, m15, q2020  ;  0
6250    vshufi32x4          m6, m23, m22, q3131 ; 24
6251    vshufi32x4          m4, m23, m22, q2020 ; 16
6252    vshufi32x4          m3, m1, m18, q3131  ; 12
6253    vshufi32x4          m1, m18, q2020      ;  4
6254    vshufi32x4          m7, m27, m26, q3131 ; 28
6255    vshufi32x4          m5, m27, m26, q2020 ; 20
6256    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
6257    vshufi32x4         m16, m14, m17, q3131 ; 10
6258    vshufi32x4         m14, m17, q2020      ;  2
6259    vshufi32x4         m17, m19, m20, q3131 ; 14
6260    vshufi32x4         m15, m19, m20, q2020 ;  6
6261    vshufi32x4         m20, m25, m24, q3131 ; 26
6262    vshufi32x4         m18, m25, m24, q2020 ; 18
6263    vshufi32x4         m21, m29, m28, q3131 ; 30
6264    vshufi32x4         m19, m29, m28, q2020 ; 22
6265    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
6266    pmulhrsw           m22, m13, [cq+64*16] ; a1
6267    pmulhrsw           m23, m13, [cq+64*20] ; c1
6268    pmulhrsw           m24, m13, [cq+64*24] ; e1
6269    pmulhrsw           m25, m13, [cq+64*28] ; g1
6270    pmulhrsw           m26, m13, [cq+64*17] ; a3
6271    pmulhrsw           m27, m13, [cq+64*21] ; c3
6272    pmulhrsw           m28, m13, [cq+64*25] ; e3
6273    pmulhrsw           m29, m13, [cq+64*29] ; g3
6274    mova        [cq+64* 8], m14
6275    mova        [cq+64* 9], m15
6276    mova        [cq+64*10], m16
6277    mova        [cq+64*11], m17
6278    mova        [cq+64*12], m18
6279    mova        [cq+64*13], m19
6280    mova        [cq+64*14], m20
6281    mova        [cq+64*15], m21
6282    pmulhrsw           m14, m13, [cq+64*18] ; a5
6283    pmulhrsw           m15, m13, [cq+64*22] ; c5
6284    pmulhrsw           m16, m13, [cq+64*26] ; e5
6285    pmulhrsw           m17, m13, [cq+64*30] ; g5
6286    pmulhrsw           m18, m13, [cq+64*19] ; a7
6287    pmulhrsw           m19, m13, [cq+64*23] ; c7
6288    pmulhrsw           m20, m13, [cq+64*27] ; e7
6289    pmulhrsw           m21, m13, [cq+64*31] ; g7
6290    vinserti32x8        m8, m22, ym23, 1 ; a10 a11 c10 c11
6291    vshufi32x4         m22, m23, q3232   ; a12 a13 c12 c13
6292    vinserti32x8        m9, m24, ym25, 1 ; e10 e11 g10 g11
6293    vshufi32x4         m24, m25, q3232   ; e12 e13 g12 g13
6294    vinserti32x8       m23, m26, ym27, 1 ; a30 a31 c30 c31
6295    vshufi32x4         m26, m27, q3232   ; a32 a33 c32 c33
6296    vinserti32x8       m11, m28, ym29, 1 ; e30 e31 g30 g31
6297    vshufi32x4         m28, m29, q3232   ; e32 e33 g32 g33
6298    mova        [cq+64* 0], m0
6299    mova        [cq+64* 1], m1
6300    mova        [cq+64* 2], m2
6301    mova        [cq+64* 3], m3
6302    mova        [cq+64* 4], m4
6303    mova        [cq+64* 5], m5
6304    mova        [cq+64* 6], m6
6305    mova        [cq+64* 7], m7
6306    vinserti32x8       m12, m14, ym15, 1 ; a50 a51 c50 c51
6307    vshufi32x4         m14, m15, q3232   ; a52 a53 c52 c53
6308    vinserti32x8       m13, m16, ym17, 1 ; e50 e51 g50 g51
6309    vshufi32x4         m16, m17, q3232   ; e52 e53 g52 g53
6310    vinserti32x8       m25, m18, ym19, 1 ; a70 a71 c70 c71
6311    vshufi32x4         m18, m19, q3232   ; a72 a73 c72 c73
6312    vinserti32x8       m17, m20, ym21, 1 ; e70 e71 g70 g71
6313    vshufi32x4         m20, m21, q3232   ; e72 e73 g72 g73
6314    vshufi32x4         m27, m23, m11, q3131 ; 11 m27
6315    vshufi32x4         m23, m11, q2020      ;  3 m23
6316    vshufi32x4         m19, m26, m28, q3131 ; 27 m19
6317    vshufi32x4         m15, m26, m28, q2020 ; 19 m15
6318    vshufi32x4         m29, m25, m17, q3131 ; 15 m29
6319    vshufi32x4         m25, m17, q2020      ;  7 m25
6320    vshufi32x4         m21, m18, m20, q3131 ; 31 m21
6321    vshufi32x4         m17, m18, m20, q2020 ; 23 m17
6322    vshufi32x4         m20, m14, m16, q3131 ; 29 m20
6323    vshufi32x4         m16, m14, m16, q2020 ; 21 m16
6324    vshufi32x4         m18, m22, m24, q3131 ; 25 m18
6325    vshufi32x4         m14, m22, m24, q2020 ; 17 m14
6326    vshufi32x4         m26, m8, m9, q3131   ;  9 m26
6327    vshufi32x4         m22, m8, m9, q2020   ;  1 m22
6328    vshufi32x4         m28, m12, m13, q3131 ; 13 m28
6329    vshufi32x4         m24, m12, m13, q2020 ;  5 m24
6330    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
6331    vpbroadcastd       m13, [o(pw_16384)]
6332    pmulhrsw            m0, m13, [r4-64*21]
6333    pmulhrsw            m1, m13, [r4-64*22]
6334    pmulhrsw            m2, m13, [r4-64*23]
6335    pmulhrsw            m3, m13, [r4-64*24]
6336    pmulhrsw            m4, m13, [r4-64*25]
6337    pmulhrsw            m5, m13, [r4-64*26]
6338    pmulhrsw            m6, m13, [r4-64*27]
6339    pmulhrsw            m7, m13, [r4-64*28]
6340    mova        [cq+64*16], m14
6341    mova        [cq+64*17], m15
6342    mova        [cq+64*18], m16
6343    mova        [cq+64*19], m17
6344    mova        [cq+64*20], m18
6345    mova        [cq+64*21], m19
6346    mova        [cq+64*22], m20
6347    mova        [cq+64*23], m21
6348    pmulhrsw           m14, m13, [r4-64*12]
6349    pmulhrsw           m15, m13, [r4-64*11]
6350    pmulhrsw           m16, m13, [r4-64*10]
6351    pmulhrsw           m17, m13, [r4-64* 9]
6352    pmulhrsw           m18, m13, [r4-64* 8]
6353    pmulhrsw           m19, m13, [r4-64* 7]
6354    pmulhrsw           m20, m13, [r4-64* 6]
6355    pmulhrsw           m21, m13, [r4-64* 5]
6356    mova        [cq+64*24], m22
6357    mova        [cq+64*25], m23
6358    mova        [cq+64*26], m24
6359    mova        [cq+64*27], m25
6360    mova        [cq+64*28], m26
6361    mova        [cq+64*29], m27
6362    mova        [cq+64*30], m28
6363    mova        [cq+64*31], m29
6364    call .transpose_2x8x8_lo
6365    mova        [r4-64*12], m1
6366    mova        [r4-64*11], m3
6367    mova        [r4-64*10], m5
6368    mova        [r4-64* 9], m7
6369    mova        [r4-64* 8], m15
6370    mova        [r4-64* 7], m17
6371    mova        [r4-64* 6], m19
6372    mova        [r4-64* 5], m21
6373    vinserti32x8       m22, m0, ym14, 1     ; f00 f01 h00 h01
6374    vshufi32x4         m23, m0, m14, q3232  ; f02 f03 h02 h03
6375    vinserti32x8       m24, m2, ym16, 1     ; f20 f21 h20 h21
6376    vshufi32x4         m25, m2, m16, q3232  ; f22 f23 h22 h23
6377    vinserti32x8       m26, m4, ym18, 1     ; f40 f41 h40 h41
6378    vshufi32x4         m27, m4, m18, q3232  ; f42 f43 h42 h43
6379    vinserti32x8       m28, m6, ym20, 1     ; f60 f61 h60 h61
6380    vshufi32x4         m29, m6, m20, q3232  ; f62 f63 h62 h63
6381    pmulhrsw            m0, m13, [r4-64*20]
6382    pmulhrsw            m1, m13, [r4-64*19]
6383    pmulhrsw            m2, m13, [r4-64*18]
6384    pmulhrsw            m3, m13, [r4-64*17]
6385    pmulhrsw            m4, m13, [r4-64*16]
6386    pmulhrsw            m5, m13, [r4-64*15]
6387    pmulhrsw            m6, m13, [r4-64*14]
6388    pmulhrsw            m7, m13, [r4-64*13]
6389    pmulhrsw           m14, m13, [r4-64*29]
6390    pmulhrsw           m15, m13, [r4-64*30]
6391    pmulhrsw           m16, m13, [r4-64*31]
6392    pmulhrsw           m17, m13, [r4-64*32]
6393    pmulhrsw           m18, m13, [r4-64*33]
6394    pmulhrsw           m19, m13, [r4-64*34]
6395    pmulhrsw           m20, m13, [r4-64*35]
6396    pmulhrsw           m21, m13, [r4-64*36]
6397    call .transpose_2x8x8_lo
6398    mova       [r4-64*20], m1
6399    mova       [r4-64*19], m3
6400    mova       [r4-64*18], m5
6401    mova       [r4-64*17], m7
6402    mova       [r4-64*16], m15
6403    mova       [r4-64*15], m17
6404    mova       [r4-64*14], m19
6405    mova       [r4-64*13], m21
6406    vinserti32x8        m1, m4, ym18, 1     ; b40 b41 d40 d41
6407    vshufi32x4          m5, m4, m18, q3232  ; b42 b43 d42 d43
6408    vshufi32x4          m4, m0, m14, q3232  ; b02 b03 d02 d03
6409    vinserti32x8        m0, ym14, 1         ; b00 b01 d00 d01
6410    vinserti32x8       m14, m2, ym16, 1     ; b20 b21 d20 d21
6411    vshufi32x4         m18, m2, m16, q3232  ; b22 b23 d22 d23
6412    vinserti32x8       m15, m6, ym20, 1     ; b60 b61 d60 d61
6413    vshufi32x4         m19, m6, m20, q3232  ; b62 b63 d62 d63
6414    vshufi32x4          m2, m0, m22, q3131  ;  8
6415    vshufi32x4          m0, m22, q2020      ;  0
6416    vshufi32x4          m3, m1, m26, q3131  ; 12
6417    vshufi32x4          m1, m26, q2020      ;  4
6418    vshufi32x4          m6, m4, m23, q3131  ; 24
6419    vshufi32x4          m4, m23, q2020      ; 16
6420    vshufi32x4          m7, m5, m27, q3131  ; 28
6421    vshufi32x4          m5, m27, q2020      ; 20
6422    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
6423    vshufi32x4         m16, m14, m24, q3131 ; 10
6424    vshufi32x4         m14, m24, q2020      ;  2
6425    vshufi32x4         m17, m15, m28, q3131 ; 14
6426    vshufi32x4         m15, m28, q2020      ;  6
6427    vshufi32x4         m20, m18, m25, q3131 ; 26
6428    vshufi32x4         m18, m25, q2020      ; 18
6429    vshufi32x4         m21, m19, m29, q3131 ; 30
6430    vshufi32x4         m19, m29, q2020      ; 22
6431    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
6432    mova               m22, [r4-64*20]
6433    mova               m26, [r4-64*16]
6434    mova               m23, [r4-64*19]
6435    mova               m27, [r4-64*15]
6436    mova               m24, [r4-64*18]
6437    mova               m28, [r4-64*14]
6438    mova               m25, [r4-64*17]
6439    mova               m29, [r4-64*13]
6440    mova        [r4-64*20], m14
6441    mova        [r4-64*19], m15
6442    mova        [r4-64*18], m16
6443    mova        [r4-64*17], m17
6444    mova        [r4-64*16], m18
6445    mova        [r4-64*15], m19
6446    mova        [r4-64*14], m20
6447    mova        [r4-64*13], m21
6448    mova               m19, [r4-64*12]
6449    mova               m11, [r4-64* 8]
6450    mova               m20, [r4-64*11]
6451    mova               m12, [r4-64* 7]
6452    mova               m21, [r4-64*10]
6453    mova                m8, [r4-64* 6]
6454    mova                m9, [r4-64* 9]
6455    mova               m18, [r4-64* 5]
6456    vshufi32x4         m14, m22, m26, q3232 ; b12 b13 d12 d13
6457    vinserti32x8       m22, ym26, 1         ; b10 b11 d10 d11
6458    vshufi32x4         m15, m23, m27, q3232 ; b32 b33 d32 d33
6459    vinserti32x8       m23, ym27, 1         ; b30 b31 d30 d31
6460    vshufi32x4         m16, m24, m28, q3232 ; b52 b53 d52 d53
6461    vinserti32x8       m24, ym28, 1         ; b50 b51 d50 d51
6462    vshufi32x4         m17, m25, m29, q3232 ; b72 b73 d72 d73
6463    vinserti32x8       m25, ym29, 1         ; b70 b71 d70 d71
6464    vinserti32x8       m27, m19, ym11, 1    ; f10 f11 h10 h11
6465    vshufi32x4         m19, m11, q3232      ; f12 f13 h12 h13
6466    vinserti32x8       m28, m20, ym12, 1    ; f30 f31 h30 h31
6467    vshufi32x4         m20, m12, q3232      ; f32 f33 h32 h33
6468    vinserti32x8       m29, m21, ym8, 1     ; f50 f51 h50 h51
6469    vshufi32x4         m21, m8, q3232       ; f52 f53 h52 h53
6470    vinserti32x8        m8, m9, ym18, 1     ; f70 f71 h70 h71
6471    vshufi32x4          m9, m18, q3232      ; f72 f73 h72 h73
6472    vshufi32x4         m26, m22, m27, q3131 ;  9
6473    vshufi32x4         m22, m27, q2020      ;  1
6474    vshufi32x4         m27, m23, m28, q3131 ; 11
6475    vshufi32x4         m23, m28, q2020      ;  3
6476    vshufi32x4         m28, m24, m29, q3131 ; 13
6477    vshufi32x4         m24, m29, q2020      ;  5
6478    vshufi32x4         m29, m25, m8, q3131  ; 15
6479    vshufi32x4         m25, m8, q2020       ;  7
6480    vshufi32x4         m18, m14, m19, q3131 ; 25
6481    vshufi32x4         m14, m19, q2020      ; 17
6482    vshufi32x4         m19, m15, m20, q3131 ; 27
6483    vshufi32x4         m15, m20, q2020      ; 19
6484    vshufi32x4         m20, m16, m21, q3131 ; 29
6485    vshufi32x4         m16, m21, q2020      ; 21
6486    vshufi32x4         m21, m17, m9, q3131  ; 31
6487    vshufi32x4         m17, m9, q2020       ; 23
6488    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
6489    jmp .end
6490.fast: ; bottom/right halves are zero
6491    {evex}vpmulhrsw     ym8, ym23, [cq+64* 4]
6492    {evex}vpmulhrsw     xm1, xm23, [cq+64*12]
6493    mova                m28, [o(dup16_perm)]
6494    {evex}vpmulhrsw     ym7, ym23, [cq+64* 8]
6495          vpmulhrsw    ym22, ym23, [cq+64* 0]
6496    vpermb               m8, m28, m8
6497    vpermb              ym1, ym28, ym1
6498    vpermb               m7, m28, m7
6499    pmovzxwd             m9, ym22
6500    pslld                m9, 16
6501    call m(idct_16x16_internal_8bpc).main_fast2
6502    {evex}vpmulhrsw    ym21, ym23, [cq+64* 2]
6503    {evex}vpmulhrsw    xm15, xm23, [cq+64*14]
6504    {evex}vpmulhrsw    xm18, xm23, [cq+64*10]
6505    {evex}vpmulhrsw    ym14, ym23, [cq+64* 6]
6506    vpermb              m21, m28, m21
6507    punpcklwd          xm15, xm15
6508    vpermb             ym18, ym28, ym18
6509    vpermb              m14, m28, m14
6510    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
6511          vpmulhrsw    ym22, ym23, [cq+64* 1]
6512    {evex}vpmulhrsw    xm29, xm23, [cq+64*15]
6513    {evex}vpmulhrsw    xm26, xm23, [cq+64* 9]
6514    {evex}vpmulhrsw    ym25, ym23, [cq+64* 7]
6515    {evex}vpmulhrsw    ym24, ym23, [cq+64* 5]
6516    {evex}vpmulhrsw    xm27, xm23, [cq+64*11]
6517    {evex}vpmulhrsw     xm8, xm23, [cq+64*13]
6518    {evex}vpmulhrsw    ym23,       [cq+64* 3]
6519    vpermb              m22, m28, m22
6520    punpcklwd          xm29, xm29
6521    vpermb             ym26, ym28, ym26
6522    vpermb              m25, m28, m25
6523    mova         [cq+64* 0], m14
6524    mova         [cq+64* 1], m15
6525    mova         [cq+64* 2], m16
6526    mova         [cq+64* 3], m17
6527    REPX {vpermb x, m28, x}, m24, m27, m23
6528    punpcklwd          xm28, xm8, xm8
6529    mova         [cq+64* 4], m18
6530    mova         [cq+64* 5], m19
6531    mova         [cq+64* 6], m20
6532    mova         [cq+64* 7], m21
6533    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
6534    mov                  r4, rsp
6535    vpbroadcastd        m13, [o(pw_16384)]
6536    mova         [r4+64*16], m4
6537    mova         [r4+64*17], m5
6538    mova         [r4+64*18], m6
6539    mova         [r4+64*19], m7
6540    mova         [r4+64*28], m26
6541    mova         [r4+64*29], m27
6542    mova         [r4+64*30], m28
6543    mova         [r4+64*31], m29
6544    call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end
6545    mova         [r4+64*20], m22
6546    mova         [r4+64*21], m23
6547    mova         [r4+64*22], m24
6548    mova         [r4+64*23], m25
6549    mova         [r4+64*24], m26
6550    mova         [r4+64*25], m27
6551    mova         [r4+64*26], m28
6552    mova         [r4+64*27], m29
6553    call .pass2_fast
6554    mova         [cq+64* 8], m14
6555    mova         [cq+64* 9], m15
6556    mova         [cq+64*10], m16
6557    mova         [cq+64*11], m17
6558    mova         [cq+64*12], m18
6559    mova         [cq+64*13], m19
6560    mova         [cq+64*14], m20
6561    mova         [cq+64*15], m21
6562    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
6563    mova         [cq+64* 0], m0
6564    mova         [cq+64* 1], m1
6565    mova         [cq+64* 2], m2
6566    mova         [cq+64* 3], m3
6567    mova         [cq+64* 4], m4
6568    mova         [cq+64* 5], m5
6569    mova         [cq+64* 6], m6
6570    mova         [cq+64* 7], m7
6571    pmulhrsw             m0, m13, [r4+64*16]
6572    pmulhrsw             m1, m13, [r4+64*17]
6573    pmulhrsw             m2, m13, [r4+64*18]
6574    pmulhrsw             m3, m13, [r4+64*19]
6575    pmulhrsw             m4, m13, [r4+64*20]
6576    pmulhrsw             m5, m13, [r4+64*21]
6577    pmulhrsw             m6, m13, [r4+64*22]
6578    pmulhrsw             m7, m13, [r4+64*23]
6579    mova         [cq+64*16], m14
6580    mova         [cq+64*17], m15
6581    mova         [cq+64*18], m16
6582    mova         [cq+64*19], m17
6583    mova         [cq+64*20], m18
6584    mova         [cq+64*21], m19
6585    mova         [cq+64*22], m20
6586    mova         [cq+64*23], m21
6587    pmulhrsw            m14, m13, [r4+64*24]
6588    pmulhrsw            m15, m13, [r4+64*25]
6589    pmulhrsw            m16, m13, [r4+64*26]
6590    pmulhrsw            m17, m13, [r4+64*27]
6591    pmulhrsw            m18, m13, [r4+64*28]
6592    pmulhrsw            m19, m13, [r4+64*29]
6593    pmulhrsw            m20, m13, [r4+64*30]
6594    pmulhrsw            m21, m13, [r4+64*31]
6595    mova         [cq+64*24], m22
6596    mova         [cq+64*25], m23
6597    mova         [cq+64*26], m24
6598    mova         [cq+64*27], m25
6599    mova         [cq+64*28], m26
6600    mova         [cq+64*29], m27
6601    mova         [cq+64*30], m28
6602    mova         [cq+64*31], m29
6603    call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round
6604    call .pass2_fast
6605    mova         [r4+64*16], m14
6606    mova         [r4+64*17], m15
6607    mova         [r4+64*18], m16
6608    mova         [r4+64*19], m17
6609    mova         [r4+64*20], m18
6610    mova         [r4+64*21], m19
6611    mova         [r4+64*22], m20
6612    mova         [r4+64*23], m21
6613    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
6614.end:
6615    vpbroadcastd        m13, [o(pw_2048)]
6616    lea                  r5, [strideq*3]
6617    pxor                m12, m12
6618    lea                  r3, [dstq+r5*8]
6619    lea                  r6, [strideq+r5] ; stride*4
6620    add                  r3, r6           ; dst+stride*28
6621%macro IDCT_64x32_END 5 ; src16, src32, mem, off_lo, off_hi
6622    mova                m11, [cq+64*(   %3)] ;  0
6623    mova                 m9, [cq+64*(31-%3)] ; 31
6624%if %3 >= 8
6625    mova                m%1, [rsp+64*(%1+16)]
6626%endif
6627    mova                m10, [dstq+%4]
6628    paddsw               m8, m11, m9
6629    psubsw              m11, m9
6630    paddsw               m9, m%1, m%2
6631    psubsw              m%1, m%2
6632    punpcklbw           m%2, m10, m12
6633    punpckhbw           m10, m12
6634    pmulhrsw             m8, m13
6635    pmulhrsw             m9, m13
6636    paddw                m8, m%2
6637    paddw                m9, m10
6638    mova                m10, [r3+%5]
6639    pmulhrsw            m11, m13
6640    pmulhrsw            m%1, m13
6641    mova    [cq+64*(   %3)], m12
6642    mova    [cq+64*(31-%3)], m12
6643    punpcklbw           m%2, m10, m12
6644    punpckhbw           m10, m12
6645    packuswb             m8, m9
6646    paddw               m11, m%2
6647    paddw               m%1, m10
6648    packuswb            m11, m%1
6649    mova          [dstq+%4], m8
6650    mova          [r3  +%5], m11
6651%if %3 == 3 || %3 == 7 || %3 == 11
6652    add                dstq, r6
6653    sub                  r3, r6
6654%endif
6655%endmacro
6656    IDCT_64x32_END        0, 29,  0, strideq*0, r5
6657    IDCT_64x32_END        1, 28,  1, strideq*1, strideq*2
6658    IDCT_64x32_END        2, 27,  2, strideq*2, strideq*1
6659    IDCT_64x32_END        3, 26,  3, r5       , strideq*0
6660    IDCT_64x32_END        4, 25,  4, strideq*0, r5
6661    IDCT_64x32_END        5, 24,  5, strideq*1, strideq*2
6662    IDCT_64x32_END        6, 23,  6, strideq*2, strideq*1
6663    IDCT_64x32_END        7, 22,  7, r5       , strideq*0
6664    IDCT_64x32_END        0, 21,  8, strideq*0, r5
6665    IDCT_64x32_END        1, 20,  9, strideq*1, strideq*2
6666    IDCT_64x32_END        2, 19, 10, strideq*2, strideq*1
6667    IDCT_64x32_END        3, 18, 11, r5       , strideq*0
6668    IDCT_64x32_END        4, 17, 12, strideq*0, r5
6669    IDCT_64x32_END        5, 16, 13, strideq*1, strideq*2
6670    IDCT_64x32_END        6, 15, 14, strideq*2, strideq*1
6671    IDCT_64x32_END        7, 14, 15, r5       , strideq*0
6672    RET
6673ALIGN function_align
6674.dconly:
6675    movsx               r6d, word [cq]
6676    mov                [cq], eobd
6677    imul                r6d, 181
6678    mov                 r3d, 32
6679    add                 r6d, 128
6680    sar                 r6d, 8
6681    imul                r6d, 181
6682    add                 r6d, 128+256
6683    sar                 r6d, 8+1
6684    jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly2
6685ALIGN function_align
6686.pass1_end_part1:
6687%macro IDCT_64x32_PASS1_END 3 ; src16, src32, src64
6688%if %1 != %3
6689    mova                m%1, [cq+64*%1]
6690%endif
6691    mova                 m9, [r4+64*(%3-36)] ; idct64 32+n
6692    mova                m11, [r4+64*(-5-%3)] ; idct64 63-n
6693    psubsw               m8, m%1, m%2        ; idct32 31-n
6694    paddsw              m%1, m%2             ; idct32  0+n
6695%if %1 == %3
6696    psubsw              m%2, m8, m9   ; out 32+n e
6697    paddsw               m8, m9       ; out 31-n d
6698    psubsw               m9, m%1, m11 ; out 63-n h
6699    paddsw              m%1, m11      ; out  0+n a
6700%else
6701    paddsw              m%2, m8, m9   ; out 23-n c
6702    psubsw               m8, m9       ; out 40+n f
6703    paddsw               m9, m%1, m11 ; out  8+n b
6704    psubsw              m%1, m11      ; out 55-n g
6705%endif
6706    mova   [r4+64*(%3-36)], m8
6707    mova   [r4+64*(-5-%3)], m9
6708%endmacro
6709    IDCT_64x32_PASS1_END  0, 29,  0
6710    IDCT_64x32_PASS1_END  1, 28,  1
6711    IDCT_64x32_PASS1_END  2, 27,  2
6712    IDCT_64x32_PASS1_END  3, 26,  3
6713    IDCT_64x32_PASS1_END  4, 25,  4
6714    IDCT_64x32_PASS1_END  5, 24,  5
6715    IDCT_64x32_PASS1_END  6, 23,  6
6716    IDCT_64x32_PASS1_END  7, 22,  7
6717.transpose_2x8x8_hi: ; m0-m7 + m22-m29 (inverted)
6718    punpcklwd            m8, m25, m24 ; e0 f0 e1 f1 e2 f2 e3 f3
6719    punpckhwd           m25, m24      ; e4 f4 e5 f5 e6 f6 e7 f7
6720    punpcklwd           m24, m23, m22 ; g0 h0 g1 h1 g2 h2 g3 h3
6721    punpckhwd           m23, m22      ; g4 h4 g5 h5 g6 h6 g7 h7
6722    punpcklwd           m22, m29, m28 ; a0 b0 a1 b1 a2 b2 a3 b3
6723    punpckhwd           m29, m28      ; a4 b4 a5 b5 a6 b6 a7 b7
6724    punpcklwd           m28, m27, m26 ; c0 d0 c1 d1 c2 d2 c3 d3
6725    punpckhwd           m27, m26      ; c4 d4 c5 d5 c6 d6 c7 d7
6726    punpckldq           m26, m29, m27 ; a4 b4 c4 d4 a5 b5 c5 d5
6727    punpckhdq           m29, m27      ; a6 b6 c6 d6 a7 b7 c7 d7
6728    punpckldq           m27, m8, m24  ; e0 f0 g0 h0 e1 f1 g1 h1
6729    punpckhdq            m8, m24      ; e2 f2 g2 h2 e3 f3 g3 h3
6730    punpckhdq           m24, m22, m28 ; a2 b2 c2 d2 a3 b3 c3 d3
6731    punpckldq           m22, m28      ; a0 b0 c0 d0 a1 b1 c1 d1
6732    punpckldq           m28, m25, m23 ; e4 f4 g4 h4 e5 f5 g5 h5
6733    punpckhdq           m25, m23      ; e6 f6 g6 h6 e7 f7 g7 h7
6734    punpckhqdq          m23, m22, m27 ;  1 23
6735    punpcklqdq          m22, m27      ;  0 22
6736    punpckhqdq          m27, m26, m28 ;  5 27
6737    punpcklqdq          m26, m28      ;  4 26
6738    punpcklqdq          m28, m29, m25 ;  6 28
6739    punpckhqdq          m29, m25      ;  7 29
6740    punpckhqdq          m25, m24, m8  ;  3 25
6741    punpcklqdq          m24, m8       ;  2 24
6742.transpose_8x8:
6743    punpckhwd            m8, m4, m5
6744    punpcklwd            m4, m5
6745    punpckhwd            m5, m0, m1
6746    punpcklwd            m0, m1
6747    punpckhwd            m1, m6, m7
6748    punpcklwd            m6, m7
6749    punpckhwd            m7, m2, m3
6750    punpcklwd            m2, m3
6751    punpckhdq            m3, m0, m2
6752    punpckldq            m0, m2
6753    punpckldq            m2, m4, m6
6754    punpckhdq            m4, m6
6755    punpckhdq            m6, m5, m7
6756    punpckldq            m5, m7
6757    punpckldq            m7, m8, m1
6758    punpckhdq            m8, m1
6759    punpckhqdq           m1, m0, m2
6760    punpcklqdq           m0, m2
6761    punpcklqdq           m2, m3, m4
6762    punpckhqdq           m3, m4
6763    punpcklqdq           m4, m5, m7
6764    punpckhqdq           m5, m7
6765    punpckhqdq           m7, m6, m8
6766    punpcklqdq           m6, m8
6767    ret
6768.pass1_end_part2:
6769    IDCT_64x32_PASS1_END  0, 21,  8
6770    IDCT_64x32_PASS1_END  1, 20,  9
6771    IDCT_64x32_PASS1_END  2, 19, 10
6772    IDCT_64x32_PASS1_END  3, 18, 11
6773    IDCT_64x32_PASS1_END  4, 17, 12
6774    IDCT_64x32_PASS1_END  5, 16, 13
6775    IDCT_64x32_PASS1_END  6, 15, 14
6776    IDCT_64x32_PASS1_END  7, 14, 15
6777.transpose_2x8x8_lo: ; m0-m7 (inverted) + m14-m21
6778    punpcklwd            m8, m3, m2
6779    punpckhwd            m3, m2
6780    punpcklwd            m2, m1, m0
6781    punpckhwd            m1, m0
6782    punpcklwd            m0, m7, m6
6783    punpckhwd            m7, m6
6784    punpcklwd            m6, m5, m4
6785    punpckhwd            m5, m4
6786    punpckldq            m4, m7, m5
6787    punpckhdq            m7, m5
6788    punpckldq            m5, m8, m2
6789    punpckhdq            m8, m2
6790    punpckhdq            m2, m0, m6
6791    punpckldq            m0, m6
6792    punpckldq            m6, m3, m1
6793    punpckhdq            m3, m1
6794    punpckhqdq           m1, m0, m5
6795    punpcklqdq           m0, m5
6796    punpckhqdq           m5, m4, m6
6797    punpcklqdq           m4, m6
6798    punpcklqdq           m6, m7, m3
6799    punpckhqdq           m7, m3
6800    punpckhqdq           m3, m2, m8
6801    punpcklqdq           m2, m8
6802    punpckhwd            m8, m18, m19
6803    punpcklwd           m18, m19
6804    punpckhwd           m19, m14, m15
6805    punpcklwd           m14, m15
6806    punpckhwd           m15, m20, m21
6807    punpcklwd           m20, m21
6808    punpckhwd           m21, m16, m17
6809    punpcklwd           m16, m17
6810    punpckhdq           m17, m14, m16
6811    punpckldq           m14, m16
6812    punpckldq           m16, m18, m20
6813    punpckhdq           m18, m20
6814    punpckhdq           m20, m19, m21
6815    punpckldq           m19, m21
6816    punpckldq           m21, m8, m15
6817    punpckhdq            m8, m15
6818    punpckhqdq          m15, m14, m16
6819    punpcklqdq          m14, m16
6820    punpcklqdq          m16, m17, m18
6821    punpckhqdq          m17, m18
6822    punpcklqdq          m18, m19, m21
6823    punpckhqdq          m19, m21
6824    punpckhqdq          m21, m20, m8
6825    punpcklqdq          m20, m8
6826    ret
6827.pass2_fast:
6828    vshufi32x4          m24, m9, m15, q3131  ;  5
6829    vshufi32x4          m22, m9, m15, q2020  ;  1
6830    vshufi32x4          m15, m1, m16, q3131  ;  6
6831    vshufi32x4          m14, m1, m16, q2020  ;  2
6832    vshufi32x4           m1, m0, m3, q3131   ;  4
6833    vshufi32x4           m0, m3, q2020       ;  0
6834    vshufi32x4           m3, m8, m2, q3131   ; 12
6835    vshufi32x4           m2, m8, m2, q2020   ;  8
6836    vshufi32x4          m25, m11, m17, q3131 ;  7
6837    vshufi32x4          m23, m11, m17, q2020 ;  3
6838    vshufi32x4          m17, m5, m19, q3131  ; 14
6839    vshufi32x4          m16, m5, m19, q2020  ; 10
6840    vshufi32x4          m29, m6, m20, q3131  ; 15
6841    vshufi32x4          m27, m6, m20, q2020  ; 11
6842    vshufi32x4          m28, m4, m18, q3131  ; 13
6843    vshufi32x4          m26, m4, m18, q2020  ;  9
6844    jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
6845
6846cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 7, 0, dst, stride, c, eob
6847    lea                  r5, [o_base]
6848    test               eobd, eobd
6849    jz .dconly
6850    PROLOGUE              0, 7, 30, 64*96, dst, stride, c, eob
6851%undef cmp
6852    cmp                eobd, 136
6853    jb .fast
6854    mova                 m0, [cq+64* 1]
6855    mova                 m1, [cq+64*31]
6856    mova                 m2, [cq+64*17]
6857    mova                 m3, [cq+64*15]
6858    vpbroadcastd        m10, [o(pd_2048)]
6859    mov                  r4, rsp
6860    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
6861    mova                 m0, [cq+64* 7]
6862    mova                 m1, [cq+64*25]
6863    mova                 m2, [cq+64*23]
6864    mova                 m3, [cq+64* 9]
6865    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
6866    mova                 m0, [cq+64* 5]
6867    mova                 m1, [cq+64*27]
6868    mova                 m2, [cq+64*21]
6869    mova                 m3, [cq+64*11]
6870    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
6871    mova                 m0, [cq+64* 3]
6872    mova                 m1, [cq+64*29]
6873    mova                 m2, [cq+64*19]
6874    mova                 m3, [cq+64*13]
6875    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
6876    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
6877    mova                 m0, [cq+64* 0]
6878    mova                 m1, [cq+64* 8]
6879    mova                 m2, [cq+64*16]
6880    mova                 m3, [cq+64*24]
6881    mova                m14, [cq+64* 4]
6882    mova                m15, [cq+64*12]
6883    mova                m16, [cq+64*20]
6884    mova                m17, [cq+64*28]
6885    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
6886    mova                m22, [cq+64* 2]
6887    mova                m29, [cq+64*30]
6888    mova                m26, [cq+64*18]
6889    mova                m25, [cq+64*14]
6890    mova                m24, [cq+64*10]
6891    mova                m27, [cq+64*22]
6892    mova                m28, [cq+64*26]
6893    mova                m23, [cq+64* 6]
6894    mova         [cq+64* 0], m14
6895    mova         [cq+64* 1], m15
6896    mova         [cq+64* 2], m16
6897    mova         [cq+64* 3], m17
6898    mova         [cq+64* 4], m18
6899    mova         [cq+64* 5], m19
6900    mova         [cq+64* 6], m20
6901    mova         [cq+64* 7], m21
6902    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
6903    vpbroadcastd        m13, [o(pw_8192)]
6904    call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part1
6905    mova         [r4+64*36], m1
6906    mova         [r4+64*37], m3
6907    mova         [r4+64*38], m5
6908    mova         [r4+64*39], m7
6909    mova         [r4+64*44], m23
6910    mova         [r4+64*45], m25
6911    mova         [r4+64*46], m27
6912    mova         [r4+64*47], m29
6913    pmulhrsw            m23, m13, m0 ; a0
6914    pmulhrsw            m25, m13, m2 ; a2
6915    pmulhrsw            m27, m13, m4 ; a4
6916    pmulhrsw            m29, m13, m6 ; a6
6917    call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part2
6918    lea                  r6, [r4-64*4]
6919    add                  r4, 64*28
6920    call .pass2_end
6921    mov                  r4, rsp
6922    mova                 m0, [r4+64*23]
6923    mova                 m1, [r4+64*22]
6924    mova                 m2, [r4+64*21]
6925    mova                 m3, [r4+64*20]
6926    mova                 m4, [r4+64*19]
6927    mova                 m5, [r4+64*18]
6928    mova                 m6, [r4+64*17]
6929    mova                 m7, [r4+64*16]
6930    mova                m22, [r4+64*15]
6931    mova                m23, [r4+64*14]
6932    mova                m24, [r4+64*13]
6933    mova                m25, [r4+64*12]
6934    mova                m26, [r4+64*11]
6935    mova                m27, [r4+64*10]
6936    mova                m28, [r4+64* 9]
6937    mova                m29, [r4+64* 8]
6938    call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_hi
6939    vpbroadcastd        m13, [o(pw_8192)]
6940    mova         [r4+64* 8], m1
6941    mova         [r4+64* 9], m3
6942    mova         [r4+64*10], m5
6943    mova         [r4+64*11], m7
6944    mova         [r4+64*16], m23
6945    mova         [r4+64*17], m25
6946    mova         [r4+64*18], m27
6947    mova         [r4+64*19], m29
6948    pmulhrsw            m23, m13, m0 ; b0
6949    pmulhrsw            m25, m13, m2 ; b2
6950    pmulhrsw            m27, m13, m4 ; b4
6951    pmulhrsw            m29, m13, m6 ; b6
6952    mova                 m0, [r4+64*31]
6953    mova                 m1, [r4+64*30]
6954    mova                 m2, [r4+64*29]
6955    mova                 m3, [r4+64*28]
6956    mova                 m4, [r4+64*27]
6957    mova                 m5, [r4+64*26]
6958    mova                 m6, [r4+64*25]
6959    mova                 m7, [r4+64*24]
6960    mova                m14, [r4+64* 7]
6961    mova                m15, [r4+64* 6]
6962    mova                m16, [r4+64* 5]
6963    mova                m17, [r4+64* 4]
6964    mova                m18, [r4+64* 3]
6965    mova                m19, [r4+64* 2]
6966    mova                m20, [r4+64* 1]
6967    mova                m21, [r4+64* 0]
6968    call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_lo
6969    mov                  r6, cq
6970    call .pass2_end
6971    jmp .end
6972.fast: ; bottom/right halves are zero
6973    mova                m28, [o(dup16_perm)]
6974    pmovzxwd             m9,       [cq+64* 0]
6975    vpermb               m8, m28,  [cq+64* 4]
6976    vpermb              ym1, ym28, [cq+64*12]
6977    vpermb               m7, m28,  [cq+64* 8]
6978    pslld                m9, 16
6979    call m(idct_16x16_internal_8bpc).main_fast2
6980    vpermb              m21, m28,  [cq+64* 2]
6981    vpermb             ym15, ym28, [cq+64*14]
6982    vpermb             ym18, ym28, [cq+64*10]
6983    vpermb              m14, m28,  [cq+64* 6]
6984    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
6985    vpermb              m22, m28,  [cq+64* 1]
6986    vpermb             ym29, ym28, [cq+64*15]
6987    vpermb             ym26, ym28, [cq+64* 9]
6988    vpermb              m25, m28,  [cq+64* 7]
6989    vpermb              m24, m28,  [cq+64* 5]
6990    vpermb             ym27, ym28, [cq+64*11]
6991    vpermb              m23, m28,  [cq+64* 3]
6992    vpermb             ym28, ym28, [cq+64*13]
6993    mova         [cq+64* 0], m14
6994    mova         [cq+64* 1], m15
6995    mova         [cq+64* 2], m16
6996    mova         [cq+64* 3], m17
6997    mova         [cq+64* 4], m18
6998    mova         [cq+64* 5], m19
6999    mova         [cq+64* 6], m20
7000    mova         [cq+64* 7], m21
7001    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
7002    vpbroadcastd        m13, [o(pw_8192)]
7003    mova         [cq+64*16], m4
7004    mova         [cq+64*17], m5
7005    mova         [cq+64*18], m6
7006    mova         [cq+64*19], m7
7007    mova         [cq+64*28], m26
7008    mova         [cq+64*29], m27
7009    mova         [cq+64*30], m28
7010    mova         [cq+64*31], m29
7011    call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end
7012    mova         [cq+64*20], m22
7013    mova         [cq+64*21], m23
7014    mova         [cq+64*22], m24
7015    mova         [cq+64*23], m25
7016    mova         [cq+64*24], m26
7017    mova         [cq+64*25], m27
7018    mova         [cq+64*26], m28
7019    mova         [cq+64*27], m29
7020    lea                  r4, [rsp+64*64]
7021    lea                  r3, [rsp+64*32]
7022    call .pass2_fast
7023    pmulhrsw             m0, m13, [cq+64*16]
7024    pmulhrsw             m1, m13, [cq+64*17]
7025    pmulhrsw             m2, m13, [cq+64*18]
7026    pmulhrsw             m3, m13, [cq+64*19]
7027    pmulhrsw             m4, m13, [cq+64*20]
7028    pmulhrsw             m5, m13, [cq+64*21]
7029    pmulhrsw             m6, m13, [cq+64*22]
7030    pmulhrsw             m7, m13, [cq+64*23]
7031    pmulhrsw            m14, m13, [cq+64*24]
7032    pmulhrsw            m15, m13, [cq+64*25]
7033    pmulhrsw            m16, m13, [cq+64*26]
7034    pmulhrsw            m17, m13, [cq+64*27]
7035    pmulhrsw            m18, m13, [cq+64*28]
7036    pmulhrsw            m19, m13, [cq+64*29]
7037    pmulhrsw            m20, m13, [cq+64*30]
7038    pmulhrsw            m21, m13, [cq+64*31]
7039    call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round
7040    mov                  r4, rsp
7041    mov                  r3, cq
7042    call .pass2_fast
7043.end:
7044    vpbroadcastd        m17, [o(pw_2048)]
7045    lea                  r5, [strideq*8]
7046    mov                  r3, dstq
7047    pxor                m16, m16
7048    sub                  r4, 64*5 ; rsp+64*31
7049    mov                  r6, rsp
7050.end_loop:
7051    mova                 m2, [r6+64*32] ; idct16 0+n  lo
7052    mova                 m7, [r6+64*48] ; idct32 31-n lo
7053    mova                 m6, [cq+64* 0] ; idct16 0+n  hi
7054    mova                 m0, [cq+64*16] ; idct32 31-n hi
7055    mova                 m4, [r4+64*64] ; idct64 63-n lo
7056    mova                 m1, [r4+64* 0] ; idct64 63-n hi
7057    mova                 m5, [r6+64*64] ; idct64 32+n lo
7058    mova                 m8, [r6+64* 0] ; idct64 32+n hi
7059    sub                  r3, strideq
7060    paddsw               m3, m2, m7     ; idct32  0+n lo
7061    mova                m12, [dstq+r5*0]
7062    psubsw               m2, m7         ; idct32 31-n lo
7063    mova                m15, [r3  +r5*8]
7064    paddsw               m7, m6, m0     ; idct32  0+n hi
7065    mova                m13, [r3  +r5*4]
7066    psubsw               m6, m0         ; idct32 31-n hi
7067    mova                m14, [dstq+r5*4]
7068    paddsw               m0, m3, m4     ; out  0+n lo
7069    add                  r6, 64
7070    psubsw               m3, m4         ; out 63-n lo
7071    sub                  r4, 64
7072    paddsw               m4, m7, m1     ; out  0+n hi
7073    mova         [cq+64* 0], m16
7074    psubsw               m7, m1         ; out 63-n hi
7075    mova         [cq+64*16], m16
7076    paddsw               m1, m2, m5     ; out 31-n lo
7077    add                  cq, 64
7078    psubsw               m2, m5         ; out 32+n lo
7079    paddsw               m5, m6, m8     ; out 31-n hi
7080    psubsw               m6, m8         ; out 32+n hi
7081    pmulhrsw             m0, m17
7082    punpcklbw            m8, m12, m16
7083    pmulhrsw             m4, m17
7084    punpckhbw           m12, m16
7085    pmulhrsw             m3, m17
7086    punpcklbw           m11, m15, m16
7087    pmulhrsw             m7, m17
7088    punpckhbw           m15, m16
7089    pmulhrsw             m1, m17
7090    punpcklbw            m9, m13, m16
7091    pmulhrsw             m5, m17
7092    punpckhbw           m13, m16
7093    pmulhrsw             m2, m17
7094    punpcklbw           m10, m14, m16
7095    pmulhrsw             m6, m17
7096    punpckhbw           m14, m16
7097    paddw                m0, m8
7098    paddw                m4, m12
7099    packuswb             m0, m4
7100    paddw                m3, m11
7101    paddw                m7, m15
7102    packuswb             m3, m7
7103    paddw                m1, m9
7104    paddw                m5, m13
7105    packuswb             m1, m5
7106    paddw                m2, m10
7107    paddw                m6, m14
7108    packuswb             m2, m6
7109    mova        [dstq+r5*0], m0
7110    mova        [r3  +r5*8], m3
7111    mova        [r3  +r5*4], m1
7112    mova        [dstq+r5*4], m2
7113    add                dstq, strideq
7114    cmp                  r6, r4
7115    jb .end_loop
7116    RET
7117.dconly:
7118    movsx               r6d, word [cq]
7119    mov                [cq], eobd
7120    mov                 r3d, 64
7121    jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly
7122ALIGN function_align
7123.pass2_end:
7124    REPX  {pmulhrsw x, m13}, m22, m24, m26, m28, m14, m16, m18, m20, m0, m2, m4, m6
7125    mova         [r4+64*20], m1
7126    mova         [r4+64*21], m3
7127    mova         [r4+64*22], m5
7128    mova         [r4+64*23], m7
7129    vinserti32x8         m1, m23, ym14, 1    ; a00 a01 c00 c01
7130    vshufi32x4           m3, m23, m14, q3232 ; a02 a03 c02 c03
7131    vinserti32x8         m5, m22, ym0, 1     ; e00 e01 g00 g01
7132    vshufi32x4          m14, m22, m0, q3232  ; e02 e03 g02 g03
7133    mova         [r4+64*12], m15
7134    mova         [r4+64*13], m17
7135    mova         [r4+64*14], m19
7136    mova         [r4+64*15], m21
7137    vinserti32x8        m15, m27, ym18, 1    ; a40 a41 c40 c41
7138    vshufi32x4          m17, m27, m18, q3232 ; a42 a43 c42 c43
7139    vinserti32x8        m18, m26, ym4, 1     ; e40 e41 g40 g41
7140    vshufi32x4          m19, m26, m4, q3232  ; e42 e43 g42 g43
7141    vinserti32x8        m22, m25, ym16, 1    ; a20 a21 c20 c21
7142    vshufi32x4          m26, m25, m16, q3232 ; a22 a23 c22 c23
7143    vinserti32x8        m25, m24, ym2, 1     ; e20 e21 g20 g21
7144    vshufi32x4          m27, m24, m2, q3232  ; e22 e23 g22 g23
7145    vinserti32x8        m23, m29, ym20, 1    ; a60 a61 c60 c61
7146    vshufi32x4          m29, m20, q3232      ; a62 a63 c62 c63
7147    vshufi32x4          m13, m28, m6, q3232  ; e62 e63 g62 g63
7148    vinserti32x8        m28, ym6, 1          ; e60 e61 g60 g61
7149    vshufi32x4           m0, m1, m5, q2020   ;  0
7150    vshufi32x4           m1, m5, q3131       ;  8
7151    vshufi32x4           m2, m3, m14, q2020  ; 16
7152    vshufi32x4           m3, m14, q3131      ; 24
7153    vshufi32x4          m14, m15, m18, q2020 ;  4
7154    vshufi32x4          m15, m18, q3131      ; 12
7155    vshufi32x4          m16, m17, m19, q2020 ; 20
7156    vshufi32x4          m17, m19, q3131      ; 28
7157    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
7158    vshufi32x4          m24, m22, m25, q3131 ; 10
7159    vshufi32x4          m22, m25, q2020      ;  2
7160    vshufi32x4          m25, m23, m28, q3131 ; 14
7161    vshufi32x4          m23, m28, q2020      ;  6
7162    vshufi32x4          m28, m26, m27, q3131 ; 26
7163    vshufi32x4          m26, m27, q2020      ; 18
7164    vshufi32x4          m27, m29, m13, q2020 ; 22
7165    vshufi32x4          m29, m13, q3131      ; 30
7166    mova         [r6+64* 0], m0
7167    mova         [r6+64* 1], m1
7168    mova         [r6+64* 2], m2
7169    mova         [r6+64* 3], m3
7170    mova         [r6+64* 4], m4
7171    mova         [r6+64* 5], m5
7172    mova         [r6+64* 6], m6
7173    mova         [r6+64* 7], m7
7174    mova         [r6+64* 8], m14
7175    mova         [r6+64* 9], m15
7176    mova         [r6+64*10], m16
7177    mova         [r6+64*11], m17
7178    mova         [r6+64*12], m18
7179    mova         [r6+64*13], m19
7180    mova         [r6+64*14], m20
7181    mova         [r6+64*15], m21
7182    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
7183    vpbroadcastd        m13, [o(pw_8192)]
7184    mova         [r6+64*16], m29
7185    mova         [r6+64*17], m28
7186    mova         [r6+64*18], m27
7187    mova         [r6+64*19], m26
7188    mova         [r6+64*20], m25
7189    mova         [r6+64*21], m24
7190    mova         [r6+64*22], m23
7191    mova         [r6+64*23], m22
7192    mova         [r6+64*24], m21
7193    mova         [r6+64*25], m20
7194    mova         [r6+64*26], m19
7195    mova         [r6+64*27], m18
7196    mova         [r6+64*28], m17
7197    mova         [r6+64*29], m16
7198    mova         [r6+64*30], m15
7199    mova         [r6+64*31], m14
7200    pmulhrsw            m15, m13, [r4+64* 8] ;  1  9 17 25
7201    pmulhrsw            m16, m13, [r4+64*12]
7202    pmulhrsw            m17, m13, [r4+64*16]
7203    pmulhrsw            m18, m13, [r4+64*20]
7204    pmulhrsw            m19, m13, [r4+64*11] ;  7 15 23 31
7205    pmulhrsw            m20, m13, [r4+64*15]
7206    pmulhrsw            m21, m13, [r4+64*19]
7207    pmulhrsw            m22, m13, [r4+64*23]
7208    vinserti32x8        m14, m15, ym16, 1 ; a1  a9  c1  c9
7209    vshufi32x4          m15, m16, q3232   ; a17 a25 c17 c25
7210    vinserti32x8        m16, m17, ym18, 1 ; e1  e9  g1  g9
7211    vshufi32x4          m17, m18, q3232   ; e17 e25 g17 g25
7212    pmulhrsw            m23, m13, [r4+64*10] ;  5 13 21 29
7213    pmulhrsw            m24, m13, [r4+64*14]
7214    pmulhrsw            m25, m13, [r4+64*18]
7215    pmulhrsw            m26, m13, [r4+64*22]
7216    vinserti32x8        m18, m19, ym20, 1 ; a7  a15 c7  c15
7217    vshufi32x4          m19, m20, q3232   ; a23 a31 c23 c31
7218    vinserti32x8        m20, m21, ym22, 1 ; e7  e15 g7  g15
7219    vshufi32x4          m21, m22, q3232   ; e23 e31 g23 g31
7220    pmulhrsw            m27, m13, [r4+64* 9] ;  3 11 19 27
7221    pmulhrsw            m28, m13, [r4+64*13]
7222    pmulhrsw            m29, m13, [r4+64*17]
7223    pmulhrsw            m13,      [r4+64*21]
7224    vshufi32x4           m0, m14, m16, q2020 ;  1
7225    vshufi32x4           m1, m19, m21, q3131 ; 31
7226    vshufi32x4           m2, m15, m17, q2020 ; 17
7227    vshufi32x4           m3, m18, m20, q3131 ; 15
7228    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
7229    vshufi32x4           m0, m18, m20, q2020 ;  7
7230    vshufi32x4           m1, m15, m17, q3131 ; 25
7231    vshufi32x4           m2, m19, m21, q2020 ; 23
7232    vshufi32x4           m3, m14, m16, q3131 ;  9
7233    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
7234    vinserti32x8        m22, m23, ym24, 1 ; a5  a13 c5  c13
7235    vshufi32x4          m23, m24, q3232   ; a21 a29 c21 c29
7236    vinserti32x8        m24, m25, ym26, 1 ; e5  e13 g5  g13
7237    vshufi32x4          m25, m26, q3232   ; e21 e29 g21 g29
7238    vinserti32x8        m26, m27, ym28, 1 ; a3  a11 c3  c11
7239    vshufi32x4          m27, m28, q3232   ; a19 a27 c19 c27
7240    vinserti32x8        m28, m29, ym13, 1 ; e3  e11 g3  g11
7241    vshufi32x4          m29, m13, q3232   ; e19 e17 g19 g27
7242    vshufi32x4           m0, m22, m24, q2020 ;  5
7243    vshufi32x4           m1, m27, m29, q3131 ; 27
7244    vshufi32x4           m2, m23, m25, q2020 ; 21
7245    vshufi32x4           m3, m26, m28, q3131 ; 11
7246    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
7247    vshufi32x4           m0, m26, m28, q2020 ;  3
7248    vshufi32x4           m1, m23, m25, q3131 ; 29
7249    vshufi32x4           m2, m27, m29, q2020 ; 19
7250    vshufi32x4           m3, m22, m24, q3131 ; 13
7251    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
7252    jmp m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
7253ALIGN function_align
7254.pass2_fast:
7255    vshufi32x4          m23, m1, m16, q3131  ;  6
7256    vshufi32x4          m22, m1, m16, q2020  ;  2
7257    vshufi32x4          m14, m0, m3, q3131   ;  4
7258    vshufi32x4          m26, m0, m3, q2020   ;  0
7259    vshufi32x4          m28, m9, m15, q3131  ;  5
7260    vshufi32x4           m0, m9, m15, q2020  ;  1
7261    vshufi32x4          m16, m11, m17, q3131 ;  7
7262    vshufi32x4          m29, m11, m17, q2020 ;  3
7263    vshufi32x4          m15, m8, m2, q3131   ; 12
7264    vshufi32x4          m27, m8, m2, q2020   ;  8
7265    vshufi32x4          m25, m5, m19, q3131  ; 14
7266    vshufi32x4          m24, m5, m19, q2020  ; 10
7267    vshufi32x4           m3, m6, m20, q3131  ; 15
7268    vshufi32x4          m19, m6, m20, q2020  ; 11
7269    vshufi32x4          m17, m4, m18, q3131  ; 13
7270    vshufi32x4          m18, m4, m18, q2020  ;  9
7271    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
7272    mova                 m0, m16
7273    mova                 m3, m18
7274    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
7275    mova                 m0, m28
7276    mova                 m3, m19
7277    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
7278    mova                 m0, m29
7279    mova                 m3, m17
7280    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
7281    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
7282    mova                 m0, m26
7283    mova                 m1, m27
7284    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2
7285    mova         [r3+64* 0], m0
7286    mova         [r3+64* 1], m1
7287    mova         [r3+64* 2], m2
7288    mova         [r3+64* 3], m3
7289    mova         [r3+64* 4], m4
7290    mova         [r3+64* 5], m5
7291    mova         [r3+64* 6], m6
7292    mova         [r3+64* 7], m7
7293    mova         [r3+64* 8], m14
7294    mova         [r3+64* 9], m15
7295    mova         [r3+64*10], m16
7296    mova         [r3+64*11], m17
7297    mova         [r3+64*12], m18
7298    mova         [r3+64*13], m19
7299    mova         [r3+64*14], m20
7300    mova         [r3+64*15], m21
7301    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
7302    mova         [r3+64*16], m29
7303    mova         [r3+64*17], m28
7304    mova         [r3+64*18], m27
7305    mova         [r3+64*19], m26
7306    mova         [r3+64*20], m25
7307    mova         [r3+64*21], m24
7308    mova         [r3+64*22], m23
7309    mova         [r3+64*23], m22
7310    mova         [r3+64*24], m21
7311    mova         [r3+64*25], m20
7312    mova         [r3+64*26], m19
7313    mova         [r3+64*27], m18
7314    mova         [r3+64*28], m17
7315    mova         [r3+64*29], m16
7316    mova         [r3+64*30], m15
7317    mova         [r3+64*31], m14
7318    ret
7319
7320%endif ; ARCH_X86_64
7321