1;******************************************************************************
2;* VP9 IDCT SIMD optimizations
3;*
4;* Copyright (C) 2013 Clément Bœsch <u pkh me>
5;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25%include "vp9itxfm_template.asm"
26
27SECTION_RODATA 32
28
29%macro VP9_IDCT_COEFFS 2-3 0
30const pw_m%1_%2
31times 8 dw -%1,  %2
32const pw_%2_%1
33times 8 dw  %2,  %1
34
35%if %3 == 1
36const pw_m%2_m%1
37times 8 dw -%2, -%1
38%if %1 != %2
39const pw_m%2_%1
40times 8 dw -%2,  %1
41const pw_%1_%2
42times 8 dw  %1,  %2
43%endif
44%endif
45
46%if %1 < 11585
47pw_m%1x2:   times 16 dw -%1*2
48%elif %1 > 11585
49pw_%1x2:    times 16 dw  %1*2
50%else
51const pw_%1x2
52times 16 dw %1*2
53%endif
54
55%if %2 != %1
56pw_%2x2:    times 16 dw  %2*2
57%endif
58%endmacro
59
60VP9_IDCT_COEFFS 16364,   804
61VP9_IDCT_COEFFS 16305,  1606
62VP9_IDCT_COEFFS 16069,  3196, 1
63VP9_IDCT_COEFFS 15893,  3981
64VP9_IDCT_COEFFS 15137,  6270, 1
65VP9_IDCT_COEFFS 14811,  7005
66VP9_IDCT_COEFFS 14449,  7723
67VP9_IDCT_COEFFS 13160,  9760
68VP9_IDCT_COEFFS 11585, 11585, 1
69VP9_IDCT_COEFFS 11003, 12140
70VP9_IDCT_COEFFS 10394, 12665
71VP9_IDCT_COEFFS  9102, 13623, 1
72VP9_IDCT_COEFFS  8423, 14053
73VP9_IDCT_COEFFS  5520, 15426
74VP9_IDCT_COEFFS  4756, 15679
75VP9_IDCT_COEFFS  2404, 16207
76
77const pw_5283_13377
78times 4 dw 5283, 13377
79const pw_9929_13377
80times 4 dw 9929, 13377
81const pw_15212_m13377
82times 4 dw 15212, -13377
83const pw_15212_9929
84times 4 dw 15212, 9929
85const pw_m5283_m15212
86times 4 dw -5283, -15212
87const pw_13377x2
88times 8 dw 13377*2
89const pw_m13377_13377
90times 4 dw -13377, 13377
91const pw_13377_0
92times 4 dw 13377, 0
93
94cextern pw_8
95cextern pw_16
96cextern pw_32
97cextern pw_512
98cextern pw_1024
99cextern pw_2048
100cextern pw_m1
101cextern pd_8192
102
103SECTION .text
104
105%macro VP9_UNPACK_MULSUB_2D_4X 6 ; dst1 [src1], dst2 [src2], dst3, dst4, mul1, mul2
106    punpckhwd          m%4, m%2, m%1
107    punpcklwd          m%2, m%1
108    pmaddwd            m%3, m%4, [pw_m%5_%6]
109    pmaddwd            m%4, [pw_%6_%5]
110    pmaddwd            m%1, m%2, [pw_m%5_%6]
111    pmaddwd            m%2, [pw_%6_%5]
112%endmacro
113
114%macro VP9_RND_SH_SUMSUB_BA 6 ; dst1 [src1], dst2 [src2], src3, src4, tmp, round
115    SUMSUB_BA            d, %1, %2, %5
116    SUMSUB_BA            d, %3, %4, %5
117    paddd              m%1, %6
118    paddd              m%2, %6
119    paddd              m%3, %6
120    paddd              m%4, %6
121    psrad              m%1, 14
122    psrad              m%2, 14
123    psrad              m%3, 14
124    psrad              m%4, 14
125    packssdw           m%1, m%3
126    packssdw           m%2, m%4
127%endmacro
128
129%macro VP9_STORE_2X 5-6 dstq ; reg1, reg2, tmp1, tmp2, zero, dst
130%if mmsize == 32
131    pmovzxbw           m%3, [%6]
132    pmovzxbw           m%4, [%6+strideq]
133%else
134    movh               m%3, [%6]
135    movh               m%4, [%6+strideq]
136    punpcklbw          m%3, m%5
137    punpcklbw          m%4, m%5
138%endif
139    paddw              m%3, m%1
140    paddw              m%4, m%2
141%if mmsize == 32
142    packuswb           m%3, m%4
143    ; Intel...
144    vpermq             m%3, m%3, q3120
145    mova              [%6], xm%3
146    vextracti128 [%6+strideq], m%3, 1
147%elif mmsize == 16
148    packuswb           m%3, m%4
149    movh              [%6], m%3
150    movhps    [%6+strideq], m%3
151%else
152    packuswb           m%3, m%5
153    packuswb           m%4, m%5
154    movh              [%6], m%3
155    movh      [%6+strideq], m%4
156%endif
157%endmacro
158
159%macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg
160%assign %%y 0
161%rep %3
162%assign %%x 0
163%rep %3*2/mmsize
164    mova      [%1+%%y+%%x], %4
165%assign %%x (%%x+mmsize)
166%endrep
167%assign %%y (%%y+%2)
168%endrep
169%endmacro
170
171;-------------------------------------------------------------------------------------------
172; void vp9_iwht_iwht_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
173;-------------------------------------------------------------------------------------------
174
175INIT_MMX mmx
176cglobal vp9_iwht_iwht_4x4_add, 3, 3, 0, dst, stride, block, eob
177    mova                m0, [blockq+0*8]
178    mova                m1, [blockq+1*8]
179    mova                m2, [blockq+2*8]
180    mova                m3, [blockq+3*8]
181    psraw               m0, 2
182    psraw               m1, 2
183    psraw               m2, 2
184    psraw               m3, 2
185
186    VP9_IWHT4_1D
187    TRANSPOSE4x4W        0, 1, 2, 3, 4
188    VP9_IWHT4_1D
189
190    pxor                m4, m4
191    VP9_STORE_2X         0, 1, 5, 6, 4
192    lea               dstq, [dstq+strideq*2]
193    VP9_STORE_2X         2, 3, 5, 6, 4
194    ZERO_BLOCK      blockq, 8, 4, m4
195    RET
196
197;-------------------------------------------------------------------------------------------
198; void vp9_idct_idct_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
199;-------------------------------------------------------------------------------------------
200
201; 2x2 top left corner
202%macro VP9_IDCT4_2x2_1D 0
203    pmulhrsw            m0, m5                              ; m0=t1
204    mova                m2, m0                              ; m2=t0
205    mova                m3, m1
206    pmulhrsw            m1, m6                              ; m1=t2
207    pmulhrsw            m3, m7                              ; m3=t3
208    VP9_IDCT4_1D_FINALIZE
209%endmacro
210
211%macro VP9_IDCT4_WRITEOUT 0
212%if cpuflag(ssse3)
213    mova                m5, [pw_2048]
214    pmulhrsw            m0, m5              ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
215    pmulhrsw            m1, m5
216%else
217    mova                m5, [pw_8]
218    paddw               m0, m5
219    paddw               m1, m5
220    psraw               m0, 4
221    psraw               m1, 4
222%endif
223    VP9_STORE_2X         0,  1,  6,  7,  4
224    lea               dstq, [dstq+2*strideq]
225%if cpuflag(ssse3)
226    pmulhrsw            m2, m5
227    pmulhrsw            m3, m5
228%else
229    paddw               m2, m5
230    paddw               m3, m5
231    psraw               m2, 4
232    psraw               m3, 4
233%endif
234    VP9_STORE_2X         2,  3,  6,  7,  4
235%endmacro
236
237%macro IDCT_4x4_FN 1
238INIT_MMX %1
239cglobal vp9_idct_idct_4x4_add, 4, 4, 0, dst, stride, block, eob
240
241%if cpuflag(ssse3)
242    cmp eobd, 4 ; 2x2 or smaller
243    jg .idctfull
244
245    cmp eobd, 1 ; faster path for when only DC is set
246    jne .idct2x2
247%else
248    cmp eobd, 1
249    jg .idctfull
250%endif
251
252%if cpuflag(ssse3)
253    movd                m0, [blockq]
254    mova                m5, [pw_11585x2]
255    pmulhrsw            m0, m5
256    pmulhrsw            m0, m5
257%else
258    DEFINE_ARGS dst, stride, block, coef
259    movsx            coefd, word [blockq]
260    imul             coefd, 11585
261    add              coefd, 8192
262    sar              coefd, 14
263    imul             coefd, 11585
264    add              coefd, (8 << 14) + 8192
265    sar              coefd, 14 + 4
266    movd                m0, coefd
267%endif
268    pshufw              m0, m0, 0
269    pxor                m4, m4
270    movh          [blockq], m4
271%if cpuflag(ssse3)
272    pmulhrsw            m0, [pw_2048]       ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
273%endif
274    VP9_STORE_2X         0,  0,  6,  7,  4
275    lea               dstq, [dstq+2*strideq]
276    VP9_STORE_2X         0,  0,  6,  7,  4
277    RET
278
279%if cpuflag(ssse3)
280; faster path for when only top left 2x2 block is set
281.idct2x2:
282    movd                m0, [blockq+0]
283    movd                m1, [blockq+8]
284    mova                m5, [pw_11585x2]
285    mova                m6, [pw_6270x2]
286    mova                m7, [pw_15137x2]
287    VP9_IDCT4_2x2_1D
288    ; partial 2x4 transpose
289    punpcklwd           m0, m1
290    punpcklwd           m2, m3
291    SBUTTERFLY          dq, 0, 2, 1
292    SWAP                1, 2
293    VP9_IDCT4_2x2_1D
294    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
295    movh       [blockq+ 0], m4
296    movh       [blockq+ 8], m4
297    VP9_IDCT4_WRITEOUT
298    RET
299%endif
300
301.idctfull: ; generic full 4x4 idct/idct
302    mova                m0, [blockq+ 0]
303    mova                m1, [blockq+ 8]
304    mova                m2, [blockq+16]
305    mova                m3, [blockq+24]
306%if cpuflag(ssse3)
307    mova                m6, [pw_11585x2]
308%endif
309    mova                m7, [pd_8192]       ; rounding
310    VP9_IDCT4_1D
311    TRANSPOSE4x4W  0, 1, 2, 3, 4
312    VP9_IDCT4_1D
313    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
314    mova       [blockq+ 0], m4
315    mova       [blockq+ 8], m4
316    mova       [blockq+16], m4
317    mova       [blockq+24], m4
318    VP9_IDCT4_WRITEOUT
319    RET
320%endmacro
321
322IDCT_4x4_FN mmxext
323IDCT_4x4_FN ssse3
324
325;-------------------------------------------------------------------------------------------
326; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
327;-------------------------------------------------------------------------------------------
328
329%macro IADST4_FN 5
330INIT_MMX %5
331cglobal vp9_%1_%3_4x4_add, 3, 3, 0, dst, stride, block, eob
332%if WIN64 && notcpuflag(ssse3)
333    WIN64_SPILL_XMM 8
334%endif
335    movdqa            xmm5, [pd_8192]
336    mova                m0, [blockq+ 0]
337    mova                m1, [blockq+ 8]
338    mova                m2, [blockq+16]
339    mova                m3, [blockq+24]
340%if cpuflag(ssse3)
341    mova                m6, [pw_11585x2]
342%endif
343%ifnidn %1%3, iadstiadst
344    movdq2q             m7, xmm5
345%endif
346    VP9_%2_1D
347    TRANSPOSE4x4W  0, 1, 2, 3, 4
348    VP9_%4_1D
349    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
350    mova       [blockq+ 0], m4
351    mova       [blockq+ 8], m4
352    mova       [blockq+16], m4
353    mova       [blockq+24], m4
354    VP9_IDCT4_WRITEOUT
355    RET
356%endmacro
357
358IADST4_FN idct,  IDCT4,  iadst, IADST4, sse2
359IADST4_FN iadst, IADST4, idct,  IDCT4,  sse2
360IADST4_FN iadst, IADST4, iadst, IADST4, sse2
361
362IADST4_FN idct,  IDCT4,  iadst, IADST4, ssse3
363IADST4_FN iadst, IADST4, idct,  IDCT4,  ssse3
364IADST4_FN iadst, IADST4, iadst, IADST4, ssse3
365
366%macro SCRATCH 3
367%if ARCH_X86_64
368    SWAP                %1, %2
369%else
370    mova              [%3], m%1
371%endif
372%endmacro
373
374%macro UNSCRATCH 3
375%if ARCH_X86_64
376    SWAP                %1, %2
377%else
378    mova               m%1, [%3]
379%endif
380%endmacro
381
382;-------------------------------------------------------------------------------------------
383; void vp9_idct_idct_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
384;-------------------------------------------------------------------------------------------
385
386%macro VP9_IDCT8_1D_FINALIZE 0
387    SUMSUB_BA            w,  3,  6, 5                       ; m3=t0+t7, m6=t0-t7
388    SUMSUB_BA            w,  1,  2, 5                       ; m1=t1+t6, m2=t1-t6
389    SUMSUB_BA            w,  7,  0, 5                       ; m7=t2+t5, m0=t2-t5
390
391    UNSCRATCH            5, 8, blockq+ 0
392    SCRATCH              2, 8, blockq+ 0
393
394    SUMSUB_BA            w,  5,  4, 2                       ; m5=t3+t4, m4=t3-t4
395    SWAP                 7,  6,  2
396    SWAP                 3,  5,  0
397
398%if ARCH_X86_64
399    SWAP                 6, 8
400%endif
401%endmacro
402
403; x86-32
404; - in: m0/m4 is in mem
405; - out: m6 is in mem
406; x86-64:
407; - everything is in registers (m0-7)
408%macro VP9_IDCT8_1D 0
409%if ARCH_X86_64
410    SWAP                 0, 8
411    SWAP                 4, 9
412%endif
413
414    VP9_UNPACK_MULSUB_2W_4X 5,  3,  9102, 13623, D_8192_REG, 0, 4  ; m5=t5a, m3=t6a
415    VP9_UNPACK_MULSUB_2W_4X 1,  7, 16069,  3196, D_8192_REG, 0, 4  ; m1=t4a, m7=t7a
416    SUMSUB_BA            w,  5,  1, 0                       ; m5=t4a+t5a (t4), m1=t4a-t5a (t5a)
417    SUMSUB_BA            w,  3,  7, 0                       ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a)
418%if cpuflag(ssse3)
419    SUMSUB_BA            w,  1,  7, 0                       ; m1=t6a+t5a (t6), m7=t6a-t5a (t5)
420    pmulhrsw            m1, W_11585x2_REG                   ; m1=t6
421    pmulhrsw            m7, W_11585x2_REG                   ; m7=t5
422%else
423    VP9_UNPACK_MULSUB_2W_4X 7,  1, 11585, 11585, D_8192_REG, 0, 4
424%endif
425    VP9_UNPACK_MULSUB_2W_4X 2,  6, 15137,  6270, D_8192_REG, 0, 4  ; m2=t2a, m6=t3a
426
427    UNSCRATCH            0, 8, blockq+ 0    ; IN(0)
428    UNSCRATCH            4, 9, blockq+64    ; IN(4)
429    SCRATCH              5, 8, blockq+ 0
430
431%if cpuflag(ssse3)
432    SUMSUB_BA            w, 4, 0, 5                         ; m4=IN(0)+IN(4) m0=IN(0)-IN(4)
433    pmulhrsw            m4, W_11585x2_REG                   ; m4=t0a
434    pmulhrsw            m0, W_11585x2_REG                   ; m0=t1a
435%else
436    SCRATCH              7, 9, blockq+64
437    VP9_UNPACK_MULSUB_2W_4X 0,  4, 11585, 11585, D_8192_REG, 5, 7
438    UNSCRATCH            7, 9, blockq+64
439%endif
440    SUMSUB_BA            w,  6,  4, 5                       ; m6=t0a+t3a (t0), m4=t0a-t3a (t3)
441    SUMSUB_BA            w,  2,  0, 5                       ; m2=t1a+t2a (t1), m0=t1a-t2a (t2)
442
443    VP9_IDCT8_1D_FINALIZE
444%endmacro
445
446%macro VP9_IDCT8_4x4_1D 0
447    pmulhrsw            m0, W_11585x2_REG                   ; m0=t1a/t0a
448    pmulhrsw            m6, m2, [pw_15137x2]                ; m6=t3a
449    pmulhrsw            m2, [pw_6270x2]                     ; m2=t2a
450    pmulhrsw            m7, m1, [pw_16069x2]                ; m7=t7a
451    pmulhrsw            m1, [pw_3196x2]                     ; m1=t4a
452    pmulhrsw            m5, m3, [pw_m9102x2]                ; m5=t5a
453    pmulhrsw            m3, [pw_13623x2]                    ; m3=t6a
454    SUMSUB_BA            w,  5,  1, 4                       ; m1=t4a+t5a (t4), m5=t4a-t5a (t5a)
455    SUMSUB_BA            w,  3,  7, 4                       ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a)
456    SUMSUB_BA            w,  1,  7, 4                       ; m1=t6a+t5a (t6), m7=t6a-t5a (t5)
457    pmulhrsw            m1, W_11585x2_REG                   ; m1=t6
458    pmulhrsw            m7, W_11585x2_REG                   ; m7=t5
459    psubw               m4, m0, m6                          ; m4=t0a-t3a (t3)
460    paddw               m6, m0                              ; m6=t0a+t3a (t0)
461    SCRATCH              5,  8, blockq+ 0
462    SUMSUB_BA            w,  2,  0, 5                       ; m2=t1a+t2a (t1), m0=t1a-t2a (t2)
463    VP9_IDCT8_1D_FINALIZE
464%endmacro
465
466%macro VP9_IDCT8_2x2_1D 1
467    pmulhrsw            m0, W_11585x2_REG                   ; m0=t0
468    pmulhrsw            m3, m1, W_16069x2_REG               ; m3=t7
469    pmulhrsw            m1, W_3196x2_REG                    ; m1=t4
470    psubw               m7, m3, m1                          ; t5 = t7a - t4a
471    paddw               m5, m3, m1                          ; t6 = t7a + t4a
472    pmulhrsw            m7, W_11585x2_REG                   ; m7=t5
473    pmulhrsw            m5, W_11585x2_REG                   ; m5=t6
474    SWAP                 5,  1
475    ; merged VP9_IDCT8_1D_FINALIZE to make register-sharing w/ avx easier
476    psubw               m6, m0, m3                          ; m6=t0-t7
477    paddw               m3, m0                              ; m3=t0+t7
478    psubw               m2, m0, m1                          ; m2=t1-t6
479    paddw               m1, m0                              ; m1=t1+t6
480%if %1 == 1
481    punpcklwd           m3, m1
482%define SCRATCH_REG 1
483%elif ARCH_X86_32
484    mova       [blockq+ 0], m2
485%define SCRATCH_REG 2
486%else
487%define SCRATCH_REG 8
488%endif
489    psubw               m4, m0, m5                          ; m4=t3-t4
490    paddw               m5, m0                              ; m5=t3+t4
491    SUMSUB_BA            w,  7,  0, SCRATCH_REG             ; m7=t2+t5, m0=t2-t5
492    SWAP                 7,  6,  2
493    SWAP                 3,  5,  0
494%undef SCRATCH_REG
495%endmacro
496
497%macro VP9_IDCT8_WRITEx2 6-8 5 ; line1, line2, tmp1, tmp2, zero, pw_1024/pw_16, shift
498%if cpuflag(ssse3)
499    pmulhrsw           m%1, %6              ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5
500    pmulhrsw           m%2, %6
501%else
502    paddw              m%1, %6
503    paddw              m%2, %6
504    psraw              m%1, %7
505    psraw              m%2, %7
506%endif
507%if %0 <= 7
508    VP9_STORE_2X        %1, %2, %3, %4, %5
509%else
510    VP9_STORE_2X        %1, %2, %3, %4, %5, %8
511%endif
512%endmacro
513
514; x86-32:
515; - m6 is in mem
516; x86-64:
517; - m8 holds m6 (SWAP)
518; m6 holds zero
519%macro VP9_IDCT8_WRITEOUT 0
520%if ARCH_X86_64
521%if cpuflag(ssse3)
522    mova                m9, [pw_1024]
523%else
524    mova                m9, [pw_16]
525%endif
526%define ROUND_REG m9
527%else
528%if cpuflag(ssse3)
529%define ROUND_REG [pw_1024]
530%else
531%define ROUND_REG [pw_16]
532%endif
533%endif
534    SCRATCH              5, 10, blockq+16
535    SCRATCH              7, 11, blockq+32
536    VP9_IDCT8_WRITEx2    0,  1, 5, 7, 6, ROUND_REG
537    lea               dstq, [dstq+2*strideq]
538    VP9_IDCT8_WRITEx2    2,  3, 5, 7, 6, ROUND_REG
539    lea               dstq, [dstq+2*strideq]
540    UNSCRATCH            5, 10, blockq+16
541    UNSCRATCH            7, 11, blockq+32
542    VP9_IDCT8_WRITEx2    4,  5, 0, 1, 6, ROUND_REG
543    lea               dstq, [dstq+2*strideq]
544    UNSCRATCH            5, 8, blockq+ 0
545    VP9_IDCT8_WRITEx2    5,  7, 0, 1, 6, ROUND_REG
546
547%undef ROUND_REG
548%endmacro
549
550%macro VP9_IDCT_IDCT_8x8_ADD_XMM 2
551INIT_XMM %1
552cglobal vp9_idct_idct_8x8_add, 4, 4, %2, dst, stride, block, eob
553
554%if cpuflag(ssse3)
555%if ARCH_X86_64
556    mova               m12, [pw_11585x2]    ; often used
557%define W_11585x2_REG m12
558%else
559%define W_11585x2_REG [pw_11585x2]
560%endif
561
562    cmp eobd, 12 ; top left half or less
563    jg .idctfull
564
565    cmp eobd, 3  ; top left corner or less
566    jg .idcthalf
567
568    cmp eobd, 1 ; faster path for when only DC is set
569    jne .idcttopleftcorner
570%else
571    cmp eobd, 1
572    jg .idctfull
573%endif
574
575%if cpuflag(ssse3)
576    movd                m0, [blockq]
577    pmulhrsw            m0, W_11585x2_REG
578    pmulhrsw            m0, W_11585x2_REG
579%else
580    DEFINE_ARGS dst, stride, block, coef
581    movsx            coefd, word [blockq]
582    imul             coefd, 11585
583    add              coefd, 8192
584    sar              coefd, 14
585    imul             coefd, 11585
586    add              coefd, (16 << 14) + 8192
587    sar              coefd, 14 + 5
588    movd                m0, coefd
589%endif
590    SPLATW              m0, m0, 0
591    pxor                m4, m4
592    movd          [blockq], m4
593%if cpuflag(ssse3)
594    pmulhrsw            m0, [pw_1024]       ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5
595%endif
596%rep 3
597    VP9_STORE_2X         0,  0,  6,  7,  4
598    lea               dstq, [dstq+2*strideq]
599%endrep
600    VP9_STORE_2X         0,  0,  6,  7,  4
601    RET
602
603%if cpuflag(ssse3)
604; faster path for when only left corner is set (3 input: DC, right to DC, below
605; to DC). Note: also working with a 2x2 block
606.idcttopleftcorner:
607    movd                m0, [blockq+0]
608    movd                m1, [blockq+16]
609%if ARCH_X86_64
610    mova               m10, [pw_3196x2]
611    mova               m11, [pw_16069x2]
612%define W_3196x2_REG m10
613%define W_16069x2_REG m11
614%else
615%define W_3196x2_REG [pw_3196x2]
616%define W_16069x2_REG [pw_16069x2]
617%endif
618    VP9_IDCT8_2x2_1D 1
619    ; partial 2x8 transpose
620    ; punpcklwd m0, m1 already done inside idct
621    punpcklwd           m2, m3
622    punpcklwd           m4, m5
623    punpcklwd           m6, m7
624    punpckldq           m0, m2
625    punpckldq           m4, m6
626    SBUTTERFLY         qdq, 0, 4, 1
627    SWAP                 1, 4
628    VP9_IDCT8_2x2_1D 2
629%if ARCH_X86_64
630    SWAP                 6, 8
631%endif
632    pxor                m6, m6  ; used for the block reset, and VP9_STORE_2X
633    VP9_IDCT8_WRITEOUT
634%if ARCH_X86_64
635    movd       [blockq+ 0], m6
636    movd       [blockq+16], m6
637%else
638    mova       [blockq+ 0], m6
639    mova       [blockq+16], m6
640    mova       [blockq+32], m6
641%endif
642    RET
643
644.idcthalf:
645    movh                m0, [blockq + 0]
646    movh                m1, [blockq +16]
647    movh                m2, [blockq +32]
648    movh                m3, [blockq +48]
649    VP9_IDCT8_4x4_1D
650    ; partial 4x8 transpose
651%if ARCH_X86_32
652    mova                m6, [blockq+ 0]
653%endif
654    punpcklwd           m0, m1
655    punpcklwd           m2, m3
656    punpcklwd           m4, m5
657    punpcklwd           m6, m7
658    SBUTTERFLY          dq, 0, 2, 1
659    SBUTTERFLY          dq, 4, 6, 5
660    SBUTTERFLY         qdq, 0, 4, 1
661    SBUTTERFLY         qdq, 2, 6, 5
662    SWAP                 1, 4
663    SWAP                 3, 6
664    VP9_IDCT8_4x4_1D
665%if ARCH_X86_64
666    SWAP                 6, 8
667%endif
668    pxor                m6, m6
669    VP9_IDCT8_WRITEOUT
670%if ARCH_X86_64
671    movh       [blockq+ 0], m6
672    movh       [blockq+16], m6
673    movh       [blockq+32], m6
674%else
675    mova       [blockq+ 0], m6
676    mova       [blockq+16], m6
677    mova       [blockq+32], m6
678%endif
679    movh       [blockq+48], m6
680    RET
681%endif
682
683.idctfull: ; generic full 8x8 idct/idct
684%if ARCH_X86_64
685    mova                m0, [blockq+  0]    ; IN(0)
686%endif
687    mova                m1, [blockq+ 16]    ; IN(1)
688    mova                m2, [blockq+ 32]    ; IN(2)
689    mova                m3, [blockq+ 48]    ; IN(3)
690%if ARCH_X86_64
691    mova                m4, [blockq+ 64]    ; IN(4)
692%endif
693    mova                m5, [blockq+ 80]    ; IN(5)
694    mova                m6, [blockq+ 96]    ; IN(6)
695    mova                m7, [blockq+112]    ; IN(7)
696%if ARCH_X86_64
697    mova               m11, [pd_8192]       ; rounding
698%define D_8192_REG m11
699%else
700%define D_8192_REG [pd_8192]
701%endif
702    VP9_IDCT8_1D
703%if ARCH_X86_64
704    TRANSPOSE8x8W  0, 1, 2, 3, 4, 5, 6, 7, 8
705%else
706    TRANSPOSE8x8W  0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1
707    mova        [blockq+0], m0
708%endif
709    VP9_IDCT8_1D
710
711%if ARCH_X86_64
712    SWAP                 6, 8
713%endif
714    pxor                m6, m6  ; used for the block reset, and VP9_STORE_2X
715    VP9_IDCT8_WRITEOUT
716    ZERO_BLOCK      blockq, 16, 8, m6
717    RET
718%undef W_11585x2_REG
719%endmacro
720
721VP9_IDCT_IDCT_8x8_ADD_XMM sse2, 12
722VP9_IDCT_IDCT_8x8_ADD_XMM ssse3, 13
723VP9_IDCT_IDCT_8x8_ADD_XMM avx, 13
724
725;---------------------------------------------------------------------------------------------
726; void vp9_iadst_iadst_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
727;---------------------------------------------------------------------------------------------
728
729; x86-32:
730; - in: m0/3/4/7 are in mem [blockq+N*16]
731; - out: m6 is in mem [blockq+0]
732; x86-64:
733; - everything is in registers
734%macro VP9_IADST8_1D 0 ; input/output=m0/1/2/3/4/5/6/7
735%if ARCH_X86_64
736    SWAP                     0, 8
737    SWAP                     3, 9
738    SWAP                     4, 10
739    SWAP                     7, 11
740%endif
741
742    VP9_UNPACK_MULSUB_2D_4X  5,  2,  0,  3, 14449,  7723    ; m5/2=t3[d], m2/4=t2[d]
743    VP9_UNPACK_MULSUB_2D_4X  1,  6,  4,  7,  4756, 15679    ; m1/4=t7[d], m6/7=t6[d]
744    SCRATCH                  4, 12, blockq+1*16
745    VP9_RND_SH_SUMSUB_BA     6,  2,  7,  3, 4, D_8192_REG  ; m6=t2[w], m2=t6[w]
746    UNSCRATCH                4, 12, blockq+1*16
747    VP9_RND_SH_SUMSUB_BA     1,  5,  4,  0, 3, D_8192_REG  ; m1=t3[w], m5=t7[w]
748
749    UNSCRATCH                0,  8, blockq+16*0
750    UNSCRATCH                3,  9, blockq+16*3
751    UNSCRATCH                4, 10, blockq+16*4
752    UNSCRATCH                7, 11, blockq+16*7
753    SCRATCH                  1,  8, blockq+16*1
754    SCRATCH                  2,  9, blockq+16*2
755    SCRATCH                  5, 10, blockq+16*5
756    SCRATCH                  6, 11, blockq+16*6
757
758    VP9_UNPACK_MULSUB_2D_4X  7,  0,  1,  2, 16305,  1606    ; m7/1=t1[d], m0/2=t0[d]
759    VP9_UNPACK_MULSUB_2D_4X  3,  4,  5,  6, 10394, 12665    ; m3/5=t5[d], m4/6=t4[d]
760    SCRATCH                  1, 12, blockq+ 0*16
761    VP9_RND_SH_SUMSUB_BA     4,  0,  6,  2, 1, D_8192_REG  ; m4=t0[w], m0=t4[w]
762    UNSCRATCH                1, 12, blockq+ 0*16
763    VP9_RND_SH_SUMSUB_BA     3,  7,  5,  1, 2, D_8192_REG  ; m3=t1[w], m7=t5[w]
764
765    UNSCRATCH                2,  9, blockq+16*2
766    UNSCRATCH                5, 10, blockq+16*5
767    SCRATCH                  3,  9, blockq+16*3
768    SCRATCH                  4, 10, blockq+16*4
769
770    ; m4=t0, m3=t1, m6=t2, m1=t3, m0=t4, m7=t5, m2=t6, m5=t7
771
772    VP9_UNPACK_MULSUB_2D_4X  0,  7,  1,  3, 15137,  6270    ; m0/1=t5[d], m7/3=t4[d]
773    VP9_UNPACK_MULSUB_2D_4X  5,  2,  4,  6,  6270, 15137    ; m5/4=t6[d], m2/6=t7[d]
774    SCRATCH                  1, 12, blockq+ 0*16
775    VP9_RND_SH_SUMSUB_BA     5,  7,  4,  3, 1, D_8192_REG
776    UNSCRATCH                1, 12, blockq+ 0*16
777    PSIGNW                  m5, W_M1_REG                    ; m5=out1[w], m7=t6[w]
778    VP9_RND_SH_SUMSUB_BA     2,  0,  6,  1, 3, D_8192_REG   ; m2=out6[w], m0=t7[w]
779
780    UNSCRATCH                1,  8, blockq+16*1
781    UNSCRATCH                3,  9, blockq+16*3
782    UNSCRATCH                4, 10, blockq+16*4
783    UNSCRATCH                6, 11, blockq+16*6
784    SCRATCH                  2,  8, blockq+16*0
785
786    SUMSUB_BA                w,  6,  4, 2                   ; m6=out0[w], m4=t2[w]
787    SUMSUB_BA                w,  1,  3, 2
788    PSIGNW                  m1, W_M1_REG                    ; m1=out7[w], m3=t3[w]
789
790    ; m6=out0, m5=out1, m4=t2, m3=t3, m7=t6, m0=t7, m2=out6, m1=out7
791
792    ; unfortunately, the code below overflows in some cases
793%if 0; cpuflag(ssse3)
794    SUMSUB_BA                w,  3,  4,  2
795    SUMSUB_BA                w,  0,  7,  2
796    pmulhrsw                m3, W_11585x2_REG
797    pmulhrsw                m7, W_11585x2_REG
798    pmulhrsw                m4, W_11585x2_REG               ; out4
799    pmulhrsw                m0, W_11585x2_REG               ; out2
800%else
801    SCRATCH                  5,  9, blockq+16*1
802    VP9_UNPACK_MULSUB_2W_4X  4, 3, 11585, 11585, D_8192_REG, 2, 5
803    VP9_UNPACK_MULSUB_2W_4X  7, 0, 11585, 11585, D_8192_REG, 2, 5
804    UNSCRATCH                5,  9, blockq+16*1
805%endif
806    PSIGNW                  m3, W_M1_REG                    ; out3
807    PSIGNW                  m7, W_M1_REG                    ; out5
808
809    ; m6=out0, m5=out1, m0=out2, m3=out3, m4=out4, m7=out5, m2=out6, m1=out7
810
811%if ARCH_X86_64
812    SWAP                     2, 8
813%endif
814    SWAP                     0, 6, 2
815    SWAP                     7, 1, 5
816%endmacro
817
818%macro IADST8_FN 6
819INIT_XMM %5
820cglobal vp9_%1_%3_8x8_add, 3, 3, %6, dst, stride, block, eob
821
822%ifidn %1, idct
823%define first_is_idct 1
824%else
825%define first_is_idct 0
826%endif
827
828%ifidn %3, idct
829%define second_is_idct 1
830%else
831%define second_is_idct 0
832%endif
833
834%if ARCH_X86_64
835    mova                m0, [blockq+  0]    ; IN(0)
836%endif
837    mova                m1, [blockq+ 16]    ; IN(1)
838    mova                m2, [blockq+ 32]    ; IN(2)
839%if ARCH_X86_64 || first_is_idct
840    mova                m3, [blockq+ 48]    ; IN(3)
841%endif
842%if ARCH_X86_64
843    mova                m4, [blockq+ 64]    ; IN(4)
844%endif
845    mova                m5, [blockq+ 80]    ; IN(5)
846    mova                m6, [blockq+ 96]    ; IN(6)
847%if ARCH_X86_64 || first_is_idct
848    mova                m7, [blockq+112]    ; IN(7)
849%endif
850%if ARCH_X86_64
851%if cpuflag(ssse3)
852    mova               m15, [pw_11585x2]    ; often used
853%endif
854    mova               m13, [pd_8192]       ; rounding
855    mova               m14, [pw_m1]
856%define W_11585x2_REG m15
857%define D_8192_REG m13
858%define W_M1_REG m14
859%else
860%define W_11585x2_REG [pw_11585x2]
861%define D_8192_REG [pd_8192]
862%define W_M1_REG [pw_m1]
863%endif
864
865    ; note different calling conventions for idct8 vs. iadst8 on x86-32
866    VP9_%2_1D
867%if ARCH_X86_64
868    TRANSPOSE8x8W  0, 1, 2, 3, 4, 5, 6, 7, 8
869%else
870    TRANSPOSE8x8W  0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1
871    mova      [blockq+  0], m0
872%if second_is_idct == 0
873    mova      [blockq+ 48], m3
874    mova      [blockq+112], m7
875%endif
876%endif
877    VP9_%4_1D
878
879%if ARCH_X86_64
880    SWAP                 6, 8
881%endif
882    pxor                m6, m6  ; used for the block reset, and VP9_STORE_2X
883    VP9_IDCT8_WRITEOUT
884    ZERO_BLOCK      blockq, 16, 8, m6
885    RET
886
887%undef W_11585x2_REG
888%undef first_is_idct
889%undef second_is_idct
890
891%endmacro
892
893IADST8_FN idct,  IDCT8,  iadst, IADST8, sse2, 15
894IADST8_FN iadst, IADST8, idct,  IDCT8,  sse2, 15
895IADST8_FN iadst, IADST8, iadst, IADST8, sse2, 15
896IADST8_FN idct,  IDCT8,  iadst, IADST8, ssse3, 16
897IADST8_FN idct,  IDCT8,  iadst, IADST8, avx, 16
898IADST8_FN iadst, IADST8, idct,  IDCT8,  ssse3, 16
899IADST8_FN iadst, IADST8, idct,  IDCT8,  avx, 16
900IADST8_FN iadst, IADST8, iadst, IADST8, ssse3, 16
901IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16
902
903;---------------------------------------------------------------------------------------------
904; void vp9_idct_idct_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
905;---------------------------------------------------------------------------------------------
906
907; x86-64:
908; at the end of this macro, m7 is stored in [%4+15*%5]
909; everything else (t0-6 and t8-15) is stored in m0-6 and m8-15
910; the following sumsubs have not been done yet:
911;    SUMSUB_BA            w,  6,  9, 15      ; t6, t9
912;    SUMSUB_BA            w,  7,  8, 15      ; t7, t8
913; or (x86-32) t0-t5 are in m0-m5, t10-t15 are in x11/9/7/5/3/1,
914; and the following simsubs have not been done yet:
915;    SUMSUB_BA            w, x13, x14, 7       ; t6, t9
916;    SUMSUB_BA            w, x15, x12, 7       ; t7, t8
917
918%macro VP9_IDCT16_1D_START 6 ; src, nnzc, stride, scratch, scratch_stride, is_iadst
919%if %2 <= 4
920    mova                m3, [%1+ 1*%3]      ; IN(1)
921    mova                m0, [%1+ 3*%3]      ; IN(3)
922
923    pmulhrsw            m4, m3,  [pw_16305x2]       ; t14-15
924    pmulhrsw            m3, [pw_1606x2]             ; t8-9
925    pmulhrsw            m7, m0,  [pw_m4756x2]       ; t10-11
926    pmulhrsw            m0, [pw_15679x2]            ; t12-13
927
928    ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7
929    ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15
930
931    VP9_UNPACK_MULSUB_2W_4X 2, 5, 4, 3, 15137,  6270, [pd_8192], 1, 6 ; t9,  t14
932    SCRATCH              4, 10, %4+ 1*%5
933    SCRATCH              5, 11, %4+ 7*%5
934    VP9_UNPACK_MULSUB_2W_4X 6, 1, 0, 7, 6270, m15137, [pd_8192], 4, 5 ; t10, t13
935    UNSCRATCH            5, 11, %4+ 7*%5
936
937    ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
938    ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
939%else
940    mova                m5, [%1+ 1*%3]      ; IN(1)
941    mova                m4, [%1+ 7*%3]      ; IN(7)
942%if %2 <= 8
943    pmulhrsw            m2, m5,  [pw_16305x2]       ; t15
944    pmulhrsw            m5, [pw_1606x2]             ; t8
945    pmulhrsw            m3, m4,  [pw_m10394x2]      ; t9
946    pmulhrsw            m4, [pw_12665x2]            ; t14
947%else
948    mova                m3, [%1+ 9*%3]      ; IN(9)
949    mova                m2, [%1+15*%3]      ; IN(15)
950
951    ; m10=in0, m5=in1, m14=in2, m6=in3, m9=in4, m7=in5, m15=in6, m4=in7
952    ; m11=in8, m3=in9, m12=in10 m0=in11, m8=in12, m1=in13, m13=in14, m2=in15
953
954    VP9_UNPACK_MULSUB_2W_4X   5,   2, 16305,  1606, [pd_8192], 0, 1 ; t8,  t15
955    VP9_UNPACK_MULSUB_2W_4X   3,   4, 10394, 12665, [pd_8192], 0, 1 ; t9,  t14
956%endif
957
958    SUMSUB_BA            w,  3,  5, 0       ; t8,  t9
959    SUMSUB_BA            w,  4,  2, 0       ; t15, t14
960
961    VP9_UNPACK_MULSUB_2W_4X   2,   5, 15137,  6270, [pd_8192], 0, 1 ; t9,  t14
962
963    SCRATCH              4, 10, %4+ 1*%5
964    SCRATCH              5, 11, %4+ 7*%5
965
966    mova                m6, [%1+ 3*%3]      ; IN(3)
967    mova                m7, [%1+ 5*%3]      ; IN(5)
968%if %2 <= 8
969    pmulhrsw            m0, m7,  [pw_14449x2]       ; t13
970    pmulhrsw            m7, [pw_7723x2]             ; t10
971    pmulhrsw            m1, m6,  [pw_m4756x2]       ; t11
972    pmulhrsw            m6, [pw_15679x2]            ; t12
973%else
974    mova                m0, [%1+11*%3]      ; IN(11)
975    mova                m1, [%1+13*%3]      ; IN(13)
976
977    VP9_UNPACK_MULSUB_2W_4X   7,   0, 14449,  7723, [pd_8192], 4, 5 ; t10, t13
978    VP9_UNPACK_MULSUB_2W_4X   1,   6,  4756, 15679, [pd_8192], 4, 5 ; t11, t12
979%endif
980
981    ; m11=t0, m10=t1, m9=t2, m8=t3, m14=t4, m12=t5, m15=t6, m13=t7
982    ; m5=t8, m3=t9, m7=t10, m1=t11, m6=t12, m0=t13, m4=t14, m2=t15
983
984    SUMSUB_BA            w,  7,  1, 4       ; t11, t10
985    SUMSUB_BA            w,  0,  6, 4       ; t12, t13
986
987    ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7
988    ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15
989
990    VP9_UNPACK_MULSUB_2W_4X   6,   1, 6270, m15137, [pd_8192], 4, 5 ; t10, t13
991
992    UNSCRATCH            5, 11, %4+ 7*%5
993%endif
994
995    ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m13=t5, m14=t6, m15=t7
996    ; m3=t8, m2=t9, m6=t10, m7=t11, m0=t12, m1=t13, m5=t14, m4=t15
997
998    SUMSUB_BA            w,  7,  3, 4       ; t8,  t11
999
1000    ; backup first register
1001    mova        [%4+15*%5], m7
1002
1003    SUMSUB_BA            w,  6,  2, 7       ; t9,  t10
1004    UNSCRATCH            4, 10, %4+ 1*%5
1005    SUMSUB_BA            w,  0,  4, 7       ; t15, t12
1006    SUMSUB_BA            w,  1,  5, 7       ; t14. t13
1007
1008    ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
1009    ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
1010
1011%if cpuflag(ssse3) && %6 == 0
1012    SUMSUB_BA            w,  2,  5, 7
1013    SUMSUB_BA            w,  3,  4, 7
1014    pmulhrsw            m5, [pw_11585x2]    ; t10
1015    pmulhrsw            m4, [pw_11585x2]    ; t11
1016    pmulhrsw            m3, [pw_11585x2]    ; t12
1017    pmulhrsw            m2, [pw_11585x2]    ; t13
1018%else
1019    SCRATCH              6, 10, %4+ 1*%5
1020    VP9_UNPACK_MULSUB_2W_4X   5,   2, 11585, 11585, [pd_8192], 6, 7 ; t10, t13
1021    VP9_UNPACK_MULSUB_2W_4X   4,   3, 11585, 11585, [pd_8192], 6, 7 ; t11, t12
1022    UNSCRATCH            6, 10, %4+ 1*%5
1023%endif
1024
1025    ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
1026    ; m7=t8, m6=t9, m5=t10, m4=t11, m3=t12, m2=t13, m1=t14, m0=t15
1027
1028    SCRATCH              0,  8, %4+ 1*%5
1029    SCRATCH              1,  9, %4+ 3*%5
1030    SCRATCH              2, 10, %4+ 5*%5
1031    SCRATCH              3, 11, %4+ 7*%5
1032    SCRATCH              4, 12, %4+ 9*%5
1033    SCRATCH              5, 13, %4+11*%5
1034    SCRATCH              6, 14, %4+13*%5
1035
1036    ; even (tx8x8)
1037%if %2 <= 4
1038    mova                m3, [%1+ 0*%3]      ; IN(0)
1039    mova                m4, [%1+ 2*%3]      ; IN(2)
1040
1041    pmulhrsw            m3, [pw_11585x2]    ; t0-t3
1042    pmulhrsw            m7, m4, [pw_16069x2]        ; t6-7
1043    pmulhrsw            m4, [pw_3196x2]             ; t4-5
1044
1045%if 0 ; overflows :(
1046    paddw               m6, m7, m4
1047    psubw               m5, m7, m4
1048    pmulhrsw            m5, [pw_11585x2]            ; t5
1049    pmulhrsw            m6, [pw_11585x2]            ; t6
1050%else
1051    VP9_UNPACK_MULSUB_2W_4X  5, 6, 7, 4, 11585, 11585, [pd_8192], 0, 1 ; t5,  t6
1052%endif
1053
1054    psubw               m0, m3, m7
1055    paddw               m7, m3
1056    psubw               m1, m3, m6
1057    paddw               m6, m3
1058    psubw               m2, m3, m5
1059    paddw               m5, m3
1060
1061%if ARCH_X86_32
1062    SWAP                 0, 7
1063%endif
1064    SCRATCH              7, 15, %4+12*%5
1065%else
1066    mova                m6, [%1+ 2*%3]      ; IN(2)
1067    mova                m1, [%1+ 4*%3]      ; IN(4)
1068    mova                m7, [%1+ 6*%3]      ; IN(6)
1069%if %2 <= 8
1070    pmulhrsw            m0, m1,  [pw_15137x2]       ; t3
1071    pmulhrsw            m1, [pw_6270x2]             ; t2
1072    pmulhrsw            m5, m6, [pw_16069x2]        ; t7
1073    pmulhrsw            m6, [pw_3196x2]             ; t4
1074    pmulhrsw            m4, m7, [pw_m9102x2]        ; t5
1075    pmulhrsw            m7, [pw_13623x2]            ; t6
1076%else
1077    mova                m4, [%1+10*%3]      ; IN(10)
1078    mova                m0, [%1+12*%3]      ; IN(12)
1079    mova                m5, [%1+14*%3]      ; IN(14)
1080
1081    VP9_UNPACK_MULSUB_2W_4X   1,   0, 15137,  6270, [pd_8192], 2, 3 ; t2,  t3
1082    VP9_UNPACK_MULSUB_2W_4X   6,   5, 16069,  3196, [pd_8192], 2, 3 ; t4,  t7
1083    VP9_UNPACK_MULSUB_2W_4X   4,   7,  9102, 13623, [pd_8192], 2, 3 ; t5,  t6
1084%endif
1085
1086    SUMSUB_BA            w,  4,  6, 2       ; t4,  t5
1087    SUMSUB_BA            w,  7,  5, 2       ; t7,  t6
1088
1089%if cpuflag(ssse3) && %6 == 0
1090    SUMSUB_BA            w,  6,  5, 2
1091    pmulhrsw            m5, [pw_11585x2]                              ; t5
1092    pmulhrsw            m6, [pw_11585x2]                              ; t6
1093%else
1094    VP9_UNPACK_MULSUB_2W_4X  5,  6, 11585, 11585, [pd_8192], 2, 3 ; t5,  t6
1095%endif
1096
1097    SCRATCH              5, 15, %4+10*%5
1098    mova                m2, [%1+ 0*%3]      ; IN(0)
1099%if %2 <= 8
1100    pmulhrsw            m2, [pw_11585x2]    ; t0 and t1
1101    psubw               m3, m2, m0
1102    paddw               m0, m2
1103
1104    SUMSUB_BA            w,  7,  0, 5       ; t0,  t7
1105%else
1106    mova                m3, [%1+ 8*%3]      ; IN(8)
1107
1108    ; from 3 stages back
1109%if cpuflag(ssse3) && %6 == 0
1110    SUMSUB_BA            w,  3,  2, 5
1111    pmulhrsw            m3, [pw_11585x2]    ; t0
1112    pmulhrsw            m2, [pw_11585x2]    ; t1
1113%else
1114    mova        [%1+ 0*%3], m0
1115    VP9_UNPACK_MULSUB_2W_4X  2,  3, 11585,  11585, [pd_8192], 5, 0 ; t0, t1
1116    mova                m0, [%1+ 0*%3]
1117%endif
1118
1119    ; from 2 stages back
1120    SUMSUB_BA            w,  0,  3, 5      ; t0,  t3
1121
1122    SUMSUB_BA            w,  7,  0, 5      ; t0,  t7
1123%endif
1124    UNSCRATCH            5, 15, %4+10*%5
1125%if ARCH_X86_32
1126    SWAP                 0, 7
1127%endif
1128    SCRATCH              7, 15, %4+12*%5
1129    SUMSUB_BA            w,  1,  2, 7       ; t1,  t2
1130
1131    ; from 1 stage back
1132    SUMSUB_BA            w,  6,  1, 7       ; t1,  t6
1133    SUMSUB_BA            w,  5,  2, 7       ; t2,  t5
1134%endif
1135    SUMSUB_BA            w,  4,  3, 7       ; t3,  t4
1136
1137%if ARCH_X86_64
1138    SWAP                 0, 8
1139    SWAP                 1, 9
1140    SWAP                 2, 10
1141    SWAP                 3, 11
1142    SWAP                 4, 12
1143    SWAP                 5, 13
1144    SWAP                 6, 14
1145
1146    SUMSUB_BA            w,  0, 15, 7       ; t0, t15
1147    SUMSUB_BA            w,  1, 14, 7       ; t1, t14
1148    SUMSUB_BA            w,  2, 13, 7       ; t2, t13
1149    SUMSUB_BA            w,  3, 12, 7       ; t3, t12
1150    SUMSUB_BA            w,  4, 11, 7       ; t4, t11
1151    SUMSUB_BA            w,  5, 10, 7       ; t5, t10
1152%else
1153    SWAP                 1, 6
1154    SWAP                 2, 5
1155    SWAP                 3, 4
1156    mova        [%4+14*%5], m6
1157
1158%macro %%SUMSUB_BA_STORE 5 ; reg, from_mem, to_mem, scratch, scratch_stride
1159    mova                m6, [%4+%2*%5]
1160    SUMSUB_BA            w,  6, %1, 7
1161    SWAP                %1, 6
1162    mova        [%4+%3*%5], m6
1163%endmacro
1164
1165    %%SUMSUB_BA_STORE    0,  1,  1, %4, %5  ; t0, t15
1166    %%SUMSUB_BA_STORE    1,  3,  3, %4, %5  ; t1, t14
1167    %%SUMSUB_BA_STORE    2,  5,  5, %4, %5  ; t2, t13
1168    %%SUMSUB_BA_STORE    3,  7,  7, %4, %5  ; t3, t12
1169    %%SUMSUB_BA_STORE    4,  9,  9, %4, %5  ; t4, t11
1170    %%SUMSUB_BA_STORE    5, 11, 11, %4, %5  ; t5, t10
1171%endif
1172%endmacro
1173
1174%macro VP9_IDCT16_1D 2-4 16, 1 ; src, pass, nnzc, is_iadst
1175%if %2 == 1
1176    VP9_IDCT16_1D_START %1, %3, 32, tmpq, 16, %4
1177
1178%if ARCH_X86_64
1179    ; backup a different register
1180    mova                m7, [tmpq+15*16]
1181    mova      [tmpq+ 1*16], m15
1182
1183    SUMSUB_BA            w,  6,  9, 15      ; t6, t9
1184    SUMSUB_BA            w,  7,  8, 15      ; t7, t8
1185
1186    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 15
1187    mova        [tmpq+  0], m0
1188    mova        [tmpq+ 32], m1
1189    mova        [tmpq+ 64], m2
1190    mova        [tmpq+ 96], m3
1191    mova        [tmpq+128], m4
1192    mova        [tmpq+160], m5
1193    mova        [tmpq+192], m6
1194    mova        [tmpq+224], m7
1195
1196    mova               m15, [tmpq+ 1*16]
1197    TRANSPOSE8x8W        8, 9, 10, 11, 12, 13, 14, 15, 0
1198    mova        [tmpq+ 16], m8
1199    mova        [tmpq+ 48], m9
1200    mova        [tmpq+ 80], m10
1201    mova        [tmpq+112], m11
1202    mova        [tmpq+144], m12
1203    mova        [tmpq+176], m13
1204    mova        [tmpq+208], m14
1205    mova        [tmpq+240], m15
1206%else
1207    mova                m6, [tmpq+13*16]
1208    mova                m7, [tmpq+14*16]
1209    SUMSUB_BA            w, 6, 7                ; t6, t9
1210    mova      [tmpq+14*16], m6
1211    mova      [tmpq+13*16], m7
1212    mova                m7, [tmpq+15*16]
1213    mova                m6, [tmpq+12*16]
1214    SUMSUB_BA            w, 7, 6                ; t7, t8
1215    mova      [tmpq+15*16], m6
1216
1217    TRANSPOSE8x8W       0, 1, 2, 3, 4, 5, 6, 7, [tmpq+14*16], [tmpq+ 8*16], 1
1218    mova     [tmpq+ 0*16], m0
1219    mova     [tmpq+ 2*16], m1
1220    mova     [tmpq+ 4*16], m2
1221    mova     [tmpq+ 6*16], m3
1222    mova     [tmpq+10*16], m5
1223    mova     [tmpq+12*16], m6
1224    mova     [tmpq+14*16], m7
1225
1226    mova                m0, [tmpq+15*16]
1227    mova                m1, [tmpq+13*16]
1228    mova                m2, [tmpq+11*16]
1229    mova                m3, [tmpq+ 9*16]
1230    mova                m4, [tmpq+ 7*16]
1231    mova                m5, [tmpq+ 5*16]
1232    mova                m7, [tmpq+ 1*16]
1233    TRANSPOSE8x8W       0, 1, 2, 3, 4, 5, 6, 7, [tmpq+ 3*16], [tmpq+ 9*16], 1
1234    mova     [tmpq+ 1*16], m0
1235    mova     [tmpq+ 3*16], m1
1236    mova     [tmpq+ 5*16], m2
1237    mova     [tmpq+ 7*16], m3
1238    mova     [tmpq+11*16], m5
1239    mova     [tmpq+13*16], m6
1240    mova     [tmpq+15*16], m7
1241%endif
1242%else ; %2 == 2
1243    VP9_IDCT16_1D_START %1, %3, 32, %1, 32, %4
1244
1245%if cpuflag(ssse3)
1246%define ROUND_REG [pw_512]
1247%else
1248%define ROUND_REG [pw_32]
1249%endif
1250
1251    pxor                m7, m7
1252%if ARCH_X86_64
1253    ; backup more registers
1254    mova        [%1+ 2*32], m8
1255    mova        [%1+ 3*32], m9
1256
1257    VP9_IDCT8_WRITEx2    0,  1, 8, 9, 7, ROUND_REG, 6
1258    lea               dstq, [dstq+strideq*2]
1259    VP9_IDCT8_WRITEx2    2,  3, 8, 9, 7, ROUND_REG, 6
1260    lea               dstq, [dstq+strideq*2]
1261    VP9_IDCT8_WRITEx2    4,  5, 8, 9, 7, ROUND_REG, 6
1262    lea               dstq, [dstq+strideq*2]
1263
1264    ; restore from cache
1265    SWAP                 0, 7               ; move zero from m7 to m0
1266    mova                m7, [%1+15*32]
1267    mova                m8, [%1+ 2*32]
1268    mova                m9, [%1+ 3*32]
1269
1270    SUMSUB_BA            w,  6,  9, 3       ; t6, t9
1271    SUMSUB_BA            w,  7,  8, 3       ; t7, t8
1272
1273    VP9_IDCT8_WRITEx2    6,  7, 3, 4, 0, ROUND_REG, 6
1274    lea               dstq, [dstq+strideq*2]
1275    VP9_IDCT8_WRITEx2    8,  9, 3, 4, 0, ROUND_REG, 6
1276    lea               dstq, [dstq+strideq*2]
1277    VP9_IDCT8_WRITEx2   10, 11, 1, 2, 0, ROUND_REG, 6
1278    lea               dstq, [dstq+strideq*2]
1279    VP9_IDCT8_WRITEx2   12, 13, 1, 2, 0, ROUND_REG, 6
1280    lea               dstq, [dstq+strideq*2]
1281    VP9_IDCT8_WRITEx2   14, 15, 1, 2, 0, ROUND_REG, 6
1282%else
1283    mova      [tmpq+ 0*32], m5
1284
1285    VP9_IDCT8_WRITEx2    0,  1, 5, 6, 7, ROUND_REG, 6
1286    lea               dstq, [dstq+strideq*2]
1287    VP9_IDCT8_WRITEx2    2,  3, 5, 6, 7, ROUND_REG, 6
1288    lea               dstq, [dstq+strideq*2]
1289
1290    SWAP                 0, 7               ; move zero from m7 to m0
1291    mova                m5, [tmpq+ 0*32]
1292
1293    VP9_IDCT8_WRITEx2    4,  5, 1, 2, 0, ROUND_REG, 6
1294    lea               dstq, [dstq+strideq*2]
1295
1296    mova                m4, [tmpq+13*32]
1297    mova                m7, [tmpq+14*32]
1298    mova                m5, [tmpq+15*32]
1299    mova                m6, [tmpq+12*32]
1300    SUMSUB_BADC w, 4, 7, 5, 6, 1
1301
1302    VP9_IDCT8_WRITEx2    4,  5, 1, 2, 0, ROUND_REG, 6
1303    lea               dstq, [dstq+strideq*2]
1304    VP9_IDCT8_WRITEx2    6,  7, 1, 2, 0, ROUND_REG, 6
1305    lea               dstq, [dstq+strideq*2]
1306
1307    mova                m4, [tmpq+11*32]
1308    mova                m5, [tmpq+ 9*32]
1309    mova                m6, [tmpq+ 7*32]
1310    mova                m7, [tmpq+ 5*32]
1311
1312    VP9_IDCT8_WRITEx2    4,  5, 1, 2, 0, ROUND_REG, 6
1313    lea               dstq, [dstq+strideq*2]
1314    VP9_IDCT8_WRITEx2    6,  7, 1, 2, 0, ROUND_REG, 6
1315    lea               dstq, [dstq+strideq*2]
1316
1317    mova                m4, [tmpq+ 3*32]
1318    mova                m5, [tmpq+ 1*32]
1319
1320    VP9_IDCT8_WRITEx2    4,  5, 1, 2, 0, ROUND_REG, 6
1321    lea               dstq, [dstq+strideq*2]
1322%endif
1323
1324%undef ROUND_REG
1325%endif ; %2 == 1/2
1326%endmacro
1327
1328%macro VP9_STORE_2XFULL 6-7 strideq; dc, tmp1, tmp2, tmp3, tmp4, zero, stride
1329    mova               m%3, [dstq]
1330    mova               m%5, [dstq+%7]
1331    punpcklbw          m%2, m%3, m%6
1332    punpckhbw          m%3, m%6
1333    punpcklbw          m%4, m%5, m%6
1334    punpckhbw          m%5, m%6
1335    paddw              m%2, m%1
1336    paddw              m%3, m%1
1337    paddw              m%4, m%1
1338    paddw              m%5, m%1
1339    packuswb           m%2, m%3
1340    packuswb           m%4, m%5
1341    mova            [dstq], m%2
1342    mova         [dstq+%7], m%4
1343%endmacro
1344
1345%macro VP9_IDCT_IDCT_16x16_ADD_XMM 1
1346INIT_XMM %1
1347cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob
1348%if cpuflag(ssse3)
1349    ; 2x2=eob=3, 4x4=eob=10
1350    cmp eobd, 38
1351    jg .idctfull
1352    cmp eobd, 1 ; faster path for when only DC is set
1353    jne .idct8x8
1354%else
1355    cmp eobd, 1 ; faster path for when only DC is set
1356    jg .idctfull
1357%endif
1358
1359    ; dc-only
1360%if cpuflag(ssse3)
1361    movd                m0, [blockq]
1362    mova                m1, [pw_11585x2]
1363    pmulhrsw            m0, m1
1364    pmulhrsw            m0, m1
1365%else
1366    DEFINE_ARGS dst, stride, block, coef
1367    movsx            coefd, word [blockq]
1368    imul             coefd, 11585
1369    add              coefd, 8192
1370    sar              coefd, 14
1371    imul             coefd, 11585
1372    add              coefd, (32 << 14) + 8192
1373    sar              coefd, 14 + 6
1374    movd                m0, coefd
1375%endif
1376    SPLATW              m0, m0, q0000
1377%if cpuflag(ssse3)
1378    pmulhrsw            m0, [pw_512]
1379%endif
1380    pxor                m5, m5
1381    movd          [blockq], m5
1382%rep 7
1383    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5
1384    lea               dstq, [dstq+2*strideq]
1385%endrep
1386    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5
1387    RET
1388
1389    DEFINE_ARGS dst, stride, block, cnt, dst_bak, tmp
1390%if cpuflag(ssse3)
1391.idct8x8:
1392    mov               tmpq, rsp
1393    VP9_IDCT16_1D   blockq, 1, 8, 0
1394
1395    mov               cntd, 2
1396    mov           dst_bakq, dstq
1397.loop2_8x8:
1398    VP9_IDCT16_1D     tmpq, 2, 8, 0
1399    lea               dstq, [dst_bakq+8]
1400    add               tmpq, 16
1401    dec               cntd
1402    jg .loop2_8x8
1403
1404    ; at the end of the loop, m0 should still be zero
1405    ; use that to zero out block coefficients
1406    ZERO_BLOCK      blockq, 32, 8, m0
1407    RET
1408%endif
1409
1410.idctfull:
1411    mov               cntd, 2
1412    mov               tmpq, rsp
1413.loop1_full:
1414    VP9_IDCT16_1D   blockq, 1, 16, 0
1415    add             blockq, 16
1416    add               tmpq, 256
1417    dec               cntd
1418    jg .loop1_full
1419    sub             blockq, 32
1420
1421    mov               cntd, 2
1422    mov               tmpq, rsp
1423    mov           dst_bakq, dstq
1424.loop2_full:
1425    VP9_IDCT16_1D     tmpq, 2, 16, 0
1426    lea               dstq, [dst_bakq+8]
1427    add               tmpq, 16
1428    dec               cntd
1429    jg .loop2_full
1430
1431    ; at the end of the loop, m0 should still be zero
1432    ; use that to zero out block coefficients
1433    ZERO_BLOCK      blockq, 32, 16, m0
1434    RET
1435%endmacro
1436
1437VP9_IDCT_IDCT_16x16_ADD_XMM sse2
1438VP9_IDCT_IDCT_16x16_ADD_XMM ssse3
1439VP9_IDCT_IDCT_16x16_ADD_XMM avx
1440
1441%macro VP9_IDCT16_YMM_1D 0
1442    VP9_UNPACK_MULSUB_2W_4X  1,  15, 16305,  1606, [pd_8192], 0, 4 ; t8,  t15
1443    VP9_UNPACK_MULSUB_2W_4X  9,   7, 10394, 12665, [pd_8192], 0, 4 ; t9,  t14
1444
1445    SUMSUB_BA            w,  9,   1, 0      ; t8,  t9
1446    SUMSUB_BA            w,  7,  15, 0      ; t15, t14
1447
1448    VP9_UNPACK_MULSUB_2W_4X 15,   1, 15137,  6270, [pd_8192], 0, 4 ; t9,  t14
1449
1450    VP9_UNPACK_MULSUB_2W_4X  5,  11, 14449,  7723, [pd_8192], 0, 4 ; t10, t13
1451    VP9_UNPACK_MULSUB_2W_4X 13,   3,  4756, 15679, [pd_8192], 0, 4 ; t11, t12
1452
1453    SUMSUB_BA            w,  5,  13, 0      ; t11, t10
1454    SUMSUB_BA            w, 11,   3, 0      ; t12, t13
1455
1456    VP9_UNPACK_MULSUB_2W_4X  3,  13, 6270, m15137, [pd_8192], 0, 4 ; t10, t13
1457
1458    SUMSUB_BA            w,  5,   9, 0      ; t8,  t11
1459    SUMSUB_BA            w,  3,  15, 0      ; t9,  t10
1460    SUMSUB_BA            w, 11,   7, 0      ; t15, t12
1461    SUMSUB_BA            w, 13,   1, 0      ; t14, t13
1462
1463    SUMSUB_BA            w, 15,   1, 0
1464    SUMSUB_BA            w,  9,   7, 0
1465    pmulhrsw            m1, [pw_11585x2]    ; t10
1466    pmulhrsw            m7, [pw_11585x2]    ; t11
1467    pmulhrsw            m9, [pw_11585x2]    ; t12
1468    pmulhrsw           m15, [pw_11585x2]    ; t13
1469
1470    ; even (tx8x8)
1471    mova                m4, [blockq+128]
1472    mova      [blockq+128], m5
1473    VP9_UNPACK_MULSUB_2W_4X   4,  12, 15137,  6270, [pd_8192], 0, 5 ; t2,  t3
1474    VP9_UNPACK_MULSUB_2W_4X   2,  14, 16069,  3196, [pd_8192], 0, 5 ; t4,  t7
1475    VP9_UNPACK_MULSUB_2W_4X  10,   6,  9102, 13623, [pd_8192], 0, 5 ; t5,  t6
1476    mova                m0, [blockq+  0]
1477    SUMSUB_BA            w,   8,   0, 5
1478    pmulhrsw            m8, [pw_11585x2]    ; t0
1479    pmulhrsw            m0, [pw_11585x2]    ; t1
1480
1481    SUMSUB_BA            w,  10,   2, 5     ; t4,  t5
1482    SUMSUB_BA            w,   6,  14, 5     ; t7,  t6
1483    SUMSUB_BA            w,  12,   8, 5     ; t0,  t3
1484    SUMSUB_BA            w,   4,   0, 5     ; t1,  t2
1485
1486    SUMSUB_BA            w,   2,  14, 5
1487    pmulhrsw           m14, [pw_11585x2]    ; t5
1488    pmulhrsw            m2, [pw_11585x2]    ; t6
1489
1490    SUMSUB_BA            w,   6,  12, 5     ; t0,  t7
1491    SUMSUB_BA            w,   2,   4, 5     ; t1,  t6
1492    SUMSUB_BA            w,  14,   0, 5     ; t2,  t5
1493    SUMSUB_BA            w,  10,   8, 5     ; t3,  t4
1494
1495    ; final stage
1496    SUMSUB_BA            w, 11,  6,  5      ; out0, out15
1497    SUMSUB_BA            w, 13,  2,  5      ; out1, out14
1498    SUMSUB_BA            w, 15, 14,  5      ; out2, out13
1499    SUMSUB_BA            w,  9, 10,  5      ; out3, out12
1500    SUMSUB_BA            w,  7,  8,  5      ; out4, out11
1501    SUMSUB_BA            w,  1,  0,  5      ; out5, out10
1502    SUMSUB_BA            w,  3,  4,  5      ; out6, out9
1503    mova                m5, [blockq+128]
1504    mova      [blockq+192], m3
1505    SUMSUB_BA            w,  5, 12,  3      ; out7, out8
1506
1507    SWAP  0, 11,  8, 12, 10
1508    SWAP  1, 13, 14,  2, 15,  6,  3,  9,  4,  7,  5
1509%endmacro
1510
1511; this is almost identical to VP9_STORE_2X, but it does two rows
1512; for slightly improved interleaving, and it omits vpermq since the
1513; input is DC so all values are identical
1514%macro VP9_STORE_YMM_DC_4X 6 ; reg, tmp1, tmp2, tmp3, tmp4, zero
1515    mova              xm%2, [dstq]
1516    mova              xm%4, [dstq+strideq*2]
1517    vinserti128        m%2, m%2, [dstq+strideq], 1
1518    vinserti128        m%4, m%4, [dstq+stride3q], 1
1519    punpckhbw          m%3, m%2, m%6
1520    punpcklbw          m%2, m%6
1521    punpckhbw          m%5, m%4, m%6
1522    punpcklbw          m%4, m%6
1523    paddw              m%3, m%1
1524    paddw              m%2, m%1
1525    paddw              m%5, m%1
1526    paddw              m%4, m%1
1527    packuswb           m%2, m%3
1528    packuswb           m%4, m%5
1529    mova            [dstq], xm%2
1530    mova        [dstq+strideq*2], xm%4
1531    vextracti128  [dstq+strideq], m%2, 1
1532    vextracti128 [dstq+stride3q], m%4, 1
1533%endmacro
1534
1535%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
1536INIT_YMM avx2
1537cglobal vp9_idct_idct_16x16_add, 4, 4, 16, dst, stride, block, eob
1538    cmp eobd, 1 ; faster path for when only DC is set
1539    jg .idctfull
1540
1541    ; dc-only
1542    mova                m1, [pw_11585x2]
1543    vpbroadcastw        m0, [blockq]
1544    pmulhrsw            m0, m1
1545    pmulhrsw            m0, m1
1546    pxor                m5, m5
1547    pmulhrsw            m0, [pw_512]
1548    movd          [blockq], xm5
1549
1550    DEFINE_ARGS dst, stride, stride3, cnt
1551    mov               cntd, 4
1552    lea           stride3q, [strideq*3]
1553.loop_dc:
1554    VP9_STORE_YMM_DC_4X  0, 1, 2, 3, 4, 5
1555    lea               dstq, [dstq+4*strideq]
1556    dec               cntd
1557    jg .loop_dc
1558    RET
1559
1560    DEFINE_ARGS dst, stride, block, eob
1561.idctfull:
1562    mova                m1, [blockq+ 32]
1563    mova                m2, [blockq+ 64]
1564    mova                m3, [blockq+ 96]
1565    mova                m5, [blockq+160]
1566    mova                m6, [blockq+192]
1567    mova                m7, [blockq+224]
1568    mova                m8, [blockq+256]
1569    mova                m9, [blockq+288]
1570    mova               m10, [blockq+320]
1571    mova               m11, [blockq+352]
1572    mova               m12, [blockq+384]
1573    mova               m13, [blockq+416]
1574    mova               m14, [blockq+448]
1575    mova               m15, [blockq+480]
1576
1577    VP9_IDCT16_YMM_1D
1578    TRANSPOSE16x16W      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
1579                         [blockq+192], [blockq+128], 1
1580    mova      [blockq+  0], m0
1581    VP9_IDCT16_YMM_1D
1582
1583    mova      [blockq+224], m7
1584
1585    ; store
1586    VP9_IDCT8_WRITEx2    0,  1, 6, 7, unused, [pw_512], 6
1587    lea               dstq, [dstq+2*strideq]
1588    VP9_IDCT8_WRITEx2    2,  3, 6, 7, unused, [pw_512], 6
1589    lea               dstq, [dstq+2*strideq]
1590    VP9_IDCT8_WRITEx2    4,  5, 6, 7, unused, [pw_512], 6
1591    lea               dstq, [dstq+2*strideq]
1592    mova                m6, [blockq+192]
1593    mova                m7, [blockq+224]
1594    VP9_IDCT8_WRITEx2    6,  7, 1, 2, unused, [pw_512], 6
1595    lea               dstq, [dstq+2*strideq]
1596    VP9_IDCT8_WRITEx2    8,  9, 1, 2, unused, [pw_512], 6
1597    lea               dstq, [dstq+2*strideq]
1598    VP9_IDCT8_WRITEx2   10, 11, 1, 2, unused, [pw_512], 6
1599    lea               dstq, [dstq+2*strideq]
1600    VP9_IDCT8_WRITEx2   12, 13, 1, 2, unused, [pw_512], 6
1601    lea               dstq, [dstq+2*strideq]
1602    VP9_IDCT8_WRITEx2   14, 15, 1, 2, unused, [pw_512], 6
1603    lea               dstq, [dstq+2*strideq]
1604
1605    ; at the end of the loop, m0 should still be zero
1606    ; use that to zero out block coefficients
1607    pxor                m0, m0
1608    ZERO_BLOCK      blockq, 32, 16, m0
1609    RET
1610%endif
1611
1612;---------------------------------------------------------------------------------------------
1613; void vp9_iadst_iadst_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
1614;---------------------------------------------------------------------------------------------
1615
1616%macro VP9_IADST16_1D 2 ; src, pass
1617%assign %%str 16*%2
1618    mova                m0, [%1+ 0*32]  ; in0
1619    mova                m1, [%1+15*32]  ; in15
1620    mova                m2, [%1+ 7*32]  ; in7
1621    mova                m3, [%1+ 8*32]  ; in8
1622
1623    VP9_UNPACK_MULSUB_2D_4X  1,  0,  4,  5, 16364,   804    ; m1/4=t1[d], m0/5=t0[d]
1624    VP9_UNPACK_MULSUB_2D_4X  2,  3,  7,  6, 11003, 12140    ; m2/7=t9[d], m3/6=t8[d]
1625    SCRATCH              4, 8, tmpq+ 0*%%str
1626    VP9_RND_SH_SUMSUB_BA     3,  0,  6,  5,  4, [pd_8192]   ; m3=t0[w], m0=t8[w]
1627    UNSCRATCH            4, 8, tmpq+ 0*%%str
1628    VP9_RND_SH_SUMSUB_BA     2,  1,  7,  4,  5, [pd_8192]   ; m2=t1[w], m1=t9[w]
1629
1630    SCRATCH              0, 10, tmpq+ 0*%%str
1631    SCRATCH              1, 11, tmpq+15*%%str
1632    mova   [tmpq+ 7*%%str], m2
1633    mova   [tmpq+ 8*%%str], m3
1634
1635    mova                m1, [%1+ 2*32]  ; in2
1636    mova                m0, [%1+13*32]  ; in13
1637    mova                m3, [%1+ 5*32]  ; in5
1638    mova                m2, [%1+10*32]  ; in10
1639
1640    VP9_UNPACK_MULSUB_2D_4X  0,  1,  6,  7, 15893,  3981    ; m0/6=t3[d], m1/7=t2[d]
1641    VP9_UNPACK_MULSUB_2D_4X  3,  2,  4,  5,  8423, 14053    ; m3/4=t11[d], m2/5=t10[d]
1642    SCRATCH              4, 12, tmpq+ 2*%%str
1643    VP9_RND_SH_SUMSUB_BA     2,  1,  5,  7,  4, [pd_8192]   ; m2=t2[w], m1=t10[w]
1644    UNSCRATCH            4, 12, tmpq+ 2*%%str
1645    VP9_RND_SH_SUMSUB_BA     3,  0,  4,  6,  5, [pd_8192]   ; m3=t3[w], m0=t11[w]
1646
1647    SCRATCH              0, 12, tmpq+ 2*%%str
1648    SCRATCH              1, 13, tmpq+13*%%str
1649    mova   [tmpq+ 5*%%str], m2
1650    mova   [tmpq+10*%%str], m3
1651
1652    mova                m2, [%1+ 4*32]  ; in4
1653    mova                m3, [%1+11*32]  ; in11
1654    mova                m0, [%1+ 3*32]  ; in3
1655    mova                m1, [%1+12*32]  ; in12
1656
1657    VP9_UNPACK_MULSUB_2D_4X  3,  2,  7,  6, 14811,  7005    ; m3/7=t5[d], m2/6=t4[d]
1658    VP9_UNPACK_MULSUB_2D_4X  0,  1,  4,  5,  5520, 15426    ; m0/4=t13[d], m1/5=t12[d]
1659    SCRATCH              4, 9, tmpq+ 4*%%str
1660    VP9_RND_SH_SUMSUB_BA     1,  2,  5,  6,  4, [pd_8192]   ; m1=t4[w], m2=t12[w]
1661    UNSCRATCH            4, 9, tmpq+ 4*%%str
1662    VP9_RND_SH_SUMSUB_BA     0,  3,  4,  7,  6, [pd_8192]   ; m0=t5[w], m3=t13[w]
1663
1664    SCRATCH              0,  8, tmpq+ 4*%%str
1665    mova   [tmpq+11*%%str], m1          ; t4:m1->r11
1666    UNSCRATCH            0, 10, tmpq+ 0*%%str
1667    UNSCRATCH            1, 11, tmpq+15*%%str
1668
1669    ; round 2 interleaved part 1
1670    VP9_UNPACK_MULSUB_2D_4X  0,  1,  6,  7, 16069,  3196    ; m1/7=t8[d], m0/6=t9[d]
1671    VP9_UNPACK_MULSUB_2D_4X  3,  2,  5,  4,  3196, 16069    ; m3/5=t12[d], m2/4=t13[d]
1672    SCRATCH              4, 9, tmpq+ 3*%%str
1673    VP9_RND_SH_SUMSUB_BA     3,  1,  5,  7,  4, [pd_8192]   ; m3=t8[w], m1=t12[w]
1674    UNSCRATCH            4, 9, tmpq+ 3*%%str
1675    VP9_RND_SH_SUMSUB_BA     2,  0,  4,  6,  5, [pd_8192]   ; m2=t9[w], m0=t13[w]
1676
1677    SCRATCH              0, 10, tmpq+ 0*%%str
1678    SCRATCH              1, 11, tmpq+15*%%str
1679    SCRATCH              2, 14, tmpq+ 3*%%str
1680    SCRATCH              3, 15, tmpq+12*%%str
1681
1682    mova                m2, [%1+ 6*32]  ; in6
1683    mova                m3, [%1+ 9*32]  ; in9
1684    mova                m0, [%1+ 1*32]  ; in1
1685    mova                m1, [%1+14*32]  ; in14
1686
1687    VP9_UNPACK_MULSUB_2D_4X  3,  2,  7,  6, 13160,  9760    ; m3/7=t7[d], m2/6=t6[d]
1688    VP9_UNPACK_MULSUB_2D_4X  0,  1,  4,  5,  2404, 16207    ; m0/4=t15[d], m1/5=t14[d]
1689    SCRATCH              4, 9, tmpq+ 6*%%str
1690    VP9_RND_SH_SUMSUB_BA     1,  2,  5,  6,  4, [pd_8192]   ; m1=t6[w], m2=t14[w]
1691    UNSCRATCH            4, 9, tmpq+ 6*%%str
1692    VP9_RND_SH_SUMSUB_BA     0,  3,  4,  7,  6, [pd_8192]   ; m0=t7[w], m3=t15[w]
1693
1694    ; r8=t0, r7=t1, r5=t2, r10=t3, r11=t4, m8|r4=t5, m1=t6, m0=t7
1695    ; m10|r0=t8, m11|r15=t9, m13|r13=t10, m12|r2=t11, m14|r3=t12, m15|r12=t13, m2=t14, m3=t15
1696
1697    UNSCRATCH            4, 12, tmpq+ 2*%%str
1698    UNSCRATCH            5, 13, tmpq+13*%%str
1699    SCRATCH              0, 12, tmpq+ 1*%%str
1700    SCRATCH              1, 13, tmpq+14*%%str
1701
1702    ; remainder of round 2 (rest of t8-15)
1703    VP9_UNPACK_MULSUB_2D_4X  5,  4,  6,  7,  9102, 13623    ; m5/6=t11[d], m4/7=t10[d]
1704    VP9_UNPACK_MULSUB_2D_4X  3,  2,  1,  0, 13623,  9102    ; m3/1=t14[d], m2/0=t15[d]
1705    SCRATCH              0, 9, tmpq+ 6*%%str
1706    VP9_RND_SH_SUMSUB_BA     3,  4,  1,  7,  0, [pd_8192]   ; m3=t10[w], m4=t14[w]
1707    UNSCRATCH            0, 9, tmpq+ 6*%%str
1708    VP9_RND_SH_SUMSUB_BA     2,  5,  0,  6,  1, [pd_8192]   ; m2=t11[w], m5=t15[w]
1709
1710    ; m15|r12=t8, m14|r3=t9, m3=t10, m2=t11, m11|r15=t12, m10|r0=t13, m4=t14, m5=t15
1711
1712    UNSCRATCH            6, 14, tmpq+ 3*%%str
1713    UNSCRATCH            7, 15, tmpq+12*%%str
1714
1715    SUMSUB_BA                w,  3,  7,  1
1716    PSIGNW                  m3, [pw_m1]                     ; m3=out1[w], m7=t10[w]
1717    SUMSUB_BA                w,  2,  6,  1                  ; m2=out14[w], m6=t11[w]
1718
1719    ; unfortunately, the code below overflows in some cases, e.g.
1720    ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8.webm
1721%if 0; cpuflag(ssse3)
1722    SUMSUB_BA                w,  7,  6,  1
1723    pmulhrsw                m7, [pw_11585x2]                ; m7=out6[w]
1724    pmulhrsw                m6, [pw_11585x2]                ; m6=out9[w]
1725%else
1726    VP9_UNPACK_MULSUB_2W_4X  6,  7, 11585, 11585, [pd_8192], 1, 0
1727%endif
1728
1729    mova       [tmpq+ 3*%%str], m6
1730    mova       [tmpq+ 6*%%str], m7
1731    UNSCRATCH                6, 10, tmpq+ 0*%%str
1732    UNSCRATCH                7, 11, tmpq+15*%%str
1733    mova       [tmpq+13*%%str], m2
1734    SCRATCH                  3, 11, tmpq+ 9*%%str
1735
1736    VP9_UNPACK_MULSUB_2D_4X  7,  6,  2,  3, 15137,  6270    ; m6/3=t13[d], m7/2=t12[d]
1737    VP9_UNPACK_MULSUB_2D_4X  5,  4,  1,  0,  6270, 15137    ; m5/1=t14[d], m4/0=t15[d]
1738    SCRATCH              0, 9, tmpq+ 2*%%str
1739    VP9_RND_SH_SUMSUB_BA     5,  6,  1,  3,  0, [pd_8192]   ; m5=out2[w], m6=t14[w]
1740    UNSCRATCH            0, 9, tmpq+ 2*%%str
1741    VP9_RND_SH_SUMSUB_BA     4,  7,  0,  2,  1, [pd_8192]
1742    PSIGNW                  m4, [pw_m1]                     ; m4=out13[w], m7=t15[w]
1743
1744    ; unfortunately, the code below overflows in some cases
1745%if 0; cpuflag(ssse3)
1746    SUMSUB_BA                w,  7,  6,  1
1747    pmulhrsw                m7, [pw_m11585x2]               ; m7=out5[w]
1748    pmulhrsw                m6, [pw_11585x2]                ; m6=out10[w]
1749%else
1750    PSIGNW                  m7, [pw_m1]
1751    VP9_UNPACK_MULSUB_2W_4X  7,  6, 11585, 11585, [pd_8192], 1, 0
1752%endif
1753
1754    ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, m6=out10, m4=out13, r2=out14
1755
1756    mova                    m2, [tmpq+ 8*%%str]
1757    mova                    m3, [tmpq+ 7*%%str]
1758    mova                    m1, [tmpq+11*%%str]
1759    mova       [tmpq+ 7*%%str], m6
1760    mova       [tmpq+11*%%str], m4
1761    mova                    m4, [tmpq+ 5*%%str]
1762    SCRATCH                  5, 14, tmpq+ 5*%%str
1763    SCRATCH                  7, 15, tmpq+ 8*%%str
1764    UNSCRATCH                6,  8, tmpq+ 4*%%str
1765    UNSCRATCH                5, 12, tmpq+ 1*%%str
1766    UNSCRATCH                7, 13, tmpq+14*%%str
1767
1768    ; m2=t0, m3=t1, m9=t2, m0=t3, m1=t4, m8=t5, m13=t6, m12=t7
1769    ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14
1770
1771    SUMSUB_BA                w,  1,  2, 0                   ; m1=t0[w], m2=t4[w]
1772    mova                    m0, [tmpq+10*%%str]
1773    SCRATCH                  1, 12, tmpq+ 1*%%str
1774    SUMSUB_BA                w,  6,  3, 1                   ; m8=t1[w], m3=t5[w]
1775    SCRATCH                  6, 13, tmpq+ 4*%%str
1776    SUMSUB_BA                w,  7,  4, 1                   ; m13=t2[w], m9=t6[w]
1777    SCRATCH                  7,  8, tmpq+10*%%str
1778    SUMSUB_BA                w,  5,  0, 1                   ; m12=t3[w], m0=t7[w]
1779    SCRATCH                  5,  9, tmpq+14*%%str
1780
1781    VP9_UNPACK_MULSUB_2D_4X  2,  3,  7,  5, 15137,  6270    ; m2/6=t5[d], m3/10=t4[d]
1782    VP9_UNPACK_MULSUB_2D_4X  0,  4,  1,  6,  6270, 15137    ; m0/14=t6[d], m9/15=t7[d]
1783    SCRATCH                  6, 10, tmpq+ 0*%%str
1784    VP9_RND_SH_SUMSUB_BA     0,  3,  1,  5,  6, [pd_8192]
1785    UNSCRATCH                6, 10, tmpq+ 0*%%str
1786    PSIGNW                  m0, [pw_m1]                     ; m0=out3[w], m3=t6[w]
1787    VP9_RND_SH_SUMSUB_BA     4,  2,  6,  7,  5, [pd_8192]   ; m9=out12[w], m2=t7[w]
1788
1789    UNSCRATCH                1,  8, tmpq+10*%%str
1790    UNSCRATCH                5,  9, tmpq+14*%%str
1791    UNSCRATCH                6, 12, tmpq+ 1*%%str
1792    UNSCRATCH                7, 13, tmpq+ 4*%%str
1793    SCRATCH                  4,  9, tmpq+14*%%str
1794
1795    SUMSUB_BA                w,  1,  6,  4                  ; m13=out0[w], m1=t2[w]
1796    SUMSUB_BA                w,  5,  7,  4
1797    PSIGNW                  m5, [pw_m1]                     ; m12=out15[w], m8=t3[w]
1798
1799    ; unfortunately, the code below overflows in some cases, e.g.
1800    ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm
1801%if 0 ; cpuflag(ssse3)
1802    SUMSUB_BA               w,   7,  6,  4
1803    pmulhrsw                m7, [pw_m11585x2]               ; m8=out7[w]
1804    pmulhrsw                m6, [pw_11585x2]                ; m1=out8[w]
1805    SWAP                     6,  7
1806    SUMSUB_BA                w,  3,  2,  4
1807    pmulhrsw                m3, [pw_11585x2]                ; m3=out4[w]
1808    pmulhrsw                m2, [pw_11585x2]                ; m2=out11[w]
1809%else
1810    SCRATCH                  5,  8, tmpq+10*%%str
1811    VP9_UNPACK_MULSUB_2W_4X  6,  7, 11585, m11585, [pd_8192],  5,  4
1812    VP9_UNPACK_MULSUB_2W_4X  2,  3, 11585, 11585, [pd_8192],  5,  4
1813    UNSCRATCH                5,  8, tmpq+10*%%str
1814%endif
1815
1816    ; m13=out0, m0=out3, m3=out4, m8=out7, m1=out8, m2=out11, m9=out12, m12=out15
1817    ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14
1818
1819%if %2 == 1
1820%if ARCH_X86_64
1821    mova                   m13, [tmpq+ 6*%%str]
1822    TRANSPOSE8x8W            1, 11, 14, 0, 3, 15, 13, 6, 10
1823    mova          [tmpq+ 0*16], m1
1824    mova          [tmpq+ 2*16], m11
1825    mova          [tmpq+ 4*16], m14
1826    mova          [tmpq+ 6*16], m0
1827    mova                    m1, [tmpq+ 3*%%str]
1828    mova                   m11, [tmpq+ 7*%%str]
1829    mova                   m14, [tmpq+11*%%str]
1830    mova                    m0, [tmpq+13*%%str]
1831    mova          [tmpq+ 8*16], m3
1832    mova          [tmpq+10*16], m15
1833    mova          [tmpq+12*16], m13
1834    mova          [tmpq+14*16], m6
1835
1836    TRANSPOSE8x8W            7, 1, 11, 2, 9, 14, 0, 5, 10
1837    mova          [tmpq+ 1*16], m7
1838    mova          [tmpq+ 3*16], m1
1839    mova          [tmpq+ 5*16], m11
1840    mova          [tmpq+ 7*16], m2
1841    mova          [tmpq+ 9*16], m9
1842    mova          [tmpq+11*16], m14
1843    mova          [tmpq+13*16], m0
1844    mova          [tmpq+15*16], m5
1845%else
1846    mova       [tmpq+12*%%str], m2
1847    mova       [tmpq+ 1*%%str], m5
1848    mova       [tmpq+15*%%str], m7
1849    mova                    m2, [tmpq+ 9*%%str]
1850    mova                    m5, [tmpq+ 5*%%str]
1851    mova                    m7, [tmpq+ 8*%%str]
1852    TRANSPOSE8x8W            1, 2, 5, 0, 3, 7, 4, 6, [tmpq+ 6*%%str], [tmpq+ 8*%%str], 1
1853    mova          [tmpq+ 0*16], m1
1854    mova          [tmpq+ 2*16], m2
1855    mova          [tmpq+ 4*16], m5
1856    mova          [tmpq+ 6*16], m0
1857    mova          [tmpq+10*16], m7
1858    mova                    m3, [tmpq+12*%%str]
1859    mova          [tmpq+12*16], m4
1860    mova                    m4, [tmpq+14*%%str]
1861    mova          [tmpq+14*16], m6
1862
1863    mova                    m0, [tmpq+15*%%str]
1864    mova                    m1, [tmpq+ 3*%%str]
1865    mova                    m2, [tmpq+ 7*%%str]
1866    mova                    m5, [tmpq+11*%%str]
1867    mova                    m7, [tmpq+ 1*%%str]
1868    TRANSPOSE8x8W            0, 1, 2, 3, 4, 5, 6, 7, [tmpq+13*%%str], [tmpq+ 9*%%str], 1
1869    mova          [tmpq+ 1*16], m0
1870    mova          [tmpq+ 3*16], m1
1871    mova          [tmpq+ 5*16], m2
1872    mova          [tmpq+ 7*16], m3
1873    mova          [tmpq+11*16], m5
1874    mova          [tmpq+13*16], m6
1875    mova          [tmpq+15*16], m7
1876%endif
1877%else
1878    pxor                    m4, m4
1879
1880%if cpuflag(ssse3)
1881%define ROUND_REG [pw_512]
1882%else
1883%define ROUND_REG [pw_32]
1884%endif
1885
1886%if ARCH_X86_64
1887    mova                   m12, [tmpq+ 6*%%str]
1888    VP9_IDCT8_WRITEx2        1, 11, 10,  8,  4, ROUND_REG, 6
1889    lea                   dstq, [dstq+strideq*2]
1890    VP9_IDCT8_WRITEx2       14,  0, 10,  8,  4, ROUND_REG, 6
1891    lea                   dstq, [dstq+strideq*2]
1892    VP9_IDCT8_WRITEx2        3, 15, 10,  8,  4, ROUND_REG, 6
1893    lea                   dstq, [dstq+strideq*2]
1894    VP9_IDCT8_WRITEx2       12,  6, 10,  8,  4, ROUND_REG, 6
1895    lea                   dstq, [dstq+strideq*2]
1896
1897    mova                    m1, [tmpq+ 3*%%str]
1898    mova                   m11, [tmpq+ 7*%%str]
1899    mova                   m14, [tmpq+11*%%str]
1900    mova                    m0, [tmpq+13*%%str]
1901
1902    VP9_IDCT8_WRITEx2        7,  1, 10,  8,  4, ROUND_REG, 6
1903    lea                   dstq, [dstq+strideq*2]
1904    VP9_IDCT8_WRITEx2       11,  2, 10,  8,  4, ROUND_REG, 6
1905    lea                   dstq, [dstq+strideq*2]
1906    VP9_IDCT8_WRITEx2        9, 14, 10,  8,  4, ROUND_REG, 6
1907    lea                   dstq, [dstq+strideq*2]
1908    VP9_IDCT8_WRITEx2        0,  5, 10,  8,  4, ROUND_REG, 6
1909%else
1910    mova       [tmpq+ 0*%%str], m2
1911    mova       [tmpq+ 1*%%str], m5
1912    mova       [tmpq+ 2*%%str], m7
1913    mova                    m2, [tmpq+ 9*%%str]
1914    VP9_IDCT8_WRITEx2        1,  2,  5,  7,  4, ROUND_REG, 6
1915    lea                   dstq, [dstq+strideq*2]
1916    mova                    m5, [tmpq+ 5*%%str]
1917    VP9_IDCT8_WRITEx2        5,  0,  1,  2,  4, ROUND_REG, 6
1918    lea                   dstq, [dstq+strideq*2]
1919    mova                    m5, [tmpq+ 8*%%str]
1920    VP9_IDCT8_WRITEx2        3,  5,  1,  2,  4, ROUND_REG, 6
1921    lea                   dstq, [dstq+strideq*2]
1922    mova                    m5, [tmpq+ 6*%%str]
1923    VP9_IDCT8_WRITEx2        5,  6,  1,  2,  4, ROUND_REG, 6
1924    lea                   dstq, [dstq+strideq*2]
1925
1926    mova                    m0, [tmpq+ 2*%%str]
1927    mova                    m3, [tmpq+ 3*%%str]
1928    VP9_IDCT8_WRITEx2        0,  3,  1,  2,  4, ROUND_REG, 6
1929    lea                   dstq, [dstq+strideq*2]
1930    mova                    m0, [tmpq+ 7*%%str]
1931    mova                    m3, [tmpq+ 0*%%str]
1932    VP9_IDCT8_WRITEx2        0,  3,  1,  2,  4, ROUND_REG, 6
1933    lea                   dstq, [dstq+strideq*2]
1934    mova                    m0, [tmpq+14*%%str]
1935    mova                    m3, [tmpq+11*%%str]
1936    VP9_IDCT8_WRITEx2        0,  3,  1,  2,  4, ROUND_REG, 6
1937    lea                   dstq, [dstq+strideq*2]
1938    mova                    m0, [tmpq+13*%%str]
1939    mova                    m3, [tmpq+ 1*%%str]
1940    VP9_IDCT8_WRITEx2        0,  3,  1,  2,  4, ROUND_REG, 6
1941%endif
1942
1943    SWAP                     0,  4 ; zero
1944%undef ROUND_REG
1945%endif
1946%endmacro
1947
1948%macro IADST16_FN 5
1949INIT_XMM %5
1950cglobal vp9_%1_%3_16x16_add, 3, 6, 16, 512, dst, stride, block, cnt, dst_bak, tmp
1951    mov               cntd, 2
1952    mov               tmpq, rsp
1953.loop1_full:
1954    VP9_%2_1D       blockq, 1
1955    add             blockq, 16
1956    add               tmpq, 256
1957    dec               cntd
1958    jg .loop1_full
1959    sub             blockq, 32
1960
1961    mov               cntd, 2
1962    mov               tmpq, rsp
1963    mov           dst_bakq, dstq
1964.loop2_full:
1965    VP9_%4_1D         tmpq, 2
1966    lea               dstq, [dst_bakq+8]
1967    add               tmpq, 16
1968    dec               cntd
1969    jg .loop2_full
1970
1971    ; at the end of the loop, m0 should still be zero
1972    ; use that to zero out block coefficients
1973    ZERO_BLOCK      blockq, 32, 16, m0
1974    RET
1975%endmacro
1976
1977IADST16_FN idct,  IDCT16,  iadst, IADST16, sse2
1978IADST16_FN iadst, IADST16, idct,  IDCT16,  sse2
1979IADST16_FN iadst, IADST16, iadst, IADST16, sse2
1980IADST16_FN idct,  IDCT16,  iadst, IADST16, ssse3
1981IADST16_FN iadst, IADST16, idct,  IDCT16,  ssse3
1982IADST16_FN iadst, IADST16, iadst, IADST16, ssse3
1983IADST16_FN idct,  IDCT16,  iadst, IADST16, avx
1984IADST16_FN iadst, IADST16, idct,  IDCT16,  avx
1985IADST16_FN iadst, IADST16, iadst, IADST16, avx
1986
1987; in: data in m[0-15] except m0/m4, which are in [blockq+0] and [blockq+128]
1988; out: m[0-15] except m6, which is in [blockq+192]
1989; uses blockq as scratch space
1990%macro VP9_IADST16_YMM_1D 0
1991    mova          [blockq+ 32], m3
1992    mova          [blockq+ 64], m7
1993    mova          [blockq+ 96], m8
1994
1995    ; first half of round 1
1996    VP9_UNPACK_MULSUB_2D_4X  9,  6,  0,  3, 13160,  9760    ; m9/x=t7[d], m6/x=t6[d]
1997    VP9_UNPACK_MULSUB_2D_4X  1, 14,  4,  7,  2404, 16207    ; m1/x=t15[d], m14/x=t14[d]
1998    VP9_RND_SH_SUMSUB_BA    14,  6,  7,  3,  8, [pd_8192]   ; m14=t6[w], m6=t14[w]
1999    VP9_RND_SH_SUMSUB_BA     1,  9,  4,  0,  8, [pd_8192]   ; m1=t7[w], m9=t15[w]
2000
2001    VP9_UNPACK_MULSUB_2D_4X 13,  2,  4,  7, 15893,  3981    ; m13/x=t3[d], m2/x=t2[d]
2002    VP9_UNPACK_MULSUB_2D_4X  5, 10,  0,  3,  8423, 14053    ; m5/x=t11[d], m10/x=t10[d]
2003    VP9_RND_SH_SUMSUB_BA    10,  2,  3,  7,  8, [pd_8192]   ; m10=t2[w], m2=t10[w]
2004    VP9_RND_SH_SUMSUB_BA     5, 13,  0,  4,  8, [pd_8192]   ; m5=t3[w], m13=t11[w]
2005
2006    ; half of round 2 t8-15
2007    VP9_UNPACK_MULSUB_2D_4X  2, 13,  4,  7,  9102, 13623    ; m2/x=t11[d], m13/x=t10[d]
2008    VP9_UNPACK_MULSUB_2D_4X  9,  6,  3,  0, 13623,  9102    ; m9/x=t14[d], m6/x=t15[d]
2009    VP9_RND_SH_SUMSUB_BA     9, 13,  3,  7,  8, [pd_8192]   ; m9=t10[w], m13=t14[w]
2010    VP9_RND_SH_SUMSUB_BA     6,  2,  0,  4,  8, [pd_8192]   ; m6=t11[w], m2=t15[w]
2011
2012    SUMSUB_BA            w, 14, 10,  8                      ; m14=t2, m10=t6
2013    SUMSUB_BA            w,  1,  5,  8                      ; m1=t3, m5=t7
2014
2015    mova                    m0, [blockq+  0]
2016    mova                    m4, [blockq+128]
2017    mova                    m3, [blockq+ 32]
2018    mova                    m7, [blockq+ 64]
2019    mova                    m8, [blockq+ 96]
2020    mova          [blockq+  0], m1
2021    mova          [blockq+128], m14
2022    mova          [blockq+ 32], m6
2023    mova          [blockq+ 64], m9
2024    mova          [blockq+ 96], m10
2025
2026    ; second half of round 1
2027    VP9_UNPACK_MULSUB_2D_4X 15,  0,  1,  9, 16364,   804    ; m15/x=t1[d], m0/x=t0[d]
2028    VP9_UNPACK_MULSUB_2D_4X  7,  8, 10,  6, 11003, 12140    ; m7/x=t9[d], m8/x=t8[d]
2029    VP9_RND_SH_SUMSUB_BA     8,  0,  6,  9, 14, [pd_8192]   ; m8=t0[w], m0=t8[w]
2030    VP9_RND_SH_SUMSUB_BA     7, 15, 10,  1, 14, [pd_8192]   ; m7=t1[w], m15=t9[w]
2031
2032    VP9_UNPACK_MULSUB_2D_4X 11,  4, 10,  6, 14811,  7005    ; m11/x=t5[d], m4/x=t4[d]
2033    VP9_UNPACK_MULSUB_2D_4X  3, 12,  1,  9,  5520, 15426    ; m3/x=t13[d], m12/x=t12[d]
2034    VP9_RND_SH_SUMSUB_BA    12,  4,  9,  6, 14, [pd_8192]   ; m12=t4[w], m4=t12[w]
2035    VP9_RND_SH_SUMSUB_BA     3, 11,  1, 10, 14, [pd_8192]   ; m3=t5[w], m11=t13[w]
2036
2037    ; second half of round 2 t8-15
2038    VP9_UNPACK_MULSUB_2D_4X  0, 15,  6, 10, 16069,  3196    ; m15/x=t8[d], m0/x=t9[d]
2039    VP9_UNPACK_MULSUB_2D_4X 11,  4,  9,  1,  3196, 16069    ; m11/x=t12[d], m4/x=t13[d]
2040    VP9_RND_SH_SUMSUB_BA    11, 15,  9, 10, 14, [pd_8192]   ; m11=t8[w], m15=t12[w]
2041    VP9_RND_SH_SUMSUB_BA     4,  0,  1,  6, 14, [pd_8192]   ; m4=t9[w], m0=t13[w]
2042
2043    SUMSUB_BA            w, 12,  8, 14                      ; m12=t0, m8=t4
2044    SUMSUB_BA            w,  3,  7, 14                      ; m3=t1, m7=t5
2045
2046    mova                   m10, [blockq+ 96]
2047    mova          [blockq+ 96], m12
2048
2049    ; round 3
2050    VP9_UNPACK_MULSUB_2D_4X 15,  0,  9, 12, 15137,  6270    ; m15/x=t13[d], m0/x=t12[d]
2051    VP9_UNPACK_MULSUB_2D_4X  2, 13,  1,  6,  6270, 15137    ; m2/x=t14[d], m13/x=t15[d]
2052    VP9_RND_SH_SUMSUB_BA     2,  0,  1, 12, 14, [pd_8192]   ; m2=out2[w], m0=t14a[w]
2053    VP9_RND_SH_SUMSUB_BA    13, 15,  6,  9, 14, [pd_8192]
2054    PSIGNW                 m13, [pw_m1]                     ; m13=out13[w], m15=t15a[w]
2055
2056    VP9_UNPACK_MULSUB_2D_4X  8,  7, 12,  9, 15137,  6270    ; m8/x=t5[d], m7/x=t4[d]
2057    VP9_UNPACK_MULSUB_2D_4X  5, 10,  1,  6,  6270, 15137    ; m5/x=t6[d], m10/x=t7[d]
2058    VP9_RND_SH_SUMSUB_BA     5,  7,  1,  9, 14, [pd_8192]
2059    PSIGNW                  m5, [pw_m1]                     ; m5=out3[w], m7=t6[w]
2060    VP9_RND_SH_SUMSUB_BA    10,  8,  6, 12, 14, [pd_8192]   ; m10=out12[w], m8=t7[w]
2061
2062    mova                    m1, [blockq+  0]
2063    mova                   m14, [blockq+128]
2064    mova                    m6, [blockq+ 32]
2065    mova                    m9, [blockq+ 64]
2066    mova                   m12, [blockq+ 96]
2067    mova          [blockq+  0], m10
2068    mova          [blockq+128], m5
2069
2070    SUMSUB_BA            w, 14, 12,  5                      ; m14=out0, m12=t2a
2071    SUMSUB_BA            w,  1,  3,  5
2072    PSIGNW                  m1, [pw_m1]                     ; m1=out15, m3=t3a
2073
2074    SUMSUB_BA            w,  9, 11,  5
2075    PSIGNW                  m9, [pw_m1]                     ; m9=out1, m11=t10
2076    SUMSUB_BA            w,  6,  4,  5                      ; m6=out14, m4=t11
2077
2078    VP9_UNPACK_MULSUB_2W_4X  4, 11, 11585, 11585, [pd_8192],  5, 10 ; m4=out9, m11=out6
2079    mova                    m5, [blockq+128]
2080    mova          [blockq+192], m11
2081    PSIGNW                 m15, [pw_m1]
2082    VP9_UNPACK_MULSUB_2W_4X 15,  0, 11585, 11585, [pd_8192], 10, 11 ; m15=out5, m0=out10
2083
2084    PSIGNW                  m3, [pw_m1]
2085    VP9_UNPACK_MULSUB_2W_4X  3, 12, 11585, 11585, [pd_8192], 10, 11 ; m3=out7,m12=out8
2086    VP9_UNPACK_MULSUB_2W_4X  8,  7, 11585, 11585, [pd_8192], 10, 11 ; m8=out11,m7=out4
2087
2088    mova                   m10, [blockq+  0]
2089
2090    SWAP                     0, 14,  6, 11,  8, 12, 10
2091    SWAP                     1,  9, 15,  4,  7,  3,  5
2092    SWAP                     5,  9, 15
2093%endmacro
2094
2095%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
2096%macro IADST16_YMM_FN 4
2097INIT_YMM avx2
2098cglobal vp9_%1_%3_16x16_add, 4, 4, 16, dst, stride, block, eob
2099    mova                m1, [blockq+ 32]
2100    mova                m2, [blockq+ 64]
2101    mova                m3, [blockq+ 96]
2102    mova                m5, [blockq+160]
2103    mova                m6, [blockq+192]
2104    mova                m7, [blockq+224]
2105    mova                m8, [blockq+256]
2106    mova                m9, [blockq+288]
2107    mova               m10, [blockq+320]
2108    mova               m11, [blockq+352]
2109    mova               m12, [blockq+384]
2110    mova               m13, [blockq+416]
2111    mova               m14, [blockq+448]
2112    mova               m15, [blockq+480]
2113
2114    VP9_%2_YMM_1D
2115    TRANSPOSE16x16W      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
2116                         [blockq+192], [blockq+128], 1
2117    mova      [blockq+  0], m0
2118    VP9_%4_YMM_1D
2119
2120    mova      [blockq+224], m7
2121
2122    ; store
2123    VP9_IDCT8_WRITEx2    0,  1, 6, 7, unused, [pw_512], 6
2124    lea               dstq, [dstq+2*strideq]
2125    VP9_IDCT8_WRITEx2    2,  3, 6, 7, unused, [pw_512], 6
2126    lea               dstq, [dstq+2*strideq]
2127    VP9_IDCT8_WRITEx2    4,  5, 6, 7, unused, [pw_512], 6
2128    lea               dstq, [dstq+2*strideq]
2129    mova                m6, [blockq+192]
2130    mova                m7, [blockq+224]
2131    VP9_IDCT8_WRITEx2    6,  7, 1, 2, unused, [pw_512], 6
2132    lea               dstq, [dstq+2*strideq]
2133    VP9_IDCT8_WRITEx2    8,  9, 1, 2, unused, [pw_512], 6
2134    lea               dstq, [dstq+2*strideq]
2135    VP9_IDCT8_WRITEx2   10, 11, 1, 2, unused, [pw_512], 6
2136    lea               dstq, [dstq+2*strideq]
2137    VP9_IDCT8_WRITEx2   12, 13, 1, 2, unused, [pw_512], 6
2138    lea               dstq, [dstq+2*strideq]
2139    VP9_IDCT8_WRITEx2   14, 15, 1, 2, unused, [pw_512], 6
2140    lea               dstq, [dstq+2*strideq]
2141
2142    ; at the end of the loop, m0 should still be zero
2143    ; use that to zero out block coefficients
2144    pxor                m0, m0
2145    ZERO_BLOCK      blockq, 32, 16, m0
2146    RET
2147%endmacro
2148
2149IADST16_YMM_FN idct,  IDCT16,  iadst, IADST16
2150IADST16_YMM_FN iadst, IADST16, idct,  IDCT16
2151IADST16_YMM_FN iadst, IADST16, iadst, IADST16
2152%endif
2153
2154;---------------------------------------------------------------------------------------------
2155; void vp9_idct_idct_32x32_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
2156;---------------------------------------------------------------------------------------------
2157
2158%macro VP9_IDCT32_1D 2-3 32 ; src, pass, nnzc
2159%if %2 == 1
2160%assign %%str mmsize
2161%else
2162%assign %%str 64
2163%endif
2164
2165    ; first do t0-15, this can be done identical to idct16x16
2166    VP9_IDCT16_1D_START %1, %3/2, 64*2, tmpq, 2*%%str, 1
2167
2168    ; store everything on stack to make space available for t16-31
2169    ; we store interleaved with the output of the second half (t16-31)
2170    ; so we don't need to allocate extra stack space
2171    mova    [tmpq+ 0*%%str], m0     ; t0
2172    mova    [tmpq+ 4*%%str], m1     ; t1
2173    mova    [tmpq+ 8*%%str], m2     ; t2
2174    mova    [tmpq+12*%%str], m3     ; t3
2175    mova    [tmpq+16*%%str], m4     ; t4
2176    mova    [tmpq+20*%%str], m5     ; t5
2177%if ARCH_X86_64
2178    mova    [tmpq+22*%%str], m10    ; t10
2179    mova    [tmpq+18*%%str], m11    ; t11
2180    mova    [tmpq+14*%%str], m12    ; t12
2181    mova    [tmpq+10*%%str], m13    ; t13
2182    mova    [tmpq+ 6*%%str], m14    ; t14
2183    mova    [tmpq+ 2*%%str], m15    ; t15
2184%endif
2185
2186    mova                m0, [tmpq+ 30*%%str]
2187    UNSCRATCH            1,  6, tmpq+26*%%str
2188    UNSCRATCH            2,  8, tmpq+24*%%str
2189    UNSCRATCH            3,  9, tmpq+28*%%str
2190    SUMSUB_BA            w,  1,  3, 4       ; t6, t9
2191    SUMSUB_BA            w,  0,  2, 4       ; t7, t8
2192
2193    mova    [tmpq+24*%%str], m1     ; t6
2194    mova    [tmpq+28*%%str], m0     ; t7
2195    mova    [tmpq+30*%%str], m2     ; t8
2196    mova    [tmpq+26*%%str], m3     ; t9
2197
2198    ; then, secondly, do t16-31
2199%if %3 <= 8
2200    mova                 m4, [%1+ 1*64]
2201    mova                 m7, [%1+ 7*64]
2202
2203    pmulhrsw             m1,  m4, [pw_16364x2] ;t31
2204    pmulhrsw             m4, [pw_804x2] ;t16
2205
2206    VP9_UNPACK_MULSUB_2W_4X   5,  0,  1,  4, 16069,  3196, [pd_8192], 6,  2 ; t17, t30
2207
2208    pmulhrsw             m3,  m7, [pw_m5520x2] ;t19
2209    pmulhrsw             m7, [pw_15426x2] ;t28
2210
2211    SCRATCH               4, 13, tmpq+ 1*%%str
2212    SCRATCH               5, 12, tmpq+15*%%str
2213
2214    VP9_UNPACK_MULSUB_2W_4X   2,  6,  7,  3, 3196, m16069, [pd_8192], 4,  5 ; t18, t29
2215%else
2216    mova                 m0, [%1+ 1*64]
2217    mova                 m1, [%1+15*64]
2218%if %3 <= 16
2219    pmulhrsw             m5, m0, [pw_16364x2]
2220    pmulhrsw             m0, [pw_804x2]
2221    pmulhrsw             m4, m1, [pw_m11003x2]
2222    pmulhrsw             m1, [pw_12140x2]
2223%else
2224    mova                 m4, [%1+17*64]
2225    mova                 m5, [%1+31*64]
2226
2227    VP9_UNPACK_MULSUB_2W_4X   0,  5, 16364,   804, [pd_8192], 2, 3 ; t16, t31
2228    VP9_UNPACK_MULSUB_2W_4X   4,  1, 11003, 12140, [pd_8192], 2, 3 ; t17, t30
2229%endif
2230    SUMSUB_BA             w,  4,  0,  2
2231    SUMSUB_BA             w,  1,  5,  2
2232
2233    VP9_UNPACK_MULSUB_2W_4X   5,  0, 16069,  3196, [pd_8192], 2, 3 ; t17, t30
2234
2235    SCRATCH               4, 13, tmpq+ 1*%%str
2236    SCRATCH               5, 12, tmpq+15*%%str
2237
2238    mova                 m2, [%1+ 7*64]
2239    mova                 m3, [%1+ 9*64]
2240%if %3 <= 16
2241    pmulhrsw             m7,  m3, [pw_14811x2]
2242    pmulhrsw             m3, [pw_7005x2]
2243    pmulhrsw             m6,  m2, [pw_m5520x2]
2244    pmulhrsw             m2, [pw_15426x2]
2245%else
2246    mova                 m7, [%1+23*64]
2247    mova                 m6, [%1+25*64]
2248
2249    VP9_UNPACK_MULSUB_2W_4X   3,  7, 14811,  7005, [pd_8192], 4, 5 ; t18, t29
2250    VP9_UNPACK_MULSUB_2W_4X   6,  2,  5520, 15426, [pd_8192], 4, 5 ; t19, t28
2251%endif
2252    SUMSUB_BA             w,  3,  6,  4
2253    SUMSUB_BA             w,  7,  2,  4
2254
2255    VP9_UNPACK_MULSUB_2W_4X   2,  6, 3196, m16069, [pd_8192], 4, 5 ; t18, t29
2256%endif
2257
2258    UNSCRATCH             5, 12, tmpq+15*%%str
2259    SUMSUB_BA             w,  6,  0,  4
2260    mova    [tmpq+25*%%str], m6             ; t19
2261    UNSCRATCH             4, 13, tmpq+ 1*%%str
2262    SUMSUB_BA             w,  7,  1,  6
2263    SUMSUB_BA             w,  3,  4,  6
2264    mova    [tmpq+23*%%str], m3             ; t16
2265    SUMSUB_BA             w,  2,  5,  6
2266
2267    VP9_UNPACK_MULSUB_2W_4X   0,  5, 15137,  6270, [pd_8192], 6, 3 ; t18, t29
2268    VP9_UNPACK_MULSUB_2W_4X   1,  4, 15137,  6270, [pd_8192], 6, 3 ; t19, t28
2269
2270    SCRATCH               0, 10, tmpq+ 1*%%str
2271    SCRATCH               1, 11, tmpq+ 7*%%str
2272    SCRATCH               2,  9, tmpq+ 9*%%str
2273    SCRATCH               4, 14, tmpq+15*%%str
2274    SCRATCH               5, 15, tmpq+17*%%str
2275    SCRATCH               7, 13, tmpq+31*%%str
2276
2277%if %3 <= 8
2278    mova                 m0, [%1+ 5*64]
2279    mova                 m3, [%1+ 3*64]
2280
2281    pmulhrsw             m5,  m0, [pw_15893x2] ;t27
2282    pmulhrsw             m0, [pw_3981x2] ;t20
2283
2284    VP9_UNPACK_MULSUB_2W_4X   1,  4,  5,  0,  9102, 13623, [pd_8192], 7,  2 ; t21, t26
2285
2286    pmulhrsw             m6,  m3, [pw_m2404x2] ;t23
2287    pmulhrsw             m3, [pw_16207x2] ;t24
2288
2289    SCRATCH               5,  8, tmpq+ 5*%%str
2290    SCRATCH               4, 12, tmpq+11*%%str
2291
2292    VP9_UNPACK_MULSUB_2W_4X   7,  2,  3,  6, 13623, m9102, [pd_8192], 4, 5 ; t22, t25
2293%else
2294    mova                 m4, [%1+ 5*64]
2295    mova                 m5, [%1+11*64]
2296%if %3 <= 16
2297    pmulhrsw             m1, m4, [pw_15893x2]
2298    pmulhrsw             m4, [pw_3981x2]
2299    pmulhrsw             m0, m5, [pw_m8423x2]
2300    pmulhrsw             m5, [pw_14053x2]
2301%else
2302    mova                 m0, [%1+21*64]
2303    mova                 m1, [%1+27*64]
2304
2305    VP9_UNPACK_MULSUB_2W_4X   4,  1, 15893,  3981, [pd_8192], 2, 3 ; t20, t27
2306    VP9_UNPACK_MULSUB_2W_4X   0,  5,  8423, 14053, [pd_8192], 2, 3 ; t21, t26
2307%endif
2308    SUMSUB_BA             w,  0,  4,  2
2309    SUMSUB_BA             w,  5,  1,  2
2310
2311    VP9_UNPACK_MULSUB_2W_4X   1,  4,  9102, 13623, [pd_8192], 2, 3 ; t21, t26
2312
2313    SCRATCH               5,  8, tmpq+ 5*%%str
2314    SCRATCH               4, 12, tmpq+11*%%str
2315
2316    mova                 m7, [%1+ 3*64]
2317    mova                 m6, [%1+13*64]
2318%if %3 <= 16
2319    pmulhrsw             m3, m6, [pw_13160x2]
2320    pmulhrsw             m6, [pw_9760x2]
2321    pmulhrsw             m2, m7, [pw_m2404x2]
2322    pmulhrsw             m7, [pw_16207x2]
2323%else
2324    mova                 m2, [%1+29*64]
2325    mova                 m3, [%1+19*64]
2326    VP9_UNPACK_MULSUB_2W_4X   6,  3, 13160,  9760, [pd_8192], 4, 5 ; t22, t25
2327    VP9_UNPACK_MULSUB_2W_4X   2,  7,  2404, 16207, [pd_8192], 4, 5 ; t23, t24
2328%endif
2329    SUMSUB_BA             w,  6,  2,  4
2330    SUMSUB_BA             w,  3,  7,  4
2331
2332    VP9_UNPACK_MULSUB_2W_4X   7,  2, 13623, m9102, [pd_8192], 4, 5 ; t22, t25
2333%endif
2334
2335    ; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23,
2336    ; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31
2337
2338    UNSCRATCH             4, 12, tmpq+11*%%str
2339    SUMSUB_BA             w,  0,  6, 5
2340    SUMSUB_BA             w,  4,  2, 5
2341    UNSCRATCH             5,  8, tmpq+ 5*%%str
2342    SCRATCH               4,  8, tmpq+11*%%str
2343    SUMSUB_BA             w,  1,  7, 4
2344    SUMSUB_BA             w,  5,  3, 4
2345    SCRATCH               5, 12, tmpq+ 5*%%str
2346
2347    VP9_UNPACK_MULSUB_2W_4X   3,  6, 6270, m15137, [pd_8192], 4, 5 ; t20, t27
2348    VP9_UNPACK_MULSUB_2W_4X   2,  7, 6270, m15137, [pd_8192], 4, 5 ; t21, t26
2349
2350    ; m8[s]=t16, m9=t17, m5=t18, m4[s]=t19, m12=t20, m13=t21, m1=t22, m0=t23,
2351    ; m15=t24, m14=t25, m2=t26, m3=t27, m11=t28, m10=t29, m6=t30, m7=t31
2352
2353    UNSCRATCH             5,  9, tmpq+ 9*%%str
2354    mova                 m4, [tmpq+23*%%str] ; t16
2355%if ARCH_X86_64
2356    SUMSUB_BA             w,  1,  5,  9
2357    SUMSUB_BA             w,  0,  4,  9
2358%else
2359    SUMSUB_BADC           w,  1,  5,  0,  4
2360%endif
2361    mova    [tmpq+29*%%str], m1     ; t17
2362    mova    [tmpq+21*%%str], m0     ; t16
2363    UNSCRATCH             0, 10, tmpq+ 1*%%str
2364    UNSCRATCH             1, 11, tmpq+ 7*%%str
2365%if ARCH_X86_64
2366    SUMSUB_BA             w,  2,  0,  9
2367    SUMSUB_BA             w,  3,  1,  9
2368%else
2369    SUMSUB_BADC           w,  2,  0,  3,  1
2370%endif
2371    mova    [tmpq+ 9*%%str], m2     ; t18
2372    mova    [tmpq+13*%%str], m3     ; t19
2373    SCRATCH               0, 10, tmpq+23*%%str
2374    SCRATCH               1, 11, tmpq+27*%%str
2375
2376    UNSCRATCH             2, 14, tmpq+15*%%str
2377    UNSCRATCH             3, 15, tmpq+17*%%str
2378    SUMSUB_BA             w,  6,  2, 0
2379    SUMSUB_BA             w,  7,  3, 0
2380    SCRATCH               6, 14, tmpq+ 3*%%str
2381    SCRATCH               7, 15, tmpq+ 7*%%str
2382
2383    UNSCRATCH             0,  8, tmpq+11*%%str
2384    mova                 m1, [tmpq+25*%%str] ; t19
2385    UNSCRATCH             6, 12, tmpq+ 5*%%str
2386    UNSCRATCH             7, 13, tmpq+31*%%str
2387%if ARCH_X86_64
2388    SUMSUB_BA             w,  0,  1,  9
2389    SUMSUB_BA             w,  6,  7,  9
2390%else
2391    SUMSUB_BADC           w,  0,  1,  6,  7
2392%endif
2393
2394    ; m0=t16, m1=t17, m2=t18, m3=t19, m11=t20, m10=t21, m9=t22, m8=t23,
2395    ; m7=t24, m6=t25, m5=t26, m4=t27, m12=t28, m13=t29, m14=t30, m15=t31
2396
2397%if 0; cpuflag(ssse3)
2398%if ARCH_X86_64
2399    SUMSUB_BA             w,  4,  7,  8
2400    SUMSUB_BA             w,  5,  1,  8
2401%else
2402    SUMSUB_BADC           w,  4,  7,  5,  1
2403%endif
2404
2405    pmulhrsw             m7, [pw_11585x2]
2406    pmulhrsw             m4, [pw_11585x2]
2407    pmulhrsw             m1, [pw_11585x2]
2408    pmulhrsw             m5, [pw_11585x2]
2409
2410    mova    [tmpq+ 5*%%str], m7     ; t23
2411    SCRATCH               1, 13, tmpq+25*%%str
2412    UNSCRATCH             7, 10, tmpq+23*%%str
2413    UNSCRATCH             1, 11, tmpq+27*%%str
2414
2415%if ARCH_X86_64
2416    SUMSUB_BA             w,  7,  3, 10
2417    SUMSUB_BA             w,  1,  2, 10
2418%else
2419    SUMSUB_BADC           w,  7,  3,  1,  2
2420%endif
2421
2422    pmulhrsw             m3, [pw_11585x2]
2423    pmulhrsw             m7, [pw_11585x2]
2424    pmulhrsw             m2, [pw_11585x2]
2425    pmulhrsw             m1, [pw_11585x2]
2426%else
2427    SCRATCH               0,  8, tmpq+15*%%str
2428    SCRATCH               6,  9, tmpq+17*%%str
2429    VP9_UNPACK_MULSUB_2W_4X  7,  4, 11585, 11585, [pd_8192], 0, 6
2430    mova    [tmpq+ 5*%%str], m7     ; t23
2431    UNSCRATCH             7, 10, tmpq+23*%%str
2432    VP9_UNPACK_MULSUB_2W_4X  1,  5, 11585, 11585, [pd_8192], 0, 6
2433    SCRATCH               1, 13, tmpq+25*%%str
2434    UNSCRATCH             1, 11, tmpq+27*%%str
2435    VP9_UNPACK_MULSUB_2W_4X  3,  7, 11585, 11585, [pd_8192], 0, 6
2436    VP9_UNPACK_MULSUB_2W_4X  2,  1, 11585, 11585, [pd_8192], 0, 6
2437    UNSCRATCH             0,  8, tmpq+15*%%str
2438    UNSCRATCH             6,  9, tmpq+17*%%str
2439%endif
2440
2441    ; m0=t16, m1=t17, m2=t18, m3=t19, m4=t20, m5=t21, m6=t22, m7=t23,
2442    ; m8=t24, m9=t25, m10=t26, m11=t27, m12=t28, m13=t29, m14=t30, m15=t31
2443
2444    ; then do final pass to sumsub+store the two halves
2445%if %2 == 1
2446    mova    [tmpq+17*%%str], m2     ; t20
2447    mova    [tmpq+ 1*%%str], m3     ; t21
2448%if ARCH_X86_64
2449    mova    [tmpq+25*%%str], m13    ; t22
2450
2451    mova                 m8, [tmpq+ 0*%%str] ; t0
2452    mova                 m9, [tmpq+ 4*%%str] ; t1
2453    mova                m12, [tmpq+ 8*%%str] ; t2
2454    mova                m11, [tmpq+12*%%str] ; t3
2455    mova                 m2, [tmpq+16*%%str] ; t4
2456    mova                 m3, [tmpq+20*%%str] ; t5
2457    mova                m13, [tmpq+24*%%str] ; t6
2458
2459    SUMSUB_BA             w,  6,  8, 10
2460    mova    [tmpq+ 3*%%str], m8              ; t15
2461    SUMSUB_BA             w,  0,  9,  8
2462    SUMSUB_BA             w, 15, 12,  8
2463    SUMSUB_BA             w, 14, 11,  8
2464    SUMSUB_BA             w,  1,  2,  8
2465    SUMSUB_BA             w,  7,  3,  8
2466    SUMSUB_BA             w,  5, 13,  8
2467    mova                m10, [tmpq+28*%%str] ; t7
2468    SUMSUB_BA             w,  4, 10,  8
2469%if cpuflag(avx2)
2470    ; the "shitty" about this idct is that the final pass does the outermost
2471    ; interleave sumsubs (t0/31, t1/30, etc) but the tN for the 16x16 need
2472    ; to be sequential, which means I need to load/store half of the sumsub
2473    ; intermediates back to/from memory to get a 16x16 transpose going...
2474    ; This would be easier if we had more (e.g. 32) YMM regs here.
2475    mova    [tmpq+ 7*%%str], m9
2476    mova    [tmpq+11*%%str], m12
2477    mova    [tmpq+15*%%str], m11
2478    mova    [tmpq+19*%%str], m2
2479    mova    [tmpq+23*%%str], m3
2480    mova    [tmpq+27*%%str], m13
2481    mova    [tmpq+31*%%str], m10
2482    mova    [tmpq+12*%%str], m5
2483
2484    mova                m13, [tmpq+30*%%str] ; t8
2485    mova                m12, [tmpq+26*%%str] ; t9
2486    mova                m11, [tmpq+22*%%str] ; t10
2487    mova                m10, [tmpq+18*%%str] ; t11
2488    mova                 m9, [tmpq+17*%%str] ; t20
2489    mova                 m8, [tmpq+ 1*%%str] ; t21
2490    mova                 m3, [tmpq+25*%%str] ; t22
2491    mova                 m2, [tmpq+ 5*%%str] ; t23
2492
2493    SUMSUB_BA             w,  9, 10, 5
2494    SUMSUB_BA             w,  8, 11, 5
2495    SUMSUB_BA             w,  3, 12, 5
2496    SUMSUB_BA             w,  2, 13, 5
2497    mova    [tmpq+ 1*%%str], m10
2498    mova    [tmpq+ 5*%%str], m11
2499    mova    [tmpq+17*%%str], m12
2500    mova    [tmpq+25*%%str], m13
2501
2502    mova                m13, [tmpq+14*%%str] ; t12
2503    mova                m12, [tmpq+10*%%str] ; t13
2504    mova                m11, [tmpq+ 9*%%str] ; t18
2505    mova                m10, [tmpq+13*%%str] ; t19
2506
2507    SUMSUB_BA             w, 11, 12, 5
2508    SUMSUB_BA             w, 10, 13, 5
2509    mova    [tmpq+ 9*%%str], m13
2510    mova    [tmpq+13*%%str], m12
2511    mova    [tmpq+10*%%str], m10
2512    mova    [tmpq+14*%%str], m11
2513
2514    mova                m13, [tmpq+ 6*%%str] ; t14
2515    mova                m12, [tmpq+ 2*%%str] ; t15
2516    mova                m11, [tmpq+21*%%str] ; t16
2517    mova                m10, [tmpq+29*%%str] ; t17
2518    SUMSUB_BA             w, 11, 12, 5
2519    SUMSUB_BA             w, 10, 13, 5
2520    mova    [tmpq+21*%%str], m12
2521    mova    [tmpq+29*%%str], m13
2522    mova                m12, [tmpq+10*%%str]
2523    mova                m13, [tmpq+14*%%str]
2524
2525    TRANSPOSE16x16W       6,  0, 15, 14,  1,  7,  5,  4, \
2526                          2,  3,  8,  9, 12, 13, 10, 11, \
2527            [tmpq+12*%%str], [tmpq+ 8*%%str], 1
2528    mova    [tmpq+ 0*%%str], m6
2529    mova    [tmpq+ 2*%%str], m0
2530    mova    [tmpq+ 4*%%str], m15
2531    mova    [tmpq+ 6*%%str], m14
2532    mova    [tmpq+10*%%str], m7
2533    mova    [tmpq+12*%%str], m5
2534    mova    [tmpq+14*%%str], m4
2535    mova    [tmpq+16*%%str], m2
2536    mova    [tmpq+18*%%str], m3
2537    mova    [tmpq+20*%%str], m8
2538    mova    [tmpq+22*%%str], m9
2539    mova    [tmpq+24*%%str], m12
2540    mova    [tmpq+26*%%str], m13
2541    mova    [tmpq+28*%%str], m10
2542    mova    [tmpq+30*%%str], m11
2543
2544    mova                 m0, [tmpq+21*%%str]
2545    mova                 m1, [tmpq+29*%%str]
2546    mova                 m2, [tmpq+13*%%str]
2547    mova                 m3, [tmpq+ 9*%%str]
2548    mova                 m4, [tmpq+ 1*%%str]
2549    mova                 m5, [tmpq+ 5*%%str]
2550    mova                 m7, [tmpq+25*%%str]
2551    mova                 m8, [tmpq+31*%%str]
2552    mova                 m9, [tmpq+27*%%str]
2553    mova                m10, [tmpq+23*%%str]
2554    mova                m11, [tmpq+19*%%str]
2555    mova                m12, [tmpq+15*%%str]
2556    mova                m13, [tmpq+11*%%str]
2557    mova                m14, [tmpq+ 7*%%str]
2558    mova                m15, [tmpq+ 3*%%str]
2559    TRANSPOSE16x16W       0,  1,  2,  3,  4,  5,  6,  7, \
2560                          8,  9, 10, 11, 12, 13, 14, 15, \
2561            [tmpq+17*%%str], [tmpq+ 9*%%str], 1
2562    mova    [tmpq+ 1*%%str], m0
2563    mova    [tmpq+ 3*%%str], m1
2564    mova    [tmpq+ 5*%%str], m2
2565    mova    [tmpq+ 7*%%str], m3
2566    mova    [tmpq+11*%%str], m5
2567    mova    [tmpq+13*%%str], m6
2568    mova    [tmpq+15*%%str], m7
2569    mova    [tmpq+17*%%str], m8
2570    mova    [tmpq+19*%%str], m9
2571    mova    [tmpq+21*%%str], m10
2572    mova    [tmpq+23*%%str], m11
2573    mova    [tmpq+25*%%str], m12
2574    mova    [tmpq+27*%%str], m13
2575    mova    [tmpq+29*%%str], m14
2576    mova    [tmpq+31*%%str], m15
2577%else ; !avx2
2578    TRANSPOSE8x8W         6, 0, 15, 14, 1, 7, 5, 4, 8
2579    mova    [tmpq+ 0*%%str], m6
2580    mova    [tmpq+ 4*%%str], m0
2581    mova    [tmpq+ 8*%%str], m15
2582    mova    [tmpq+12*%%str], m14
2583    mova    [tmpq+16*%%str], m1
2584    mova    [tmpq+20*%%str], m7
2585    mova    [tmpq+24*%%str], m5
2586    mova    [tmpq+28*%%str], m4
2587
2588    mova                  m8, [tmpq+ 3*%%str] ; t15
2589    TRANSPOSE8x8W         10, 13, 3, 2, 11, 12, 9, 8, 0
2590    mova    [tmpq+ 3*%%str], m10
2591    mova    [tmpq+ 7*%%str], m13
2592    mova    [tmpq+11*%%str], m3
2593    mova    [tmpq+15*%%str], m2
2594    mova    [tmpq+19*%%str], m11
2595    mova    [tmpq+23*%%str], m12
2596    mova    [tmpq+27*%%str], m9
2597    mova    [tmpq+31*%%str], m8
2598
2599    mova                m15, [tmpq+30*%%str] ; t8
2600    mova                m14, [tmpq+26*%%str] ; t9
2601    mova                m13, [tmpq+22*%%str] ; t10
2602    mova                m12, [tmpq+18*%%str] ; t11
2603    mova                m11, [tmpq+14*%%str] ; t12
2604    mova                m10, [tmpq+10*%%str] ; t13
2605    mova                 m9, [tmpq+ 6*%%str] ; t14
2606    mova                 m8, [tmpq+ 2*%%str] ; t15
2607    mova                 m7, [tmpq+21*%%str] ; t16
2608    mova                 m6, [tmpq+29*%%str] ; t17
2609    mova                 m5, [tmpq+ 9*%%str] ; t18
2610    mova                 m4, [tmpq+13*%%str] ; t19
2611    mova                 m3, [tmpq+17*%%str] ; t20
2612    mova                 m2, [tmpq+ 1*%%str] ; t21
2613    mova                 m1, [tmpq+25*%%str] ; t22
2614
2615    SUMSUB_BA             w,  7,  8, 0
2616    mova    [tmpq+ 2*%%str], m8
2617    mova                 m0, [tmpq+ 5*%%str] ; t23
2618    SUMSUB_BA             w,  6,  9, 8
2619    SUMSUB_BA             w,  5, 10, 8
2620    SUMSUB_BA             w,  4, 11, 8
2621    SUMSUB_BA             w,  3, 12, 8
2622    SUMSUB_BA             w,  2, 13, 8
2623    SUMSUB_BA             w,  1, 14, 8
2624    SUMSUB_BA             w,  0, 15, 8
2625
2626    TRANSPOSE8x8W         0, 1, 2, 3, 4, 5, 6, 7, 8
2627    mova    [tmpq+ 1*%%str], m0
2628    mova    [tmpq+ 5*%%str], m1
2629    mova    [tmpq+ 9*%%str], m2
2630    mova    [tmpq+13*%%str], m3
2631    mova    [tmpq+17*%%str], m4
2632    mova    [tmpq+21*%%str], m5
2633    mova    [tmpq+25*%%str], m6
2634    mova    [tmpq+29*%%str], m7
2635
2636    mova                 m8, [tmpq+ 2*%%str]
2637    TRANSPOSE8x8W         8, 9, 10, 11, 12, 13, 14, 15, 0
2638    mova    [tmpq+ 2*%%str], m8
2639    mova    [tmpq+ 6*%%str], m9
2640    mova    [tmpq+10*%%str], m10
2641    mova    [tmpq+14*%%str], m11
2642    mova    [tmpq+18*%%str], m12
2643    mova    [tmpq+22*%%str], m13
2644    mova    [tmpq+26*%%str], m14
2645    mova    [tmpq+30*%%str], m15
2646%endif ; avx2
2647%else
2648    mova                 m2, [tmpq+24*%%str] ; t6
2649    mova                 m3, [tmpq+28*%%str] ; t7
2650    SUMSUB_BADC           w,  5,  2,  4,  3
2651    mova    [tmpq+24*%%str], m5
2652    mova    [tmpq+23*%%str], m2
2653    mova    [tmpq+28*%%str], m4
2654    mova    [tmpq+19*%%str], m3
2655
2656    mova                 m2, [tmpq+16*%%str] ; t4
2657    mova                 m3, [tmpq+20*%%str] ; t5
2658    SUMSUB_BA             w,  1,  2,  5
2659    SUMSUB_BA             w,  7,  3,  5
2660    mova    [tmpq+15*%%str], m2
2661    mova    [tmpq+11*%%str], m3
2662
2663    mova                 m2, [tmpq+ 0*%%str] ; t0
2664    mova                 m3, [tmpq+ 4*%%str] ; t1
2665    SUMSUB_BA             w,  6,  2,  5
2666    SUMSUB_BA             w,  0,  3,  5
2667    mova    [tmpq+31*%%str], m2
2668    mova    [tmpq+27*%%str], m3
2669
2670    mova                 m2, [tmpq+ 8*%%str] ; t2
2671    mova                 m3, [tmpq+12*%%str] ; t3
2672    mova                 m5, [tmpq+ 7*%%str]
2673    mova                 m4, [tmpq+ 3*%%str]
2674    SUMSUB_BADC           w,  5,  2,  4,  3
2675    mova    [tmpq+ 7*%%str], m2
2676    mova    [tmpq+ 3*%%str], m3
2677
2678    mova                 m3, [tmpq+28*%%str]
2679    TRANSPOSE8x8W         6, 0, 5, 4, 1, 7, 2, 3, [tmpq+24*%%str], [tmpq+16*%%str], 1
2680    mova    [tmpq+ 0*%%str], m6
2681    mova    [tmpq+ 4*%%str], m0
2682    mova    [tmpq+ 8*%%str], m5
2683    mova    [tmpq+12*%%str], m4
2684    mova    [tmpq+20*%%str], m7
2685    mova    [tmpq+24*%%str], m2
2686    mova    [tmpq+28*%%str], m3
2687
2688    mova                 m6, [tmpq+19*%%str]
2689    mova                 m0, [tmpq+23*%%str]
2690    mova                 m5, [tmpq+11*%%str]
2691    mova                 m4, [tmpq+15*%%str]
2692    mova                 m1, [tmpq+ 3*%%str]
2693    mova                 m7, [tmpq+ 7*%%str]
2694    mova                 m3, [tmpq+31*%%str]
2695    TRANSPOSE8x8W         6, 0, 5, 4, 1, 7, 2, 3, [tmpq+27*%%str], [tmpq+19*%%str], 1
2696    mova    [tmpq+ 3*%%str], m6
2697    mova    [tmpq+ 7*%%str], m0
2698    mova    [tmpq+11*%%str], m5
2699    mova    [tmpq+15*%%str], m4
2700    mova    [tmpq+23*%%str], m7
2701    mova    [tmpq+27*%%str], m2
2702    mova    [tmpq+31*%%str], m3
2703
2704    mova                 m1, [tmpq+ 6*%%str] ; t14
2705    mova                 m0, [tmpq+ 2*%%str] ; t15
2706    mova                 m7, [tmpq+21*%%str] ; t16
2707    mova                 m6, [tmpq+29*%%str] ; t17
2708    SUMSUB_BA             w,  7,  0,  2
2709    SUMSUB_BA             w,  6,  1,  2
2710    mova    [tmpq+29*%%str], m7
2711    mova    [tmpq+ 2*%%str], m0
2712    mova    [tmpq+21*%%str], m6
2713    mova    [tmpq+ 6*%%str], m1
2714
2715    mova                 m1, [tmpq+14*%%str] ; t12
2716    mova                 m0, [tmpq+10*%%str] ; t13
2717    mova                 m5, [tmpq+ 9*%%str] ; t18
2718    mova                 m4, [tmpq+13*%%str] ; t19
2719    SUMSUB_BA             w,  5,  0,  2
2720    SUMSUB_BA             w,  4,  1,  2
2721    mova     [tmpq+10*%%str], m0
2722    mova     [tmpq+14*%%str], m1
2723
2724    mova                 m1, [tmpq+22*%%str] ; t10
2725    mova                 m0, [tmpq+18*%%str] ; t11
2726    mova                 m3, [tmpq+17*%%str] ; t20
2727    mova                 m2, [tmpq+ 1*%%str] ; t21
2728    SUMSUB_BA             w,  3,  0,  6
2729    SUMSUB_BA             w,  2,  1,  6
2730    mova     [tmpq+18*%%str], m0
2731    mova     [tmpq+22*%%str], m1
2732
2733    mova                 m7, [tmpq+30*%%str] ; t8
2734    mova                 m6, [tmpq+26*%%str] ; t9
2735    mova                 m1, [tmpq+25*%%str] ; t22
2736    mova                 m0, [tmpq+ 5*%%str] ; t23
2737    SUMSUB_BADC           w,  1,  6,  0,  7
2738    mova     [tmpq+26*%%str], m6
2739    mova     [tmpq+30*%%str], m7
2740
2741    mova                 m7, [tmpq+29*%%str]
2742    TRANSPOSE8x8W         0, 1, 2, 3, 4, 5, 6, 7, [tmpq+21*%%str], [tmpq+17*%%str], 1
2743    mova    [tmpq+ 1*%%str], m0
2744    mova    [tmpq+ 5*%%str], m1
2745    mova    [tmpq+ 9*%%str], m2
2746    mova    [tmpq+13*%%str], m3
2747    mova    [tmpq+21*%%str], m5
2748    mova    [tmpq+25*%%str], m6
2749    mova    [tmpq+29*%%str], m7
2750
2751    mova                 m0, [tmpq+ 2*%%str]
2752    mova                 m1, [tmpq+ 6*%%str]
2753    mova                 m2, [tmpq+10*%%str]
2754    mova                 m3, [tmpq+14*%%str]
2755    mova                 m4, [tmpq+18*%%str]
2756    mova                 m5, [tmpq+22*%%str]
2757    mova                 m7, [tmpq+30*%%str]
2758    TRANSPOSE8x8W         0, 1, 2, 3, 4, 5, 6, 7, [tmpq+26*%%str], [tmpq+18*%%str], 1
2759    mova    [tmpq+ 2*%%str], m0
2760    mova    [tmpq+ 6*%%str], m1
2761    mova    [tmpq+10*%%str], m2
2762    mova    [tmpq+14*%%str], m3
2763    mova    [tmpq+22*%%str], m5
2764    mova    [tmpq+26*%%str], m6
2765    mova    [tmpq+30*%%str], m7
2766%endif
2767%else
2768    ; t0-7 is in [tmpq+{0,4,8,12,16,20,24,28}*%%str]
2769    ; t8-15 is in [tmpq+{2,6,10,14,18,22,26,30}*%%str]
2770    ; t16-19 and t23 is in [tmpq+{1,5,9,13,29}*%%str]
2771    ; t20-22 is in m4-6
2772    ; t24-31 is in m8-15
2773
2774%if cpuflag(ssse3)
2775%define ROUND_REG [pw_512]
2776%else
2777%define ROUND_REG [pw_32]
2778%endif
2779
2780%macro %%STORE_2X2 7-8 1 ; src[1-4], tmp[1-2], zero, inc_dst_ptrs
2781    SUMSUB_BA            w, %4, %1, %5
2782    SUMSUB_BA            w, %3, %2, %5
2783    VP9_IDCT8_WRITEx2   %4, %3, %5, %6, %7, ROUND_REG, 6
2784%if %8 == 1
2785    add               dstq, stride2q
2786%endif
2787    VP9_IDCT8_WRITEx2   %2, %1, %5, %6, %7, ROUND_REG, 6, dst_endq
2788%if %8 == 1
2789    sub           dst_endq, stride2q
2790%endif
2791%endmacro
2792
2793%if ARCH_X86_64
2794    pxor               m10, m10
2795
2796    ; store t0-1 and t30-31
2797    mova                m8, [tmpq+ 0*%%str]
2798    mova                m9, [tmpq+ 4*%%str]
2799    %%STORE_2X2          8,  9,  0,  6, 12, 11, 10
2800
2801    ; store t2-3 and t28-29
2802    mova                m8, [tmpq+ 8*%%str]
2803    mova                m9, [tmpq+12*%%str]
2804    %%STORE_2X2          8,  9, 14, 15, 12, 11, 10
2805
2806    ; store t4-5 and t26-27
2807    mova                m8, [tmpq+16*%%str]
2808    mova                m9, [tmpq+20*%%str]
2809    %%STORE_2X2          8,  9,  7,  1, 12, 11, 10
2810
2811    ; store t6-7 and t24-25
2812    mova                m8, [tmpq+24*%%str]
2813    mova                m9, [tmpq+28*%%str]
2814    %%STORE_2X2          8,  9,  4,  5, 12, 11, 10
2815
2816    ; store t8-9 and t22-23
2817    mova                m8, [tmpq+30*%%str]
2818    mova                m9, [tmpq+26*%%str]
2819    mova                m0, [tmpq+ 5*%%str]
2820    %%STORE_2X2          8,  9, 13,  0, 12, 11, 10
2821
2822    ; store t10-11 and t20-21
2823    mova                m8, [tmpq+22*%%str]
2824    mova                m9, [tmpq+18*%%str]
2825    %%STORE_2X2          8,  9,  2,  3, 12, 11, 10
2826
2827    ; store t12-13 and t18-19
2828    mova                m8, [tmpq+14*%%str]
2829    mova                m9, [tmpq+10*%%str]
2830    mova                m5, [tmpq+13*%%str]
2831    mova                m4, [tmpq+ 9*%%str]
2832    %%STORE_2X2          8,  9,  4,  5, 12, 11, 10
2833
2834    ; store t14-17
2835    mova                m8, [tmpq+ 6*%%str]
2836    mova                m9, [tmpq+ 2*%%str]
2837    mova                m5, [tmpq+29*%%str]
2838    mova                m4, [tmpq+21*%%str]
2839    %%STORE_2X2          8,  9,  4,  5, 12, 11, 10, 0
2840
2841    SWAP                 1, 10 ; zero
2842%else
2843    mova   [tmpq+ 1*%%str], m1
2844    mova   [tmpq+11*%%str], m2
2845    mova   [tmpq+15*%%str], m3
2846    mova   [tmpq+17*%%str], m4
2847    mova   [tmpq+19*%%str], m5
2848    pxor                m1, m1
2849
2850    ; store t0-1 and t30-31
2851    mova                m2, [tmpq+ 0*%%str]
2852    mova                m3, [tmpq+ 4*%%str]
2853    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
2854
2855    ; store t2-3 and t28-29
2856    mova                m2, [tmpq+ 8*%%str]
2857    mova                m3, [tmpq+12*%%str]
2858    mova                m0, [tmpq+ 3*%%str]
2859    mova                m6, [tmpq+ 7*%%str]
2860    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
2861
2862    ; store t4-5 and t26-27
2863    mova                m2, [tmpq+16*%%str]
2864    mova                m3, [tmpq+20*%%str]
2865    mova                m0, [tmpq+ 1*%%str]
2866    %%STORE_2X2          2,  3,  7,  0, 4, 5, 1
2867
2868    ; store t6-7 and t24-25
2869    mova                m2, [tmpq+24*%%str]
2870    mova                m3, [tmpq+28*%%str]
2871    mova                m0, [tmpq+17*%%str]
2872    mova                m6, [tmpq+19*%%str]
2873    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
2874
2875    ; store t8-9 and t22-23
2876    mova                m2, [tmpq+30*%%str]
2877    mova                m3, [tmpq+26*%%str]
2878    mova                m0, [tmpq+25*%%str]
2879    mova                m6, [tmpq+ 5*%%str]
2880    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
2881
2882    ; store t10-11 and t20-21
2883    mova                m2, [tmpq+22*%%str]
2884    mova                m3, [tmpq+18*%%str]
2885    mova                m0, [tmpq+11*%%str]
2886    mova                m6, [tmpq+15*%%str]
2887    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
2888
2889    ; store t12-13 and t18-19
2890    mova                m2, [tmpq+14*%%str]
2891    mova                m3, [tmpq+10*%%str]
2892    mova                m6, [tmpq+13*%%str]
2893    mova                m0, [tmpq+ 9*%%str]
2894    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
2895
2896    ; store t14-17
2897    mova                m2, [tmpq+ 6*%%str]
2898    mova                m3, [tmpq+ 2*%%str]
2899    mova                m6, [tmpq+29*%%str]
2900    mova                m0, [tmpq+21*%%str]
2901    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1, 0
2902%endif
2903%undef ROUND_REG
2904%endif
2905%endmacro
2906
2907%macro VP9_IDCT_IDCT_32x32_ADD_XMM 1
2908INIT_XMM %1
2909cglobal vp9_idct_idct_32x32_add, 0, 6 + ARCH_X86_64 * 3, 16, 2048, dst, stride, block, eob
2910    movifnidn         eobd, dword eobm
2911%if cpuflag(ssse3)
2912    cmp eobd, 135
2913    jg .idctfull
2914    cmp eobd, 34
2915    jg .idct16x16
2916    cmp eobd, 1
2917    jg .idct8x8
2918%else
2919    cmp eobd, 1
2920    jg .idctfull
2921%endif
2922
2923    ; dc-only case
2924    movifnidn       blockq, blockmp
2925    movifnidn         dstq, dstmp
2926    movifnidn      strideq, stridemp
2927%if cpuflag(ssse3)
2928    movd                m0, [blockq]
2929    mova                m1, [pw_11585x2]
2930    pmulhrsw            m0, m1
2931    pmulhrsw            m0, m1
2932%else
2933    DEFINE_ARGS dst, stride, block, coef
2934    movsx            coefd, word [blockq]
2935    imul             coefd, 11585
2936    add              coefd, 8192
2937    sar              coefd, 14
2938    imul             coefd, 11585
2939    add              coefd, (32 << 14) + 8192
2940    sar              coefd, 14 + 6
2941    movd                m0, coefd
2942%endif
2943    SPLATW              m0, m0, q0000
2944%if cpuflag(ssse3)
2945    pmulhrsw            m0, [pw_512]
2946%endif
2947    pxor                m5, m5
2948    movd          [blockq], m5
2949%rep 31
2950    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5, mmsize
2951    add               dstq, strideq
2952%endrep
2953    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5, mmsize
2954    RET
2955
2956%if ARCH_X86_64
2957    DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp
2958%else
2959%define dst_bakq r0mp
2960%endif
2961%if cpuflag(ssse3)
2962.idct8x8:
2963%if ARCH_X86_32
2964    DEFINE_ARGS block, u1, u2, u3, u4, tmp
2965    mov             blockq, r2mp
2966%endif
2967    mov               tmpq, rsp
2968    VP9_IDCT32_1D   blockq, 1, 8
2969
2970%if ARCH_X86_32
2971    DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
2972    mov            strideq, r1mp
2973%define cntd dword r3m
2974%endif
2975    mov          stride30q, strideq         ; stride
2976    lea           stride2q, [strideq*2]     ; stride*2
2977    shl          stride30q, 5               ; stride*32
2978    mov               cntd, 4
2979    sub          stride30q, stride2q        ; stride*30
2980.loop2_8x8:
2981    mov               dstq, dst_bakq
2982    lea           dst_endq, [dstq+stride30q]
2983    VP9_IDCT32_1D     tmpq, 2, 8
2984    add           dst_bakq, 8
2985    add               tmpq, 16
2986    dec               cntd
2987    jg .loop2_8x8
2988
2989    ; at the end of the loop, m7 should still be zero
2990    ; use that to zero out block coefficients
2991%if ARCH_X86_32
2992    DEFINE_ARGS block
2993    mov             blockq, r2mp
2994%endif
2995    ZERO_BLOCK      blockq, 64,  8, m1
2996    RET
2997
2998.idct16x16:
2999%if ARCH_X86_32
3000    DEFINE_ARGS block, tmp, cnt
3001    mov             blockq, r2mp
3002%endif
3003    mov               cntd, 2
3004    mov               tmpq, rsp
3005.loop1_16x16:
3006    VP9_IDCT32_1D   blockq, 1, 16
3007    add             blockq, 16
3008    add               tmpq, 512
3009    dec               cntd
3010    jg .loop1_16x16
3011
3012%if ARCH_X86_64
3013    sub             blockq, 32
3014%else
3015    DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
3016    mov            strideq, r1mp
3017%define cntd dword r3m
3018%endif
3019
3020    mov          stride30q, strideq         ; stride
3021    lea           stride2q, [strideq*2]     ; stride*2
3022    shl          stride30q, 5               ; stride*32
3023    mov               cntd, 4
3024    mov               tmpq, rsp
3025    sub          stride30q, stride2q        ; stride*30
3026.loop2_16x16:
3027    mov               dstq, dst_bakq
3028    lea           dst_endq, [dstq+stride30q]
3029    VP9_IDCT32_1D     tmpq, 2, 16
3030    add           dst_bakq, 8
3031    add               tmpq, 16
3032    dec               cntd
3033    jg .loop2_16x16
3034
3035    ; at the end of the loop, m7 should still be zero
3036    ; use that to zero out block coefficients
3037%if ARCH_X86_32
3038    DEFINE_ARGS block
3039    mov             blockq, r2mp
3040%endif
3041    ZERO_BLOCK      blockq, 64, 16, m1
3042    RET
3043%endif
3044
3045.idctfull:
3046%if ARCH_X86_32
3047    DEFINE_ARGS block, tmp, cnt
3048    mov             blockq, r2mp
3049%endif
3050    mov               cntd, 4
3051    mov               tmpq, rsp
3052.loop1_full:
3053    VP9_IDCT32_1D   blockq, 1
3054    add             blockq, 16
3055    add               tmpq, 512
3056    dec               cntd
3057    jg .loop1_full
3058
3059%if ARCH_X86_64
3060    sub             blockq, 64
3061%else
3062    DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
3063    mov            strideq, r1mp
3064%define cntd dword r3m
3065%endif
3066
3067    mov          stride30q, strideq         ; stride
3068    lea           stride2q, [strideq*2]     ; stride*2
3069    shl          stride30q, 5               ; stride*32
3070    mov               cntd, 4
3071    mov               tmpq, rsp
3072    sub          stride30q, stride2q        ; stride*30
3073.loop2_full:
3074    mov               dstq, dst_bakq
3075    lea           dst_endq, [dstq+stride30q]
3076    VP9_IDCT32_1D     tmpq, 2
3077    add           dst_bakq, 8
3078    add               tmpq, 16
3079    dec               cntd
3080    jg .loop2_full
3081
3082    ; at the end of the loop, m7 should still be zero
3083    ; use that to zero out block coefficients
3084%if ARCH_X86_32
3085    DEFINE_ARGS block
3086    mov             blockq, r2mp
3087%endif
3088    ZERO_BLOCK      blockq, 64, 32, m1
3089    RET
3090%endmacro
3091
3092VP9_IDCT_IDCT_32x32_ADD_XMM sse2
3093VP9_IDCT_IDCT_32x32_ADD_XMM ssse3
3094VP9_IDCT_IDCT_32x32_ADD_XMM avx
3095
3096; this is almost identical to VP9_STORE_2X, but it does two rows
3097; for slightly improved interleaving, and it omits vpermq since the
3098; input is DC so all values are identical
3099%macro VP9_STORE_YMM_DC_2X2 6 ; reg, tmp1, tmp2, tmp3, tmp4, zero
3100    mova               m%2, [dstq]
3101    mova               m%4, [dstq+strideq]
3102    punpckhbw          m%3, m%2, m%6
3103    punpcklbw          m%2, m%6
3104    punpckhbw          m%5, m%4, m%6
3105    punpcklbw          m%4, m%6
3106    paddw              m%3, m%1
3107    paddw              m%2, m%1
3108    paddw              m%5, m%1
3109    paddw              m%4, m%1
3110    packuswb           m%2, m%3
3111    packuswb           m%4, m%5
3112    mova  [dstq+strideq*0], m%2
3113    mova  [dstq+strideq*1], m%4
3114%endmacro
3115
3116%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
3117INIT_YMM avx2
3118cglobal vp9_idct_idct_32x32_add, 4, 9, 16, 2048, dst, stride, block, eob
3119    cmp eobd, 135
3120    jg .idctfull
3121    cmp eobd, 1
3122    jg .idct16x16
3123
3124    ; dc-only case
3125    mova                m1, [pw_11585x2]
3126    vpbroadcastw        m0, [blockq]
3127    pmulhrsw            m0, m1
3128    pmulhrsw            m0, m1
3129    pxor                m5, m5
3130    pmulhrsw            m0, [pw_512]
3131    movd          [blockq], xm5
3132
3133    DEFINE_ARGS dst, stride, cnt
3134    mov               cntd, 16
3135.loop_dc:
3136    VP9_STORE_YMM_DC_2X2 0, 1, 2, 3, 4, 5
3137    lea               dstq, [dstq+2*strideq]
3138    dec               cntd
3139    jg .loop_dc
3140    RET
3141
3142    DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp
3143.idct16x16:
3144    mov               tmpq, rsp
3145    VP9_IDCT32_1D   blockq, 1, 16
3146
3147    mov          stride30q, strideq         ; stride
3148    lea           stride2q, [strideq*2]     ; stride*2
3149    shl          stride30q, 5               ; stride*32
3150    mov               cntd, 2
3151    sub          stride30q, stride2q        ; stride*30
3152.loop2_16x16:
3153    mov               dstq, dst_bakq
3154    lea           dst_endq, [dstq+stride30q]
3155    VP9_IDCT32_1D     tmpq, 2, 16
3156    add           dst_bakq, 16
3157    add               tmpq, 32
3158    dec               cntd
3159    jg .loop2_16x16
3160
3161    ; at the end of the loop, m1 should still be zero
3162    ; use that to zero out block coefficients
3163    ZERO_BLOCK      blockq, 64, 16, m1
3164    RET
3165
3166.idctfull:
3167    mov               cntd, 2
3168    mov               tmpq, rsp
3169.loop1_full:
3170    VP9_IDCT32_1D   blockq, 1
3171    add             blockq, 32
3172    add               tmpq, 1024
3173    dec               cntd
3174    jg .loop1_full
3175
3176    sub             blockq, 64
3177
3178    mov          stride30q, strideq         ; stride
3179    lea           stride2q, [strideq*2]     ; stride*2
3180    shl          stride30q, 5               ; stride*32
3181    mov               cntd, 2
3182    mov               tmpq, rsp
3183    sub          stride30q, stride2q        ; stride*30
3184.loop2_full:
3185    mov               dstq, dst_bakq
3186    lea           dst_endq, [dstq+stride30q]
3187    VP9_IDCT32_1D     tmpq, 2
3188    add           dst_bakq, 16
3189    add               tmpq, 32
3190    dec               cntd
3191    jg .loop2_full
3192
3193    ; at the end of the loop, m1 should still be zero
3194    ; use that to zero out block coefficients
3195    ZERO_BLOCK      blockq, 64, 32, m1
3196    RET
3197%endif
3198