1;*****************************************************************************
2;* Copyright (C) 2013-2020 MulticoreWare, Inc
3;*
4;* Authors: Nabajit Deka <nabajit@multicorewareinc.com>
5;*          Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
6;*          Li Cao <li@multicorewareinc.com>
7;*          Praveen Kumar Tiwari <Praveen@multicorewareinc.com>
8;*
9;* This program is free software; you can redistribute it and/or modify
10;* it under the terms of the GNU General Public License as published by
11;* the Free Software Foundation; either version 2 of the License, or
12;* (at your option) any later version.
13;*
14;* This program is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17;* GNU General Public License for more details.
18;*
19;* You should have received a copy of the GNU General Public License
20;* along with this program; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22;*
23;* This program is also available under a commercial proprietary license.
24;* For more information, contact us at license @ x265.com.
25;*****************************************************************************/
26
27;TO-DO : Further optimize the routines.
28
29%include "x86inc.asm"
30%include "x86util.asm"
31SECTION_RODATA 64
32
33tab_dct32:      dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
34                dw 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13,  4, -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90
35                dw 90, 87, 80, 70, 57, 43, 25,  9, -9, -25, -43, -57, -70, -80, -87, -90, -90, -87, -80, -70, -57, -43, -25, -9,  9, 25, 43, 57, 70, 80, 87, 90
36                dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13, 13, 38, 61, 78, 88, 90, 85, 73, 54, 31,  4, -22, -46, -67, -82, -90
37                dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89, 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89
38                dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22, -22, -61, -85, -90, -73, -38,  4, 46, 78, 90, 82, 54, 13, -31, -67, -88
39                dw 87, 57,  9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87, -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43,  9, 57, 87
40                dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31, 31, 78, 90, 61,  4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85
41                dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83
42                dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67,  4, 73, 88, 38, -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82
43                dw 80,  9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80, -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70,  9, 80
44                dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46, 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82,  4, -78
45                dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75, 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75
46                dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54, -54, -85,  4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73
47                dw 70, -43, -87,  9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70, -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90,  9, -87, -43, 70
48                dw 67, -54, -78, 38, 85, -22, -90,  4, 90, 13, -88, -31, 82, 46, -73, -61, 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67
49                dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64
50                dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67, -67, -54, 78, 38, -85, -22, 90,  4, -90, 13, 88, -31, -82, 46, 73, -61
51                dw 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87,  9, -90, 25, 80, -57, -57, 80, 25, -90,  9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57
52                dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73, 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88,  4, 85, -54
53                dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50, 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50
54                dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82,  4, 78, -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46
55                dw 43, -90, 57, 25, -87, 70,  9, -80, 80, -9, -70, 87, -25, -57, 90, -43, -43, 90, -57, -25, 87, -70, -9, 80, -80,  9, 70, -87, 25, 57, -90, 43
56                dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82, 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67,  4, -73, 88, -38
57                dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36
58                dw 31, -78, 90, -61,  4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85, -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31
59                dw 25, -70, 90, -80, 43,  9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25, -25, 70, -90, 80, -43, -9, 57, -87, 87, -57,  9, 43, -80, 90, -70, 25
60                dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88, 88, -67, 31, 13, -54, 82, -90, 78, -46,  4, 38, -73, 90, -85, 61, -22
61                dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18, 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18
62                dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31,  4, 22, -46, 67, -82, 90, -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13
63                dw  9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9, -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25,  9
64                dw  4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90, 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4
65tab_dct16:      dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
66                dw 90, 87, 80, 70, 57, 43, 25,  9, -9, -25, -43, -57, -70, -80, -87, -90
67                dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89
68                dw 87, 57,  9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87
69                dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83
70                dw 80,  9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80
71                dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75
72                dw 70, -43, -87,  9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70
73                dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64
74                dw 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87,  9, -90, 25, 80, -57
75                dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50
76                dw 43, -90, 57, 25, -87, 70,  9, -80, 80, -9, -70, 87, -25, -57, 90, -43
77                dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36
78                dw 25, -70, 90, -80, 43,  9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25
79                dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18
80                dw 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9
81
82dct16_shuf_AVX512:  dq 0, 1, 8, 9, 4, 5, 12, 13
83dct16_shuf1_AVX512: dq 2, 3, 10, 11, 6, 7, 14, 15
84dct16_shuf3_AVX512: dq 0, 1, 4, 5, 8, 9, 12, 13
85dct16_shuf4_AVX512: dq 2, 3, 6, 7, 10, 11, 14, 15
86dct16_shuf2_AVX512: dd 0, 4, 8, 12, 2, 6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30
87
88dct8_shuf5_AVX512: dq 0, 2, 4, 6, 1, 3, 5, 7
89dct8_shuf6_AVX512: dq 0, 2, 4, 6, 1, 3, 5, 7
90dct8_shuf8_AVX512: dd 0, 2, 8, 10, 4, 6, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
91dct8_shuf4_AVX512: times 2 dd 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
92dct16_shuf7_AVX512: dd 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
93dct16_shuf9_AVX512: dd 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
94
95dct32_shuf_AVX512:  dd 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20 , 21, 24, 25, 28, 29
96dct32_shuf4_AVX512: times 2 dd 0, 4, 8, 12, 0, 4, 8, 12
97dct32_shuf5_AVX512: dd 0, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0
98dct32_shuf6_AVX512: dd 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0
99dct32_shuf7_AVX512: dd 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1
100dct32_shuf8_AVX512: dd -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
101dct16_shuf5_AVX512: dw 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
102dct16_shuf6_AVX512: dw 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
103dct16_shuf8_AVX512: dw 20, 0, 4, 2, 28, 8, 6, 10, 22, 16, 12, 18, 30, 24, 14, 26
104
105dct8_shuf7_AVX512: dw 0, 2, 16, 18, 8, 10, 24, 26, 4, 6, 20, 22, 12, 14, 28, 30
106dct8_shuf9_AVX512: times 2 dw 0, 8, 16, 24, 4, 12, 20, 28
107dct32_shuf1_AVX512: dw 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
108dct32_shuf2_AVX512: dw 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 15, 14, 13, 12, 11, 10, 9, 8, 31, 30, 29, 28, 27, 26, 25, 24
109dct32_shuf3_AVX512: times 2 dw 0, 8, 16, 24, 2, 10, 18, 26
110
111dct8_shuf:         times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9
112dct8_shuf_AVX512:  times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11
113
114tab_dct8:       dw 64, 64, 64, 64, 64, 64, 64, 64
115                dw 89, 75, 50, 18, -18, -50, -75, -89
116                dw 83, 36, -36, -83, -83, -36, 36, 83
117                dw 75, -18, -89, -50, 50, 89, 18, -75
118                dw 64, -64, -64, 64, 64, -64, -64, 64
119                dw 50, -89, 18, 75, -75, -18, 89, -50
120                dw 36, -83, 83, -36, -36, 83, -83, 36
121                dw 18, -50, 75, -89, 89, -75, 50, -18
122
123tab_dct8_avx512: dw 64, 64, 64, 64, 89, 75, 50, 18
124                 dw 83, 36, -36, -83, 75, -18, -89, -50
125                 dw 64, -64, -64, 64, 50, -89, 18, 75
126                 dw 36, -83, 83, -36, 18, -50, 75, -89
127
128tab_dct16_1:    dw 64, 64, 64, 64, 64, 64, 64, 64
129                dw 90, 87, 80, 70, 57, 43, 25,  9
130                dw 89, 75, 50, 18, -18, -50, -75, -89
131                dw 87, 57,  9, -43, -80, -90, -70, -25
132                dw 83, 36, -36, -83, -83, -36, 36, 83
133                dw 80,  9, -70, -87, -25, 57, 90, 43
134                dw 75, -18, -89, -50, 50, 89, 18, -75
135                dw 70, -43, -87,  9, 90, 25, -80, -57
136                dw 64, -64, -64, 64, 64, -64, -64, 64
137                dw 57, -80, -25, 90, -9, -87, 43, 70
138                dw 50, -89, 18, 75, -75, -18, 89, -50
139                dw 43, -90, 57, 25, -87, 70,  9, -80
140                dw 36, -83, 83, -36, -36, 83, -83, 36
141                dw 25, -70, 90, -80, 43,  9, -57, 87
142                dw 18, -50, 75, -89, 89, -75, 50, -18
143                dw  9, -25, 43, -57, 70, -80, 87, -90
144
145tab_dct16_2:    dw 64, 64, 64, 64, 64, 64, 64, 64
146                dw -9, -25, -43, -57, -70, -80, -87, -90
147                dw -89, -75, -50, -18, 18, 50, 75, 89
148                dw 25, 70, 90, 80, 43, -9, -57, -87
149                dw 83, 36, -36, -83, -83, -36, 36, 83
150                dw -43, -90, -57, 25, 87, 70, -9, -80
151                dw -75, 18, 89, 50, -50, -89, -18, 75
152                dw 57, 80, -25, -90, -9, 87, 43, -70
153                dw 64, -64, -64, 64, 64, -64, -64, 64
154                dw -70, -43, 87,  9, -90, 25, 80, -57
155                dw -50, 89, -18, -75, 75, 18, -89, 50
156                dw 80, -9, -70, 87, -25, -57, 90, -43
157                dw 36, -83, 83, -36, -36, 83, -83, 36
158                dw -87, 57, -9, -43, 80, -90, 70, -25
159                dw -18, 50, -75, 89, -89, 75, -50, 18
160                dw 90, -87, 80, -70, 57, -43, 25, -9
161
162dct16_shuf1:     times 2 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
163
164dct16_shuf2:    times 2 db 0, 1, 14, 15, 2, 3, 12, 13, 4, 5, 10, 11, 6, 7, 8, 9
165
166tab_dct32_1:    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
167                dw 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13,  4
168                dw 90, 87, 80, 70, 57, 43, 25,  9, -9, -25, -43, -57, -70, -80, -87, -90
169                dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13
170                dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89
171                dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22
172                dw 87, 57,  9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87
173                dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31
174                dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83
175                dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67,  4, 73, 88, 38
176                dw 80,  9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80
177                dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46
178                dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75
179                dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54
180                dw 70, -43, -87,  9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70
181                dw 67, -54, -78, 38, 85, -22, -90,  4, 90, 13, -88, -31, 82, 46, -73, -61
182                dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64
183                dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67
184                dw 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87,  9, -90, 25, 80, -57
185                dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73
186                dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50
187                dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82,  4, 78
188                dw 43, -90, 57, 25, -87, 70,  9, -80, 80, -9, -70, 87, -25, -57, 90, -43
189                dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82
190                dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36
191                dw 31, -78, 90, -61,  4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85
192                dw 25, -70, 90, -80, 43,  9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25
193                dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88
194                dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18
195                dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31,  4, 22, -46, 67, -82, 90
196                dw 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9
197                dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
198
199tab_dct32_2:    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
200                dw -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90
201                dw -90, -87, -80, -70, -57, -43, -25, -9,  9, 25, 43, 57, 70, 80, 87, 90
202                dw 13, 38, 61, 78, 88, 90, 85, 73, 54, 31,  4, -22, -46, -67, -82, -90
203                dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89
204                dw -22, -61, -85, -90, -73, -38,  4, 46, 78, 90, 82, 54, 13, -31, -67, -88
205                dw -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43,  9, 57, 87
206                dw 31, 78, 90, 61,  4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85
207                dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83
208                dw -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82
209                dw -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70,  9, 80
210                dw 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82,  4, -78
211                dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75
212                dw -54, -85,  4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73
213                dw -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90,  9, -87, -43, 70
214                dw 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67
215                dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64
216                dw -67, -54, 78, 38, -85, -22, 90,  4, -90, 13, 88, -31, -82, 46, 73, -61
217                dw -57, 80, 25, -90,  9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57
218                dw 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88,  4, 85, -54
219                dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50
220                dw -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46
221                dw -43, 90, -57, -25, 87, -70, -9, 80, -80,  9, 70, -87, 25, 57, -90, 43
222                dw 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67,  4, -73, 88, -38
223                dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36
224                dw -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31
225                dw -25, 70, -90, 80, -43, -9, 57, -87, 87, -57,  9, 43, -80, 90, -70, 25
226                dw 88, -67, 31, 13, -54, 82, -90, 78, -46,  4, 38, -73, 90, -85, 61, -22
227                dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18
228                dw -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13
229                dw -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25,  9
230                dw 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4
231
232avx2_idct8_1:   times 4 dw 64, 83, 64, 36
233                times 4 dw 64, 36, -64, -83
234                times 4 dw 64, -36, -64, 83
235                times 4 dw 64, -83, 64, -36
236
237avx2_idct8_2:   times 4 dw 89, 75, 50, 18
238                times 4 dw 75, -18, -89, -50
239                times 4 dw 50, -89, 18, 75
240                times 4 dw 18, -50, 75, -89
241
242avx512_idct8_1:   times 8 dw 64, 83, 64, 36
243                  times 8 dw 64, 36, -64, -83
244                  times 8 dw 64, -36, -64, 83
245                  times 8 dw 64, -83, 64, -36
246
247avx512_idct8_2:   times 8 dw 89, 75, 50, 18
248                  times 8 dw 75, -18, -89, -50
249                  times 8 dw 50, -89, 18, 75
250                  times 8 dw 18, -50, 75, -89
251
252avx512_idct8_3:   dw 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36
253                  dw 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83
254                  dw 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83
255                  dw -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36
256                  dw 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89
257                  dw 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75
258                  dw 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50
259                  dw -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89
260
261idct8_shuf1:    dd 0, 2, 4, 6, 1, 3, 5, 7
262
263const idct8_shuf2,    times 2 db 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
264
265idct8_shuf3:    times 2 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
266
267
268idct8_avx512_shuf3:    times 4 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
269
270tab_idct16_1:   dw 90, 87, 80, 70, 57, 43, 25, 9
271                dw 87, 57, 9, -43, -80, -90, -70, -25
272                dw 80, 9, -70, -87, -25, 57, 90, 43
273                dw 70, -43, -87, 9, 90, 25, -80, -57
274                dw 57, -80, -25, 90, -9, -87, 43, 70
275                dw 43, -90, 57, 25, -87, 70, 9, -80
276                dw 25, -70, 90, -80, 43, 9, -57, 87
277                dw 9, -25, 43, -57, 70, -80, 87, -90
278
279tab_idct16_2:   dw 64, 89, 83, 75, 64, 50, 36, 18
280                dw 64, 75, 36, -18, -64, -89, -83, -50
281                dw 64, 50, -36, -89, -64, 18, 83, 75
282                dw 64, 18, -83, -50, 64, 75, -36, -89
283                dw 64, -18, -83, 50, 64, -75, -36, 89
284                dw 64, -50, -36, 89, -64, -18, 83, -75
285                dw 64, -75, 36, 18, -64, 89, -83, 50
286                dw 64, -89, 83, -75, 64, -50, 36, -18
287
288idct16_shuff:   dd 0, 4, 2, 6, 1, 5, 3, 7
289
290idct16_shuff1:  dd 2, 6, 0, 4, 3, 7, 1, 5
291idct16_shuff2:  dw 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
292idct16_shuff3:  dw 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31
293idct16_shuff4:  dd 0, 8, 2, 10, 4, 12, 6, 14
294idct16_shuff5:  dd 1, 9, 3, 11, 5, 13, 7, 15
295
296
297tab_AVX512_idct16_1:   dw 90, 87, 80, 70, 57, 43, 25, 9, 90, 87, 80, 70, 57, 43, 25, 9, 80, 9, -70, -87, -25, 57, 90, 43, 80, 9, -70, -87, -25, 57, 90, 43
298                       dw 87, 57, 9, -43, -80, -90, -70, -25, 87, 57, 9, -43, -80, -90, -70, -25, 70, -43, -87, 9, 90, 25, -80, -57, 70, -43, -87, 9, 90, 25, -80, -57
299                       dw 57, -80, -25, 90, -9, -87, 43, 70, 57, -80, -25, 90, -9, -87, 43, 70, 25, -70, 90, -80, 43, 9, -57, 87, 25, -70, 90, -80, 43, 9, -57, 87
300                       dw 43, -90, 57, 25, -87, 70, 9, -80, 43, -90, 57, 25, -87, 70, 9, -80, 9, -25, 43, -57, 70, -80, 87, -90, 9, -25, 43, -57, 70, -80, 87, -90
301
302tab_AVX512_idct16_2:   dw 64, 89, 83, 75, 64, 50, 36, 18, 64, 89, 83, 75, 64, 50, 36, 18, 64, 50, -36, -89, -64, 18, 83, 75, 64, 50, -36, -89, -64, 18, 83, 75
303                       dw 64, 75, 36, -18, -64, -89, -83, -50, 64, 75, 36, -18, -64, -89, -83, -50, 64, 18, -83, -50, 64, 75, -36, -89, 64, 18, -83, -50, 64, 75, -36, -89
304                       dw 64, -18, -83, 50, 64, -75, -36, 89, 64, -18, -83, 50, 64, -75, -36, 89, 64, -75, 36, 18, -64, 89, -83, 50, 64, -75, 36, 18, -64, 89, -83, 50
305                       dw 64, -50, -36, 89, -64, -18, 83, -75, 64, -50, -36, 89, -64, -18, 83, -75, 64, -89, 83, -75, 64, -50, 36, -18, 64, -89, 83, -75, 64, -50, 36, -18
306
307idct16_AVX512_shuff:   dd 0, 4, 2, 6, 1, 5, 3, 7, 8, 12, 10, 14, 9, 13, 11, 15
308
309idct16_AVX512_shuff1:  dd 2, 6, 0, 4, 3, 7, 1, 5, 10, 14, 8, 12, 11, 15, 9, 13
310
311idct16_AVX512_shuff2:   dq 0, 1, 8, 9, 4, 5, 12, 13
312idct16_AVX512_shuff3:   dq 2, 3, 10, 11, 6, 7, 14, 15
313idct16_AVX512_shuff4:   dq 4, 5, 12, 13, 0, 1, 8, 9
314idct16_AVX512_shuff5:   dq 6, 7, 14, 15, 2, 3, 10, 11
315idct16_AVX512_shuff6:   times 4 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
316
317tab_idct32_1:   dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4
318                dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13
319                dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22
320                dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31
321                dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38
322                dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46
323                dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54
324                dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61
325                dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67
326                dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73
327                dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78
328                dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82
329                dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85
330                dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88
331                dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90
332                dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
333
334
335tab_idct32_2:   dw 64, 89, 83, 75, 64, 50, 36, 18
336                dw 64, 75, 36, -18, -64, -89, -83, -50
337                dw 64, 50, -36, -89, -64, 18, 83, 75
338                dw 64, 18, -83, -50, 64, 75, -36, -89
339                dw 64, -18, -83, 50, 64, -75, -36, 89
340                dw 64, -50, -36, 89, -64, -18, 83, -75
341                dw 64, -75, 36, 18, -64, 89, -83, 50
342                dw 64, -89, 83, -75, 64, -50, 36, -18
343
344
345tab_idct32_3:   dw 90, 87, 80, 70, 57, 43, 25, 9
346                dw 87, 57, 9, -43, -80, -90, -70, -25
347                dw 80, 9, -70, -87, -25, 57, 90, 43
348                dw 70, -43, -87, 9, 90, 25, -80, -57
349                dw 57, -80, -25, 90, -9, -87, 43, 70
350                dw 43, -90, 57, 25, -87, 70, 9, -80
351                dw 25, -70, 90, -80, 43, 9, -57, 87
352                dw 9, -25, 43, -57, 70, -80, 87, -90
353
354tab_idct32_4:   dw 64, 90, 89, 87, 83, 80, 75, 70, 64, 57, 50, 43, 36, 25, 18, 9
355                dw 64, 87, 75, 57, 36, 9, -18, -43, -64, -80, -89, -90, -83, -70, -50, -25
356                dw 64, 80, 50, 9, -36, -70, -89, -87, -64, -25, 18, 57, 83, 90, 75, 43
357                dw 64, 70, 18, -43, -83, -87, -50, 9, 64, 90, 75, 25, -36, -80, -89, -57
358                dw 64, 57, -18, -80, -83, -25, 50, 90, 64, -9, -75, -87, -36, 43, 89, 70
359                dw 64, 43, -50, -90, -36, 57, 89, 25, -64, -87, -18, 70, 83, 9, -75, -80
360                dw 64, 25, -75, -70, 36, 90, 18, -80, -64, 43, 89, 9, -83, -57, 50, 87
361                dw 64, 9, -89, -25, 83, 43, -75, -57, 64, 70, -50, -80, 36, 87, -18, -90
362                dw 64, -9, -89, 25, 83, -43, -75, 57, 64, -70, -50, 80, 36, -87, -18, 90
363                dw 64, -25, -75, 70, 36, -90, 18, 80, -64, -43, 89, -9, -83, 57, 50, -87
364                dw 64, -43, -50, 90, -36, -57, 89, -25, -64, 87, -18, -70, 83, -9, -75, 80
365                dw 64, -57, -18, 80, -83, 25, 50, -90, 64, 9, -75, 87, -36, -43, 89, -70
366                dw 64, -70, 18, 43, -83, 87, -50, -9, 64, -90, 75, -25, -36, 80, -89, 57
367                dw 64, -80, 50, -9, -36, 70, -89, 87, -64, 25, 18, -57, 83, -90, 75, -43
368                dw 64, -87, 75, -57, 36, -9, -18, 43, -64, 80, -89, 90, -83, 70, -50, 25
369                dw 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9
370
371
372tab_idct32_AVX512_1:   dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 90 ,90 ,88 ,85, 82, 78, 73, 67, 90, 82, 67, 46, 22, -4, -31, -54, 90, 82, 67, 46, 22, -4, -31, -54
373                       dw 61, 54, 46, 38, 31, 22, 13, 4, 61, 54, 46, 38, 31, 22, 13, 4, -73, -85, -90, -88, -78, -61, -38, -13, -73, -85, -90, -88, -78, -61, -38, -13
374                       dw 88, 67, 31, -13, -54, -82, -90, -78, 88, 67, 31, -13, -54, -82, -90, -78, 85, 46, -13, -67, -90, -73, -22, 38, 85, 46, -13, -67, -90, -73, -22, 38
375                       dw -46, -4, 38, 73, 90, 85, 61, 22, -46, -4, 38, 73, 90, 85, 61, 22, 82, 88, 54, -4, -61, -90, -78, -31, 82, 88, 54, -4, -61, -90, -78, -31
376                       dw 82, 22, -54, -90, -61, 13, 78, 85, 82, 22, -54, -90, -61, 13, 78, 85, 78, -4, -82, -73, 13, 85, 67, -22, 78, -4, -82, -73, 13, 85, 67, -22
377                       dw 31, -46, -90, -67, 4, 73, 88, 38, 31, -46, -90, -67, 4, 73, 88, 38, -88, -61, 31, 90, 54, -38, -90, -46, -88, -61, 31, 90, 54, -38, -90, -46
378                       dw 73, -31, -90, -22, 78, 67, -38, -90, 73, -31, -90, -22, 78, 67, -38, -90, 67, -54, -78, 38, 85, -22, -90, 4, 67, -54, -78, 38, 85, -22, -90, 4
379                       dw -13, 82, 61, -46, -88, -4, 85, 54, -13, 82, 61, -46, -88, -4, 85, 54, 90, 13, -88, -31, 82, 46, -73, -61, 90, 13, -88, -31, 82, 46, -73, -61
380
381tab_idct32_AVX512_5:   dw 4, -13, 22, -31, 38, -46, 54, -61, 4, -13, 22, -31, 38, -46, 54, -61, 13, -38, 61, -78, 88, -90, 85, -73, 13, -38, 61, -78, 88, -90, 85, -73
382                       dw 67, -73, 78, -82, 85, -88, 90, -90, 67, -73, 78, -82, 85, -88, 90, -90, 54, -31, 4, 22, -46, 67, -82, 90, 54, -31, 4, 22, -46, 67, -82, 90
383                       dw 22, -61, 85, -90, 73, -38, -4, 46, 22, -61, 85, -90, 73, -38, -4, 46, 31, -78, 90, -61, 4, 54, -88, 82, 31, -78, 90, -61, 4, 54, -88, 82
384                       dw -78, 90, -82, 54, -13, -31, 67, -88, -78, 90, -82, 54, -13, -31, 67, -88, -38, -22, 73, -90, 67, -13, -46, 85, -38, -22, 73, -90, 67, -13, -46, 85
385                       dw 38, -88, 73, -4, -67, 90, -46, -31, 38, -88, 73, -4, -67, 90, -46, -31, 46, -90, 38, 54, -90, 31, 61, -88, 46, -90, 38, 54, -90, 31, 61, -88
386                       dw 85, -78, 13, 61, -90, 54, 22, -82, 85, -78, 13, 61, -90, 54, 22, -82, 22, 67, -85, 13, 73, -82, 4, 78, 22, 67, -85, 13, 73, -82, 4, 78
387                       dw 54, -85, -4, 88, -46, -61, 82, 13, 54, -85, -4, 88, -46, -61, 82, 13, 61, -73, -46, 82, 31, -88, -13, 90, 61, -73, -46, 82, 31, -88, -13, 90
388                       dw -90, 38, 67, -78, -22, 90, -31, -73, -90, 38, 67, -78, -22, 90, -31, -73, -4, -90, 22, 85, -38, -78, 54, 67, -4, -90, 22, 85, -38, -78, 54, 67
389
390
391tab_idct32_AVX512_2:   dw 64, 89, 83, 75, 64, 50, 36, 18, 64, 89, 83, 75, 64, 50, 36, 18, 64, 75, 36, -18, -64, -89, -83, -50, 64, 75, 36, -18, -64, -89, -83, -50
392                       dw 64, 50, -36, -89, -64, 18, 83, 75, 64, 50, -36, -89, -64, 18, 83, 75, 64, 18, -83, -50, 64, 75, -36, -89, 64, 18, -83, -50, 64, 75, -36, -89
393                       dw 64, -18, -83, 50, 64, -75, -36, 89, 64, -18, -83, 50, 64, -75, -36, 89, 64, -50, -36, 89, -64, -18, 83, -75, 64, -50, -36, 89, -64, -18, 83, -75
394                       dw 64, -75, 36, 18, -64, 89, -83, 50, 64, -75, 36, 18, -64, 89, -83, 50, 64, -89, 83, -75, 64, -50, 36, -18, 64, -89, 83, -75, 64, -50, 36, -18
395
396tab_idct32_AVX512_3:   dw 90, 87, 80, 70, 57, 43, 25, 9, 90, 87, 80, 70, 57, 43, 25, 9, 87, 57, 9, -43, -80, -90, -70, -25, 87, 57, 9, -43, -80, -90, -70, -25
397                       dw 80, 9, -70, -87, -25, 57, 90, 43, 80, 9, -70, -87, -25, 57, 90, 43, 70, -43, -87, 9, 90, 25, -80, -57, 70, -43, -87, 9, 90, 25, -80, -57
398                       dw 57, -80, -25, 90, -9, -87, 43, 70, 57, -80, -25, 90, -9, -87, 43, 70, 43, -90, 57, 25, -87, 70, 9, -80, 43, -90, 57, 25, -87, 70, 9, -80
399                       dw 25, -70, 90, -80, 43, 9, -57, 87, 25, -70, 90, -80, 43, 9, -57, 87, 9, -25, 43, -57, 70, -80, 87, -90, 9, -25, 43, -57, 70, -80, 87, -90
400
401tab_idct32_AVX512_4:   dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4
402                       dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13, 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13
403                       dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22, 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22
404                       dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31, 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31
405                       dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38, 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38
406                       dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46, 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46
407                       dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54, 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54
408                       dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61, 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61
409                       dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67, 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67
410                       dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73, 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73
411                       dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78, 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78
412                       dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82, 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82
413                       dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85, 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85
414                       dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88, 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88
415                       dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90, 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90
416                       dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90, 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
417
418tab_idct32_AVX512_6:   dw 64, 90, 89, 87, 83, 80, 75, 70, 64, 57, 50, 43, 36, 25, 18, 9, 64, 90, 89, 87, 83, 80, 75, 70, 64, 57, 50, 43, 36, 25, 18, 9
419                       dw 64, 87, 75, 57, 36, 9, -18, -43, -64, -80, -89, -90, -83, -70, -50, -25, 64, 87, 75, 57, 36, 9, -18, -43, -64, -80, -89, -90, -83, -70, -50, -25
420                       dw 64, 80, 50, 9, -36, -70, -89, -87, -64, -25, 18, 57, 83, 90, 75, 43, 64, 80, 50, 9, -36, -70, -89, -87, -64, -25, 18, 57, 83, 90, 75, 43
421                       dw 64, 70, 18, -43, -83, -87, -50, 9, 64, 90, 75, 25, -36, -80, -89, -57, 64, 70, 18, -43, -83, -87, -50, 9, 64, 90, 75, 25, -36, -80, -89, -57
422                       dw 64, 57, -18, -80, -83, -25, 50, 90, 64, -9, -75, -87, -36, 43, 89, 70, 64, 57, -18, -80, -83, -25, 50, 90, 64, -9, -75, -87, -36, 43, 89, 70
423                       dw 64, 43, -50, -90, -36, 57, 89, 25, -64, -87, -18, 70, 83, 9, -75, -80, 64, 43, -50, -90, -36, 57, 89, 25, -64, -87, -18, 70, 83, 9, -75, -80
424                       dw 64, 25, -75, -70, 36, 90, 18, -80, -64, 43, 89, 9, -83, -57, 50, 87, 64, 25, -75, -70, 36, 90, 18, -80, -64, 43, 89, 9, -83, -57, 50, 87
425                       dw 64, 9, -89, -25, 83, 43, -75, -57, 64, 70, -50, -80, 36, 87, -18, -90, 64, 9, -89, -25, 83, 43, -75, -57, 64, 70, -50, -80, 36, 87, -18, -90
426                       dw 64, -9, -89, 25, 83, -43, -75, 57, 64, -70, -50, 80, 36, -87, -18, 90, 64, -9, -89, 25, 83, -43, -75, 57, 64, -70, -50, 80, 36, -87, -18, 90
427                       dw 64, -25, -75, 70, 36, -90, 18, 80, -64, -43, 89, -9, -83, 57, 50, -87, 64, -25, -75, 70, 36, -90, 18, 80, -64, -43, 89, -9, -83, 57, 50, -87
428                       dw 64, -43, -50, 90, -36, -57, 89, -25, -64, 87, -18, -70, 83, -9, -75, 80, 64, -43, -50, 90, -36, -57, 89, -25, -64, 87, -18, -70, 83, -9, -75, 80
429                       dw 64, -57, -18, 80, -83, 25, 50, -90, 64, 9, -75, 87, -36, -43, 89, -70, 64, -57, -18, 80, -83, 25, 50, -90, 64, 9, -75, 87, -36, -43, 89, -70
430                       dw 64, -70, 18, 43, -83, 87, -50, -9, 64, -90, 75, -25, -36, 80, -89, 57, 64, -70, 18, 43, -83, 87, -50, -9, 64, -90, 75, -25, -36, 80, -89, 57
431                       dw 64, -80, 50, -9, -36, 70, -89, 87, -64, 25, 18, -57, 83, -90, 75, -43, 64, -80, 50, -9, -36, 70, -89, 87, -64, 25, 18, -57, 83, -90, 75, -43
432                       dw 64, -87, 75, -57, 36, -9, -18, 43, -64, 80, -89, 90, -83, 70, -50, 25, 64, -87, 75, -57, 36, -9, -18, 43, -64, 80, -89, 90, -83, 70, -50, 25
433                       dw 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9, 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9
434
435
436avx2_dct4:      dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
437                dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83
438
439avx2_idct4_1:   dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
440                dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36 ,-83, 36, -83
441
442avx2_idct4_2:   dw 64, 64, 64, -64, 83, 36, 36, -83
443
444const idct4_shuf1,    times 2 db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
445
446idct4_shuf2:    times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8 ,9 ,10, 11
447
448tab_dct4:       times 4 dw 64, 64
449                times 4 dw 83, 36
450                times 4 dw 64, -64
451                times 4 dw 36, -83
452
453dct4_shuf:      db 0, 1, 2, 3, 8, 9, 10, 11, 6, 7, 4, 5, 14, 15, 12, 13
454
455tab_dst4:       times 2 dw 29, 55, 74, 84
456                times 2 dw 74, 74,  0, -74
457                times 2 dw 84, -29, -74, 55
458                times 2 dw 55, -84, 74, -29
459
460pw_dst4_tab:    times 4 dw 29,  55,  74,  84
461                times 4 dw 74,  74,   0, -74
462                times 4 dw 84, -29, -74,  55
463                times 4 dw 55, -84,  74, -29
464
465tab_idst4:      times 4 dw 29, +84
466                times 4 dw +74, +55
467                times 4 dw 55, -29
468                times 4 dw +74, -84
469                times 4 dw 74, -74
470                times 4 dw 0, +74
471                times 4 dw 84, +55
472                times 4 dw -74, -29
473
474pw_idst4_tab:   times 4 dw  29,  84
475                times 4 dw  55, -29
476                times 4 dw  74,  55
477                times 4 dw  74, -84
478                times 4 dw  74, -74
479                times 4 dw  84,  55
480                times 4 dw  0,   74
481                times 4 dw -74, -29
482pb_idst4_shuf:  times 2 db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
483
484tab_dct8_1:     times 2 dw 89, 50, 75, 18
485                times 2 dw 75, -89, -18, -50
486                times 2 dw 50, 18, -89, 75
487                times 2 dw 18, 75, -50, -89
488
489tab_dct8_2:     times 2 dd 83, 36
490                times 2 dd 36, 83
491                times 1 dd 89, 75, 50, 18
492                times 1 dd 75, -18, -89, -50
493                times 1 dd 50, -89, 18, 75
494                times 1 dd 18, -50, 75, -89
495
496tab_idct8_3:    times 4 dw 89, 75
497                times 4 dw 50, 18
498                times 4 dw 75, -18
499                times 4 dw -89, -50
500                times 4 dw 50, -89
501                times 4 dw 18, 75
502                times 4 dw 18, -50
503                times 4 dw 75, -89
504
505pb_unpackhlw1:  db 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15
506
507pb_idct8even:   db 0, 1, 8, 9, 4, 5, 12, 13, 0, 1, 8, 9, 4, 5, 12, 13
508
509tab_idct8_1:    times 1 dw 64, -64, 36, -83, 64, 64, 83, 36
510
511tab_idct8_2:    times 1 dw 89, 75, 50, 18, 75, -18, -89, -50
512                times 1 dw 50, -89, 18, 75, 18, -50, 75, -89
513pb_idct8odd:    db 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15
514
515;Scale bits table for rdoQuant
516tab_nonpsyRdo8 : dq 5, 7, 9, 11
517tab_nonpsyRdo10: dq 9, 11, 13, 15
518tab_nonpsyRdo12: dq 13, 15, 17, 19
519
520SECTION .text
521cextern pd_1
522cextern pd_2
523cextern pd_4
524cextern pd_8
525cextern pd_16
526cextern pd_32
527cextern pd_64
528cextern pd_128
529cextern pd_256
530cextern pd_512
531cextern pd_1024
532cextern pd_2048
533cextern pw_ppppmmmm
534cextern trans8_shuf
535
536
537%if BIT_DEPTH == 12
538    %define     DCT4_SHIFT          5
539    %define     DCT4_ROUND          16
540    %define    IDCT_SHIFT           8
541    %define    IDCT_ROUND           128
542    %define     DST4_SHIFT          5
543    %define     DST4_ROUND          16
544    %define     DCT8_SHIFT1         6
545    %define     DCT8_ROUND1         32
546    %define     RDO_MAX_4           3
547    %define     RDO_MAX_8           1
548    %define     RDO_MAX_16          0
549    %define     RDO_MAX_32          0
550%elif BIT_DEPTH == 10
551    %define     DCT4_SHIFT          3
552    %define     DCT4_ROUND          4
553    %define    IDCT_SHIFT           10
554    %define    IDCT_ROUND           512
555    %define     DST4_SHIFT          3
556    %define     DST4_ROUND          4
557    %define     DCT8_SHIFT1         4
558    %define     DCT8_ROUND1         8
559    %define     RDO_MAX_4           7
560    %define     RDO_MAX_8           5
561    %define     RDO_MAX_16          3
562    %define     RDO_MAX_32          1
563%elif BIT_DEPTH == 8
564    %define     DCT4_SHIFT          1
565    %define     DCT4_ROUND          1
566    %define    IDCT_SHIFT           12
567    %define    IDCT_ROUND           2048
568    %define     DST4_SHIFT          1
569    %define     DST4_ROUND          1
570    %define     DCT8_SHIFT1         2
571    %define     DCT8_ROUND1         2
572    %define     RDO_MAX_4           11
573    %define     RDO_MAX_8           9
574    %define     RDO_MAX_16          7
575    %define     RDO_MAX_32          5
576%else
577    %error Unsupported BIT_DEPTH!
578%endif
579
580%define         DCT8_ROUND2         256
581%define         DCT8_SHIFT2         9
582
583;------------------------------------------------------
584;void dct4(const int16_t* src, int16_t* dst, intptr_t srcStride)
585;------------------------------------------------------
586INIT_XMM sse2
587cglobal dct4, 3, 4, 8
588    mova        m7, [pd_ %+ DCT4_ROUND]
589    add         r2d, r2d
590    lea         r3, [tab_dct4]
591
592    mova        m4, [r3 + 0 * 16]
593    mova        m5, [r3 + 1 * 16]
594    mova        m6, [r3 + 2 * 16]
595    movh        m0, [r0 + 0 * r2]
596    movh        m1, [r0 + 1 * r2]
597    punpcklqdq  m0, m1
598    pshufd      m0, m0, 0xD8
599    pshufhw     m0, m0, 0xB1
600
601    lea         r0, [r0 + 2 * r2]
602    movh        m1, [r0]
603    movh        m2, [r0 + r2]
604    punpcklqdq  m1, m2
605    pshufd      m1, m1, 0xD8
606    pshufhw     m1, m1, 0xB1
607
608    punpcklqdq  m2, m0, m1
609    punpckhqdq  m0, m1
610
611    paddw       m1, m2, m0
612    psubw       m2, m0
613    pmaddwd     m0, m1, m4
614    paddd       m0, m7
615    psrad       m0, DCT4_SHIFT
616    pmaddwd     m3, m2, m5
617    paddd       m3, m7
618    psrad       m3, DCT4_SHIFT
619    packssdw    m0, m3
620    pshufd      m0, m0, 0xD8
621    pshufhw     m0, m0, 0xB1
622    pmaddwd     m1, m6
623    paddd       m1, m7
624    psrad       m1, DCT4_SHIFT
625    pmaddwd     m2, [r3 + 3 * 16]
626    paddd       m2, m7
627    psrad       m2, DCT4_SHIFT
628    packssdw    m1, m2
629    pshufd      m1, m1, 0xD8
630    pshufhw     m1, m1, 0xB1
631
632    punpcklqdq  m2, m0, m1
633    punpckhqdq  m0, m1
634
635    mova        m7, [pd_128]
636
637    pmaddwd     m1, m2, m4
638    pmaddwd     m3, m0, m4
639    paddd       m1, m3
640    paddd       m1, m7
641    psrad       m1, 8
642
643    pmaddwd     m4, m2, m5
644    pmaddwd     m3, m0, m5
645    psubd       m4, m3
646    paddd       m4, m7
647    psrad       m4, 8
648    packssdw    m1, m4
649    movu        [r1 + 0 * 16], m1
650
651    pmaddwd     m1, m2, m6
652    pmaddwd     m3, m0, m6
653    paddd       m1, m3
654    paddd       m1, m7
655    psrad       m1, 8
656
657    pmaddwd     m2, [r3 + 3 * 16]
658    pmaddwd     m0, [r3 + 3 * 16]
659    psubd       m2, m0
660    paddd       m2, m7
661    psrad       m2, 8
662    packssdw    m1, m2
663    movu        [r1 + 1 * 16], m1
664    RET
665
666; DCT 4x4
667;
668; Input parameters:
669; - r0:     source
670; - r1:     destination
671; - r2:     source stride
672INIT_YMM avx2
673cglobal dct4, 3, 4, 8, src, dst, srcStride
674    vbroadcasti128  m7, [pd_ %+ DCT4_ROUND]
675    add             r2d, r2d
676    lea             r3, [avx2_dct4]
677
678    vbroadcasti128  m4, [dct4_shuf]
679    mova            m5, [r3]
680    mova            m6, [r3 + 32]
681    movq            xm0, [r0]
682    movhps          xm0, [r0 + r2]
683    lea             r0, [r0 + 2 * r2]
684    movq            xm1, [r0]
685    movhps          xm1, [r0 + r2]
686
687    vinserti128     m0, m0, xm1, 1
688    pshufb          m0, m4
689    vpermq          m1, m0, 11011101b
690    vpermq          m0, m0, 10001000b
691    paddw           m2, m0, m1
692    psubw           m0, m1
693
694    pmaddwd         m2, m5
695    paddd           m2, m7
696    psrad           m2, DCT4_SHIFT
697
698    pmaddwd         m0, m6
699    paddd           m0, m7
700    psrad           m0, DCT4_SHIFT
701
702    packssdw        m2, m0
703    pshufb          m2, m4
704    vpermq          m1, m2, 11011101b
705    vpermq          m2, m2, 10001000b
706    vbroadcasti128  m7, [pd_128]
707
708    pmaddwd         m0, m2, m5
709    pmaddwd         m3, m1, m5
710    paddd           m3, m0
711    paddd           m3, m7
712    psrad           m3, 8
713
714    pmaddwd         m2, m6
715    pmaddwd         m1, m6
716    psubd           m2, m1
717    paddd           m2, m7
718    psrad           m2, 8
719
720    packssdw        m3, m2
721    movu            [r1], m3
722    RET
723
724;-------------------------------------------------------
725;void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
726;-------------------------------------------------------
727INIT_XMM sse2
728cglobal idct4, 3, 4, 6
729    add         r2d, r2d
730    lea         r3, [tab_dct4]
731
732    movu        m0, [r0 + 0 * 16]
733    movu        m1, [r0 + 1 * 16]
734
735    punpcklwd   m2, m0, m1
736    pmaddwd     m3, m2, [r3 + 0 * 16]       ; m3 = E1
737    paddd       m3, [pd_64]
738
739    pmaddwd     m2, [r3 + 2 * 16]           ; m2 = E2
740    paddd       m2, [pd_64]
741
742    punpckhwd   m0, m1
743    pmaddwd     m1, m0, [r3 + 1 * 16]       ; m1 = O1
744    pmaddwd     m0, [r3 + 3 * 16]           ; m0 = O2
745
746    paddd       m4, m3, m1
747    psrad       m4, 7                       ; m4 = m128iA
748    paddd       m5, m2, m0
749    psrad       m5, 7
750    packssdw    m4, m5                      ; m4 = m128iA
751
752    psubd       m2, m0
753    psrad       m2, 7
754    psubd       m3, m1
755    psrad       m3, 7
756    packssdw    m2, m3                      ; m2 = m128iD
757
758    punpcklwd   m1, m4, m2                  ; m1 = S0
759    punpckhwd   m4, m2                      ; m4 = S8
760
761    punpcklwd   m0, m1, m4                  ; m0 = m128iA
762    punpckhwd   m1, m4                      ; m1 = m128iD
763
764    punpcklwd   m2, m0, m1
765    pmaddwd     m3, m2, [r3 + 0 * 16]
766    paddd       m3, [pd_ %+ IDCT_ROUND]     ; m3 = E1
767
768    pmaddwd     m2, [r3 + 2 * 16]
769    paddd       m2, [pd_ %+ IDCT_ROUND]     ; m2 = E2
770
771    punpckhwd   m0, m1
772    pmaddwd     m1, m0, [r3 + 1 * 16]       ; m1 = O1
773    pmaddwd     m0, [r3 + 3 * 16]           ; m0 = O2
774
775    paddd       m4, m3, m1
776    psrad       m4, IDCT_SHIFT              ; m4 = m128iA
777    paddd       m5, m2, m0
778    psrad       m5, IDCT_SHIFT
779    packssdw    m4, m5                      ; m4 = m128iA
780
781    psubd       m2, m0
782    psrad       m2, IDCT_SHIFT
783    psubd       m3, m1
784    psrad       m3, IDCT_SHIFT
785    packssdw    m2, m3                      ; m2 = m128iD
786
787    punpcklwd   m1, m4, m2
788    punpckhwd   m4, m2
789
790    punpcklwd   m0, m1, m4
791    movlps      [r1 + 0 * r2], m0
792    movhps      [r1 + 1 * r2], m0
793
794    punpckhwd   m1, m4
795    movlps      [r1 + 2 * r2], m1
796    lea         r1, [r1 + 2 * r2]
797    movhps      [r1 + r2], m1
798    RET
799
800;------------------------------------------------------
801;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride)
802;------------------------------------------------------
803INIT_XMM sse2
804%if ARCH_X86_64
805cglobal dst4, 3, 4, 8+4
806  %define       coef0   m8
807  %define       coef1   m9
808  %define       coef2   m10
809  %define       coef3   m11
810%else ; ARCH_X86_64 = 0
811cglobal dst4, 3, 4, 8
812  %define       coef0   [r3 + 0 * 16]
813  %define       coef1   [r3 + 1 * 16]
814  %define       coef2   [r3 + 2 * 16]
815  %define       coef3   [r3 + 3 * 16]
816%endif ; ARCH_X86_64
817
818    mova        m5, [pd_ %+ DST4_ROUND]
819    add         r2d, r2d
820    lea         r3, [tab_dst4]
821%if ARCH_X86_64
822    mova        coef0, [r3 + 0 * 16]
823    mova        coef1, [r3 + 1 * 16]
824    mova        coef2, [r3 + 2 * 16]
825    mova        coef3, [r3 + 3 * 16]
826%endif
827    movh        m0, [r0 + 0 * r2]            ; load
828    movhps      m0, [r0 + 1 * r2]
829    lea         r0, [r0 + 2 * r2]
830    movh        m1, [r0]
831    movhps      m1, [r0 + r2]
832    pmaddwd     m2, m0, coef0                ; DST1
833    pmaddwd     m3, m1, coef0
834    pshufd      m6, m2, q2301
835    pshufd      m7, m3, q2301
836    paddd       m2, m6
837    paddd       m3, m7
838    pshufd      m2, m2, q3120
839    pshufd      m3, m3, q3120
840    punpcklqdq  m2, m3
841    paddd       m2, m5
842    psrad       m2, DST4_SHIFT
843    pmaddwd     m3, m0, coef1
844    pmaddwd     m4, m1, coef1
845    pshufd      m6, m4, q2301
846    pshufd      m7, m3, q2301
847    paddd       m4, m6
848    paddd       m3, m7
849    pshufd      m4, m4, q3120
850    pshufd      m3, m3, q3120
851    punpcklqdq  m3, m4
852    paddd       m3, m5
853    psrad       m3, DST4_SHIFT
854    packssdw    m2, m3                       ; m2 = T70
855    pmaddwd     m3, m0, coef2
856    pmaddwd     m4, m1, coef2
857    pshufd      m6, m4, q2301
858    pshufd      m7, m3, q2301
859    paddd       m4, m6
860    paddd       m3, m7
861    pshufd      m4, m4, q3120
862    pshufd      m3, m3, q3120
863    punpcklqdq  m3, m4
864    paddd       m3, m5
865    psrad       m3, DST4_SHIFT
866    pmaddwd     m0, coef3
867    pmaddwd     m1, coef3
868    pshufd      m6, m0, q2301
869    pshufd      m7, m1, q2301
870    paddd       m0, m6
871    paddd       m1, m7
872    pshufd      m0, m0, q3120
873    pshufd      m1, m1, q3120
874    punpcklqdq  m0, m1
875    paddd       m0, m5
876    psrad       m0, DST4_SHIFT
877    packssdw    m3, m0                       ; m3 = T71
878    mova        m5, [pd_128]
879
880    pmaddwd     m0, m2, coef0                ; DST2
881    pmaddwd     m1, m3, coef0
882    pshufd      m6, m0, q2301
883    pshufd      m7, m1, q2301
884    paddd       m0, m6
885    paddd       m1, m7
886    pshufd      m0, m0, q3120
887    pshufd      m1, m1, q3120
888    punpcklqdq  m0, m1
889    paddd       m0, m5
890    psrad       m0, 8
891
892    pmaddwd     m4, m2, coef1
893    pmaddwd     m1, m3, coef1
894    pshufd      m6, m4, q2301
895    pshufd      m7, m1, q2301
896    paddd       m4, m6
897    paddd       m1, m7
898    pshufd      m4, m4, q3120
899    pshufd      m1, m1, q3120
900    punpcklqdq  m4, m1
901    paddd       m4, m5
902    psrad       m4, 8
903    packssdw    m0, m4
904    movu        [r1 + 0 * 16], m0
905
906    pmaddwd     m0, m2, coef2
907    pmaddwd     m1, m3, coef2
908    pshufd      m6, m0, q2301
909    pshufd      m7, m1, q2301
910    paddd       m0, m6
911    paddd       m1, m7
912    pshufd      m0, m0, q3120
913    pshufd      m1, m1, q3120
914    punpcklqdq  m0, m1
915    paddd       m0, m5
916    psrad       m0, 8
917
918    pmaddwd     m2, coef3
919    pmaddwd     m3, coef3
920    pshufd      m6, m2, q2301
921    pshufd      m7, m3, q2301
922    paddd       m2, m6
923    paddd       m3, m7
924    pshufd      m2, m2, q3120
925    pshufd      m3, m3, q3120
926    punpcklqdq  m2, m3
927    paddd       m2, m5
928    psrad       m2, 8
929    packssdw    m0, m2
930    movu        [r1 + 1 * 16], m0
931    RET
932
933;------------------------------------------------------
934;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride)
935;------------------------------------------------------
936INIT_XMM ssse3
937%if ARCH_X86_64
938cglobal dst4, 3, 4, 8+2
939  %define       coef2   m8
940  %define       coef3   m9
941%else ; ARCH_X86_64 = 0
942cglobal dst4, 3, 4, 8
943  %define       coef2   [r3 + 2 * 16]
944  %define       coef3   [r3 + 3 * 16]
945%endif ; ARCH_X86_64
946%define         coef0   m6
947%define         coef1   m7
948
949    mova        m5, [pd_ %+ DST4_ROUND]
950    add         r2d, r2d
951    lea         r3, [tab_dst4]
952    mova        coef0, [r3 + 0 * 16]
953    mova        coef1, [r3 + 1 * 16]
954%if ARCH_X86_64
955    mova        coef2, [r3 + 2 * 16]
956    mova        coef3, [r3 + 3 * 16]
957%endif
958    movh        m0, [r0 + 0 * r2]            ; load
959    movh        m1, [r0 + 1 * r2]
960    punpcklqdq  m0, m1
961    lea         r0, [r0 + 2 * r2]
962    movh        m1, [r0]
963    movh        m2, [r0 + r2]
964    punpcklqdq  m1, m2
965    pmaddwd     m2, m0, coef0                ; DST1
966    pmaddwd     m3, m1, coef0
967    phaddd      m2, m3
968    paddd       m2, m5
969    psrad       m2, DST4_SHIFT
970    pmaddwd     m3, m0, coef1
971    pmaddwd     m4, m1, coef1
972    phaddd      m3, m4
973    paddd       m3, m5
974    psrad       m3, DST4_SHIFT
975    packssdw    m2, m3                       ; m2 = T70
976    pmaddwd     m3, m0, coef2
977    pmaddwd     m4, m1, coef2
978    phaddd      m3, m4
979    paddd       m3, m5
980    psrad       m3, DST4_SHIFT
981    pmaddwd     m0, coef3
982    pmaddwd     m1, coef3
983    phaddd      m0, m1
984    paddd       m0, m5
985    psrad       m0, DST4_SHIFT
986    packssdw    m3, m0                       ; m3 = T71
987    mova        m5, [pd_128]
988
989    pmaddwd     m0, m2, coef0                ; DST2
990    pmaddwd     m1, m3, coef0
991    phaddd      m0, m1
992    paddd       m0, m5
993    psrad       m0, 8
994
995    pmaddwd     m4, m2, coef1
996    pmaddwd     m1, m3, coef1
997    phaddd      m4, m1
998    paddd       m4, m5
999    psrad       m4, 8
1000    packssdw    m0, m4
1001    movu        [r1 + 0 * 16], m0
1002
1003    pmaddwd     m0, m2, coef2
1004    pmaddwd     m1, m3, coef2
1005    phaddd      m0, m1
1006    paddd       m0, m5
1007    psrad       m0, 8
1008
1009    pmaddwd     m2, coef3
1010    pmaddwd     m3, coef3
1011    phaddd      m2, m3
1012    paddd       m2, m5
1013    psrad       m2, 8
1014    packssdw    m0, m2
1015    movu        [r1 + 1 * 16], m0
1016    RET
1017
1018;------------------------------------------------------------------
1019;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride)
1020;------------------------------------------------------------------
1021INIT_YMM avx2
1022cglobal dst4, 3, 4, 6
1023    vbroadcasti128 m5, [pd_ %+ DST4_ROUND]
1024    mova        m4, [trans8_shuf]
1025    add         r2d, r2d
1026    lea         r3, [pw_dst4_tab]
1027
1028    movq        xm0, [r0 + 0 * r2]
1029    movhps      xm0, [r0 + 1 * r2]
1030    lea         r0, [r0 + 2 * r2]
1031    movq        xm1, [r0]
1032    movhps      xm1, [r0 + r2]
1033
1034    vinserti128 m0, m0, xm1, 1          ; m0 = src[0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
1035
1036    pmaddwd     m2, m0, [r3 + 0 * 32]
1037    pmaddwd     m1, m0, [r3 + 1 * 32]
1038    phaddd      m2, m1
1039    paddd       m2, m5
1040    psrad       m2, DST4_SHIFT
1041    pmaddwd     m3, m0, [r3 + 2 * 32]
1042    pmaddwd     m1, m0, [r3 + 3 * 32]
1043    phaddd      m3, m1
1044    paddd       m3, m5
1045    psrad       m3, DST4_SHIFT
1046    packssdw    m2, m3
1047    vpermd      m2, m4, m2
1048
1049    vpbroadcastd m5, [pd_128]
1050    pmaddwd     m0, m2, [r3 + 0 * 32]
1051    pmaddwd     m1, m2, [r3 + 1 * 32]
1052    phaddd      m0, m1
1053    paddd       m0, m5
1054    psrad       m0, 8
1055    pmaddwd     m3, m2, [r3 + 2 * 32]
1056    pmaddwd     m2, m2, [r3 + 3 * 32]
1057    phaddd      m3, m2
1058    paddd       m3, m5
1059    psrad       m3, 8
1060    packssdw    m0, m3
1061    vpermd      m0, m4, m0
1062    movu        [r1], m0
1063    RET
1064
1065;-------------------------------------------------------
1066;void idst4(const int16_t* src, int16_t* dst, intptr_t dstStride)
1067;-------------------------------------------------------
1068INIT_XMM sse2
1069cglobal idst4, 3, 4, 7
1070    mova        m6, [pd_ %+ IDCT_ROUND]
1071    add         r2d, r2d
1072    lea         r3, [tab_idst4]
1073    mova        m5, [pd_64]
1074
1075    movu        m0, [r0 + 0 * 16]
1076    movu        m1, [r0 + 1 * 16]
1077
1078    punpcklwd   m2, m0, m1                  ; m2 = m128iAC
1079    punpckhwd   m0, m1                      ; m0 = m128iBD
1080
1081    pmaddwd     m1, m2, [r3 + 0 * 16]
1082    pmaddwd     m3, m0, [r3 + 1 * 16]
1083    paddd       m1, m3
1084    paddd       m1, m5
1085    psrad       m1, 7                       ; m1 = S0
1086
1087    pmaddwd     m3, m2, [r3 + 2 * 16]
1088    pmaddwd     m4, m0, [r3 + 3 * 16]
1089    paddd       m3, m4
1090    paddd       m3, m5
1091    psrad       m3, 7                       ; m3 = S8
1092    packssdw    m1, m3                      ; m1 = m128iA
1093
1094    pmaddwd     m3, m2, [r3 + 4 * 16]
1095    pmaddwd     m4, m0, [r3 + 5 * 16]
1096    paddd       m3, m4
1097    paddd       m3, m5
1098    psrad       m3, 7                       ; m3 = S0
1099
1100    pmaddwd     m2, [r3 + 6 * 16]
1101    pmaddwd     m0, [r3 + 7 * 16]
1102    paddd       m2, m0
1103    paddd       m2, m5
1104    psrad       m2, 7                       ; m2 = S8
1105    packssdw    m3, m2                      ; m3 = m128iD
1106
1107    punpcklwd   m0, m1, m3
1108    punpckhwd   m1, m3
1109
1110    punpcklwd   m2, m0, m1
1111    punpckhwd   m0, m1
1112    punpcklwd   m1, m2, m0
1113    punpckhwd   m2, m0
1114    pmaddwd     m0, m1, [r3 + 0 * 16]
1115    pmaddwd     m3, m2, [r3 + 1 * 16]
1116    paddd       m0, m3
1117    paddd       m0, m6
1118    psrad       m0, IDCT_SHIFT              ; m0 = S0
1119    pmaddwd     m3, m1, [r3 + 2 * 16]
1120    pmaddwd     m4, m2, [r3 + 3 * 16]
1121    paddd       m3, m4
1122    paddd       m3, m6
1123    psrad       m3, IDCT_SHIFT              ; m3 = S8
1124    packssdw    m0, m3                      ; m0 = m128iA
1125    pmaddwd     m3, m1, [r3 + 4 * 16]
1126    pmaddwd     m4, m2, [r3 + 5 * 16]
1127    paddd       m3, m4
1128    paddd       m3, m6
1129    psrad       m3, IDCT_SHIFT              ; m3 = S0
1130    pmaddwd     m1, [r3 + 6 * 16]
1131    pmaddwd     m2, [r3 + 7 * 16]
1132    paddd       m1, m2
1133    paddd       m1, m6
1134    psrad       m1, IDCT_SHIFT              ; m1 = S8
1135    packssdw    m3, m1                      ; m3 = m128iD
1136    punpcklwd   m1, m0, m3
1137    punpckhwd   m0, m3
1138
1139    punpcklwd   m2, m1, m0
1140    movlps      [r1 + 0 * r2], m2
1141    movhps      [r1 + 1 * r2], m2
1142
1143    punpckhwd   m1, m0
1144    movlps      [r1 + 2 * r2], m1
1145    lea         r1, [r1 + 2 * r2]
1146    movhps      [r1 + r2], m1
1147    RET
1148
1149;-----------------------------------------------------------------
1150;void idst4(const int16_t* src, int16_t* dst, intptr_t dstStride)
1151;-----------------------------------------------------------------
1152INIT_YMM avx2
1153cglobal idst4, 3, 4, 6
1154    vbroadcasti128 m4, [pd_ %+ IDCT_ROUND]
1155    add         r2d, r2d
1156    lea         r3, [pw_idst4_tab]
1157
1158    movu        xm0, [r0 + 0 * 16]
1159    movu        xm1, [r0 + 1 * 16]
1160
1161    punpcklwd   m2, m0, m1
1162    punpckhwd   m0, m1
1163
1164    vinserti128 m2, m2, xm2, 1
1165    vinserti128 m0, m0, xm0, 1
1166
1167    vpbroadcastd m5, [pd_64]
1168    pmaddwd     m1, m2, [r3 + 0 * 32]
1169    pmaddwd     m3, m0, [r3 + 1 * 32]
1170    paddd       m1, m3
1171    paddd       m1, m5
1172    psrad       m1, 7
1173    pmaddwd     m3, m2, [r3 + 2 * 32]
1174    pmaddwd     m0, [r3 + 3 * 32]
1175    paddd       m3, m0
1176    paddd       m3, m5
1177    psrad       m3, 7
1178
1179    packssdw    m0, m1, m3
1180    pshufb      m0, [pb_idst4_shuf]
1181    vpermq      m1, m0, 11101110b
1182
1183    punpcklwd   m2, m0, m1
1184    punpckhwd   m0, m1
1185    punpcklwd   m1, m2, m0
1186    punpckhwd   m2, m0
1187
1188    vpermq      m1, m1, 01000100b
1189    vpermq      m2, m2, 01000100b
1190
1191    pmaddwd     m0, m1, [r3 + 0 * 32]
1192    pmaddwd     m3, m2, [r3 + 1 * 32]
1193    paddd       m0, m3
1194    paddd       m0, m4
1195    psrad       m0, IDCT_SHIFT
1196    pmaddwd     m3, m1, [r3 + 2 * 32]
1197    pmaddwd     m2, m2, [r3 + 3 * 32]
1198    paddd       m3, m2
1199    paddd       m3, m4
1200    psrad       m3, IDCT_SHIFT
1201
1202    packssdw    m0, m3
1203    pshufb      m1, m0, [pb_idst4_shuf]
1204    vpermq      m0, m1, 11101110b
1205
1206    punpcklwd   m2, m1, m0
1207    movq        [r1 + 0 * r2], xm2
1208    movhps      [r1 + 1 * r2], xm2
1209
1210    punpckhwd   m1, m0
1211    movq        [r1 + 2 * r2], xm1
1212    lea         r1, [r1 + 2 * r2]
1213    movhps      [r1 + r2], xm1
1214    RET
1215
1216;-------------------------------------------------------
1217; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride)
1218;-------------------------------------------------------
1219INIT_XMM sse2
1220cglobal dct8, 3,6,8,0-16*mmsize
1221    ;------------------------
1222    ; Stack Mapping(dword)
1223    ;------------------------
1224    ; Row0[0-3] Row1[0-3]
1225    ; ...
1226    ; Row6[0-3] Row7[0-3]
1227    ; Row0[0-3] Row7[0-3]
1228    ; ...
1229    ; Row6[4-7] Row7[4-7]
1230    ;------------------------
1231
1232    add         r2, r2
1233    lea         r3, [r2 * 3]
1234    mov         r5, rsp
1235%assign x 0
1236%rep 2
1237    movu        m0, [r0]
1238    movu        m1, [r0 + r2]
1239    movu        m2, [r0 + r2 * 2]
1240    movu        m3, [r0 + r3]
1241
1242    punpcklwd   m4, m0, m1
1243    punpckhwd   m0, m1
1244    punpcklwd   m5, m2, m3
1245    punpckhwd   m2, m3
1246    punpckldq   m1, m4, m5          ; m1 = [1 0]
1247    punpckhdq   m4, m5              ; m4 = [3 2]
1248    punpckldq   m3, m0, m2
1249    punpckhdq   m0, m2
1250    pshufd      m2, m3, 0x4E        ; m2 = [4 5]
1251    pshufd      m0, m0, 0x4E        ; m0 = [6 7]
1252
1253    paddw       m3, m1, m0
1254    psubw       m1, m0              ; m1 = [d1 d0]
1255    paddw       m0, m4, m2
1256    psubw       m4, m2              ; m4 = [d3 d2]
1257    punpcklqdq  m2, m3, m0          ; m2 = [s2 s0]
1258    punpckhqdq  m3, m0
1259    pshufd      m3, m3, 0x4E        ; m3 = [s1 s3]
1260
1261    punpcklwd   m0, m1, m4          ; m0 = [d2/d0]
1262    punpckhwd   m1, m4              ; m1 = [d3/d1]
1263    punpckldq   m4, m0, m1          ; m4 = [d3 d1 d2 d0]
1264    punpckhdq   m0, m1              ; m0 = [d3 d1 d2 d0]
1265
1266    ; odd
1267    lea         r4, [tab_dct8_1]
1268    pmaddwd     m1, m4, [r4 + 0*16]
1269    pmaddwd     m5, m0, [r4 + 0*16]
1270    pshufd      m1, m1, 0xD8
1271    pshufd      m5, m5, 0xD8
1272    mova        m7, m1
1273    punpckhqdq  m7, m5
1274    punpcklqdq  m1, m5
1275    paddd       m1, m7
1276    paddd       m1, [pd_ %+ DCT8_ROUND1]
1277    psrad       m1, DCT8_SHIFT1
1278  %if x == 1
1279    pshufd      m1, m1, 0x1B
1280  %endif
1281    mova        [r5 + 1*2*mmsize], m1 ; Row 1
1282
1283    pmaddwd     m1, m4, [r4 + 1*16]
1284    pmaddwd     m5, m0, [r4 + 1*16]
1285    pshufd      m1, m1, 0xD8
1286    pshufd      m5, m5, 0xD8
1287    mova        m7, m1
1288    punpckhqdq  m7, m5
1289    punpcklqdq  m1, m5
1290    paddd       m1, m7
1291    paddd       m1, [pd_ %+ DCT8_ROUND1]
1292    psrad       m1, DCT8_SHIFT1
1293  %if x == 1
1294    pshufd      m1, m1, 0x1B
1295  %endif
1296    mova        [r5 + 3*2*mmsize], m1 ; Row 3
1297
1298    pmaddwd     m1, m4, [r4 + 2*16]
1299    pmaddwd     m5, m0, [r4 + 2*16]
1300    pshufd      m1, m1, 0xD8
1301    pshufd      m5, m5, 0xD8
1302    mova        m7, m1
1303    punpckhqdq  m7, m5
1304    punpcklqdq  m1, m5
1305    paddd       m1, m7
1306    paddd       m1, [pd_ %+ DCT8_ROUND1]
1307    psrad       m1, DCT8_SHIFT1
1308  %if x == 1
1309    pshufd      m1, m1, 0x1B
1310  %endif
1311    mova        [r5 + 5*2*mmsize], m1 ; Row 5
1312
1313    pmaddwd     m4, [r4 + 3*16]
1314    pmaddwd     m0, [r4 + 3*16]
1315    pshufd      m4, m4, 0xD8
1316    pshufd      m0, m0, 0xD8
1317    mova        m7, m4
1318    punpckhqdq  m7, m0
1319    punpcklqdq  m4, m0
1320    paddd       m4, m7
1321    paddd       m4, [pd_ %+ DCT8_ROUND1]
1322    psrad       m4, DCT8_SHIFT1
1323  %if x == 1
1324    pshufd      m4, m4, 0x1B
1325  %endif
1326    mova        [r5 + 7*2*mmsize], m4; Row 7
1327
1328    ; even
1329    lea         r4, [tab_dct4]
1330    paddw       m0, m2, m3          ; m0 = [EE1 EE0]
1331    pshufd      m0, m0, 0xD8
1332    pshuflw     m0, m0, 0xD8
1333    pshufhw     m0, m0, 0xD8
1334    psubw       m2, m3              ; m2 = [EO1 EO0]
1335    pmullw      m2, [pw_ppppmmmm]
1336    pshufd      m2, m2, 0xD8
1337    pshuflw     m2, m2, 0xD8
1338    pshufhw     m2, m2, 0xD8
1339    pmaddwd     m3, m0, [r4 + 0*16]
1340    paddd       m3, [pd_ %+ DCT8_ROUND1]
1341    psrad       m3, DCT8_SHIFT1
1342  %if x == 1
1343    pshufd      m3, m3, 0x1B
1344  %endif
1345    mova        [r5 + 0*2*mmsize], m3 ; Row 0
1346    pmaddwd     m0, [r4 + 2*16]
1347    paddd       m0, [pd_ %+ DCT8_ROUND1]
1348    psrad       m0, DCT8_SHIFT1
1349  %if x == 1
1350    pshufd      m0, m0, 0x1B
1351  %endif
1352    mova        [r5 + 4*2*mmsize], m0 ; Row 4
1353    pmaddwd     m3, m2, [r4 + 1*16]
1354    paddd       m3, [pd_ %+ DCT8_ROUND1]
1355    psrad       m3, DCT8_SHIFT1
1356  %if x == 1
1357    pshufd      m3, m3, 0x1B
1358  %endif
1359    mova        [r5 + 2*2*mmsize], m3 ; Row 2
1360    pmaddwd     m2, [r4 + 3*16]
1361    paddd       m2, [pd_ %+ DCT8_ROUND1]
1362    psrad       m2, DCT8_SHIFT1
1363  %if x == 1
1364    pshufd      m2, m2, 0x1B
1365  %endif
1366    mova        [r5 + 6*2*mmsize], m2 ; Row 6
1367
1368  %if x != 1
1369    lea         r0, [r0 + r2 * 4]
1370    add         r5, mmsize
1371  %endif
1372%assign x x+1
1373%endrep
1374
1375    mov         r0, rsp                 ; r0 = pointer to Low Part
1376    lea         r4, [tab_dct8_2]
1377
1378%assign x 0
1379%rep 4
1380    mova        m0, [r0 + 0*2*mmsize]     ; [3 2 1 0]
1381    mova        m1, [r0 + 1*2*mmsize]
1382    paddd       m2, m0, [r0 + (0*2+1)*mmsize]
1383    pshufd      m2, m2, 0x9C            ; m2 = [s2 s1 s3 s0]
1384    paddd       m3, m1, [r0 + (1*2+1)*mmsize]
1385    pshufd      m3, m3, 0x9C            ; m3 = ^^
1386    psubd       m0, [r0 + (0*2+1)*mmsize]     ; m0 = [d3 d2 d1 d0]
1387    psubd       m1, [r0 + (1*2+1)*mmsize]     ; m1 = ^^
1388
1389    ; even
1390    pshufd      m4, m2, 0xD8
1391    pshufd      m3, m3, 0xD8
1392    mova        m7, m4
1393    punpckhqdq  m7, m3
1394    punpcklqdq  m4, m3
1395    mova        m2, m4
1396    paddd       m4, m7                  ; m4 = [EE1 EE0 EE1 EE0]
1397    psubd       m2, m7                  ; m2 = [EO1 EO0 EO1 EO0]
1398
1399    pslld       m4, 6                   ; m4 = [64*EE1 64*EE0]
1400    mova        m5, m2
1401    pmuludq     m5, [r4 + 0*16]
1402    pshufd      m7, m2, 0xF5
1403    movu        m6, [r4 + 0*16 + 4]
1404    pmuludq     m7, m6
1405    pshufd      m5, m5, 0x88
1406    pshufd      m7, m7, 0x88
1407    punpckldq   m5, m7                  ; m5 = [36*EO1 83*EO0]
1408    pshufd      m7, m2, 0xF5
1409    pmuludq     m2, [r4 + 1*16]
1410    movu        m6, [r4 + 1*16 + 4]
1411    pmuludq     m7, m6
1412    pshufd      m2, m2, 0x88
1413    pshufd      m7, m7, 0x88
1414    punpckldq   m2, m7                  ; m2 = [83*EO1 36*EO0]
1415
1416    pshufd      m3, m4, 0xD8
1417    pshufd      m5, m5, 0xD8
1418    mova        m7, m3
1419    punpckhqdq  m7, m5
1420    punpcklqdq  m3, m5
1421    paddd       m3, m7                  ; m3 = [Row2 Row0]
1422    paddd       m3, [pd_ %+ DCT8_ROUND2]
1423    psrad       m3, DCT8_SHIFT2
1424    pshufd      m4, m4, 0xD8
1425    pshufd      m2, m2, 0xD8
1426    mova        m7, m4
1427    punpckhqdq  m7, m2
1428    punpcklqdq  m4, m2
1429    psubd       m4, m7                  ; m4 = [Row6 Row4]
1430    paddd       m4, [pd_ %+ DCT8_ROUND2]
1431    psrad       m4, DCT8_SHIFT2
1432
1433    packssdw    m3, m3
1434    movd        [r1 + 0*mmsize], m3
1435    pshufd      m3, m3, 1
1436    movd        [r1 + 2*mmsize], m3
1437
1438    packssdw    m4, m4
1439    movd        [r1 + 4*mmsize], m4
1440    pshufd      m4, m4, 1
1441    movd        [r1 + 6*mmsize], m4
1442
1443    ; odd
1444    mova        m2, m0
1445    pmuludq     m2, [r4 + 2*16]
1446    pshufd      m7, m0, 0xF5
1447    movu        m6, [r4 + 2*16 + 4]
1448    pmuludq     m7, m6
1449    pshufd      m2, m2, 0x88
1450    pshufd      m7, m7, 0x88
1451    punpckldq   m2, m7
1452    mova        m3, m1
1453    pmuludq     m3, [r4 + 2*16]
1454    pshufd      m7, m1, 0xF5
1455    pmuludq     m7, m6
1456    pshufd      m3, m3, 0x88
1457    pshufd      m7, m7, 0x88
1458    punpckldq   m3, m7
1459    mova        m4, m0
1460    pmuludq     m4, [r4 + 3*16]
1461    pshufd      m7, m0, 0xF5
1462    movu        m6, [r4 + 3*16 + 4]
1463    pmuludq     m7, m6
1464    pshufd      m4, m4, 0x88
1465    pshufd      m7, m7, 0x88
1466    punpckldq   m4, m7
1467    mova        m5, m1
1468    pmuludq     m5, [r4 + 3*16]
1469    pshufd      m7, m1, 0xF5
1470    pmuludq     m7, m6
1471    pshufd      m5, m5, 0x88
1472    pshufd      m7, m7, 0x88
1473    punpckldq   m5, m7
1474    pshufd      m2, m2, 0xD8
1475    pshufd      m3, m3, 0xD8
1476    mova        m7, m2
1477    punpckhqdq  m7, m3
1478    punpcklqdq  m2, m3
1479    paddd       m2, m7
1480    pshufd      m4, m4, 0xD8
1481    pshufd      m5, m5, 0xD8
1482    mova        m7, m4
1483    punpckhqdq  m7, m5
1484    punpcklqdq  m4, m5
1485    paddd       m4, m7
1486    pshufd      m2, m2, 0xD8
1487    pshufd      m4, m4, 0xD8
1488    mova        m7, m2
1489    punpckhqdq  m7, m4
1490    punpcklqdq  m2, m4
1491    paddd       m2, m7                  ; m2 = [Row3 Row1]
1492    paddd       m2, [pd_ %+ DCT8_ROUND2]
1493    psrad       m2, DCT8_SHIFT2
1494
1495    packssdw    m2, m2
1496    movd        [r1 + 1*mmsize], m2
1497    pshufd      m2, m2, 1
1498    movd        [r1 + 3*mmsize], m2
1499
1500    mova        m2, m0
1501    pmuludq     m2, [r4 + 4*16]
1502    pshufd      m7, m0, 0xF5
1503    movu        m6, [r4 + 4*16 + 4]
1504    pmuludq     m7, m6
1505    pshufd      m2, m2, 0x88
1506    pshufd      m7, m7, 0x88
1507    punpckldq   m2, m7
1508    mova        m3, m1
1509    pmuludq     m3, [r4 + 4*16]
1510    pshufd      m7, m1, 0xF5
1511    pmuludq     m7, m6
1512    pshufd      m3, m3, 0x88
1513    pshufd      m7, m7, 0x88
1514    punpckldq   m3, m7
1515    mova        m4, m0
1516    pmuludq     m4, [r4 + 5*16]
1517    pshufd      m7, m0, 0xF5
1518    movu        m6, [r4 + 5*16 + 4]
1519    pmuludq     m7, m6
1520    pshufd      m4, m4, 0x88
1521    pshufd      m7, m7, 0x88
1522    punpckldq   m4, m7
1523    mova        m5, m1
1524    pmuludq     m5, [r4 + 5*16]
1525    pshufd      m7, m1, 0xF5
1526    pmuludq     m7, m6
1527    pshufd      m5, m5, 0x88
1528    pshufd      m7, m7, 0x88
1529    punpckldq   m5, m7
1530    pshufd      m2, m2, 0xD8
1531    pshufd      m3, m3, 0xD8
1532    mova        m7, m2
1533    punpckhqdq  m7, m3
1534    punpcklqdq  m2, m3
1535    paddd       m2, m7
1536    pshufd      m4, m4, 0xD8
1537    pshufd      m5, m5, 0xD8
1538    mova        m7, m4
1539    punpckhqdq  m7, m5
1540    punpcklqdq  m4, m5
1541    paddd       m4, m7
1542    pshufd      m2, m2, 0xD8
1543    pshufd      m4, m4, 0xD8
1544    mova        m7, m2
1545    punpckhqdq  m7, m4
1546    punpcklqdq  m2, m4
1547    paddd       m2, m7                  ; m2 = [Row7 Row5]
1548    paddd       m2, [pd_ %+ DCT8_ROUND2]
1549    psrad       m2, DCT8_SHIFT2
1550
1551    packssdw    m2, m2
1552    movd        [r1 + 5*mmsize], m2
1553    pshufd      m2, m2, 1
1554    movd        [r1 + 7*mmsize], m2
1555%if x < 3
1556    add         r1, mmsize/4
1557    add         r0, 2*2*mmsize
1558%endif
1559%assign x x+1
1560%endrep
1561
1562    RET
1563
1564;-------------------------------------------------------
1565; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride)
1566;-------------------------------------------------------
1567INIT_XMM sse4
1568cglobal dct8, 3,6,7,0-16*mmsize
1569    ;------------------------
1570    ; Stack Mapping(dword)
1571    ;------------------------
1572    ; Row0[0-3] Row1[0-3]
1573    ; ...
1574    ; Row6[0-3] Row7[0-3]
1575    ; Row0[0-3] Row7[0-3]
1576    ; ...
1577    ; Row6[4-7] Row7[4-7]
1578    ;------------------------
1579    mova        m6, [pd_ %+ DCT8_ROUND1]
1580
1581    add         r2, r2
1582    lea         r3, [r2 * 3]
1583    mov         r5, rsp
1584%assign x 0
1585%rep 2
1586    movu        m0, [r0]
1587    movu        m1, [r0 + r2]
1588    movu        m2, [r0 + r2 * 2]
1589    movu        m3, [r0 + r3]
1590
1591    punpcklwd   m4, m0, m1
1592    punpckhwd   m0, m1
1593    punpcklwd   m5, m2, m3
1594    punpckhwd   m2, m3
1595    punpckldq   m1, m4, m5          ; m1 = [1 0]
1596    punpckhdq   m4, m5              ; m4 = [3 2]
1597    punpckldq   m3, m0, m2
1598    punpckhdq   m0, m2
1599    pshufd      m2, m3, 0x4E        ; m2 = [4 5]
1600    pshufd      m0, m0, 0x4E        ; m0 = [6 7]
1601
1602    paddw       m3, m1, m0
1603    psubw       m1, m0              ; m1 = [d1 d0]
1604    paddw       m0, m4, m2
1605    psubw       m4, m2              ; m4 = [d3 d2]
1606    punpcklqdq  m2, m3, m0          ; m2 = [s2 s0]
1607    punpckhqdq  m3, m0
1608    pshufd      m3, m3, 0x4E        ; m3 = [s1 s3]
1609
1610    punpcklwd   m0, m1, m4          ; m0 = [d2/d0]
1611    punpckhwd   m1, m4              ; m1 = [d3/d1]
1612    punpckldq   m4, m0, m1          ; m4 = [d3 d1 d2 d0]
1613    punpckhdq   m0, m1              ; m0 = [d3 d1 d2 d0]
1614
1615    ; odd
1616    lea         r4, [tab_dct8_1]
1617    pmaddwd     m1, m4, [r4 + 0*16]
1618    pmaddwd     m5, m0, [r4 + 0*16]
1619    phaddd      m1, m5
1620    paddd       m1, m6
1621    psrad       m1, DCT8_SHIFT1
1622  %if x == 1
1623    pshufd      m1, m1, 0x1B
1624  %endif
1625    mova        [r5 + 1*2*mmsize], m1 ; Row 1
1626
1627    pmaddwd     m1, m4, [r4 + 1*16]
1628    pmaddwd     m5, m0, [r4 + 1*16]
1629    phaddd      m1, m5
1630    paddd       m1, m6
1631    psrad       m1, DCT8_SHIFT1
1632  %if x == 1
1633    pshufd      m1, m1, 0x1B
1634  %endif
1635    mova        [r5 + 3*2*mmsize], m1 ; Row 3
1636
1637    pmaddwd     m1, m4, [r4 + 2*16]
1638    pmaddwd     m5, m0, [r4 + 2*16]
1639    phaddd      m1, m5
1640    paddd       m1, m6
1641    psrad       m1, DCT8_SHIFT1
1642  %if x == 1
1643    pshufd      m1, m1, 0x1B
1644  %endif
1645    mova        [r5 + 5*2*mmsize], m1 ; Row 5
1646
1647    pmaddwd     m4, [r4 + 3*16]
1648    pmaddwd     m0, [r4 + 3*16]
1649    phaddd      m4, m0
1650    paddd       m4, m6
1651    psrad       m4, DCT8_SHIFT1
1652  %if x == 1
1653    pshufd      m4, m4, 0x1B
1654  %endif
1655    mova        [r5 + 7*2*mmsize], m4; Row 7
1656
1657    ; even
1658    lea         r4, [tab_dct4]
1659    paddw       m0, m2, m3          ; m0 = [EE1 EE0]
1660    pshufb      m0, [pb_unpackhlw1]
1661    psubw       m2, m3              ; m2 = [EO1 EO0]
1662    psignw      m2, [pw_ppppmmmm]
1663    pshufb      m2, [pb_unpackhlw1]
1664    pmaddwd     m3, m0, [r4 + 0*16]
1665    paddd       m3, m6
1666    psrad       m3, DCT8_SHIFT1
1667  %if x == 1
1668    pshufd      m3, m3, 0x1B
1669  %endif
1670    mova        [r5 + 0*2*mmsize], m3 ; Row 0
1671    pmaddwd     m0, [r4 + 2*16]
1672    paddd       m0, m6
1673    psrad       m0, DCT8_SHIFT1
1674  %if x == 1
1675    pshufd      m0, m0, 0x1B
1676  %endif
1677    mova        [r5 + 4*2*mmsize], m0 ; Row 4
1678    pmaddwd     m3, m2, [r4 + 1*16]
1679    paddd       m3, m6
1680    psrad       m3, DCT8_SHIFT1
1681  %if x == 1
1682    pshufd      m3, m3, 0x1B
1683  %endif
1684    mova        [r5 + 2*2*mmsize], m3 ; Row 2
1685    pmaddwd     m2, [r4 + 3*16]
1686    paddd       m2, m6
1687    psrad       m2, DCT8_SHIFT1
1688  %if x == 1
1689    pshufd      m2, m2, 0x1B
1690  %endif
1691    mova        [r5 + 6*2*mmsize], m2 ; Row 6
1692
1693  %if x != 1
1694    lea         r0, [r0 + r2 * 4]
1695    add         r5, mmsize
1696  %endif
1697%assign x x+1
1698%endrep
1699
1700    mov         r2, 2
1701    mov         r0, rsp                 ; r0 = pointer to Low Part
1702    lea         r4, [tab_dct8_2]
1703    mova        m6, [pd_256]
1704
1705.pass2:
1706%rep 2
1707    mova        m0, [r0 + 0*2*mmsize]     ; [3 2 1 0]
1708    mova        m1, [r0 + 1*2*mmsize]
1709    paddd       m2, m0, [r0 + (0*2+1)*mmsize]
1710    pshufd      m2, m2, 0x9C            ; m2 = [s2 s1 s3 s0]
1711    paddd       m3, m1, [r0 + (1*2+1)*mmsize]
1712    pshufd      m3, m3, 0x9C            ; m3 = ^^
1713    psubd       m0, [r0 + (0*2+1)*mmsize]     ; m0 = [d3 d2 d1 d0]
1714    psubd       m1, [r0 + (1*2+1)*mmsize]     ; m1 = ^^
1715
1716    ; even
1717    phaddd      m4, m2, m3              ; m4 = [EE1 EE0 EE1 EE0]
1718    phsubd      m2, m3                  ; m2 = [EO1 EO0 EO1 EO0]
1719
1720    pslld       m4, 6                   ; m4 = [64*EE1 64*EE0]
1721    pmulld      m5, m2, [r4 + 0*16]     ; m5 = [36*EO1 83*EO0]
1722    pmulld      m2, [r4 + 1*16]         ; m2 = [83*EO1 36*EO0]
1723
1724    phaddd      m3, m4, m5              ; m3 = [Row2 Row0]
1725    paddd       m3, m6
1726    psrad       m3, 9
1727    phsubd      m4, m2                  ; m4 = [Row6 Row4]
1728    paddd       m4, m6
1729    psrad       m4, 9
1730
1731    packssdw    m3, m3
1732    movd        [r1 + 0*mmsize], m3
1733    pshufd      m3, m3, 1
1734    movd        [r1 + 2*mmsize], m3
1735
1736    packssdw    m4, m4
1737    movd        [r1 + 4*mmsize], m4
1738    pshufd      m4, m4, 1
1739    movd        [r1 + 6*mmsize], m4
1740
1741    ; odd
1742    pmulld      m2, m0, [r4 + 2*16]
1743    pmulld      m3, m1, [r4 + 2*16]
1744    pmulld      m4, m0, [r4 + 3*16]
1745    pmulld      m5, m1, [r4 + 3*16]
1746    phaddd      m2, m3
1747    phaddd      m4, m5
1748    phaddd      m2, m4                  ; m2 = [Row3 Row1]
1749    paddd       m2, m6
1750    psrad       m2, 9
1751
1752    packssdw    m2, m2
1753    movd        [r1 + 1*mmsize], m2
1754    pshufd      m2, m2, 1
1755    movd        [r1 + 3*mmsize], m2
1756
1757    pmulld      m2, m0, [r4 + 4*16]
1758    pmulld      m3, m1, [r4 + 4*16]
1759    pmulld      m4, m0, [r4 + 5*16]
1760    pmulld      m5, m1, [r4 + 5*16]
1761    phaddd      m2, m3
1762    phaddd      m4, m5
1763    phaddd      m2, m4                  ; m2 = [Row7 Row5]
1764    paddd       m2, m6
1765    psrad       m2, 9
1766
1767    packssdw    m2, m2
1768    movd        [r1 + 5*mmsize], m2
1769    pshufd      m2, m2, 1
1770    movd        [r1 + 7*mmsize], m2
1771
1772    add         r1, mmsize/4
1773    add         r0, 2*2*mmsize
1774%endrep
1775
1776    dec         r2
1777    jnz        .pass2
1778    RET
1779
1780;-------------------------------------------------------
1781; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride)
1782;-------------------------------------------------------
1783%if ARCH_X86_64
1784INIT_XMM sse2
1785cglobal idct8, 3, 6, 16, 0-5*mmsize
1786    mova        m9, [r0 + 1 * mmsize]
1787    mova        m1, [r0 + 3 * mmsize]
1788    mova        m7, m9
1789    punpcklwd   m7, m1
1790    punpckhwd   m9, m1
1791    mova        m14, [tab_idct8_3]
1792    mova        m3, m14
1793    pmaddwd     m14, m7
1794    pmaddwd     m3, m9
1795    mova        m0, [r0 + 5 * mmsize]
1796    mova        m10, [r0 + 7 * mmsize]
1797    mova        m2, m0
1798    punpcklwd   m2, m10
1799    punpckhwd   m0, m10
1800    mova        m15, [tab_idct8_3 + 1 * mmsize]
1801    mova        m11, [tab_idct8_3 + 1 * mmsize]
1802    pmaddwd     m15, m2
1803    mova        m4, [tab_idct8_3 + 2 * mmsize]
1804    pmaddwd     m11, m0
1805    mova        m1, [tab_idct8_3 + 2 * mmsize]
1806    paddd       m15, m14
1807    mova        m5, [tab_idct8_3 + 4 * mmsize]
1808    mova        m12, [tab_idct8_3 + 4 * mmsize]
1809    paddd       m11, m3
1810    mova        [rsp + 0 * mmsize], m11
1811    mova        [rsp + 1 * mmsize], m15
1812    pmaddwd     m4, m7
1813    pmaddwd     m1, m9
1814    mova        m14, [tab_idct8_3 + 3 * mmsize]
1815    mova        m3, [tab_idct8_3 + 3 * mmsize]
1816    pmaddwd     m14, m2
1817    pmaddwd     m3, m0
1818    paddd       m14, m4
1819    paddd       m3, m1
1820    mova        [rsp + 2 * mmsize], m3
1821    pmaddwd     m5, m9
1822    pmaddwd     m9, [tab_idct8_3 + 6 * mmsize]
1823    mova        m6, [tab_idct8_3 + 5 * mmsize]
1824    pmaddwd     m12, m7
1825    pmaddwd     m7, [tab_idct8_3 + 6 * mmsize]
1826    mova        m4, [tab_idct8_3 + 5 * mmsize]
1827    pmaddwd     m6, m2
1828    paddd       m6, m12
1829    pmaddwd     m2, [tab_idct8_3 + 7 * mmsize]
1830    paddd       m7, m2
1831    mova        [rsp + 3 * mmsize], m6
1832    pmaddwd     m4, m0
1833    pmaddwd     m0, [tab_idct8_3 + 7 * mmsize]
1834    paddd       m9, m0
1835    paddd       m5, m4
1836    mova        m6, [r0 + 0 * mmsize]
1837    mova        m0, [r0 + 4 * mmsize]
1838    mova        m4, m6
1839    punpcklwd   m4, m0
1840    punpckhwd   m6, m0
1841    mova        m12, [r0 + 2 * mmsize]
1842    mova        m0, [r0 + 6 * mmsize]
1843    mova        m13, m12
1844    mova        m8, [tab_dct4]
1845    punpcklwd   m13, m0
1846    mova        m10, [tab_dct4]
1847    punpckhwd   m12, m0
1848    pmaddwd     m8, m4
1849    mova        m3, m8
1850    pmaddwd     m4, [tab_dct4 + 2 * mmsize]
1851    pmaddwd     m10, m6
1852    mova        m2, [tab_dct4 + 1 * mmsize]
1853    mova        m1, m10
1854    pmaddwd     m6, [tab_dct4 + 2 * mmsize]
1855    mova        m0, [tab_dct4 + 1 * mmsize]
1856    pmaddwd     m2, m13
1857    paddd       m3, m2
1858    psubd       m8, m2
1859    mova        m2, m6
1860    pmaddwd     m13, [tab_dct4 + 3 * mmsize]
1861    pmaddwd     m0, m12
1862    paddd       m1, m0
1863    psubd       m10, m0
1864    mova        m0, m4
1865    pmaddwd     m12, [tab_dct4 + 3 * mmsize]
1866    paddd       m3, [pd_64]
1867    paddd       m1, [pd_64]
1868    paddd       m8, [pd_64]
1869    paddd       m10, [pd_64]
1870    paddd       m0, m13
1871    paddd       m2, m12
1872    paddd       m0, [pd_64]
1873    paddd       m2, [pd_64]
1874    psubd       m4, m13
1875    psubd       m6, m12
1876    paddd       m4, [pd_64]
1877    paddd       m6, [pd_64]
1878    mova        m12, m8
1879    psubd       m8, m7
1880    psrad       m8, 7
1881    paddd       m15, m3
1882    psubd       m3, [rsp + 1 * mmsize]
1883    psrad       m15, 7
1884    paddd       m12, m7
1885    psrad       m12, 7
1886    paddd       m11, m1
1887    mova        m13, m14
1888    psrad       m11, 7
1889    packssdw    m15, m11
1890    psubd       m1, [rsp + 0 * mmsize]
1891    psrad       m1, 7
1892    mova        m11, [rsp + 2 * mmsize]
1893    paddd       m14, m0
1894    psrad       m14, 7
1895    psubd       m0, m13
1896    psrad       m0, 7
1897    paddd       m11, m2
1898    mova        m13, [rsp + 3 * mmsize]
1899    psrad       m11, 7
1900    packssdw    m14, m11
1901    mova        m11, m6
1902    psubd       m6, m5
1903    paddd       m13, m4
1904    psrad       m13, 7
1905    psrad       m6, 7
1906    paddd       m11, m5
1907    psrad       m11, 7
1908    packssdw    m13, m11
1909    mova        m11, m10
1910    psubd       m4, [rsp + 3 * mmsize]
1911    psubd       m10, m9
1912    psrad       m4, 7
1913    psrad       m10, 7
1914    packssdw    m4, m6
1915    packssdw    m8, m10
1916    paddd       m11, m9
1917    psrad       m11, 7
1918    packssdw    m12, m11
1919    psubd       m2, [rsp + 2 * mmsize]
1920    mova        m5, m15
1921    psrad       m2, 7
1922    packssdw    m0, m2
1923    mova        m2, m14
1924    psrad       m3, 7
1925    packssdw    m3, m1
1926    mova        m6, m13
1927    punpcklwd   m5, m8
1928    punpcklwd   m2, m4
1929    mova        m1, m12
1930    punpcklwd   m6, m0
1931    punpcklwd   m1, m3
1932    mova        m9, m5
1933    punpckhwd   m13, m0
1934    mova        m0, m2
1935    punpcklwd   m9, m6
1936    punpckhwd   m5, m6
1937    punpcklwd   m0, m1
1938    punpckhwd   m2, m1
1939    punpckhwd   m15, m8
1940    mova        m1, m5
1941    punpckhwd   m14, m4
1942    punpckhwd   m12, m3
1943    mova        m6, m9
1944    punpckhwd   m9, m0
1945    punpcklwd   m1, m2
1946    mova        m4, [tab_idct8_3 + 0 * mmsize]
1947    punpckhwd   m5, m2
1948    punpcklwd   m6, m0
1949    mova        m2, m15
1950    mova        m0, m14
1951    mova        m7, m9
1952    punpcklwd   m2, m13
1953    punpcklwd   m0, m12
1954    punpcklwd   m7, m5
1955    punpckhwd   m14, m12
1956    mova        m10, m2
1957    punpckhwd   m15, m13
1958    punpckhwd   m9, m5
1959    pmaddwd     m4, m7
1960    mova        m13, m1
1961    punpckhwd   m2, m0
1962    punpcklwd   m10, m0
1963    mova        m0, m15
1964    punpckhwd   m15, m14
1965    mova        m12, m1
1966    mova        m3, [tab_idct8_3 + 0 * mmsize]
1967    punpcklwd   m0, m14
1968    pmaddwd     m3, m9
1969    mova        m11, m2
1970    punpckhwd   m2, m15
1971    punpcklwd   m11, m15
1972    mova        m8, [tab_idct8_3 + 1 * mmsize]
1973    punpcklwd   m13, m0
1974    punpckhwd   m12, m0
1975    pmaddwd     m8, m11
1976    paddd       m8, m4
1977    mova        [rsp + 4 * mmsize], m8
1978    mova        m4, [tab_idct8_3 + 2 * mmsize]
1979    pmaddwd     m4, m7
1980    mova        m15, [tab_idct8_3 + 2 * mmsize]
1981    mova        m5, [tab_idct8_3 + 1 * mmsize]
1982    pmaddwd     m15, m9
1983    pmaddwd     m5, m2
1984    paddd       m5, m3
1985    mova        [rsp + 3 * mmsize], m5
1986    mova        m14, [tab_idct8_3 + 3 * mmsize]
1987    mova        m5, [tab_idct8_3 + 3 * mmsize]
1988    pmaddwd     m14, m11
1989    paddd       m14, m4
1990    mova        [rsp + 2 * mmsize], m14
1991    pmaddwd     m5, m2
1992    paddd       m5, m15
1993    mova        [rsp + 1 * mmsize], m5
1994    mova        m15, [tab_idct8_3 + 4 * mmsize]
1995    mova        m5, [tab_idct8_3 + 4 * mmsize]
1996    pmaddwd     m15, m7
1997    pmaddwd     m7, [tab_idct8_3 + 6 * mmsize]
1998    pmaddwd     m5, m9
1999    pmaddwd     m9, [tab_idct8_3 + 6 * mmsize]
2000    mova        m4, [tab_idct8_3 + 5 * mmsize]
2001    pmaddwd     m4, m2
2002    paddd       m5, m4
2003    mova        m4, m6
2004    mova        m8, [tab_idct8_3 + 5 * mmsize]
2005    punpckhwd   m6, m10
2006    pmaddwd     m2, [tab_idct8_3 + 7 * mmsize]
2007    punpcklwd   m4, m10
2008    paddd       m9, m2
2009    pmaddwd     m8, m11
2010    mova        m10, [tab_dct4]
2011    paddd       m8, m15
2012    pmaddwd     m11, [tab_idct8_3 + 7 * mmsize]
2013    paddd       m7, m11
2014    mova        [rsp + 0 * mmsize], m8
2015    pmaddwd     m10, m6
2016    pmaddwd     m6, [tab_dct4 + 2 * mmsize]
2017    mova        m1, m10
2018    mova        m8, [tab_dct4]
2019    mova        m3, [tab_dct4 + 1 * mmsize]
2020    pmaddwd     m8, m4
2021    pmaddwd     m4, [tab_dct4 + 2 * mmsize]
2022    mova        m0, m8
2023    mova        m2, [tab_dct4 + 1 * mmsize]
2024    pmaddwd     m3, m13
2025    psubd       m8, m3
2026    paddd       m0, m3
2027    mova        m3, m6
2028    pmaddwd     m13, [tab_dct4 + 3 * mmsize]
2029    pmaddwd     m2, m12
2030    paddd       m1, m2
2031    psubd       m10, m2
2032    mova        m2, m4
2033    pmaddwd     m12, [tab_dct4 + 3 * mmsize]
2034    mova        m15, [pd_ %+ IDCT_ROUND]
2035    paddd       m0, m15
2036    paddd       m1, m15
2037    paddd       m8, m15
2038    paddd       m10, m15
2039    paddd       m2, m13
2040    paddd       m3, m12
2041    paddd       m2, m15
2042    paddd       m3, m15
2043    psubd       m4, m13
2044    psubd       m6, m12
2045    paddd       m4, m15
2046    paddd       m6, m15
2047    mova        m15, [rsp + 4 * mmsize]
2048    mova        m12, m8
2049    psubd       m8, m7
2050    psrad       m8, IDCT_SHIFT
2051    mova        m11, [rsp + 3 * mmsize]
2052    paddd       m15, m0
2053    psrad       m15, IDCT_SHIFT
2054    psubd       m0, [rsp + 4 * mmsize]
2055    psrad       m0, IDCT_SHIFT
2056    paddd       m12, m7
2057    paddd       m11, m1
2058    mova        m14, [rsp + 2 * mmsize]
2059    psrad       m11, IDCT_SHIFT
2060    packssdw    m15, m11
2061    psubd       m1, [rsp + 3 * mmsize]
2062    psrad       m1, IDCT_SHIFT
2063    mova        m11, [rsp + 1 * mmsize]
2064    paddd       m14, m2
2065    psrad       m14, IDCT_SHIFT
2066    packssdw    m0, m1
2067    psrad       m12, IDCT_SHIFT
2068    psubd       m2, [rsp + 2 * mmsize]
2069    paddd       m11, m3
2070    mova        m13, [rsp + 0 * mmsize]
2071    psrad       m11, IDCT_SHIFT
2072    packssdw    m14, m11
2073    mova        m11, m6
2074    psubd       m6, m5
2075    paddd       m13, m4
2076    psrad       m13, IDCT_SHIFT
2077    mova        m1, m15
2078    paddd       m11, m5
2079    psrad       m11, IDCT_SHIFT
2080    packssdw    m13, m11
2081    mova        m11, m10
2082    psubd       m10, m9
2083    psrad       m10, IDCT_SHIFT
2084    packssdw    m8, m10
2085    psrad       m6, IDCT_SHIFT
2086    psubd       m4, [rsp + 0 * mmsize]
2087    paddd       m11, m9
2088    psrad       m11, IDCT_SHIFT
2089    packssdw    m12, m11
2090    punpcklwd   m1, m14
2091    mova        m5, m13
2092    psrad       m4, IDCT_SHIFT
2093    packssdw    m4, m6
2094    psubd       m3, [rsp + 1 * mmsize]
2095    psrad       m2, IDCT_SHIFT
2096    mova        m6, m8
2097    psrad       m3, IDCT_SHIFT
2098    punpcklwd   m5, m12
2099    packssdw    m2, m3
2100    punpcklwd   m6, m4
2101    punpckhwd   m8, m4
2102    mova        m4, m1
2103    mova        m3, m2
2104    punpckhdq   m1, m5
2105    punpckldq   m4, m5
2106    punpcklwd   m3, m0
2107    punpckhwd   m2, m0
2108    mova        m0, m6
2109    lea         r2, [r2 + r2]
2110    lea         r4, [r2 + r2]
2111    lea         r3, [r4 + r2]
2112    lea         r4, [r4 + r3]
2113    lea         r0, [r4 + r2 * 2]
2114    movq        [r1], m4
2115    punpckhwd   m15, m14
2116    movhps      [r1 + r2], m4
2117    punpckhdq   m0, m3
2118    movq        [r1 + r2 * 2], m1
2119    punpckhwd   m13, m12
2120    movhps      [r1 + r3], m1
2121    mova        m1, m6
2122    punpckldq   m1, m3
2123    movq        [r1 + 8], m1
2124    movhps      [r1 + r2 + 8], m1
2125    movq        [r1 + r2 * 2 + 8], m0
2126    movhps      [r1 + r3 + 8], m0
2127    mova        m0, m15
2128    punpckhdq   m15, m13
2129    punpckldq   m0, m13
2130    movq        [r1 + r2 * 4], m0
2131    movhps      [r1 + r4], m0
2132    mova        m0, m8
2133    punpckhdq   m8, m2
2134    movq        [r1 + r3 * 2], m15
2135    punpckldq   m0, m2
2136    movhps      [r1 + r0], m15
2137    movq        [r1 + r2 * 4 + 8], m0
2138    movhps      [r1 + r4 + 8], m0
2139    movq        [r1 + r3 * 2 + 8], m8
2140    movhps      [r1 + r0 + 8], m8
2141    RET
2142%endif
2143
2144;-------------------------------------------------------
2145; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride)
2146;-------------------------------------------------------
2147INIT_XMM ssse3
2148cglobal patial_butterfly_inverse_internal_pass1
2149    movh        m0, [r0]
2150    movhps      m0, [r0 + 2 * 16]
2151    movh        m1, [r0 + 4 * 16]
2152    movhps      m1, [r0 + 6 * 16]
2153
2154    punpckhwd   m2, m0, m1                  ; [2 6]
2155    punpcklwd   m0, m1                      ; [0 4]
2156    pmaddwd     m1, m0, [r6]                ; EE[0]
2157    pmaddwd     m0, [r6 + 32]               ; EE[1]
2158    pmaddwd     m3, m2, [r6 + 16]           ; EO[0]
2159    pmaddwd     m2, [r6 + 48]               ; EO[1]
2160
2161    paddd       m4, m1, m3                  ; E[0]
2162    psubd       m1, m3                      ; E[3]
2163    paddd       m3, m0, m2                  ; E[1]
2164    psubd       m0, m2                      ; E[2]
2165
2166    ;E[K] = E[k] + add
2167    mova        m5, [pd_64]
2168    paddd       m0, m5
2169    paddd       m1, m5
2170    paddd       m3, m5
2171    paddd       m4, m5
2172
2173    movh        m2, [r0 + 16]
2174    movhps      m2, [r0 + 5 * 16]
2175    movh        m5, [r0 + 3 * 16]
2176    movhps      m5, [r0 + 7 * 16]
2177    punpcklwd   m6, m2, m5                  ;[1 3]
2178    punpckhwd   m2, m5                      ;[5 7]
2179
2180    pmaddwd     m5, m6, [r4]
2181    pmaddwd     m7, m2, [r4 + 16]
2182    paddd       m5, m7                      ; O[0]
2183
2184    paddd       m7, m4, m5
2185    psrad       m7, 7
2186
2187    psubd       m4, m5
2188    psrad       m4, 7
2189
2190    packssdw    m7, m4
2191    movh        [r5 + 0 * 16], m7
2192    movhps      [r5 + 7 * 16], m7
2193
2194    pmaddwd     m5, m6, [r4 + 32]
2195    pmaddwd     m4, m2, [r4 + 48]
2196    paddd       m5, m4                      ; O[1]
2197
2198    paddd       m4, m3, m5
2199    psrad       m4, 7
2200
2201    psubd       m3, m5
2202    psrad       m3, 7
2203
2204    packssdw    m4, m3
2205    movh        [r5 + 1 * 16], m4
2206    movhps      [r5 + 6 * 16], m4
2207
2208    pmaddwd     m5, m6, [r4 + 64]
2209    pmaddwd     m4, m2, [r4 + 80]
2210    paddd       m5, m4                      ; O[2]
2211
2212    paddd       m4, m0, m5
2213    psrad       m4, 7
2214
2215    psubd       m0, m5
2216    psrad       m0, 7
2217
2218    packssdw    m4, m0
2219    movh        [r5 + 2 * 16], m4
2220    movhps      [r5 + 5 * 16], m4
2221
2222    pmaddwd     m5, m6, [r4 + 96]
2223    pmaddwd     m4, m2, [r4 + 112]
2224    paddd       m5, m4                      ; O[3]
2225
2226    paddd       m4, m1, m5
2227    psrad       m4, 7
2228
2229    psubd       m1, m5
2230    psrad       m1, 7
2231
2232    packssdw    m4, m1
2233    movh        [r5 + 3 * 16], m4
2234    movhps      [r5 + 4 * 16], m4
2235
2236    ret
2237
2238%macro PARTIAL_BUTTERFLY_PROCESS_ROW 1
2239    pshufb      m4, %1, [pb_idct8even]
2240    pmaddwd     m4, [tab_idct8_1]
2241    phsubd      m5, m4
2242    pshufd      m4, m4, 0x4E
2243    phaddd      m4, m4
2244    punpckhqdq  m4, m5                      ;m4 = dd e[ 0 1 2 3]
2245    paddd       m4, m6
2246
2247    pshufb      %1, %1, [r6]
2248    pmaddwd     m5, %1, [r4]
2249    pmaddwd     %1, [r4 + 16]
2250    phaddd      m5, %1                      ; m5 = dd O[0, 1, 2, 3]
2251
2252    paddd       %1, m4, m5
2253    psrad       %1, IDCT_SHIFT
2254
2255    psubd       m4, m5
2256    psrad       m4, IDCT_SHIFT
2257    pshufd      m4, m4, 0x1B
2258
2259    packssdw    %1, m4
2260%endmacro
2261
2262INIT_XMM ssse3
2263cglobal patial_butterfly_inverse_internal_pass2
2264    mova        m0, [r5]
2265    PARTIAL_BUTTERFLY_PROCESS_ROW m0
2266    movu        [r1], m0
2267
2268    mova        m2, [r5 + 16]
2269    PARTIAL_BUTTERFLY_PROCESS_ROW m2
2270    movu        [r1 + r2], m2
2271
2272    mova        m1, [r5 + 32]
2273    PARTIAL_BUTTERFLY_PROCESS_ROW m1
2274    movu        [r1 + 2 * r2], m1
2275
2276    mova        m3, [r5 + 48]
2277    PARTIAL_BUTTERFLY_PROCESS_ROW m3
2278    movu        [r1 + r3], m3
2279    ret
2280
2281INIT_XMM ssse3
2282cglobal idct8, 3,7,8 ;,0-16*mmsize
2283    ; alignment stack to 64-bytes
2284    mov         r5, rsp
2285    sub         rsp, 16*mmsize + gprsize
2286    and         rsp, ~(64-1)
2287    mov         [rsp + 16*mmsize], r5
2288    mov         r5, rsp
2289
2290    lea         r4, [tab_idct8_3]
2291    lea         r6, [tab_dct4]
2292
2293    call        patial_butterfly_inverse_internal_pass1
2294
2295    add         r0, 8
2296    add         r5, 8
2297
2298    call        patial_butterfly_inverse_internal_pass1
2299
2300    mova        m6, [pd_ %+ IDCT_ROUND]
2301    add         r2, r2
2302    lea         r3, [r2 * 3]
2303    lea         r4, [tab_idct8_2]
2304    lea         r6, [pb_idct8odd]
2305    sub         r5, 8
2306
2307    call        patial_butterfly_inverse_internal_pass2
2308
2309    lea         r1, [r1 + 4 * r2]
2310    add         r5, 64
2311
2312    call        patial_butterfly_inverse_internal_pass2
2313
2314    ; restore origin stack pointer
2315    mov         rsp, [rsp + 16*mmsize]
2316    RET
2317
2318
2319;-----------------------------------------------------------------------------
2320; void denoise_dct(int16_t* dct, uint32_t* sum, uint16_t* offset, int size)
2321;-----------------------------------------------------------------------------
2322INIT_XMM sse4
2323cglobal denoise_dct, 4, 4, 6
2324    pxor     m5,  m5
2325    shr      r3d, 3
2326.loop:
2327    movu     m0, [r0]
2328    pabsw    m1, m0
2329    movu     m2, [r1]
2330    pmovsxwd m3, m1
2331    paddd    m2, m3
2332    movu     [r1], m2
2333    movu     m2, [r1 + 16]
2334    psrldq   m3, m1, 8
2335    pmovsxwd m4, m3
2336    paddd    m2, m4
2337    movu     [r1 + 16], m2
2338
2339    movu     m3, [r2]
2340    psubusw  m1, m3
2341    pcmpgtw  m4, m1, m5
2342    pand     m1, m4
2343    psignw   m1, m0
2344    movu     [r0], m1
2345    add      r0, 16
2346    add      r1, 32
2347    add      r2, 16
2348    dec      r3d
2349    jnz .loop
2350    RET
2351
2352INIT_YMM avx2
2353cglobal denoise_dct, 4, 4, 6
2354    pxor     m5,  m5
2355    shr      r3d, 4
2356.loop:
2357    movu     m0, [r0]
2358    pabsw    m1, m0
2359    movu     m2, [r1]
2360    pmovsxwd m4, xm1
2361    paddd    m2, m4
2362    movu     [r1], m2
2363    vextracti128 xm4, m1, 1
2364    movu     m2, [r1 + 32]
2365    pmovsxwd m3, xm4
2366    paddd    m2, m3
2367    movu     [r1 + 32], m2
2368    movu     m3, [r2]
2369    psubusw  m1, m3
2370    pcmpgtw  m4, m1, m5
2371    pand     m1, m4
2372    psignw   m1, m0
2373    movu     [r0], m1
2374    add      r0, 32
2375    add      r1, 64
2376    add      r2, 32
2377    dec      r3d
2378    jnz .loop
2379    RET
2380%if ARCH_X86_64 == 1
2381INIT_ZMM avx512
2382cglobal denoise_dct, 4, 4, 22
2383    pxor     m16,  m16
2384    sub      r3d,   16
2385    je       .coeff16
2386    add      r3d,   16
2387    shr      r3d,    5
2388    jmp      .loop
2389
2390.coeff16:
2391    movu          ym19,  [r0]
2392    pabsw         ym17, ym19
2393    movu            m2, [r1]
2394    pmovsxwd       m18, ym17
2395    paddd           m2,  m18
2396    movu          [r1],   m2
2397    movu           ym3, [r2]
2398    psubusw       ym17, ym3
2399    pcmpgtw       ym18, ym17, ym16
2400    pand          ym17, ym18
2401    psignw        ym17, ym19
2402    movu          [r0], ym17
2403    RET
2404
2405.loop:
2406    movu          m21, [r0]
2407    pabsw         m17, m21
2408    movu           m2, [r1]
2409    pmovsxwd       m4, ym17
2410    paddd          m2,  m4
2411    movu         [r1],  m2
2412    vextracti64x4 ym4, m17, 1
2413
2414    movu           m2, [r1 + mmsize]
2415    pmovsxwd       m3, ym4
2416    paddd          m2, m3
2417    movu           [r1 + mmsize], m2
2418    movu           m3, [r2]
2419    psubusw       m17, m3
2420
2421    vextracti64x4 ym20,  m17,    1
2422    pcmpgtw       ym18, ym17, ym16
2423    pcmpgtw       ym19, ym20, ym16
2424    vinserti64x4   m18,  m18, ym19, 1
2425
2426    pand           m17,  m18
2427    vextracti64x4 ym19,  m17, 1
2428    vextracti64x4 ym20,  m21, 1
2429    psignw        ym17, ym21
2430    psignw        ym19, ym20
2431    vinserti64x4   m17,  m17, ym19, 1
2432
2433    movu          [r0],  m17
2434    add             r0,  mmsize
2435    add             r1,  mmsize * 2
2436    add             r2,  mmsize
2437    dec             r3d
2438    jnz             .loop
2439    RET
2440%endif ; ARCH_X86_64 == 1
2441
2442%if ARCH_X86_64 == 1
2443%macro DCT8_PASS_1 4
2444    vpbroadcastq    m0,                 [r6 + %1]
2445    pmaddwd         m2,                 m%3, m0
2446    pmaddwd         m0,                 m%4
2447    phaddd          m2,                 m0
2448    paddd           m2,                 m5
2449    psrad           m2,                 DCT8_SHIFT1
2450    packssdw        m2,                 m2
2451    vpermq          m2,                 m2, 0x08
2452    mova            [r5 + %2],          xm2
2453%endmacro
2454
2455%macro DCT8_PASS_2 2
2456    vbroadcasti128  m4,                 [r6 + %1]
2457    pmaddwd         m6,                 m0, m4
2458    pmaddwd         m7,                 m1, m4
2459    pmaddwd         m8,                 m2, m4
2460    pmaddwd         m9,                 m3, m4
2461    phaddd          m6,                 m7
2462    phaddd          m8,                 m9
2463    phaddd          m6,                 m8
2464    paddd           m6,                 m5
2465    psrad           m6,                 DCT8_SHIFT2
2466
2467    vbroadcasti128  m4,                 [r6 + %2]
2468    pmaddwd         m10,                m0, m4
2469    pmaddwd         m7,                 m1, m4
2470    pmaddwd         m8,                 m2, m4
2471    pmaddwd         m9,                 m3, m4
2472    phaddd          m10,                m7
2473    phaddd          m8,                 m9
2474    phaddd          m10,                m8
2475    paddd           m10,                m5
2476    psrad           m10,                DCT8_SHIFT2
2477
2478    packssdw        m6,                 m10
2479    vpermq          m10,                m6, 0xD8
2480
2481%endmacro
2482
2483INIT_YMM avx2
2484cglobal dct8, 3, 7, 11, 0-8*16
2485vbroadcasti128      m5,                [pd_ %+ DCT8_ROUND1]
2486%define             DCT_SHIFT2         9
2487
2488    add             r2d,               r2d
2489    lea             r3,                [r2 * 3]
2490    lea             r4,                [r0 + r2 * 4]
2491    mov             r5,                rsp
2492    lea             r6,                [tab_dct8]
2493    mova            m6,                [dct8_shuf]
2494
2495    ;pass1
2496    mova            xm0,               [r0]
2497    vinserti128     m0,                m0, [r4], 1
2498    mova            xm1,               [r0 + r2]
2499    vinserti128     m1,                m1, [r4 + r2], 1
2500    mova            xm2,               [r0 + r2 * 2]
2501    vinserti128     m2,                m2, [r4 + r2 * 2], 1
2502    mova            xm3,               [r0 + r3]
2503    vinserti128     m3,                m3,  [r4 + r3], 1
2504
2505    punpcklqdq      m4,                m0, m1
2506    punpckhqdq      m0,                m1
2507    punpcklqdq      m1,                m2, m3
2508    punpckhqdq      m2,                m3
2509
2510    pshufb          m0,                m6
2511    pshufb          m2,                m6
2512
2513    paddw           m3,                m4, m0
2514    paddw           m7,                m1, m2
2515
2516    psubw           m4,                m0
2517    psubw           m1,                m2
2518
2519    DCT8_PASS_1     0 * 16,             0 * 16, 3, 7
2520    DCT8_PASS_1     1 * 16,             2 * 16, 4, 1
2521    DCT8_PASS_1     2 * 16,             4 * 16, 3, 7
2522    DCT8_PASS_1     3 * 16,             6 * 16, 4, 1
2523    DCT8_PASS_1     4 * 16,             1 * 16, 3, 7
2524    DCT8_PASS_1     5 * 16,             3 * 16, 4, 1
2525    DCT8_PASS_1     6 * 16,             5 * 16, 3, 7
2526    DCT8_PASS_1     7 * 16,             7 * 16, 4, 1
2527
2528    ;pass2
2529    vbroadcasti128  m5,                [pd_ %+ DCT8_ROUND2]
2530
2531    mova            m0,                [r5]
2532    mova            m1,                [r5 + 32]
2533    mova            m2,                [r5 + 64]
2534    mova            m3,                [r5 + 96]
2535
2536    DCT8_PASS_2     0 * 16, 1 * 16
2537    movu            [r1],              m10
2538    DCT8_PASS_2     2 * 16, 3 * 16
2539    movu            [r1 + 32],         m10
2540    DCT8_PASS_2     4 * 16, 5 * 16
2541    movu            [r1 + 64],         m10
2542    DCT8_PASS_2     6 * 16, 7 * 16
2543    movu            [r1 + 96],         m10
2544    RET
2545
2546
2547%macro DCT8_AVX512_PASS_1 4
2548    vpmaddwd        m%2,               m3, m%1
2549    vpsrlq          m8,                m%2, 32
2550    vpaddd          m%2,               m8
2551    vpaddd          m%2,               m5
2552    vpsrad          m%2,               DCT8_SHIFT1
2553
2554    vpmaddwd        m%4,               m2, m%3
2555    vpsrlq          m8,                m%4, 32
2556    vpaddd          m%4,               m8
2557    vpaddd          m%4,               m5
2558    vpsrad          m%4,               DCT8_SHIFT1
2559
2560    vpackssdw       m%2,               m%4
2561    vpermw          m%2,               m1, m%2
2562%endmacro
2563
2564%macro DCT8_AVX512_PASS_2 4
2565    vpmaddwd         m0,               m9,  m%1
2566    vpmaddwd         m1,               m10, m%1
2567    vpsrldq          m2,               m0,  8
2568    vpsrldq          m3,               m1,  8
2569    vpaddd           m0,               m2
2570    vpaddd           m1,               m3
2571    vpsrlq           m2,               m0,  32
2572    vpsrlq           m3,               m1,  32
2573    vpaddd           m0,               m2
2574    vpaddd           m1,               m3
2575    vpaddd           m0,               m5
2576    vpsrad           m0,               DCT8_SHIFT2
2577    vpaddd           m1,               m5
2578    vpsrad           m1,               DCT8_SHIFT2
2579    vpackssdw        m0,               m1
2580    vpermw           m0,               m19, m0
2581
2582    vpmaddwd         m1,               m9,  m%2
2583    vpmaddwd         m2,               m10, m%2
2584    vpsrldq          m3,               m1,  8
2585    vpsrldq          m4,               m2,  8
2586    vpaddd           m1,               m3
2587    vpaddd           m2,               m4
2588    vpsrlq           m3,               m1,  32
2589    vpsrlq           m4,               m2,  32
2590    vpaddd           m1,               m3
2591    vpaddd           m2,               m4
2592    vpaddd           m1,               m5
2593    vpsrad           m1,               DCT8_SHIFT2
2594    vpaddd           m2,               m5
2595    vpsrad           m2,               DCT8_SHIFT2
2596    vpackssdw        m1,               m2
2597    vpermw           m1,               m19, m1
2598    vinserti128      ym0,              ym0, xm1, 1
2599
2600    vpmaddwd         m1,               m9,  m%3
2601    vpmaddwd         m2,               m10, m%3
2602    vpsrldq          m3,               m1,  8
2603    vpsrldq          m4,               m2,  8
2604    vpaddd           m1,               m3
2605    vpaddd           m2,               m4
2606    vpsrlq           m3,               m1,  32
2607    vpsrlq           m4,               m2,  32
2608    vpaddd           m1,               m3
2609    vpaddd           m2,               m4
2610    vpaddd           m1,               m5
2611    vpsrad           m1,               DCT8_SHIFT2
2612    vpaddd           m2,               m5
2613    vpsrad           m2,               DCT8_SHIFT2
2614    vpackssdw        m1,               m2
2615    vpermw           m1,               m19, m1
2616
2617    vpmaddwd         m2,               m9,  m%4
2618    vpmaddwd         m3,               m10, m%4
2619    vpsrldq          m4,               m2,  8
2620    vpsrldq          m6,               m3,  8
2621    vpaddd           m2,               m4
2622    vpaddd           m3,               m6
2623    vpsrlq           m4,               m2,  32
2624    vpsrlq           m6,               m3,  32
2625    vpaddd           m2,               m4
2626    vpaddd           m3,               m6
2627    vpaddd           m2,               m5
2628    vpsrad           m2,               DCT8_SHIFT2
2629    vpaddd           m3,               m5
2630    vpsrad           m3,               DCT8_SHIFT2
2631    vpackssdw        m2,               m3
2632    vpermw           m2,               m19, m2
2633
2634    vinserti128      ym1,              ym1, xm2, 1
2635    vinserti64x4     m0,               m0, ym1, 1
2636%endmacro
2637
2638INIT_ZMM avx512
2639cglobal dct8, 3, 7, 24
2640
2641    vbroadcasti32x4  m5,               [pd_ %+ DCT8_ROUND1]
2642    vbroadcasti32x8  m4,               [dct8_shuf]
2643    vbroadcasti32x4  m19,              [dct8_shuf9_AVX512]
2644
2645    add              r2d,              r2d
2646    lea              r3,               [r2 * 3]
2647    lea              r4,               [r0 + r2 * 4]
2648    lea              r5,               [tab_dct8]
2649    lea              r6,               [tab_dct8_avx512]
2650
2651    ;pass1
2652    mova            xm0,               [r0]
2653    vinserti128     ym0,               ym0, [r4], 1
2654    mova            xm1,               [r0 + r2]
2655    vinserti128     ym1,               ym1, [r4 + r2], 1
2656    mova            xm2,               [r0 + r2 * 2]
2657    vinserti128     ym2,               ym2, [r4 + r2 * 2], 1
2658    mova            xm3,               [r0 + r3]
2659    vinserti128     ym3,               ym3,  [r4 + r3], 1
2660
2661    vinserti64x4    m0,                m0, ym2, 1
2662    vinserti64x4    m1,                m1, ym3, 1
2663
2664    vpunpcklqdq     m2,                m0, m1
2665    vpunpckhqdq     m0,                m1
2666
2667    vpshufb         m0,                m4
2668    vpaddw          m3,                m2, m0
2669    vpsubw          m2,                m0
2670
2671    vbroadcasti32x8 m1,                [dct8_shuf7_AVX512]
2672
2673    ; Load all the coefficients togather for better caching
2674    vpbroadcastq    m20,               [r6 + 0 * 8]
2675    vpbroadcastq    m21,               [r6 + 1 * 8]
2676    vpbroadcastq    m22,               [r6 + 2 * 8]
2677    vpbroadcastq    m23,               [r6 + 3 * 8]
2678    vpbroadcastq    m7,                [r6 + 4 * 8]
2679    vpbroadcastq    m12,               [r6 + 5 * 8]
2680    vpbroadcastq    m14,               [r6 + 6 * 8]
2681    vpbroadcastq    m16,               [r6 + 7 * 8]
2682
2683    DCT8_AVX512_PASS_1     20,       9, 21,      10
2684    DCT8_AVX512_PASS_1     22,      11, 23,      10
2685    DCT8_AVX512_PASS_1     7,       13, 12,      10
2686    DCT8_AVX512_PASS_1     14,      15, 16,      10
2687
2688    ;pass2
2689    vbroadcasti32x4        m5,          [pd_ %+ DCT8_ROUND2]
2690
2691    vinserti64x4           m9,          m9,  ym11, 1
2692    vinserti64x4           m10,         m13, ym15, 1
2693
2694    ;Load all the coefficients togather for better caching and reuse common coefficients from PASS 1
2695    vbroadcasti32x4    m21,                [r5 + 1 * 16]
2696    vbroadcasti32x4    m22,                [r5 + 2 * 16]
2697    vbroadcasti32x4    m23,                [r5 + 3 * 16]
2698    vbroadcasti32x4    m12,                [r5 + 5 * 16]
2699    vbroadcasti32x4    m14,                [r5 + 6 * 16]
2700    vbroadcasti32x4    m16,                [r5 + 7 * 16]
2701
2702    DCT8_AVX512_PASS_2     20, 21, 22, 23
2703    movu                   [r1],        m0
2704    DCT8_AVX512_PASS_2     7, 12, 14, 16
2705    movu                   [r1 + 64],   m0
2706    RET
2707
2708%macro DCT16_PASS_1_E 2
2709    vpbroadcastq    m7,                [r7 + %1]
2710
2711    pmaddwd         m4,                m0, m7
2712    pmaddwd         m6,                m2, m7
2713    phaddd          m4,                m6
2714
2715    paddd           m4,                m9
2716    psrad           m4,                DCT_SHIFT
2717
2718    packssdw        m4,                m4
2719    vpermq          m4,                m4, 0x08
2720
2721    mova            [r5 + %2],         xm4
2722%endmacro
2723
2724%macro DCT16_PASS_1_O 2
2725    vbroadcasti128  m7,                [r7 + %1]
2726
2727    pmaddwd         m10,               m0, m7
2728    pmaddwd         m11,               m2, m7
2729    phaddd          m10,               m11                 ; [d0 d0 d1 d1 d4 d4 d5 d5]
2730
2731    pmaddwd         m11,               m4, m7
2732    pmaddwd         m12,               m6, m7
2733    phaddd          m11,               m12                 ; [d2 d2 d3 d3 d6 d6 d7 d7]
2734
2735    phaddd          m10,               m11                 ; [d0 d1 d2 d3 d4 d5 d6 d7]
2736
2737    paddd           m10,               m9
2738    psrad           m10,               DCT_SHIFT
2739
2740    packssdw        m10,               m10                 ; [w0 w1 w2 w3 - - - - w4 w5 w6 w7 - - - -]
2741    vpermq          m10,               m10, 0x08
2742
2743    mova            [r5 + %2],         xm10
2744%endmacro
2745
2746%macro DCT16_PASS_2 2
2747    vbroadcasti128  m8,                [r7 + %1]
2748    vbroadcasti128  m13,               [r8 + %1]
2749
2750    pmaddwd         m10,               m0, m8
2751    pmaddwd         m11,               m1, m13
2752    paddd           m10,               m11
2753
2754    pmaddwd         m11,               m2, m8
2755    pmaddwd         m12,               m3, m13
2756    paddd           m11,               m12
2757    phaddd          m10,               m11
2758
2759    pmaddwd         m11,               m4, m8
2760    pmaddwd         m12,               m5, m13
2761    paddd           m11,               m12
2762
2763    pmaddwd         m12,               m6, m8
2764    pmaddwd         m13,               m7, m13
2765    paddd           m12,               m13
2766    phaddd          m11,               m12
2767
2768    phaddd          m10,               m11
2769    paddd           m10,               m9
2770    psrad           m10,               DCT_SHIFT2
2771
2772
2773    vbroadcasti128  m8,                [r7 + %2]
2774    vbroadcasti128  m13,               [r8 + %2]
2775
2776    pmaddwd         m14,               m0, m8
2777    pmaddwd         m11,               m1, m13
2778    paddd           m14,               m11
2779
2780    pmaddwd         m11,               m2, m8
2781    pmaddwd         m12,               m3, m13
2782    paddd           m11,               m12
2783    phaddd          m14,               m11
2784
2785    pmaddwd         m11,               m4, m8
2786    pmaddwd         m12,               m5, m13
2787    paddd           m11,               m12
2788
2789    pmaddwd         m12,               m6, m8
2790    pmaddwd         m13,               m7, m13
2791    paddd           m12,               m13
2792    phaddd          m11,               m12
2793
2794    phaddd          m14,               m11
2795    paddd           m14,               m9
2796    psrad           m14,               DCT_SHIFT2
2797
2798    packssdw        m10,               m14
2799    vextracti128    xm14,              m10,       1
2800    movlhps         xm15,              xm10,      xm14
2801    movhlps         xm14,              xm10
2802%endmacro
2803INIT_YMM avx2
2804cglobal dct16, 3, 9, 16, 0-16*mmsize
2805%if BIT_DEPTH == 12
2806    %define         DCT_SHIFT          7
2807    vbroadcasti128  m9,                [pd_64]
2808%elif BIT_DEPTH == 10
2809    %define         DCT_SHIFT          5
2810    vbroadcasti128  m9,                [pd_16]
2811%elif BIT_DEPTH == 8
2812    %define         DCT_SHIFT          3
2813    vbroadcasti128  m9,                [pd_4]
2814%else
2815    %error Unsupported BIT_DEPTH!
2816%endif
2817%define             DCT_SHIFT2         10
2818
2819    add             r2d,               r2d
2820
2821    mova            m13,               [dct16_shuf1]
2822    mova            m14,               [dct16_shuf2]
2823    lea             r7,                [tab_dct16_1 + 8 * 16]
2824    lea             r8,                [tab_dct16_2 + 8 * 16]
2825    lea             r3,                [r2 * 3]
2826    mov             r5,                rsp
2827    mov             r4d,               2                   ; Each iteration process 8 rows, so 16/8 iterations
2828
2829.pass1:
2830    lea             r6,                [r0 + r2 * 4]
2831
2832    movu            m2,                [r0]
2833    movu            m1,                [r6]
2834    vperm2i128      m0,                m2, m1, 0x20        ; [row0lo  row4lo]
2835    vperm2i128      m1,                m2, m1, 0x31        ; [row0hi  row4hi]
2836
2837    movu            m4,                [r0 + r2]
2838    movu            m3,                [r6 + r2]
2839    vperm2i128      m2,                m4, m3, 0x20        ; [row1lo  row5lo]
2840    vperm2i128      m3,                m4, m3, 0x31        ; [row1hi  row5hi]
2841
2842    movu            m6,                [r0 + r2 * 2]
2843    movu            m5,                [r6 + r2 * 2]
2844    vperm2i128      m4,                m6, m5, 0x20        ; [row2lo  row6lo]
2845    vperm2i128      m5,                m6, m5, 0x31        ; [row2hi  row6hi]
2846
2847    movu            m8,                [r0 + r3]
2848    movu            m7,                [r6 + r3]
2849    vperm2i128      m6,                m8, m7, 0x20        ; [row3lo  row7lo]
2850    vperm2i128      m7,                m8, m7, 0x31        ; [row3hi  row7hi]
2851
2852    pshufb          m1,                m13
2853    pshufb          m3,                m13
2854    pshufb          m5,                m13
2855    pshufb          m7,                m13
2856
2857    paddw           m8,                m0, m1              ;E
2858    psubw           m0,                m1                  ;O
2859
2860    paddw           m1,                m2, m3              ;E
2861    psubw           m2,                m3                  ;O
2862
2863    paddw           m3,                m4, m5              ;E
2864    psubw           m4,                m5                  ;O
2865
2866    paddw           m5,                m6, m7              ;E
2867    psubw           m6,                m7                  ;O
2868
2869    DCT16_PASS_1_O  -7 * 16,           1 * 32
2870    DCT16_PASS_1_O  -5 * 16,           3 * 32
2871    DCT16_PASS_1_O  -3 * 16,           1 * 32 + 16
2872    DCT16_PASS_1_O  -1 * 16,           3 * 32 + 16
2873    DCT16_PASS_1_O  1 * 16,            5 * 32
2874    DCT16_PASS_1_O  3 * 16,            7 * 32
2875    DCT16_PASS_1_O  5 * 16,            5 * 32 + 16
2876    DCT16_PASS_1_O  7 * 16,            7 * 32 + 16
2877
2878    pshufb          m8,                m14
2879    pshufb          m1,                m14
2880    phaddw          m0,                m8, m1
2881
2882    pshufb          m3,                m14
2883    pshufb          m5,                m14
2884    phaddw          m2,                m3, m5
2885
2886    DCT16_PASS_1_E  -8 * 16,           0 * 32
2887    DCT16_PASS_1_E  -4 * 16,           0 * 32 + 16
2888    DCT16_PASS_1_E  0 * 16,            4 * 32
2889    DCT16_PASS_1_E  4 * 16,            4 * 32 + 16
2890
2891    phsubw          m0,                m8, m1
2892    phsubw          m2,                m3, m5
2893
2894    DCT16_PASS_1_E  -6 * 16,           2 * 32
2895    DCT16_PASS_1_E  -2 * 16,           2 * 32 + 16
2896    DCT16_PASS_1_E  2 * 16,            6 * 32
2897    DCT16_PASS_1_E  6 * 16,            6 * 32 + 16
2898
2899    lea             r0,                [r0 + 8 * r2]
2900    add             r5,                256
2901
2902    dec             r4d
2903    jnz             .pass1
2904
2905    mov             r5,                rsp
2906    mov             r4d,               2
2907    mov             r2d,               32
2908    lea             r3,                [r2 * 3]
2909    vbroadcasti128  m9,                [pd_512]
2910
2911.pass2:
2912    mova            m0,                [r5 + 0 * 32]        ; [row0lo  row4lo]
2913    mova            m1,                [r5 + 8 * 32]        ; [row0hi  row4hi]
2914
2915    mova            m2,                [r5 + 1 * 32]        ; [row1lo  row5lo]
2916    mova            m3,                [r5 + 9 * 32]        ; [row1hi  row5hi]
2917
2918    mova            m4,                [r5 + 2 * 32]        ; [row2lo  row6lo]
2919    mova            m5,                [r5 + 10 * 32]       ; [row2hi  row6hi]
2920
2921    mova            m6,                [r5 + 3 * 32]        ; [row3lo  row7lo]
2922    mova            m7,                [r5 + 11 * 32]       ; [row3hi  row7hi]
2923
2924    DCT16_PASS_2    -8 * 16, -7 * 16
2925    movu            [r1],              xm15
2926    movu            [r1 + r2],         xm14
2927
2928    DCT16_PASS_2    -6 * 16, -5 * 16
2929    movu            [r1 + r2 * 2],     xm15
2930    movu            [r1 + r3],         xm14
2931
2932    lea             r6,                [r1 + r2 * 4]
2933    DCT16_PASS_2    -4 * 16, -3 * 16
2934    movu            [r6],              xm15
2935    movu            [r6 + r2],         xm14
2936
2937    DCT16_PASS_2    -2 * 16, -1 * 16
2938    movu            [r6 + r2 * 2],     xm15
2939    movu            [r6 + r3],         xm14
2940
2941    lea             r6,                [r6 + r2 * 4]
2942    DCT16_PASS_2    0 * 16, 1 * 16
2943    movu            [r6],              xm15
2944    movu            [r6 + r2],         xm14
2945
2946    DCT16_PASS_2    2 * 16, 3 * 16
2947    movu            [r6 + r2 * 2],     xm15
2948    movu            [r6 + r3],         xm14
2949
2950    lea             r6,                [r6 + r2 * 4]
2951    DCT16_PASS_2    4 * 16, 5 * 16
2952    movu            [r6],              xm15
2953    movu            [r6 + r2],         xm14
2954
2955    DCT16_PASS_2    6 * 16, 7 * 16
2956    movu            [r6 + r2 * 2],     xm15
2957    movu            [r6 + r3],         xm14
2958
2959    add             r1,                16
2960    add             r5,                128
2961
2962    dec             r4d
2963    jnz             .pass2
2964    RET
2965%macro DCT16_avx512_PASS_1_O 4
2966    vbroadcasti32x4  m1,               [r5 + %1]
2967
2968    pmaddwd          m3,               m6,  m1
2969    vpsrldq          m11,              m3,  8
2970    vpaddd           m3,               m11
2971
2972    pmaddwd          m11,              m8,  m1
2973    vpsrldq          m12,              m11, 8
2974    vpaddd           m11,              m12
2975
2976    vpunpcklqdq      m12,              m3, m11
2977    vpsrldq          m11,              m12, 4
2978    vpaddd           m11,              m12
2979
2980    pmaddwd          m3,               m10, m1
2981    vpsrldq          m12,              m3,  8
2982    vpaddd           m3,               m12
2983
2984    pmaddwd          m12,              m2,  m1
2985    vpsrldq          m13,              m12, 8
2986    vpaddd           m12,              m13
2987
2988    vpunpcklqdq      m13,              m3, m12
2989    vpsrldq          m12,              m13, 4
2990    vpaddd           m12,              m13
2991
2992    mova             m%3,              m26
2993    vpermi2d         m%3,              m11, m12
2994    paddd            m%3,              m0
2995    psrad            m%3,              DCT_SHIFT
2996
2997    ; next row start
2998    vbroadcasti32x4  m1,               [r5 + %2]
2999
3000    pmaddwd          m3,               m6,  m1
3001    vpsrldq          m11,              m3,  8
3002    vpaddd           m3,               m11
3003
3004    pmaddwd          m11,              m8,  m1
3005    vpsrldq          m12,              m11, 8
3006    vpaddd           m11,              m12
3007
3008    vpunpcklqdq      m12,              m3, m11
3009    vpsrldq          m11,              m12, 4
3010    vpaddd           m11,              m12
3011
3012    pmaddwd          m3,               m10, m1
3013    vpsrldq          m12,              m3,  8
3014    vpaddd           m3,               m12
3015
3016    pmaddwd          m12,              m2,  m1
3017    vpsrldq          m13,              m12, 8
3018    vpaddd           m12,              m13
3019
3020    vpunpcklqdq      m13,              m3, m12
3021    vpsrldq          m12,              m13, 4
3022    vpaddd           m12,              m13
3023
3024    mova             m%4,              m26
3025    vpermi2d         m%4,              m11, m12
3026    paddd            m%4,              m0
3027    psrad            m%4,              DCT_SHIFT
3028   ;next row end
3029
3030    packssdw         m%3,              m%4
3031    vpermw           m%4,              m25, m%3
3032%endmacro
3033
3034%macro DCT16_AVX512_PASS_1_LOOP 0
3035    vbroadcasti32x8 m1,                [dct16_shuf1]
3036    mova            m2,                [dct16_shuf3_AVX512]
3037    mova            m3,                [dct16_shuf4_AVX512]
3038
3039    movu            ym4,               [r0]
3040    movu            ym5,               [r0 + r2]
3041    vinserti64x4    m4,                m4, ym5, 1
3042
3043    movu            ym5,               [r0 + 2 * r2]
3044    movu            ym6,               [r0 + r3]
3045    vinserti64x4    m5,                m5, ym6, 1
3046
3047    mova            m6,                m2
3048    mova            m7,                m3
3049    vpermi2q        m6,                m4, m5
3050    vpermi2q        m7,                m4, m5
3051
3052    movu            ym4,               [r4]
3053    movu            ym5,               [r4 + r2]
3054    vinserti64x4    m4,                m4, ym5, 1
3055
3056    movu            ym5,               [r4 + 2 * r2]
3057    movu            ym8,               [r4 + r3]
3058    vinserti64x4    m5,                m5, ym8, 1
3059
3060    mova            m8,                m2
3061    mova            m9,                m3
3062    vpermi2q        m8,                m4, m5
3063    vpermi2q        m9,                m4, m5
3064
3065    vpshufb         m7,                m1
3066    vpshufb         m9,                m1
3067
3068    paddw           m4,                m6, m7
3069    psubw           m6,                m7
3070
3071    paddw           m5,                m8, m9
3072    psubw           m8,                m9
3073
3074    lea             r0,                [r0 + 8 * r2]
3075    lea             r4,                [r0 + r2 * 4]
3076
3077    movu            ym7,               [r0]
3078    movu            ym9,               [r0 + r2]
3079    vinserti64x4    m7,                m7, ym9, 1
3080
3081    movu            ym9,               [r0 + 2 * r2]
3082    movu            ym10,              [r0 + r3]
3083    vinserti64x4    m9,                m9, ym10, 1
3084
3085    mova            m10,               m2
3086    mova            m11,               m3
3087    vpermi2q        m10,               m7, m9
3088    vpermi2q        m11,               m7, m9
3089
3090    vpshufb         m11,               m1
3091    paddw           m7,                m10, m11
3092    psubw           m10,               m11
3093
3094    movu            ym9,               [r4]
3095    movu            ym11,              [r4 + r2]
3096    vinserti64x4    m9,                m9, ym11, 1
3097
3098    movu            ym11,              [r4 + 2 * r2]
3099    movu            ym12,              [r4 + r3]
3100    vinserti64x4    m11,               m11, ym12, 1
3101
3102    vpermi2q        m2,                m9, m11
3103    vpermi2q        m3,                m9, m11
3104
3105    vpshufb         m3,                m1
3106    paddw           m9,                m2, m3
3107    psubw           m2,                m3
3108%endmacro
3109
3110%macro DCT16_avx512_PASS_1_E 4
3111    vpbroadcastq      m1,              [r5 + %1]
3112
3113    pmaddwd          m19,              m11,  m1
3114    vpsrldq          m12,              m19,  4
3115    vpaddd           m12,              m19
3116
3117    pmaddwd          m19,              m13,  m1
3118    vpsrldq          m18,              m19,  4
3119    vpaddd           m18,              m19
3120
3121    mova             m%2,              m27
3122    vpermi2d         m%2,              m12, m18
3123    paddd            m%2,              m0
3124    psrad            m%2,              DCT_SHIFT
3125
3126    ; 2nd row
3127    vpbroadcastq      m1,              [r5 + %3]
3128
3129    pmaddwd          m19,              m11,  m1
3130    vpsrldq          m12,              m19,  4
3131    vpaddd           m12,              m19
3132
3133    pmaddwd          m19,              m13,  m1
3134    vpsrldq          m18,              m19,  4
3135    vpaddd           m18,              m19
3136
3137    mova             m%4,              m27
3138    vpermi2d         m%4,              m12, m18
3139    paddd            m%4,              m0
3140    psrad            m%4,              DCT_SHIFT
3141
3142    packssdw         m%2,              m%4
3143    vpermw           m%4,              m25, m%2
3144%endmacro
3145
3146%macro DCT16_PASS2_AVX512 10
3147    vpmaddwd         m5,   m%2, m%1
3148    vpsrldq          m6,   m5,  8
3149    vpaddd           m5,   m6
3150    vpsrldq          m6,   m5,  4
3151    vpaddd           m5,   m6
3152
3153    vpmaddwd         m6,   m%3, m%1
3154    vpsrldq          m7,   m6,  8
3155    vpaddd           m6,   m7
3156    vpsrldq          m7,   m6,  4
3157    vpaddd           m6,   m7
3158    vpunpckldq       m7,   m5, m6
3159
3160    vpmaddwd         m5,   m%4, m%1
3161    vpsrldq          m6,   m5,  8
3162    vpaddd           m5,   m6
3163    vpsrldq          m6,   m5,  4
3164    vpaddd           m5,   m6
3165
3166    vpmaddwd         m6,   m%5, m%1
3167    vpsrldq          m8,   m6,  8
3168    vpaddd           m6,   m8
3169    vpsrldq          m8,   m6,  4
3170    vpaddd           m6,   m8
3171    vpunpckldq       m8,   m5, m6
3172
3173    vpunpcklqdq      m5,   m7, m8
3174    vpermd           m5,   m2, m5
3175    vpsrldq          m6,   m5,  4
3176    vpaddd           m5,   m6
3177
3178    vpmaddwd         m6,   m%6, m%1
3179    vpsrldq          m7,   m6,  8
3180    vpaddd           m6,   m7
3181    vpsrldq          m7,   m6,  4
3182    vpaddd           m6,   m7
3183
3184    vpmaddwd         m7,   m%7, m%1
3185    vpsrldq          m8,   m7,  8
3186    vpaddd           m7,   m8
3187    vpsrldq          m8,   m7,  4
3188    vpaddd           m7,   m8
3189    vpunpckldq       m8,   m6, m7
3190
3191    vpmaddwd         m6,   m%8, m%1
3192    vpsrldq          m7,   m6,  8
3193    vpaddd           m6,   m7
3194    vpsrldq          m7,   m6,  4
3195    vpaddd           m6,   m7
3196
3197    vpmaddwd         m7,   m%9, m%1
3198    vpsrldq          m4,   m7,  8
3199    vpaddd           m7,   m4
3200    vpsrldq          m4,   m7,  4
3201    vpaddd           m7,   m4
3202    vpunpckldq       m4,   m6, m7
3203
3204    vpunpcklqdq      m6,   m8, m4
3205    vpermd           m6,   m2, m6
3206    vpsrldq          m7,   m6,  4
3207    vpaddd           m6,   m7
3208
3209    paddd            m5, m0
3210    psrad            m5, DCT_SHIFT2
3211    paddd            m6, m0
3212    psrad            m6, DCT_SHIFT2
3213
3214    packssdw         m5, m6
3215    vpermw           m%10, m3, m5
3216%endmacro
3217
3218INIT_ZMM avx512
3219cglobal dct16, 3, 6, 29
3220
3221%if BIT_DEPTH == 12
3222    %define          DCT_SHIFT          7
3223    vbroadcasti32x4  m0,                [pd_64]
3224%elif BIT_DEPTH == 10
3225    %define          DCT_SHIFT          5
3226    vbroadcasti32x4  m0,                [pd_16]
3227%elif BIT_DEPTH == 8
3228    %define          DCT_SHIFT          3
3229    vbroadcasti32x4  m0,                [pd_4]
3230%else
3231    %error Unsupported BIT_DEPTH!
3232%endif
3233%define             DCT_SHIFT2         10
3234
3235    add             r2d,               r2d
3236    lea             r3,                [r2 * 3]
3237    lea             r4,                [r0 + r2 * 4]
3238    lea             r5,                [tab_dct16_1 + 8 * 16]
3239
3240    ;Load reuseable table once to save memory movments
3241    mova             m25,              [dct16_shuf5_AVX512]
3242    mova             m26,              [dct16_shuf2_AVX512]
3243    mova             m27,              [dct16_shuf7_AVX512]
3244    vbroadcasti32x8  m28,              [dct16_shuf6_AVX512]
3245
3246    DCT16_AVX512_PASS_1_LOOP
3247    DCT16_avx512_PASS_1_O              -7 * 16, -5 * 16, 15, 14    ;row 1,   3
3248    DCT16_avx512_PASS_1_O              -3 * 16, -1 * 16, 16, 15    ;row 5,   7
3249    DCT16_avx512_PASS_1_O               1 * 16,  3 * 16, 17, 16    ;row 9,  11
3250    DCT16_avx512_PASS_1_O               5 * 16,  7 * 16, 18, 17    ;row 13, 15
3251
3252    vbroadcasti32x8 m1,                [dct16_shuf2]
3253    pshufb          m4,                m1
3254    pshufb          m5,                m1
3255    pshufb          m7,                m1
3256    pshufb          m9,                m1
3257
3258    vpsrldq          m3,              m4,  2
3259    vpsubw           m11,             m4,  m3
3260    vpsrldq          m6,              m5,  2
3261    vpsubw           m12,             m5,  m6
3262    vpsrldq          m8,              m7,  2
3263    vpsubw           m13,             m7,  m8
3264    vpsrldq          m10,             m9,  2
3265    vpsubw           m18,             m9,  m10
3266
3267    vpermw           m11,             m28, m11
3268    vpermw           m12,             m28, m12
3269    vinserti64x4     m11,             m11, ym12, 1
3270
3271    vpermw           m13,             m28, m13
3272    vpermw           m18,             m28, m18
3273    vinserti64x4     m13,             m13, ym18, 1
3274
3275    DCT16_avx512_PASS_1_E            -6 * 16, 21, -2 * 16, 20    ; row  2,  6
3276    DCT16_avx512_PASS_1_E             2 * 16, 22,  6 * 16, 21    ; row 10, 14
3277
3278    vpaddw           m11,             m4,  m3
3279    vpaddw           m12,             m5,  m6
3280    vpaddw           m13,             m7,  m8
3281    vpaddw           m18,             m9,  m10
3282
3283    vpermw           m11,             m28, m11
3284    vpermw           m12,             m28, m12
3285    vinserti64x4     m11,             m11, ym12, 1
3286
3287    vpermw           m13,             m28, m13
3288    vpermw           m18,             m28, m18
3289    vinserti64x4     m13,             m13, ym18, 1
3290
3291    DCT16_avx512_PASS_1_E            -8 * 16, 23, 0 * 16, 22    ; row 0, 8
3292    DCT16_avx512_PASS_1_E            -4 * 16, 24, 4 * 16, 23    ; row 4, 12
3293
3294    ;PASS2
3295    vbroadcasti128    m0,             [pd_512]
3296
3297    lea              r5,              [tab_dct16]
3298    mova             m2,              [dct16_shuf9_AVX512]
3299    vbroadcasti32x8  m3,              [dct16_shuf8_AVX512]
3300
3301    vbroadcasti32x8  m1,              [r5 + 0 * 32]
3302    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
3303    vbroadcasti32x8  m1,              [r5 + 1 * 32]
3304    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
3305    vinserti64x4     m9,              m9, ym10, 1
3306    movu             [r1 + 0 * 64],   m9
3307
3308    vbroadcasti32x8  m1,              [r5 + 2 * 32]
3309    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
3310    vbroadcasti32x8  m1,              [r5 + 3 * 32]
3311    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
3312    vinserti64x4     m9,              m9, ym10, 1
3313    movu             [r1 + 1 * 64],   m9
3314
3315    vbroadcasti32x8  m1,              [r5 + 4 * 32]
3316    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
3317    vbroadcasti32x8  m1,              [r5 + 5 * 32]
3318    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
3319    vinserti64x4     m9,              m9, ym10, 1
3320    movu             [r1 + 2 * 64],   m9
3321
3322    vbroadcasti32x8  m1,              [r5 + 6 * 32]
3323    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
3324    vbroadcasti32x8  m1,              [r5 + 7 * 32]
3325    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
3326    vinserti64x4     m9,              m9, ym10, 1
3327    movu             [r1 + 3 * 64],   m9
3328
3329    vbroadcasti32x8  m1,              [r5 + 8 * 32]
3330    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
3331    vbroadcasti32x8  m1,              [r5 + 9 * 32]
3332    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
3333    vinserti64x4     m9,              m9, ym10, 1
3334    movu             [r1 + 4 * 64],   m9
3335
3336    vbroadcasti32x8  m1,              [r5 + 10 * 32]
3337    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
3338    vbroadcasti32x8  m1,              [r5 + 11 * 32]
3339    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
3340    vinserti64x4     m9,              m9, ym10, 1
3341    movu             [r1 + 5 * 64],   m9
3342
3343    vbroadcasti32x8  m1,              [r5 + 12 * 32]
3344    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
3345    vbroadcasti32x8  m1,              [r5 + 13 * 32]
3346    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
3347    vinserti64x4     m9,              m9, ym10, 1
3348    movu             [r1 + 6 * 64],   m9
3349
3350    vbroadcasti32x8  m1,              [r5 + 14 * 32]
3351    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
3352    vbroadcasti32x8  m1,              [r5 + 15 * 32]
3353    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
3354    vinserti64x4     m9,              m9, ym10, 1
3355    movu             [r1 + 7 * 64],   m9
3356    RET
3357
3358%macro DCT32_PASS_1 4
3359    vbroadcasti128  m8,                [r7 + %1]
3360    pmaddwd         m11,               m%3, m8
3361    pmaddwd         m12,               m%4, m8
3362    phaddd          m11,               m12
3363
3364    vbroadcasti128  m8,                [r7 + %1 + 32]
3365    vbroadcasti128  m10,               [r7 + %1 + 48]
3366    pmaddwd         m12,               m5, m8
3367    pmaddwd         m13,               m6, m10
3368    phaddd          m12,               m13
3369
3370    pmaddwd         m13,               m4, m8
3371    pmaddwd         m14,               m7, m10
3372    phaddd          m13,               m14
3373
3374    phaddd          m12,               m13
3375
3376    phaddd          m11,               m12
3377    paddd           m11,               m9
3378    psrad           m11,               DCT_SHIFT
3379
3380    vpermq          m11,               m11, 0xD8
3381    packssdw        m11,               m11
3382    movq            [r5 + %2],         xm11
3383    vextracti128    xm10,              m11, 1
3384    movq            [r5 + %2 + 64],    xm10
3385%endmacro
3386
3387%macro DCT32_PASS_2 1
3388    mova            m8,                [r7 + %1]
3389    mova            m10,               [r8 + %1]
3390    pmaddwd         m11,               m0, m8
3391    pmaddwd         m12,               m1, m10
3392    paddd           m11,               m12
3393
3394    pmaddwd         m12,               m2, m8
3395    pmaddwd         m13,               m3, m10
3396    paddd           m12,               m13
3397
3398    phaddd          m11,               m12
3399
3400    pmaddwd         m12,               m4, m8
3401    pmaddwd         m13,               m5, m10
3402    paddd           m12,               m13
3403
3404    pmaddwd         m13,               m6, m8
3405    pmaddwd         m14,               m7, m10
3406    paddd           m13,               m14
3407
3408    phaddd          m12,               m13
3409
3410    phaddd          m11,               m12
3411    vextracti128    xm10,              m11, 1
3412    paddd           xm11,              xm10
3413
3414    paddd           xm11,               xm9
3415    psrad           xm11,               DCT_SHIFT2
3416    packssdw        xm11,               xm11
3417
3418%endmacro
3419
3420INIT_YMM avx2
3421cglobal dct32, 3, 9, 16, 0-64*mmsize
3422%if BIT_DEPTH == 12
3423    %define         DCT_SHIFT          8
3424    vpbroadcastq    m9,                [pd_128]
3425%elif BIT_DEPTH == 10
3426    %define         DCT_SHIFT          6
3427    vpbroadcastq    m9,                [pd_32]
3428%elif BIT_DEPTH == 8
3429    %define         DCT_SHIFT          4
3430    vpbroadcastq    m9,                [pd_8]
3431%else
3432    %error Unsupported BIT_DEPTH!
3433%endif
3434%define             DCT_SHIFT2         11
3435
3436    add             r2d,               r2d
3437
3438    lea             r7,                [tab_dct32_1]
3439    lea             r8,                [tab_dct32_2]
3440    lea             r3,                [r2 * 3]
3441    mov             r5,                rsp
3442    mov             r4d,               8
3443    mova            m15,               [dct16_shuf1]
3444
3445.pass1:
3446    movu            m2,                [r0]
3447    movu            m1,                [r0 + 32]
3448    pshufb          m1,                m15
3449    vpermq          m1,                m1, 0x4E
3450    psubw           m7,                m2, m1
3451    paddw           m2,                m1
3452
3453    movu            m1,                [r0 + r2 * 2]
3454    movu            m0,                [r0 + r2 * 2 + 32]
3455    pshufb          m0,                m15
3456    vpermq          m0,                m0, 0x4E
3457    psubw           m8,                m1, m0
3458    paddw           m1,                m0
3459    vperm2i128      m0,                m2, m1, 0x20        ; [row0lo  row2lo] for E
3460    vperm2i128      m3,                m2, m1, 0x31        ; [row0hi  row2hi] for E
3461    pshufb          m3,                m15
3462    psubw           m1,                m0, m3
3463    paddw           m0,                m3
3464
3465    vperm2i128      m5,                m7, m8, 0x20        ; [row0lo  row2lo] for O
3466    vperm2i128      m6,                m7, m8, 0x31        ; [row0hi  row2hi] for O
3467
3468
3469    movu            m4,                [r0 + r2]
3470    movu            m2,                [r0 + r2 + 32]
3471    pshufb          m2,                m15
3472    vpermq          m2,                m2, 0x4E
3473    psubw           m10,               m4, m2
3474    paddw           m4,                m2
3475
3476    movu            m3,                [r0 + r3]
3477    movu            m2,                [r0 + r3 + 32]
3478    pshufb          m2,                m15
3479    vpermq          m2,                m2, 0x4E
3480    psubw           m11,               m3, m2
3481    paddw           m3,                m2
3482    vperm2i128      m2,                m4, m3, 0x20        ; [row1lo  row3lo] for E
3483    vperm2i128      m8,                m4, m3, 0x31        ; [row1hi  row3hi] for E
3484    pshufb          m8,                m15
3485    psubw           m3,                m2, m8
3486    paddw           m2,                m8
3487
3488    vperm2i128      m4,                m10, m11, 0x20      ; [row1lo  row3lo] for O
3489    vperm2i128      m7,                m10, m11, 0x31      ; [row1hi  row3hi] for O
3490
3491
3492    DCT32_PASS_1    0 * 32,            0 * 64, 0, 2
3493    DCT32_PASS_1    2 * 32,            2 * 64, 1, 3
3494    DCT32_PASS_1    4 * 32,            4 * 64, 0, 2
3495    DCT32_PASS_1    6 * 32,            6 * 64, 1, 3
3496    DCT32_PASS_1    8 * 32,            8 * 64, 0, 2
3497    DCT32_PASS_1    10 * 32,           10 * 64, 1, 3
3498    DCT32_PASS_1    12 * 32,           12 * 64, 0, 2
3499    DCT32_PASS_1    14 * 32,           14 * 64, 1, 3
3500    DCT32_PASS_1    16 * 32,           16 * 64, 0, 2
3501    DCT32_PASS_1    18 * 32,           18 * 64, 1, 3
3502    DCT32_PASS_1    20 * 32,           20 * 64, 0, 2
3503    DCT32_PASS_1    22 * 32,           22 * 64, 1, 3
3504    DCT32_PASS_1    24 * 32,           24 * 64, 0, 2
3505    DCT32_PASS_1    26 * 32,           26 * 64, 1, 3
3506    DCT32_PASS_1    28 * 32,           28 * 64, 0, 2
3507    DCT32_PASS_1    30 * 32,           30 * 64, 1, 3
3508
3509    add             r5,                8
3510    lea             r0,                [r0 + r2 * 4]
3511
3512    dec             r4d
3513    jnz             .pass1
3514
3515    mov             r2d,               64
3516    lea             r3,                [r2 * 3]
3517    mov             r5,                rsp
3518    mov             r4d,               8
3519    vpbroadcastq    m9,                [pd_1024]
3520
3521.pass2:
3522    mova            m0,                [r5 + 0 * 64]
3523    mova            m1,                [r5 + 0 * 64 + 32]
3524
3525    mova            m2,                [r5 + 1 * 64]
3526    mova            m3,                [r5 + 1 * 64 + 32]
3527
3528    mova            m4,                [r5 + 2 * 64]
3529    mova            m5,                [r5 + 2 * 64 + 32]
3530
3531    mova            m6,                [r5 + 3 * 64]
3532    mova            m7,                [r5 + 3 * 64 + 32]
3533
3534    DCT32_PASS_2    0 * 32
3535    movq            [r1],              xm11
3536    DCT32_PASS_2    1 * 32
3537    movq            [r1 + r2],         xm11
3538    DCT32_PASS_2    2 * 32
3539    movq            [r1 + r2 * 2],     xm11
3540    DCT32_PASS_2    3 * 32
3541    movq            [r1 + r3],         xm11
3542
3543    lea             r6,                [r1 + r2 * 4]
3544    DCT32_PASS_2    4 * 32
3545    movq            [r6],              xm11
3546    DCT32_PASS_2    5 * 32
3547    movq            [r6 + r2],         xm11
3548    DCT32_PASS_2    6 * 32
3549    movq            [r6 + r2 * 2],     xm11
3550    DCT32_PASS_2    7 * 32
3551    movq            [r6 + r3],         xm11
3552
3553    lea             r6,                [r6 + r2 * 4]
3554    DCT32_PASS_2    8 * 32
3555    movq            [r6],              xm11
3556    DCT32_PASS_2    9 * 32
3557    movq            [r6 + r2],         xm11
3558    DCT32_PASS_2    10 * 32
3559    movq            [r6 + r2 * 2],     xm11
3560    DCT32_PASS_2    11 * 32
3561    movq            [r6 + r3],         xm11
3562
3563    lea             r6,                [r6 + r2 * 4]
3564    DCT32_PASS_2    12 * 32
3565    movq            [r6],              xm11
3566    DCT32_PASS_2    13 * 32
3567    movq            [r6 + r2],         xm11
3568    DCT32_PASS_2    14 * 32
3569    movq            [r6 + r2 * 2],     xm11
3570    DCT32_PASS_2    15 * 32
3571    movq            [r6 + r3],         xm11
3572
3573    lea             r6,                [r6 + r2 * 4]
3574    DCT32_PASS_2    16 * 32
3575    movq            [r6],              xm11
3576    DCT32_PASS_2    17 * 32
3577    movq            [r6 + r2],         xm11
3578    DCT32_PASS_2    18 * 32
3579    movq            [r6 + r2 * 2],     xm11
3580    DCT32_PASS_2    19 * 32
3581    movq            [r6 + r3],         xm11
3582
3583    lea             r6,                [r6 + r2 * 4]
3584    DCT32_PASS_2    20 * 32
3585    movq            [r6],              xm11
3586    DCT32_PASS_2    21 * 32
3587    movq            [r6 + r2],         xm11
3588    DCT32_PASS_2    22 * 32
3589    movq            [r6 + r2 * 2],     xm11
3590    DCT32_PASS_2    23 * 32
3591    movq            [r6 + r3],         xm11
3592
3593    lea             r6,                [r6 + r2 * 4]
3594    DCT32_PASS_2    24 * 32
3595    movq            [r6],              xm11
3596    DCT32_PASS_2    25 * 32
3597    movq            [r6 + r2],         xm11
3598    DCT32_PASS_2    26 * 32
3599    movq            [r6 + r2 * 2],     xm11
3600    DCT32_PASS_2    27 * 32
3601    movq            [r6 + r3],         xm11
3602
3603    lea             r6,                [r6 + r2 * 4]
3604    DCT32_PASS_2    28 * 32
3605    movq            [r6],              xm11
3606    DCT32_PASS_2    29 * 32
3607    movq            [r6 + r2],         xm11
3608    DCT32_PASS_2    30 * 32
3609    movq            [r6 + r2 * 2],     xm11
3610    DCT32_PASS_2    31 * 32
3611    movq            [r6 + r3],         xm11
3612
3613    add             r5,                256
3614    add             r1,                8
3615
3616    dec             r4d
3617    jnz             .pass2
3618    RET
3619
3620
3621%macro DCT32_avx512_LOOP 4
3622    movu            m1,               [r0]
3623    movu            m2,               [r0 + r2]
3624
3625    vinserti64x4    m3,               m1, ym2, 1    ; row 0l, 1l
3626    vextracti64x4   ym4,              m1, 1
3627    vinserti64x4    m2,               m2, ym4, 0    ; row 0h, 1h
3628    vpermw          m2,               m31, m2
3629
3630    psubw           m%1,              m3, m2        ; O
3631    paddw           m3,               m2            ; E
3632    mova            [r9 + %3 * 64],   m3
3633
3634    movu            m1,               [r0 + 2 * r2]
3635    movu            m5,               [r0 + r3]
3636
3637    vinserti64x4    m6,               m1, ym5, 1    ; row 2l, 3l
3638    vextracti64x4   ym7,              m1, 1
3639    vinserti64x4    m5,               m5, ym7, 0    ; row 2h, 3h
3640    vpermw          m5,               m31, m5
3641
3642    psubw           m%2,              m6, m5        ; O
3643    paddw           m6,               m5            ; E
3644    mova            [r9 + %4 * 64],   m6
3645%endmacro
3646
3647%macro DCT32_avx512_PASS_1_O 3
3648    pmaddwd          m10,              m%2,  m9
3649    vpsrldq          m11,              m10, 8
3650    vpaddd           m10,              m11
3651
3652    pmaddwd          m11,              m%3,  m9
3653    vpsrldq          m12,              m11, 8
3654    vpaddd           m11,              m12
3655
3656    mova             m12,              m8
3657    vpermi2d         m12,              m10, m11
3658    vpsrldq          m10,              m12, 8
3659    vpaddd           m12,              m10
3660    vpsrldq          m10,              m12, 4
3661    vpaddd           m12,              m10
3662
3663    vpaddd           m12,              m0
3664    vpsrad           m12,              DCT_SHIFT
3665    vpackssdw        m12,              m12
3666    vpermw           m12,              m30, m12
3667    movq             [r5 + %1],        xm12
3668%endmacro
3669
3670%macro DCT32_avx512_PASS_1_ROW_O 0
3671    vbroadcasti32x8  m9,               [r7 + 1 * 32]
3672
3673    DCT32_avx512_LOOP 13, 14, 0, 1
3674    DCT32_avx512_PASS_1_O              1 * 64 + 0 * 8, 13, 14
3675
3676    lea             r0,                [r0 + 4 * r2]
3677    DCT32_avx512_LOOP 15, 16, 2, 3
3678    DCT32_avx512_PASS_1_O              1 * 64 + 1 * 8, 15, 16
3679
3680    lea             r0,                [r0 + 4 * r2]
3681    DCT32_avx512_LOOP 17, 18, 4, 5
3682    DCT32_avx512_PASS_1_O              1 * 64 + 2 * 8, 17, 18
3683
3684    lea             r0,                [r0 + 4 * r2]
3685    DCT32_avx512_LOOP 19, 20, 6, 7
3686    DCT32_avx512_PASS_1_O              1 * 64 + 3 * 8, 19, 20
3687
3688    lea             r0,                [r0 + 4 * r2]
3689    DCT32_avx512_LOOP 21, 22, 8, 9
3690    DCT32_avx512_PASS_1_O              1 * 64 + 4 * 8, 21, 22
3691
3692    lea             r0,                [r0 + 4 * r2]
3693    DCT32_avx512_LOOP 23, 24, 10, 11
3694    DCT32_avx512_PASS_1_O              1 * 64 + 5 * 8, 23, 24
3695
3696    lea             r0,                [r0 + 4 * r2]
3697    DCT32_avx512_LOOP 25, 26, 12, 13
3698    DCT32_avx512_PASS_1_O              1 * 64 + 6 * 8, 25, 26
3699
3700    lea             r0,                [r0 + 4 * r2]
3701    DCT32_avx512_LOOP 27, 28, 14, 15
3702    DCT32_avx512_PASS_1_O              1 * 64 + 7 * 8, 27, 28
3703%endmacro
3704
3705%macro DCT32_avx512_PASS_1_ROW_O_1_7 1
3706    vbroadcasti32x8  m9,               [r7 + %1 * 32]
3707
3708    DCT32_avx512_PASS_1_O              %1 * 64 + 0 * 8, 13, 14
3709    DCT32_avx512_PASS_1_O              %1 * 64 + 1 * 8, 15, 16
3710    DCT32_avx512_PASS_1_O              %1 * 64 + 2 * 8, 17, 18
3711    DCT32_avx512_PASS_1_O              %1 * 64 + 3 * 8, 19, 20
3712    DCT32_avx512_PASS_1_O              %1 * 64 + 4 * 8, 21, 22
3713    DCT32_avx512_PASS_1_O              %1 * 64 + 5 * 8, 23, 24
3714    DCT32_avx512_PASS_1_O              %1 * 64 + 6 * 8, 25, 26
3715    DCT32_avx512_PASS_1_O              %1 * 64 + 7 * 8, 27, 28
3716%endmacro
3717
3718%macro DCT32_avx512_LOOP_EO 4
3719    mova            m4,                [rsp + 32 * mmsize + %3 * 64]
3720    vpermw          m4,                m8, m4
3721    vextracti64x4   ym5,               m4, 1
3722
3723    mova            m6,                [rsp + 32 * mmsize + %4 * 64]
3724    vpermw          m6,                m8, m6
3725    vextracti64x4   ym7,               m6, 1
3726
3727    vinserti64x4    m4,                m4, ym6, 1
3728    vinserti64x4    m5,                m5, ym7, 1
3729
3730    psubw           m%1,               m4, m5      ; EO
3731    paddw           m%2,               m4, m5      ; EE
3732%endmacro
3733
3734%macro DCT32_avx512_PASS_1_ROW_EO 2
3735    pmaddwd          m29,              m%2,  m12
3736    vpsrldq          m30,              m29,  8
3737    vpaddd           m30,              m29
3738    vpsrldq          m29,              m30,  4
3739    vpaddd           m29,              m30
3740
3741    vpaddd           m29,              m0
3742    vpsrad           m29,              DCT_SHIFT
3743    vpackssdw        m29,              m29
3744
3745    vpermw           m29,              m11, m29
3746    movq             [r5 + %1],        xm29
3747%endmacro
3748
3749%macro DCT32_avx512_PASS_1_ROW_EO_0 0
3750
3751    mova            m8,               [dct32_shuf2_AVX512]
3752    vbroadcasti32x4 m12,              [r7 + 2 * 32]
3753
3754    DCT32_avx512_LOOP_EO 13, 14, 0, 1
3755    DCT32_avx512_PASS_1_ROW_EO    2 * 64 + 0 * 8, 13
3756
3757    lea             r9,           [r9 + 4 * r2]
3758    DCT32_avx512_LOOP_EO 15, 16, 2, 3
3759    DCT32_avx512_PASS_1_ROW_EO    2 * 64 + 1 * 8, 15
3760
3761    lea             r9,           [r9 + 4 * r2]
3762    DCT32_avx512_LOOP_EO 17, 18, 4, 5
3763    DCT32_avx512_PASS_1_ROW_EO    2 * 64 + 2 * 8, 17
3764
3765    lea             r9,           [r9 + 4 * r2]
3766    DCT32_avx512_LOOP_EO 19, 20, 6, 7
3767    DCT32_avx512_PASS_1_ROW_EO    2 * 64 + 3 * 8, 19
3768
3769    lea             r9,           [r9 + 4 * r2]
3770    DCT32_avx512_LOOP_EO 21, 22, 8, 9
3771    DCT32_avx512_PASS_1_ROW_EO    2 * 64 + 4 * 8, 21
3772
3773    lea             r9,           [r9 + 4 * r2]
3774    DCT32_avx512_LOOP_EO 23, 24, 10, 11
3775    DCT32_avx512_PASS_1_ROW_EO    2 * 64 + 5 * 8, 23
3776
3777    lea             r9,           [r9 + 4 * r2]
3778    DCT32_avx512_LOOP_EO 25, 26, 12, 13
3779    DCT32_avx512_PASS_1_ROW_EO    2 * 64 + 6 * 8, 25
3780
3781    lea             r9,           [r9 + 4 * r2]
3782    DCT32_avx512_LOOP_EO 27, 28, 14, 15
3783    DCT32_avx512_PASS_1_ROW_EO    2 * 64 + 7 * 8, 27
3784
3785%endmacro
3786
3787%macro DCT32_avx512_PASS_1_ROW_EO_1_7 1
3788
3789    vbroadcasti32x4 m12,         [r7 + %1 * 32]
3790
3791    DCT32_avx512_PASS_1_ROW_EO   %1 * 64 + 0 * 8, 13
3792    DCT32_avx512_PASS_1_ROW_EO   %1 * 64 + 1 * 8, 15
3793    DCT32_avx512_PASS_1_ROW_EO   %1 * 64 + 2 * 8, 17
3794    DCT32_avx512_PASS_1_ROW_EO   %1 * 64 + 3 * 8, 19
3795    DCT32_avx512_PASS_1_ROW_EO   %1 * 64 + 4 * 8, 21
3796    DCT32_avx512_PASS_1_ROW_EO   %1 * 64 + 5 * 8, 23
3797    DCT32_avx512_PASS_1_ROW_EO   %1 * 64 + 6 * 8, 25
3798    DCT32_avx512_PASS_1_ROW_EO   %1 * 64 + 7 * 8, 27
3799
3800%endmacro
3801
3802%macro DCT32_avx512_LOOP_EEO 0
3803    vpunpcklqdq        m2,  m14, m16
3804    vpunpckhqdq        m14, m16
3805    vpshufb            m14, m31
3806
3807    vpaddw             m16, m2, m14     ; EEE
3808    vpsubw             m2,  m14         ; EE0
3809
3810    vpunpcklqdq        m3,  m18, m20
3811    vpunpckhqdq        m18, m20
3812    vpshufb            m18, m31
3813
3814    vpaddw             m20, m3, m18     ; EEE
3815    vpsubw             m3,  m18         ; EE0
3816
3817    vpunpcklqdq        m4,  m22, m24
3818    vpunpckhqdq        m22, m24
3819    vpshufb            m22, m31
3820
3821    vpaddw             m24, m4, m22     ; EEE
3822    vpsubw             m4,  m22         ; EE0
3823
3824    vpunpcklqdq        m5,  m26, m28
3825    vpunpckhqdq        m26, m28
3826    vpshufb            m26, m31
3827
3828    vpaddw             m28, m5, m26     ; EEE
3829    vpsubw             m5,  m26         ; EE0
3830%endmacro
3831
3832%macro DCT32_avx512_PASS_1_ROW_EEO 2
3833    pmaddwd          m30,              m%2,  m1
3834    vpsrldq          m29,              m30,  4
3835    vpaddd           m29,              m30
3836
3837    vpaddd           m29,              m0
3838    vpsrad           m29,              DCT_SHIFT
3839    vpackssdw        m29,              m29
3840
3841    vpermw           m29,              m27, m29
3842    movu             [r5 + %1],        xm29
3843%endmacro
3844
3845%macro DCT32_avx512_PASS_1_ROW_EEO_1_4 1
3846
3847vpbroadcastq     m1,            [r7 + %1 * 32]
3848DCT32_avx512_PASS_1_ROW_EEO     %1 * 64 + 0 * 16, 2
3849DCT32_avx512_PASS_1_ROW_EEO     %1 * 64 + 1 * 16, 3
3850DCT32_avx512_PASS_1_ROW_EEO     %1 * 64 + 2 * 16, 4
3851DCT32_avx512_PASS_1_ROW_EEO     %1 * 64 + 3 * 16, 5
3852
3853%endmacro
3854
3855%macro DCT32_avx512_PASS_1_ROW_EEEO_1_4 1
3856
3857vpbroadcastq     m1,            [r7 + %1 * 32]
3858DCT32_avx512_PASS_1_ROW_EEO     %1 * 64 + 0 * 16, 16
3859DCT32_avx512_PASS_1_ROW_EEO     %1 * 64 + 1 * 16, 20
3860DCT32_avx512_PASS_1_ROW_EEO     %1 * 64 + 2 * 16, 24
3861DCT32_avx512_PASS_1_ROW_EEO     %1 * 64 + 3 * 16, 28
3862
3863%endmacro
3864
3865%macro DCT32_avx512_PASS2_OPT 5
3866    pmaddwd         m9,                m1,  m%1
3867    vpsrldq         m10,               m9,  8
3868    vpaddd          m9,                m10
3869
3870    pmaddwd         m10,               m1,  m%2
3871    vpsrldq         m11,               m10, 8
3872    vpaddd          m10,               m11
3873
3874    pmaddwd         m11,               m1,  m%3
3875    vpsrldq         m12,               m11, 8
3876    vpaddd          m11,               m12
3877
3878    pmaddwd         m12,               m1,  m%4
3879    vpsrldq         m13,               m12, 8
3880    vpaddd          m12,               m13
3881
3882    vpsrldq         m13,               m9,  4
3883    vpaddd          m9,                m13
3884    vpsrldq         m13,               m10, 4
3885    vpaddd          m10,               m13
3886    vpsrldq         m13,               m11, 4
3887    vpaddd          m11,               m13
3888    vpsrldq         m13,               m12, 4
3889    vpaddd          m12,               m13
3890
3891    vpermd           m9,               m31,  m9
3892    vpermd          m10,               m31, m10
3893    vpermd          m11,               m31, m11
3894    vpermd          m12,               m31, m12
3895
3896    vpandd          m9,                m27
3897    vpandd          m10,               m30
3898    vpandd          m11,               m29
3899    vpandd          m12,               m28
3900
3901    vpaddd          m9,                m10
3902    vpaddd          m11,               m12
3903    vpaddd          m9,                m11
3904
3905    vpsrldq         m10,               m9, 8
3906    vpaddd          m9,                m10
3907    vpsrldq         m10,               m9, 4
3908    vpaddd          m9,                m10
3909
3910    vpermd          m9,                m31, m9
3911    vpaddd          m9,                m0
3912    vpsrad          m9,                DCT_SHIFT2
3913    vpackssdw       m9,                m9
3914    movq            [r1 + %5],         xm9
3915
3916%endmacro
3917
3918%macro DCT32_avx512_PASS2 5
3919
3920    mova            m9,                [r5 + %1]
3921    mova            m10,               [r5 + %2]
3922    mova            m11,               [r5 + %3]
3923    mova            m12,               [r5 + %4]
3924
3925    pmaddwd         m9,                m1,  m9
3926    vpsrldq         m13,               m9,  8
3927    vpaddd          m9,                m13
3928
3929    pmaddwd         m10,               m1,  m10
3930    vpsrldq         m13,               m10, 8
3931    vpaddd          m10,               m13
3932
3933    pmaddwd         m11,               m1,  m11
3934    vpsrldq         m13,               m11, 8
3935    vpaddd          m11,               m13
3936
3937    pmaddwd         m12,               m1,  m12
3938    vpsrldq         m13,               m12, 8
3939    vpaddd          m12,               m13
3940
3941    vpsrldq         m13,               m9,  4
3942    vpaddd          m9,                m13
3943    vpsrldq         m13,               m10, 4
3944    vpaddd          m10,               m13
3945    vpsrldq         m13,               m11, 4
3946    vpaddd          m11,               m13
3947    vpsrldq         m13,               m12, 4
3948    vpaddd          m12,               m13
3949
3950    vpermd           m9,               m31,  m9
3951    vpermd          m10,               m31, m10
3952    vpermd          m11,               m31, m11
3953    vpermd          m12,               m31, m12
3954
3955    vpandd          m9,                m27
3956    vpandd          m10,               m30
3957    vpandd          m11,               m29
3958    vpandd          m12,               m28
3959
3960    vpaddd          m9,                m10
3961    vpaddd          m11,               m12
3962    vpaddd          m9,                m11
3963
3964    vpsrldq         m10,               m9, 8
3965    vpaddd          m9,                m10
3966    vpsrldq         m10,               m9, 4
3967    vpaddd          m9,                m10
3968
3969    vpermd          m9,                m31, m9
3970    vpaddd          m9,                m0
3971    vpsrad          m9,                DCT_SHIFT2
3972    vpackssdw       m9,                m9
3973    movq            [r1 + %5],         xm9
3974
3975%endmacro
3976
3977%macro DCT32_avx512_PASS2_1_ROW 1
3978
3979mova            m1,                [r8 + %1 * 64]
3980
3981DCT32_avx512_PASS2_OPT  2,  3,  4, 14, %1 * 64 + 0 * 8
3982DCT32_avx512_PASS2_OPT 15, 16, 17, 18, %1 * 64 + 1 * 8
3983DCT32_avx512_PASS2_OPT 19, 20, 21, 22, %1 * 64 + 2 * 8
3984DCT32_avx512_PASS2_OPT 23, 24, 25, 26, %1 * 64 + 3 * 8
3985DCT32_avx512_PASS2_OPT  5,  6,  7,  8, %1 * 64 + 4 * 8
3986
3987DCT32_avx512_PASS2 20 * 64, 21 * 64, 22 * 64, 23 * 64, %1 * 64 + 5 * 8
3988DCT32_avx512_PASS2 24 * 64, 25 * 64, 26 * 64, 27 * 64, %1 * 64 + 6 * 8
3989DCT32_avx512_PASS2 28 * 64, 29 * 64, 30 * 64, 31 * 64, %1 * 64 + 7 * 8
3990
3991%endmacro
3992
3993INIT_ZMM avx512
3994cglobal dct32, 3, 10, 32, 0-(32*mmsize + 16*mmsize)
3995
3996%if BIT_DEPTH == 12
3997    %define         DCT_SHIFT          8
3998    vpbroadcastq    m0,                [pd_128]
3999%elif BIT_DEPTH == 10
4000    %define         DCT_SHIFT          6
4001    vpbroadcastq    m0,                [pd_32]
4002%elif BIT_DEPTH == 8
4003    %define         DCT_SHIFT          4
4004    vpbroadcastq    m0,                [pd_8]
4005%else
4006    %error Unsupported BIT_DEPTH!
4007%endif
4008%define             DCT_SHIFT2         11
4009
4010    add             r2d,               r2d
4011    lea             r7,                [tab_dct32_1]
4012    lea             r8,                [tab_dct32]
4013    lea             r3,                [r2 * 3]
4014    mov             r5,                rsp
4015    mov             r9,                2048    ; 32 * mmsize
4016    add             r9,                rsp
4017
4018    mova            m31,               [dct32_shuf1_AVX512]
4019
4020    ; PASSS 1
4021
4022    vbroadcasti32x8 m30,               [dct8_shuf9_AVX512]
4023    mova            m8,                [dct32_shuf_AVX512]
4024
4025    DCT32_avx512_PASS_1_ROW_O
4026    DCT32_avx512_PASS_1_ROW_O_1_7  3
4027    DCT32_avx512_PASS_1_ROW_O_1_7  5
4028    DCT32_avx512_PASS_1_ROW_O_1_7  7
4029    DCT32_avx512_PASS_1_ROW_O_1_7  9
4030    DCT32_avx512_PASS_1_ROW_O_1_7 11
4031    DCT32_avx512_PASS_1_ROW_O_1_7 13
4032    DCT32_avx512_PASS_1_ROW_O_1_7 15
4033    DCT32_avx512_PASS_1_ROW_O_1_7 17
4034    DCT32_avx512_PASS_1_ROW_O_1_7 19
4035    DCT32_avx512_PASS_1_ROW_O_1_7 20
4036    DCT32_avx512_PASS_1_ROW_O_1_7 21
4037    DCT32_avx512_PASS_1_ROW_O_1_7 23
4038    DCT32_avx512_PASS_1_ROW_O_1_7 25
4039    DCT32_avx512_PASS_1_ROW_O_1_7 27
4040    DCT32_avx512_PASS_1_ROW_O_1_7 29
4041    DCT32_avx512_PASS_1_ROW_O_1_7 31
4042
4043    vbroadcasti32x8  m11,               [dct8_shuf9_AVX512]
4044
4045    DCT32_avx512_PASS_1_ROW_EO_0
4046    DCT32_avx512_PASS_1_ROW_EO_1_7 6
4047    DCT32_avx512_PASS_1_ROW_EO_1_7 10
4048    DCT32_avx512_PASS_1_ROW_EO_1_7 14
4049    DCT32_avx512_PASS_1_ROW_EO_1_7 18
4050    DCT32_avx512_PASS_1_ROW_EO_1_7 22
4051    DCT32_avx512_PASS_1_ROW_EO_1_7 26
4052    DCT32_avx512_PASS_1_ROW_EO_1_7 30
4053
4054    vbroadcasti32x4  m31,               [dct8_shuf]
4055    vbroadcasti32x8  m27,               [dct32_shuf3_AVX512]
4056
4057    DCT32_avx512_LOOP_EEO
4058    DCT32_avx512_PASS_1_ROW_EEO_1_4 4
4059    DCT32_avx512_PASS_1_ROW_EEO_1_4 12
4060    DCT32_avx512_PASS_1_ROW_EEO_1_4 20
4061    DCT32_avx512_PASS_1_ROW_EEO_1_4 28
4062
4063    DCT32_avx512_PASS_1_ROW_EEEO_1_4 0
4064    DCT32_avx512_PASS_1_ROW_EEEO_1_4 16
4065    DCT32_avx512_PASS_1_ROW_EEEO_1_4 8
4066    DCT32_avx512_PASS_1_ROW_EEEO_1_4 24
4067
4068    ; PASS 2
4069
4070    vpbroadcastq    m0,               [pd_1024]
4071    vbroadcasti32x8 m31,              [dct32_shuf4_AVX512]
4072    movu            m30,              [dct32_shuf5_AVX512]
4073    movu            m29,              [dct32_shuf6_AVX512]
4074    movu            m28,              [dct32_shuf7_AVX512]
4075    movu            m27,              [dct32_shuf8_AVX512]
4076
4077    ;Load the source coefficents into free registers and reuse them for all rows
4078
4079    mova            m2,               [r5 +  0 * 64]
4080    mova            m3,               [r5 +  1 * 64]
4081    mova            m4,               [r5 +  2 * 64]
4082    mova            m14,              [r5 +  3 * 64]
4083    mova            m15,              [r5 +  4 * 64]
4084    mova            m16,              [r5 +  5 * 64]
4085    mova            m17,              [r5 +  6 * 64]
4086    mova            m18,              [r5 +  7 * 64]
4087    mova            m19,              [r5 +  8 * 64]
4088    mova            m20,              [r5 +  9 * 64]
4089    mova            m21,              [r5 + 10 * 64]
4090    mova            m22,              [r5 + 11 * 64]
4091    mova            m23,              [r5 + 12 * 64]
4092    mova            m24,              [r5 + 13 * 64]
4093    mova            m25,              [r5 + 14 * 64]
4094    mova            m26,              [r5 + 15 * 64]
4095    mova             m5,              [r5 + 16 * 64]
4096    mova             m6,              [r5 + 17 * 64]
4097    mova             m7,              [r5 + 18 * 64]
4098    mova             m8,              [r5 + 19 * 64]
4099
4100    DCT32_avx512_PASS2_1_ROW 0
4101    DCT32_avx512_PASS2_1_ROW 1
4102    DCT32_avx512_PASS2_1_ROW 2
4103    DCT32_avx512_PASS2_1_ROW 3
4104    DCT32_avx512_PASS2_1_ROW 4
4105    DCT32_avx512_PASS2_1_ROW 5
4106    DCT32_avx512_PASS2_1_ROW 6
4107    DCT32_avx512_PASS2_1_ROW 7
4108    DCT32_avx512_PASS2_1_ROW 8
4109    DCT32_avx512_PASS2_1_ROW 9
4110    DCT32_avx512_PASS2_1_ROW 10
4111    DCT32_avx512_PASS2_1_ROW 11
4112    DCT32_avx512_PASS2_1_ROW 12
4113    DCT32_avx512_PASS2_1_ROW 13
4114    DCT32_avx512_PASS2_1_ROW 14
4115    DCT32_avx512_PASS2_1_ROW 15
4116    DCT32_avx512_PASS2_1_ROW 16
4117    DCT32_avx512_PASS2_1_ROW 17
4118    DCT32_avx512_PASS2_1_ROW 18
4119    DCT32_avx512_PASS2_1_ROW 19
4120    DCT32_avx512_PASS2_1_ROW 20
4121    DCT32_avx512_PASS2_1_ROW 21
4122    DCT32_avx512_PASS2_1_ROW 22
4123    DCT32_avx512_PASS2_1_ROW 23
4124    DCT32_avx512_PASS2_1_ROW 24
4125    DCT32_avx512_PASS2_1_ROW 25
4126    DCT32_avx512_PASS2_1_ROW 26
4127    DCT32_avx512_PASS2_1_ROW 27
4128    DCT32_avx512_PASS2_1_ROW 28
4129    DCT32_avx512_PASS2_1_ROW 29
4130    DCT32_avx512_PASS2_1_ROW 30
4131    DCT32_avx512_PASS2_1_ROW 31
4132
4133    RET
4134
4135%macro IDCT8_PASS_1 1
4136    vpbroadcastd    m7,                [r5 + %1]
4137    vpbroadcastd    m10,               [r5 + %1 + 4]
4138    pmaddwd         m5,                m4, m7
4139    pmaddwd         m6,                m0, m10
4140    paddd           m5,                m6
4141
4142    vpbroadcastd    m7,                [r6 + %1]
4143    vpbroadcastd    m10,               [r6 + %1 + 4]
4144    pmaddwd         m6,                m1, m7
4145    pmaddwd         m3,                m2, m10
4146    paddd           m6,                m3
4147
4148    paddd           m3,                m5, m6
4149    paddd           m3,                m11
4150    psrad           m3,                IDCT_SHIFT1
4151
4152    psubd           m5,                m6
4153    paddd           m5,                m11
4154    psrad           m5,                IDCT_SHIFT1
4155
4156    vpbroadcastd    m7,                [r5 + %1 + 32]
4157    vpbroadcastd    m10,               [r5 + %1 + 36]
4158    pmaddwd         m6,                m4, m7
4159    pmaddwd         m8,                m0, m10
4160    paddd           m6,                m8
4161
4162    vpbroadcastd    m7,                [r6 + %1 + 32]
4163    vpbroadcastd    m10,               [r6 + %1 + 36]
4164    pmaddwd         m8,                m1, m7
4165    pmaddwd         m9,                m2, m10
4166    paddd           m8,                m9
4167
4168    paddd           m9,                m6, m8
4169    paddd           m9,                m11
4170    psrad           m9,                IDCT_SHIFT1
4171
4172    psubd           m6,                m8
4173    paddd           m6,                m11
4174    psrad           m6,                IDCT_SHIFT1
4175
4176    packssdw        m3,                m9
4177    vpermq          m3,                m3, 0xD8
4178
4179    packssdw        m6,                m5
4180    vpermq          m6,                m6, 0xD8
4181%endmacro
4182
4183%macro IDCT8_PASS_2 0
4184    punpcklqdq      m2,                m0, m1
4185    punpckhqdq      m0,                m1
4186
4187    pmaddwd         m3,                m2, [r5]
4188    pmaddwd         m5,                m2, [r5 + 32]
4189    pmaddwd         m6,                m2, [r5 + 64]
4190    pmaddwd         m7,                m2, [r5 + 96]
4191    phaddd          m3,                m5
4192    phaddd          m6,                m7
4193    pshufb          m3,                [idct8_shuf2]
4194    pshufb          m6,                [idct8_shuf2]
4195    punpcklqdq      m7,                m3, m6
4196    punpckhqdq      m3,                m6
4197
4198    pmaddwd         m5,                m0, [r6]
4199    pmaddwd         m6,                m0, [r6 + 32]
4200    pmaddwd         m8,                m0, [r6 + 64]
4201    pmaddwd         m9,                m0, [r6 + 96]
4202    phaddd          m5,                m6
4203    phaddd          m8,                m9
4204    pshufb          m5,                [idct8_shuf2]
4205    pshufb          m8,                [idct8_shuf2]
4206    punpcklqdq      m6,                m5, m8
4207    punpckhqdq      m5,                m8
4208
4209    paddd           m8,                m7, m6
4210    paddd           m8,                m12
4211    psrad           m8,                IDCT_SHIFT2
4212
4213    psubd           m7,                m6
4214    paddd           m7,                m12
4215    psrad           m7,                IDCT_SHIFT2
4216
4217    pshufb          m7,                [idct8_shuf3]
4218    packssdw        m8,                 m7
4219
4220    paddd           m9,                m3, m5
4221    paddd           m9,                m12
4222    psrad           m9,                IDCT_SHIFT2
4223
4224    psubd           m3,                m5
4225    paddd           m3,                m12
4226    psrad           m3,                IDCT_SHIFT2
4227
4228    pshufb          m3,                [idct8_shuf3]
4229    packssdw        m9,                m3
4230%endmacro
4231
4232INIT_YMM avx2
4233cglobal idct8, 3, 7, 13, 0-8*16
4234%if BIT_DEPTH == 12
4235    %define         IDCT_SHIFT2        8
4236    vpbroadcastd    m12,                [pd_128]
4237%elif BIT_DEPTH == 10
4238    %define         IDCT_SHIFT2        10
4239    vpbroadcastd    m12,                [pd_512]
4240%elif BIT_DEPTH == 8
4241    %define         IDCT_SHIFT2        12
4242    vpbroadcastd    m12,                [pd_2048]
4243%else
4244    %error Unsupported BIT_DEPTH!
4245%endif
4246%define             IDCT_SHIFT1         7
4247
4248    vbroadcasti128  m11,               [pd_64]
4249
4250    mov             r4,                rsp
4251    lea             r5,                [avx2_idct8_1]
4252    lea             r6,                [avx2_idct8_2]
4253
4254    ;pass1
4255    mova            m1,                [r0 + 0 * 32]     ; [0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1]
4256    mova            m0,                [r0 + 1 * 32]     ; [2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3]
4257    vpunpcklwd      m5,      m1,       m0                ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3]
4258    vpunpckhwd      m1,      m0                          ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3]
4259    vinserti128     m4,      m5,       xm1,       1      ; [0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 2]
4260    vextracti128    xm2,     m5,       1                 ; [1 3 1 3 1 3 1 3]
4261    vinserti128     m1,      m1,       xm2,       0      ; [1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3]
4262
4263    mova            m2,                [r0 + 2 * 32]     ; [4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5]
4264    mova            m0,                [r0 + 3 * 32]     ; [6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7]
4265    vpunpcklwd      m5,      m2,       m0                ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7]
4266    vpunpckhwd      m2,      m0                          ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7]
4267    vinserti128     m0,      m5,       xm2,       1     ; [4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6]
4268    vextracti128    xm5,     m5,       1                ; [5 7 5 7 5 7 5 7]
4269    vinserti128     m2,      m2,       xm5,       0     ; [5 7 5 7 5 7 5 7 5 7 5 7 5 7 5 7]
4270
4271    mova            m5,                [idct8_shuf1]
4272    vpermd          m4,                m5, m4
4273    vpermd          m0,                m5, m0
4274    vpermd          m1,                m5, m1
4275    vpermd          m2,                m5, m2
4276
4277    IDCT8_PASS_1    0
4278    mova            [r4],              m3
4279    mova            [r4 + 96],         m6
4280
4281    IDCT8_PASS_1    64
4282    mova            [r4 + 32],         m3
4283    mova            [r4 + 64],         m6
4284
4285    ;pass2
4286    add             r2d,               r2d
4287    lea             r3,                [r2 * 3]
4288
4289    mova            m0,                [r4]
4290    mova            m1,                [r4 + 32]
4291    IDCT8_PASS_2
4292
4293    vextracti128    xm3,               m8, 1
4294    mova            [r1],              xm8
4295    mova            [r1 + r2],         xm3
4296    vextracti128    xm3,               m9, 1
4297    mova            [r1 + r2 * 2],     xm9
4298    mova            [r1 + r3],         xm3
4299
4300    lea             r1,                [r1 + r2 * 4]
4301    mova            m0,                [r4 + 64]
4302    mova            m1,                [r4 + 96]
4303    IDCT8_PASS_2
4304
4305    vextracti128    xm3,               m8, 1
4306    mova            [r1],              xm8
4307    mova            [r1 + r2],         xm3
4308    vextracti128    xm3,               m9, 1
4309    mova            [r1 + r2 * 2],     xm9
4310    mova            [r1 + r3],         xm3
4311    RET
4312
4313
4314%macro IDCT8_AVX512_PASS_1 0
4315    pmaddwd         m5,                m29, m17
4316    pmaddwd         m6,                m25, m18
4317    paddd           m5,                m6
4318
4319    pmaddwd         m6,                m30, m21
4320    pmaddwd         m3,                m26, m22
4321    paddd           m6,                m3
4322
4323    paddd           m3,                m5, m6
4324    paddd           m3,                m11
4325    psrad           m3,                IDCT_SHIFT1
4326
4327    psubd           m5,                m6
4328    paddd           m5,                m11
4329    psrad           m5,                IDCT_SHIFT1
4330
4331    pmaddwd         m6,                m29, m19
4332    pmaddwd         m8,                m25, m20
4333    paddd           m6,                m8
4334
4335    pmaddwd         m8,                m30, m23
4336    pmaddwd         m9,                m26, m24
4337    paddd           m8,                m9
4338
4339    paddd           m9,                m6, m8
4340    paddd           m9,                m11
4341    psrad           m9,                IDCT_SHIFT1
4342
4343    psubd           m6,                m8
4344    paddd           m6,                m11
4345    psrad           m6,                IDCT_SHIFT1
4346
4347    packssdw        m3,                m9
4348    vpermq          m3,                m3, 0xD8
4349
4350    packssdw        m6,                m5
4351    vpermq          m6,                m6, 0xD8
4352%endmacro
4353
4354
4355%macro IDCT8_AVX512_PASS_2 0
4356    mov             r7d, 0xAAAA
4357    kmovd           k1, r7d
4358    punpcklqdq      m2,                m3, m13
4359    punpckhqdq      m0,                m3, m13
4360
4361    pmaddwd         m3,                m2, [r5]
4362    pmaddwd         m5,                m2, [r5 + 1 * mmsize]
4363    pmaddwd         m6,                m2, [r5 + 2 * mmsize]
4364    pmaddwd         m7,                m2, [r5 + 3 * mmsize]
4365
4366    vpsrldq         m14,   m3, 4
4367    paddd            m3,  m14
4368    vpslldq         m16,   m5, 4
4369    paddd            m5,  m16
4370    vmovdqu32        m3   {k1}, m5
4371
4372    vpsrldq         m14,   m6, 4
4373    paddd            m6,  m14
4374    vpslldq         m16,   m7, 4
4375    paddd            m7,  m16
4376    vmovdqu32        m6   {k1}, m7
4377
4378    punpcklqdq      m7,                m3, m6
4379    punpckhqdq      m3,                m6
4380
4381    pmaddwd         m5,                m0, [r6]
4382    pmaddwd         m6,                m0, [r6 + 1 * mmsize]
4383    pmaddwd         m8,                m0, [r6 + 2 * mmsize]
4384    pmaddwd         m9,                m0, [r6 + 3 * mmsize]
4385
4386    vpsrldq         m14,   m5, 4
4387    paddd            m5,  m14
4388    vpslldq         m16,   m6, 4
4389    paddd            m6,  m16
4390    vmovdqu32        m5   {k1}, m6
4391
4392    vpsrldq         m14,   m8, 4
4393    paddd            m8,  m14
4394    vpslldq         m16,   m9, 4
4395    paddd            m9,  m16
4396    vmovdqu32        m8   {k1}, m9
4397
4398    punpcklqdq      m6,                m5, m8
4399    punpckhqdq      m5,                m8
4400
4401    paddd           m8,                m7, m6
4402    paddd           m8,                m12
4403    psrad           m8,                IDCT_SHIFT2
4404
4405    psubd           m7,                m6
4406    paddd           m7,                m12
4407    psrad           m7,                IDCT_SHIFT2
4408
4409    pshufb          m7,                [idct8_avx512_shuf3]
4410    packssdw        m8,                 m7
4411
4412    paddd           m9,                m3, m5
4413    paddd           m9,                m12
4414    psrad           m9,                IDCT_SHIFT2
4415
4416    psubd           m3,                m5
4417    paddd           m3,                m12
4418    psrad           m3,                IDCT_SHIFT2
4419
4420    pshufb          m3,                [idct8_avx512_shuf3]
4421    packssdw        m9,                m3
4422%endmacro
4423
4424
4425%if ARCH_X86_64
4426INIT_ZMM avx512
4427cglobal idct8, 3, 8, 31
4428%if BIT_DEPTH == 12
4429    %define         IDCT_SHIFT2        8
4430    vpbroadcastd    m12,                [pd_128]
4431%elif BIT_DEPTH == 10
4432    %define         IDCT_SHIFT2        10
4433    vpbroadcastd    m12,                [pd_512]
4434%elif BIT_DEPTH == 8
4435    %define         IDCT_SHIFT2        12
4436    vpbroadcastd    m12,                [pd_2048]
4437%else
4438    %error Unsupported BIT_DEPTH!
4439%endif
4440%define             IDCT_SHIFT1         7
4441
4442    vpbroadcastd     m11,               [pd_64]
4443
4444    lea             r4,                [avx512_idct8_3]
4445    lea             r5,                [avx2_idct8_1]
4446    lea             r6,                [avx2_idct8_2]
4447    movu           m16,                [idct16_shuff2]
4448    movu           m17,                [idct16_shuff3]
4449
4450    ;pass1
4451    mova            ym1, [r0 + 0 * 32]
4452    mova            ym0, [r0 + 1 * 32]
4453    mova            ym25, ym16
4454    mova            ym26, ym17
4455    vpermi2w        ym25,  ym1, ym0
4456    vpermi2w        ym26,  ym1, ym0
4457
4458    mova            ym1, [r0 + 2 * 32]
4459    mova            ym0, [r0 + 3 * 32]
4460    mova            ym27, ym16
4461    mova            ym28, ym17
4462    vpermi2w        ym27,  ym1, ym0
4463    vpermi2w        ym28,  ym1, ym0
4464
4465    vperm2i128      ym29, ym25, ym26, 0x20
4466    vperm2i128      ym30, ym25, ym26, 0x31
4467    vperm2i128      ym25, ym27, ym28, 0x20
4468    vperm2i128      ym26, ym27, ym28, 0x31
4469
4470    vinserti64x4    m29,        m29,      ym29, 1
4471    vinserti64x4    m25,        m25,      ym25, 1
4472    vinserti64x4    m30,        m30,      ym30, 1
4473    vinserti64x4    m26,        m26,      ym26, 1
4474
4475    movu            m17,                [r4]
4476    movu            m18,                [r4 + 1 * mmsize]
4477    movu            m19,                [r4 + 2 * mmsize]
4478    movu            m20,                [r4 + 3 * mmsize]
4479    movu            m21,                [r4 + 4 * mmsize]
4480    movu            m22,                [r4 + 5 * mmsize]
4481    movu            m23,                [r4 + 6 * mmsize]
4482    movu            m24,                [r4 + 7 * mmsize]
4483
4484    IDCT8_AVX512_PASS_1
4485
4486    vextracti64x4   ym13,       m3,      1
4487    vextracti64x4   ym14,       m6,      1
4488    vinserti64x4      m3,       m3,      ym14, 1
4489    vinserti64x4     m13,      m13,       ym6, 1
4490
4491    ;pass2
4492    add             r2d,               r2d
4493    lea             r3,                [r2 * 3]
4494    lea             r5,                [avx512_idct8_1]
4495    lea             r6,                [avx512_idct8_2]
4496
4497    IDCT8_AVX512_PASS_2
4498
4499    vextracti128    xm3,               ym8, 1
4500    mova            [r1],              xm8
4501    mova            [r1 + r2],         xm3
4502    vextracti128    xm3,               ym9, 1
4503    mova            [r1 + r2 * 2],     xm9
4504    mova            [r1 + r3],         xm3
4505
4506    lea             r1,                [r1 + r2 * 4]
4507
4508    vextracti64x4   ym10,   m8, 1
4509    vextracti64x4   ym11,   m9, 1
4510
4511    vextracti128    xm3,               ym10, 1
4512    mova            [r1],              xm10
4513    mova            [r1 + r2],         xm3
4514    vextracti128    xm3,               ym11, 1
4515    mova            [r1 + r2 * 2],     xm11
4516    mova            [r1 + r3],         xm3
4517    RET
4518%endif
4519
4520%macro IDCT_PASS1 2
4521    vbroadcasti128  m5, [tab_idct16_2 + %1 * 16]
4522
4523    pmaddwd         m9, m0, m5
4524    pmaddwd         m10, m7, m5
4525    phaddd          m9, m10
4526
4527    pmaddwd         m10, m6, m5
4528    pmaddwd         m11, m8, m5
4529    phaddd          m10, m11
4530
4531    phaddd          m9, m10
4532    vbroadcasti128  m5, [tab_idct16_1 + %1 * 16]
4533
4534    pmaddwd         m10, m1, m5
4535    pmaddwd         m11, m3, m5
4536    phaddd          m10, m11
4537
4538    pmaddwd         m11, m4, m5
4539    pmaddwd         m12, m2, m5
4540    phaddd          m11, m12
4541
4542    phaddd          m10, m11
4543
4544    paddd           m11, m9, m10
4545    paddd           m11, m14
4546    psrad           m11, IDCT_SHIFT1
4547
4548    psubd           m9, m10
4549    paddd           m9, m14
4550    psrad           m9, IDCT_SHIFT1
4551
4552    vbroadcasti128  m5, [tab_idct16_2 + %1 * 16 + 16]
4553
4554    pmaddwd         m10, m0, m5
4555    pmaddwd         m12, m7, m5
4556    phaddd          m10, m12
4557
4558    pmaddwd         m12, m6, m5
4559    pmaddwd         m13, m8, m5
4560    phaddd          m12, m13
4561
4562    phaddd          m10, m12
4563    vbroadcasti128  m5, [tab_idct16_1 + %1 * 16  + 16]
4564
4565    pmaddwd         m12, m1, m5
4566    pmaddwd         m13, m3, m5
4567    phaddd          m12, m13
4568
4569    pmaddwd         m13, m4, m5
4570    pmaddwd         m5, m2
4571    phaddd          m13, m5
4572
4573    phaddd          m12, m13
4574
4575    paddd           m5, m10, m12
4576    paddd           m5, m14
4577    psrad           m5, IDCT_SHIFT1
4578
4579    psubd           m10, m12
4580    paddd           m10, m14
4581    psrad           m10, IDCT_SHIFT1
4582
4583    packssdw        m11, m5
4584    packssdw        m9, m10
4585
4586    mova            m10, [idct16_shuff]
4587    mova            m5,  [idct16_shuff1]
4588
4589    vpermd          m12, m10, m11
4590    vpermd          m13, m5, m9
4591    mova            [r3 + %1 * 16 * 2], xm12
4592    mova            [r3 + %2 * 16 * 2], xm13
4593    vextracti128    [r3 + %2 * 16 * 2 + 32], m13, 1
4594    vextracti128    [r3 + %1 * 16 * 2 + 32], m12, 1
4595%endmacro
4596
4597;-------------------------------------------------------
4598; void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride)
4599;-------------------------------------------------------
4600INIT_YMM avx2
4601cglobal idct16, 3, 7, 16, 0-16*mmsize
4602%if BIT_DEPTH == 12
4603    %define         IDCT_SHIFT2        8
4604    vpbroadcastd    m15,                [pd_128]
4605%elif BIT_DEPTH == 10
4606    %define         IDCT_SHIFT2        10
4607    vpbroadcastd    m15,                [pd_512]
4608%elif BIT_DEPTH == 8
4609    %define         IDCT_SHIFT2        12
4610    vpbroadcastd    m15,                [pd_2048]
4611%else
4612    %error Unsupported BIT_DEPTH!
4613%endif
4614%define             IDCT_SHIFT1         7
4615
4616    vbroadcasti128  m14,               [pd_64]
4617
4618    add             r2d,               r2d
4619    mov             r3, rsp
4620    mov             r4d, 2
4621
4622.pass1:
4623     movu            xm0, [r0 +  0 * 32]
4624     movu            xm1, [r0 +  8 * 32]
4625     punpckhqdq      xm2, xm0, xm1
4626     punpcklqdq      xm0, xm1
4627     vinserti128     m0, m0, xm2, 1
4628
4629     movu            xm1, [r0 +  1 * 32]
4630     movu            xm2, [r0 +  9 * 32]
4631     punpckhqdq      xm3, xm1, xm2
4632     punpcklqdq      xm1, xm2
4633     vinserti128     m1, m1, xm3, 1
4634
4635     movu            xm2, [r0 + 2  * 32]
4636     movu            xm3, [r0 + 10 * 32]
4637     punpckhqdq      xm4, xm2, xm3
4638     punpcklqdq      xm2, xm3
4639     vinserti128     m2, m2, xm4, 1
4640
4641     movu            xm3, [r0 + 3  * 32]
4642     movu            xm4, [r0 + 11 * 32]
4643     punpckhqdq      xm5, xm3, xm4
4644     punpcklqdq      xm3, xm4
4645     vinserti128     m3, m3, xm5, 1
4646
4647     movu            xm4, [r0 + 4  * 32]
4648     movu            xm5, [r0 + 12 * 32]
4649     punpckhqdq      xm6, xm4, xm5
4650     punpcklqdq      xm4, xm5
4651     vinserti128     m4, m4, xm6, 1
4652
4653     movu            xm5, [r0 + 5  * 32]
4654     movu            xm6, [r0 + 13 * 32]
4655     punpckhqdq      xm7, xm5, xm6
4656     punpcklqdq      xm5, xm6
4657     vinserti128     m5, m5, xm7, 1
4658
4659     movu            xm6, [r0 + 6  * 32]
4660     movu            xm7, [r0 + 14 * 32]
4661     punpckhqdq      xm8, xm6, xm7
4662     punpcklqdq      xm6, xm7
4663     vinserti128     m6, m6, xm8, 1
4664
4665     movu            xm7, [r0 + 7  * 32]
4666     movu            xm8, [r0 + 15 * 32]
4667     punpckhqdq      xm9, xm7, xm8
4668     punpcklqdq      xm7, xm8
4669     vinserti128     m7, m7, xm9, 1
4670
4671    punpckhwd       m8, m0, m2                ;[8 10]
4672    punpcklwd       m0, m2                    ;[0 2]
4673
4674    punpckhwd       m2, m1, m3                ;[9 11]
4675    punpcklwd       m1, m3                    ;[1 3]
4676
4677    punpckhwd       m3, m4, m6                ;[12 14]
4678    punpcklwd       m4, m6                    ;[4 6]
4679
4680    punpckhwd       m6, m5, m7                ;[13 15]
4681    punpcklwd       m5, m7                    ;[5 7]
4682
4683    punpckhdq       m7, m0, m4                ;[02 22 42 62 03 23 43 63 06 26 46 66 07 27 47 67]
4684    punpckldq       m0, m4                    ;[00 20 40 60 01 21 41 61 04 24 44 64 05 25 45 65]
4685
4686    punpckhdq       m4, m8, m3                ;[82 102 122 142 83 103 123 143 86 106 126 146 87 107 127 147]
4687    punpckldq       m8, m3                    ;[80 100 120 140 81 101 121 141 84 104 124 144 85 105 125 145]
4688
4689    punpckhdq       m3, m1, m5                ;[12 32 52 72 13 33 53 73 16 36 56 76 17 37 57 77]
4690    punpckldq       m1, m5                    ;[10 30 50 70 11 31 51 71 14 34 54 74 15 35 55 75]
4691
4692    punpckhdq       m5, m2, m6                ;[92 112 132 152 93 113 133 153 96 116 136 156 97 117 137 157]
4693    punpckldq       m2, m6                    ;[90 110 130 150 91 111 131 151 94 114 134 154 95 115 135 155]
4694
4695    punpckhqdq      m6, m0, m8                ;[01 21 41 61 81 101 121 141 05 25 45 65 85 105 125 145]
4696    punpcklqdq      m0, m8                    ;[00 20 40 60 80 100 120 140 04 24 44 64 84 104 124 144]
4697
4698    punpckhqdq      m8, m7, m4                ;[03 23 43 63 43 103 123 143 07 27 47 67 87 107 127 147]
4699    punpcklqdq      m7, m4                    ;[02 22 42 62 82 102 122 142 06 26 46 66 86 106 126 146]
4700
4701    punpckhqdq      m4, m1, m2                ;[11 31 51 71 91 111 131 151 15 35 55 75 95 115 135 155]
4702    punpcklqdq      m1, m2                    ;[10 30 50 70 90 110 130 150 14 34 54 74 94 114 134 154]
4703
4704    punpckhqdq      m2, m3, m5                ;[13 33 53 73 93 113 133 153 17 37 57 77 97 117 137 157]
4705    punpcklqdq      m3, m5                    ;[12 32 52 72 92 112 132 152 16 36 56 76 96 116 136 156]
4706
4707    IDCT_PASS1      0, 14
4708    IDCT_PASS1      2, 12
4709    IDCT_PASS1      4, 10
4710    IDCT_PASS1      6, 8
4711
4712    add             r0, 16
4713    add             r3, 16
4714    dec             r4d
4715    jnz             .pass1
4716
4717    mov             r3, rsp
4718    mov             r4d, 8
4719    lea             r5, [tab_idct16_2]
4720    lea             r6, [tab_idct16_1]
4721
4722    vbroadcasti128  m7,  [r5]
4723    vbroadcasti128  m8,  [r5 + 16]
4724    vbroadcasti128  m9,  [r5 + 32]
4725    vbroadcasti128  m10, [r5 + 48]
4726    vbroadcasti128  m11, [r5 + 64]
4727    vbroadcasti128  m12, [r5 + 80]
4728    vbroadcasti128  m13, [r5 + 96]
4729
4730.pass2:
4731    movu            m1, [r3]
4732    vpermq          m0, m1, 0xD8
4733
4734    pmaddwd         m1, m0, m7
4735    pmaddwd         m2, m0, m8
4736    phaddd          m1, m2
4737
4738    pmaddwd         m2, m0, m9
4739    pmaddwd         m3, m0, m10
4740    phaddd          m2, m3
4741
4742    phaddd          m1, m2
4743
4744    pmaddwd         m2, m0, m11
4745    pmaddwd         m3, m0, m12
4746    phaddd          m2, m3
4747
4748    vbroadcasti128  m14, [r5 + 112]
4749    pmaddwd         m3, m0, m13
4750    pmaddwd         m4, m0, m14
4751    phaddd          m3, m4
4752
4753    phaddd          m2, m3
4754
4755    movu            m3, [r3 + 32]
4756    vpermq          m0, m3, 0xD8
4757
4758    vbroadcasti128  m14, [r6]
4759    pmaddwd         m3, m0, m14
4760    vbroadcasti128  m14, [r6 + 16]
4761    pmaddwd         m4, m0, m14
4762    phaddd          m3, m4
4763
4764    vbroadcasti128  m14, [r6 + 32]
4765    pmaddwd         m4, m0, m14
4766    vbroadcasti128  m14, [r6 + 48]
4767    pmaddwd         m5, m0, m14
4768    phaddd          m4, m5
4769
4770    phaddd          m3, m4
4771
4772    vbroadcasti128  m14, [r6 + 64]
4773    pmaddwd         m4, m0, m14
4774    vbroadcasti128  m14, [r6 + 80]
4775    pmaddwd         m5, m0, m14
4776    phaddd          m4, m5
4777
4778    vbroadcasti128  m14, [r6 + 96]
4779    pmaddwd         m6, m0, m14
4780    vbroadcasti128  m14, [r6 + 112]
4781    pmaddwd         m0, m14
4782    phaddd          m6, m0
4783
4784    phaddd          m4, m6
4785
4786    paddd           m5, m1, m3
4787    paddd           m5, m15
4788    psrad           m5, IDCT_SHIFT2
4789
4790    psubd           m1, m3
4791    paddd           m1, m15
4792    psrad           m1, IDCT_SHIFT2
4793
4794    paddd           m6, m2, m4
4795    paddd           m6, m15
4796    psrad           m6, IDCT_SHIFT2
4797
4798    psubd           m2, m4
4799    paddd           m2, m15
4800    psrad           m2, IDCT_SHIFT2
4801
4802    packssdw        m5, m6
4803    packssdw        m1, m2
4804    pshufb          m2, m1, [dct16_shuf1]
4805
4806    mova            [r1], xm5
4807    mova            [r1 + 16], xm2
4808    vextracti128    [r1 + r2], m5, 1
4809    vextracti128    [r1 + r2 + 16], m2, 1
4810
4811    lea             r1, [r1 + 2 * r2]
4812    add             r3, 64
4813    dec             r4d
4814    jnz             .pass2
4815    RET
4816
4817
4818%macro IDCT16_AVX512_PASS1 3
4819    movu            m5,  [tab_AVX512_idct16_2 + %1 * 64]
4820    pmaddwd         m9, m4, m5
4821    pmaddwd         m10, m6, m5
4822
4823    vpsrldq         m16,   m9, 4
4824    paddd            m9,  m16
4825    vpslldq         m17,   m10, 4
4826    paddd            m10,  m17
4827    vmovdqu32        m9   {k1}, m10
4828
4829    pmaddwd         m10, m7, m5
4830    pmaddwd         m11, m8, m5
4831
4832    vpsrldq         m16,   m10, 4
4833    paddd            m10,  m16
4834    vpslldq         m17,   m11, 4
4835    paddd            m11,  m17
4836    vmovdqu32        m10   {k1}, m11
4837
4838    vpsrldq         m16,   m9, 8
4839    paddd            m9,  m16
4840    vpslldq         m17,   m10, 8
4841    paddd            m10,  m17
4842    vmovdqu32        m9   {k2}, m10
4843
4844    mova            m5,  [tab_AVX512_idct16_1 + %1 * 64]
4845    pmaddwd         m10, m28, m5
4846    pmaddwd         m11, m29, m5
4847
4848    vpsrldq         m16,   m10, 4
4849    paddd            m10,  m16
4850    vpslldq         m17,   m11, 4
4851    paddd            m11,  m17
4852    vmovdqu32        m10   {k1}, m11
4853
4854    pmaddwd         m11, m30, m5
4855    pmaddwd         m12, m31, m5
4856
4857    vpsrldq         m16,   m11, 4
4858    paddd            m11,  m16
4859    vpslldq         m17,   m12, 4
4860    paddd            m12,  m17
4861    vmovdqu32        m11   {k1}, m12
4862
4863    vpsrldq         m16,   m10, 8
4864    paddd            m10,  m16
4865    vpslldq         m17,   m11, 8
4866    paddd            m11,  m17
4867    vmovdqu32        m10   {k2}, m11
4868
4869    paddd           m11, m9, m10
4870    paddd           m11, m14
4871    psrad           m11, IDCT_SHIFT1
4872
4873    psubd           m9, m10
4874    paddd           m9, m14
4875    psrad           m9, IDCT_SHIFT1
4876
4877    mova            m5,  [tab_AVX512_idct16_2 + %1 * 64 + 64]
4878    pmaddwd         m10, m4, m5
4879    pmaddwd         m12, m6, m5
4880
4881
4882    vpsrldq         m16,   m10, 4
4883    paddd            m10,  m16
4884    vpslldq         m17,   m12, 4
4885    paddd            m12,  m17
4886    vmovdqu32        m10   {k1}, m12
4887
4888    pmaddwd         m12, m7, m5
4889    pmaddwd         m13, m8, m5
4890
4891
4892    vpsrldq         m16,   m12, 4
4893    paddd            m12,  m16
4894    vpslldq         m17,   m13, 4
4895    paddd            m13,  m17
4896    vmovdqu32        m12   {k1}, m13
4897
4898
4899    vpsrldq         m16,   m10, 8
4900    paddd            m10,  m16
4901    vpslldq         m17,   m12, 8
4902    paddd            m12,  m17
4903    vmovdqu32        m10   {k2}, m12
4904
4905
4906
4907    mova            m5,  [tab_AVX512_idct16_1 + %1 * 64 + 64]
4908    pmaddwd         m12, m28, m5
4909    pmaddwd         m13, m29, m5
4910
4911
4912    vpsrldq         m16,   m12, 4
4913    paddd            m12,  m16
4914    vpslldq         m17,   m13, 4
4915    paddd            m13,  m17
4916    vmovdqu32        m12   {k1}, m13
4917
4918    pmaddwd         m13, m30, m5
4919    pmaddwd         m5, m31
4920
4921
4922    vpsrldq         m16,   m13, 4
4923    paddd            m13,  m16
4924    vpslldq         m17,   m5, 4
4925    paddd            m5,  m17
4926    vmovdqu32        m13   {k1}, m5
4927
4928
4929    vpsrldq         m16,   m12, 8
4930    paddd            m12,  m16
4931    vpslldq         m17,   m13, 8
4932    paddd            m13,  m17
4933    vmovdqu32        m12   {k2}, m13
4934
4935
4936    paddd           m5, m10, m12
4937    paddd           m5, m14
4938    psrad           m5, IDCT_SHIFT1
4939
4940    psubd           m10, m12
4941    paddd           m10, m14
4942    psrad           m10, IDCT_SHIFT1
4943
4944    packssdw        m11, m5
4945    packssdw        m9, m10
4946
4947    mova            m10, [idct16_AVX512_shuff]
4948    mova            m5,  [idct16_AVX512_shuff1]
4949
4950    vpermd          m%2, m10, m11
4951    vpermd          m%3, m5, m9
4952%endmacro
4953
4954%macro IDCT16_AVX512_PASS2 2
4955    vpermq          m0, m%1, 0xD8
4956
4957    pmaddwd         m1, m0, m7
4958    pmaddwd         m2, m0, m8
4959
4960
4961    vpsrldq         m14,   m1, 4
4962    paddd            m1,  m14
4963    vpslldq         m31,   m2, 4
4964    paddd            m2,  m31
4965    vmovdqu32        m1   {k1}, m2
4966
4967    pmaddwd         m2, m0, m9
4968    pmaddwd         m3, m0, m10
4969
4970
4971    vpsrldq         m14,   m2, 4
4972    paddd            m2,  m14
4973    vpslldq         m31,   m3, 4
4974    paddd            m3,  m31
4975    vmovdqu32        m2   {k1}, m3
4976
4977
4978    vpsrldq         m14,   m1, 8
4979    paddd            m1,  m14
4980    vpslldq         m31,   m2, 8
4981    paddd            m2,  m31
4982    vmovdqu32        m1   {k2}, m2
4983
4984    pmaddwd         m2, m0, m11
4985    pmaddwd         m3, m0, m12
4986
4987
4988    vpsrldq         m14,   m2, 4
4989    paddd            m2,  m14
4990    vpslldq         m31,   m3, 4
4991    paddd            m3,  m31
4992    vmovdqu32        m2   {k1}, m3
4993
4994    vbroadcasti64x2  m14, [r5 + 112]
4995    pmaddwd         m3, m0, m13
4996    pmaddwd         m4, m0, m14
4997
4998
4999    vpsrldq         m14,   m3, 4
5000    paddd            m3,  m14
5001    vpslldq         m31,   m4, 4
5002    paddd            m4,  m31
5003    vmovdqu32        m3   {k1}, m4
5004
5005
5006    vpsrldq         m14,   m2, 8
5007    paddd            m2,  m14
5008    vpslldq         m31,   m3, 8
5009    paddd            m3,  m31
5010    vmovdqu32        m2   {k2}, m3
5011
5012    vpermq          m0, m%2, 0xD8
5013    pmaddwd         m3, m0, m16
5014    pmaddwd         m4, m0, m17
5015
5016
5017    vpsrldq         m14,   m3, 4
5018    paddd            m3,  m14
5019    vpslldq         m31,   m4, 4
5020    paddd            m4,  m31
5021    vmovdqu32        m3   {k1}, m4
5022
5023    pmaddwd         m4, m0, m19
5024    pmaddwd         m5, m0, m23
5025
5026
5027    vpsrldq         m14,   m4, 4
5028    paddd            m4,  m14
5029    vpslldq         m31,   m5, 4
5030    paddd            m5,  m31
5031    vmovdqu32        m4   {k1}, m5
5032
5033
5034    vpsrldq         m14,   m3, 8
5035    paddd            m3,  m14
5036    vpslldq         m31,   m4, 8
5037    paddd            m4,  m31
5038    vmovdqu32        m3   {k2}, m4
5039
5040
5041    pmaddwd         m4, m0, m28
5042    pmaddwd         m5, m0, m29
5043
5044    vpsrldq         m14,   m4, 4
5045    paddd            m4,  m14
5046    vpslldq         m31,   m5, 4
5047    paddd            m5,  m31
5048    vmovdqu32        m4   {k1}, m5
5049
5050    pmaddwd         m6, m0, m30
5051    vbroadcasti64x2  m31, [r6 + 112]
5052    pmaddwd         m0, m31
5053
5054
5055    vpsrldq         m14,   m6, 4
5056    paddd            m6,  m14
5057    vpslldq         m31,   m0, 4
5058    paddd            m0,  m31
5059    vmovdqu32        m6   {k1}, m0
5060
5061
5062    vpsrldq         m14,   m4, 8
5063    paddd            m4,  m14
5064    vpslldq         m31,   m6, 8
5065    paddd            m6,  m31
5066    vmovdqu32        m4   {k2}, m6
5067
5068    paddd           m5, m1, m3
5069    paddd           m5, m15
5070    psrad           m5, IDCT_SHIFT2
5071
5072    psubd           m1, m3
5073    paddd           m1, m15
5074    psrad           m1, IDCT_SHIFT2
5075
5076    paddd           m6, m2, m4
5077    paddd           m6, m15
5078    psrad           m6, IDCT_SHIFT2
5079
5080    psubd           m2, m4
5081    paddd           m2, m15
5082    psrad           m2, IDCT_SHIFT2
5083
5084    packssdw        m5, m6
5085    packssdw        m1, m2
5086    pshufb          m2, m1, [idct16_AVX512_shuff6]
5087%endmacro
5088
5089
5090;-------------------------------------------------------
5091; void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride)
5092;-------------------------------------------------------
5093INIT_ZMM avx512
5094cglobal idct16, 3, 8, 32
5095%if BIT_DEPTH == 12
5096    %define         IDCT_SHIFT2        8
5097    vpbroadcastd    m15,                [pd_128]
5098%elif BIT_DEPTH == 10
5099    %define         IDCT_SHIFT2        10
5100    vpbroadcastd    m15,                [pd_512]
5101%elif BIT_DEPTH == 8
5102    %define         IDCT_SHIFT2        12
5103    vpbroadcastd    m15,                [pd_2048]
5104%else
5105    %error Unsupported BIT_DEPTH!
5106%endif
5107%define             IDCT_SHIFT1         7
5108
5109    vpbroadcastd    m14,               [pd_64]
5110
5111    add             r2d,               r2d
5112
5113    mov             r7d,    0xAAAA
5114    kmovd            k1,    r7d
5115    mov             r7d,    0xCCCC
5116    kmovd            k2,    r7d
5117    mova          ym2, [idct16_shuff2]
5118    mova          ym3, [idct16_shuff3]
5119    mova         ym26, [idct16_shuff4]
5120    mova         ym27, [idct16_shuff5]
5121
5122.pass1:
5123    movu          xm0, [r0 + 0 * 32]
5124    vinserti128   ym0, ym0, [r0 + 8 * 32], 1
5125    movu          xm1, [r0 + 2 * 32]
5126    vinserti128   ym1, ym1, [r0 + 10 * 32], 1
5127
5128    mova          ym9, ym2
5129    mova         ym10, ym3
5130    vpermi2w      ym9, ym0, ym1
5131    vpermi2w     ym10, ym0, ym1
5132
5133    movu          xm0, [r0 + 4 * 32]
5134    vinserti128   ym0, ym0, [r0 + 12 * 32], 1
5135    movu          xm1, [r0 + 6 * 32]
5136    vinserti128   ym1, ym1, [r0 + 14 * 32], 1
5137
5138    mova         ym11, ym2
5139    mova         ym12, ym3
5140    vpermi2w     ym11, ym0,  ym1
5141    vpermi2w     ym12, ym0,  ym1
5142
5143    mova         ym4,  ym26
5144    mova         ym6,  ym27
5145    vpermi2d     ym4,   ym9, ym11
5146    vpermi2d     ym6,   ym9, ym11
5147
5148    mova         ym7, ym26
5149    mova         ym8, ym27
5150    vpermi2d     ym7, ym10, ym12
5151    vpermi2d     ym8, ym10, ym12
5152
5153    vpermq       ym4, ym4,  q3120
5154    vpermq       ym6, ym6,  q3120
5155    vpermq       ym7, ym7,  q3120
5156    vpermq       ym8, ym8,  q3120
5157
5158    movu          xm0, [r0 + 1 * 32]
5159    vinserti128   ym0, ym0, [r0 + 9 * 32], 1
5160    movu          xm1, [r0 + 3 * 32]
5161    vinserti128   ym1, ym1, [r0 + 11 * 32], 1
5162
5163    mova          ym9, ym2
5164    mova         ym10, ym3
5165    vpermi2w      ym9,  ym0, ym1
5166    vpermi2w     ym10,  ym0, ym1
5167
5168    movu          xm0, [r0 + 5 * 32]
5169    vinserti128   ym0, ym0, [r0 + 13 * 32], 1
5170    movu          xm1, [r0 + 7 * 32]
5171    vinserti128   ym1, ym1, [r0 + 15 * 32], 1
5172
5173    mova         ym11,  ym2
5174    mova         ym12,  ym3
5175    vpermi2w     ym11,  ym0,  ym1
5176    vpermi2w     ym12,  ym0,  ym1
5177
5178    mova         ym28,  ym26
5179    mova         ym29,  ym27
5180    vpermi2d     ym28,  ym9, ym11
5181    vpermi2d     ym29,  ym9, ym11
5182
5183    mova         ym30, ym26
5184    mova         ym31, ym27
5185    vpermi2d     ym30, ym10, ym12
5186    vpermi2d     ym31, ym10, ym12
5187
5188    vpermq       ym28, ym28,  q3120
5189    vpermq       ym29, ym29,  q3120
5190    vpermq       ym30, ym30,  q3120
5191    vpermq       ym31, ym31,  q3120
5192
5193    vinserti64x4    m4,          m4,      ym4, 1
5194    vinserti64x4    m6,          m6,      ym6, 1
5195    vinserti64x4    m7,          m7,      ym7, 1
5196    vinserti64x4    m8,          m8,      ym8, 1
5197    vinserti64x4    m28,        m28,      ym28, 1
5198    vinserti64x4    m29,        m29,      ym29, 1
5199    vinserti64x4    m30,        m30,      ym30, 1
5200    vinserti64x4    m31,        m31,      ym31, 1
5201
5202    IDCT16_AVX512_PASS1      0, 18, 19
5203    IDCT16_AVX512_PASS1      2, 20, 21
5204
5205    add             r0, 16
5206
5207    movu          xm0, [r0 + 0 * 32]
5208    vinserti128   ym0, ym0, [r0 + 8 * 32], 1
5209    movu          xm1, [r0 + 2 * 32]
5210    vinserti128   ym1, ym1, [r0 + 10 * 32], 1
5211
5212    mova          ym9, ym2
5213    mova         ym10, ym3
5214    vpermi2w      ym9, ym0, ym1
5215    vpermi2w     ym10, ym0, ym1
5216
5217    movu          xm0, [r0 + 4 * 32]
5218    vinserti128   ym0, ym0, [r0 + 12 * 32], 1
5219    movu          xm1, [r0 + 6 * 32]
5220    vinserti128   ym1, ym1, [r0 + 14 * 32], 1
5221
5222    mova         ym11, ym2
5223    mova         ym12, ym3
5224    vpermi2w     ym11, ym0,  ym1
5225    vpermi2w     ym12, ym0,  ym1
5226
5227    mova         ym4,  ym26
5228    mova         ym6,  ym27
5229    vpermi2d     ym4,   ym9, ym11
5230    vpermi2d     ym6,   ym9, ym11
5231
5232    mova         ym7, ym26
5233    mova         ym8, ym27
5234    vpermi2d     ym7, ym10, ym12
5235    vpermi2d     ym8, ym10, ym12
5236
5237    vpermq       ym4, ym4,  q3120
5238    vpermq       ym6, ym6,  q3120
5239    vpermq       ym7, ym7,  q3120
5240    vpermq       ym8, ym8,  q3120
5241
5242    movu          xm0, [r0 + 1 * 32]
5243    vinserti128   ym0, ym0, [r0 + 9 * 32], 1
5244    movu          xm1, [r0 + 3 * 32]
5245    vinserti128   ym1, ym1, [r0 + 11 * 32], 1
5246
5247    mova          ym9, ym2
5248    mova         ym10, ym3
5249    vpermi2w      ym9,  ym0, ym1
5250    vpermi2w     ym10,  ym0, ym1
5251
5252    movu          xm0, [r0 + 5 * 32]
5253    vinserti128   ym0, ym0, [r0 + 13 * 32], 1
5254    movu          xm1, [r0 + 7 * 32]
5255    vinserti128   ym1, ym1, [r0 + 15 * 32], 1
5256
5257    mova         ym11,  ym2
5258    mova         ym12,  ym3
5259    vpermi2w     ym11,  ym0,  ym1
5260    vpermi2w     ym12,  ym0,  ym1
5261
5262    mova         ym28,  ym26
5263    mova         ym29,  ym27
5264    vpermi2d     ym28,  ym9, ym11
5265    vpermi2d     ym29,  ym9, ym11
5266
5267    mova         ym30, ym26
5268    mova         ym31, ym27
5269    vpermi2d     ym30, ym10, ym12
5270    vpermi2d     ym31, ym10, ym12
5271
5272    vpermq       ym28, ym28,  q3120
5273    vpermq       ym29, ym29,  q3120
5274    vpermq       ym30, ym30,  q3120
5275    vpermq       ym31, ym31,  q3120
5276
5277    vinserti64x4    m4,          m4,      ym4, 1
5278    vinserti64x4    m6,          m6,      ym6, 1
5279    vinserti64x4    m7,          m7,      ym7, 1
5280    vinserti64x4    m8,          m8,      ym8, 1
5281    vinserti64x4    m28,        m28,      ym28, 1
5282    vinserti64x4    m29,        m29,      ym29, 1
5283    vinserti64x4    m30,        m30,      ym30, 1
5284    vinserti64x4    m31,        m31,      ym31, 1
5285
5286
5287    IDCT16_AVX512_PASS1      0, 22, 23
5288    IDCT16_AVX512_PASS1      2, 24, 25
5289
5290    mova       m26,    [idct16_AVX512_shuff2]
5291    mova       m27,    [idct16_AVX512_shuff3]
5292    vpermi2q   m26,    m18, m22
5293    vpermi2q   m27,    m18, m22
5294    mova       m18,    [idct16_AVX512_shuff2]
5295    mova       m22,    [idct16_AVX512_shuff3]
5296    vpermi2q   m18,    m20, m24
5297    vpermi2q   m22,    m20, m24
5298    mova       m20,    [idct16_AVX512_shuff4]
5299    mova       m24,    [idct16_AVX512_shuff5]
5300    vpermi2q   m20,    m21, m25
5301    vpermi2q   m24,    m21, m25
5302    mova       m21,    [idct16_AVX512_shuff4]
5303    mova       m25,    [idct16_AVX512_shuff5]
5304    vpermi2q   m21,    m19, m23
5305    vpermi2q   m25,    m19, m23
5306
5307    lea             r5, [tab_idct16_2]
5308    lea             r6, [tab_idct16_1]
5309
5310    vbroadcasti64x2  m7,  [r5]
5311    vbroadcasti64x2  m8,  [r5 + 16]
5312    vbroadcasti64x2  m9,  [r5 + 32]
5313    vbroadcasti64x2  m10, [r5 + 48]
5314    vbroadcasti64x2  m11, [r5 + 64]
5315    vbroadcasti64x2  m12, [r5 + 80]
5316    vbroadcasti64x2  m13, [r5 + 96]
5317
5318    vbroadcasti64x2  m16, [r6]
5319    vbroadcasti64x2  m17, [r6 + 16]
5320    vbroadcasti64x2  m19, [r6 + 32]
5321    vbroadcasti64x2  m23, [r6 + 48]
5322    vbroadcasti64x2  m28, [r6 + 64]
5323    vbroadcasti64x2  m29, [r6 + 80]
5324    vbroadcasti64x2  m30, [r6 + 96]
5325
5326
5327    IDCT16_AVX512_PASS2 26, 27
5328     mova            [r1], xm5
5329     mova            [r1 + 16], xm2
5330     vextracti128    [r1 + r2], ym5, 1
5331     vextracti128    [r1 + r2 + 16], ym2, 1
5332     vextracti64x4   ym14, m5, 1
5333     vextracti64x4   ym31, m2, 1
5334     lea             r1, [r1 + 2 * r2]
5335     mova            [r1], xm14
5336     mova            [r1 + 16], xm31
5337     vextracti128    [r1 + r2], ym14, 1
5338     vextracti128    [r1 + r2 + 16], ym31, 1
5339
5340    IDCT16_AVX512_PASS2 18, 22
5341     lea             r1, [r1 + 2 * r2]
5342     mova            [r1], xm5
5343     mova            [r1 + 16], xm2
5344     vextracti128    [r1 + r2], ym5, 1
5345     vextracti128    [r1 + r2 + 16], ym2, 1
5346     vextracti64x4   ym14, m5, 1
5347     vextracti64x4   ym31, m2, 1
5348     lea             r1, [r1 + 2 * r2]
5349     mova            [r1], xm14
5350     mova            [r1 + 16], xm31
5351     vextracti128    [r1 + r2], ym14, 1
5352     vextracti128    [r1 + r2 + 16], ym31, 1
5353
5354    IDCT16_AVX512_PASS2 20, 24
5355     lea             r1, [r1 + 2 * r2]
5356     mova            [r1], xm5
5357     mova            [r1 + 16], xm2
5358     vextracti128    [r1 + r2], ym5, 1
5359     vextracti128    [r1 + r2 + 16], ym2, 1
5360     vextracti64x4   ym14, m5, 1
5361     vextracti64x4   ym31, m2, 1
5362     lea             r1, [r1 + 2 * r2]
5363     mova            [r1], xm14
5364     mova            [r1 + 16], xm31
5365     vextracti128    [r1 + r2], ym14, 1
5366     vextracti128    [r1 + r2 + 16], ym31, 1
5367
5368    IDCT16_AVX512_PASS2 21, 25
5369     lea             r1, [r1 + 2 * r2]
5370     mova            [r1], xm5
5371     mova            [r1 + 16], xm2
5372     vextracti128    [r1 + r2], ym5, 1
5373     vextracti128    [r1 + r2 + 16], ym2, 1
5374     vextracti64x4   ym14, m5, 1
5375     vextracti64x4   ym31, m2, 1
5376     lea             r1, [r1 + 2 * r2]
5377     mova            [r1], xm14
5378     mova            [r1 + 16], xm31
5379     vextracti128    [r1 + r2], ym14, 1
5380     vextracti128    [r1 + r2 + 16], ym31, 1
5381    RET
5382
5383
5384
5385%macro IDCT32_PASS1 1
5386    vbroadcasti128  m3, [tab_idct32_1 + %1 * 32]
5387    vbroadcasti128  m13, [tab_idct32_1 + %1 * 32 + 16]
5388    pmaddwd         m9, m4, m3
5389    pmaddwd         m10, m8, m13
5390    phaddd          m9, m10
5391
5392    pmaddwd         m10, m2, m3
5393    pmaddwd         m11, m1, m13
5394    phaddd          m10, m11
5395
5396    phaddd          m9, m10
5397
5398    vbroadcasti128  m3, [tab_idct32_1 + (15 - %1) * 32]
5399    vbroadcasti128  m13, [tab_idct32_1 + (15- %1) * 32 + 16]
5400    pmaddwd         m10, m4, m3
5401    pmaddwd         m11, m8, m13
5402    phaddd          m10, m11
5403
5404    pmaddwd         m11, m2, m3
5405    pmaddwd         m12, m1, m13
5406    phaddd          m11, m12
5407
5408    phaddd          m10, m11
5409    phaddd          m9, m10                       ;[row0s0 row2s0 row0s15 row2s15 row1s0 row3s0 row1s15 row3s15]
5410
5411    vbroadcasti128  m3, [tab_idct32_2 + %1 * 16]
5412    pmaddwd         m10, m0, m3
5413    pmaddwd         m11, m7, m3
5414    phaddd          m10, m11
5415    phaddd          m10, m10
5416
5417    vbroadcasti128  m3, [tab_idct32_3 + %1 * 16]
5418    pmaddwd         m11, m5, m3
5419    pmaddwd         m12, m6, m3
5420    phaddd          m11, m12
5421    phaddd          m11, m11
5422
5423    paddd           m12, m10, m11                 ;[row0a0 row2a0 NIL NIL row1sa0 row3a0 NIL NIL]
5424    psubd           m10, m11                      ;[row0a15 row2a15 NIL NIL row1a15 row3a15 NIL NIL]
5425
5426    punpcklqdq      m12, m10                      ;[row0a0 row2a0 row0a15 row2a15 row1a0 row3a0 row1a15 row3a15]
5427    paddd           m10, m9, m12
5428    paddd           m10, m15
5429    psrad           m10, IDCT_SHIFT1
5430
5431    psubd           m12, m9
5432    paddd           m12, m15
5433    psrad           m12, IDCT_SHIFT1
5434
5435    packssdw        m10, m12
5436    vextracti128    xm12, m10, 1
5437    movd            [r3 + %1 * 64], xm10
5438    movd            [r3 + 32 + %1 * 64], xm12
5439    pextrd          [r4 - %1 * 64], xm10, 1
5440    pextrd          [r4+ 32 - %1 * 64], xm12, 1
5441    pextrd          [r3 + 16 * 64 + %1 *64], xm10, 3
5442    pextrd          [r3 + 16 * 64 + 32 + %1 * 64], xm12, 3
5443    pextrd          [r4 + 16 * 64 - %1 * 64], xm10, 2
5444    pextrd          [r4 + 16 * 64 + 32 - %1 * 64], xm12, 2
5445%endmacro
5446
5447;-------------------------------------------------------
5448; void idct32(const int16_t* src, int16_t* dst, intptr_t dstStride)
5449;-------------------------------------------------------
5450
5451; TODO: Reduce PHADDD instruction by PADDD
5452
5453INIT_YMM avx2
5454cglobal idct32, 3, 6, 16, 0-32*64
5455
5456%define             IDCT_SHIFT1         7
5457
5458    vbroadcasti128  m15, [pd_64]
5459
5460    mov             r3, rsp
5461    lea             r4, [r3 + 15 * 64]
5462    mov             r5d, 8
5463
5464.pass1:
5465    movq            xm0,    [r0 +  2 * 64]
5466    movq            xm1,    [r0 + 18 * 64]
5467    punpcklqdq      xm0, xm0, xm1
5468    movq            xm1,    [r0 +  0 * 64]
5469    movq            xm2,    [r0 + 16 * 64]
5470    punpcklqdq      xm1, xm1, xm2
5471    vinserti128     m0,  m0,  xm1, 1             ;[2 18 0 16]
5472
5473    movq            xm1,    [r0 + 1 * 64]
5474    movq            xm2,    [r0 + 9 * 64]
5475    punpcklqdq      xm1, xm1, xm2
5476    movq            xm2,    [r0 + 17 * 64]
5477    movq            xm3,    [r0 + 25 * 64]
5478    punpcklqdq      xm2, xm2, xm3
5479    vinserti128     m1,  m1,  xm2, 1             ;[1 9 17 25]
5480
5481    movq            xm2,    [r0 + 6 * 64]
5482    movq            xm3,    [r0 + 22 * 64]
5483    punpcklqdq      xm2, xm2, xm3
5484    movq            xm3,    [r0 + 4 * 64]
5485    movq            xm4,    [r0 + 20 * 64]
5486    punpcklqdq      xm3, xm3, xm4
5487    vinserti128     m2,  m2,  xm3, 1             ;[6 22 4 20]
5488
5489    movq            xm3,    [r0 + 3 * 64]
5490    movq            xm4,    [r0 + 11 * 64]
5491    punpcklqdq      xm3, xm3, xm4
5492    movq            xm4,    [r0 + 19 * 64]
5493    movq            xm5,    [r0 + 27 * 64]
5494    punpcklqdq      xm4, xm4, xm5
5495    vinserti128     m3,  m3,  xm4, 1             ;[3 11 17 25]
5496
5497    movq            xm4,    [r0 + 10 * 64]
5498    movq            xm5,    [r0 + 26 * 64]
5499    punpcklqdq      xm4, xm4, xm5
5500    movq            xm5,    [r0 + 8 * 64]
5501    movq            xm6,    [r0 + 24 * 64]
5502    punpcklqdq      xm5, xm5, xm6
5503    vinserti128     m4,  m4,  xm5, 1             ;[10 26 8 24]
5504
5505    movq            xm5,    [r0 + 5 * 64]
5506    movq            xm6,    [r0 + 13 * 64]
5507    punpcklqdq      xm5, xm5, xm6
5508    movq            xm6,    [r0 + 21 * 64]
5509    movq            xm7,    [r0 + 29 * 64]
5510    punpcklqdq      xm6, xm6, xm7
5511    vinserti128     m5,  m5,  xm6, 1             ;[5 13 21 9]
5512
5513    movq            xm6,    [r0 + 14 * 64]
5514    movq            xm7,    [r0 + 30 * 64]
5515    punpcklqdq      xm6, xm6, xm7
5516    movq            xm7,    [r0 + 12 * 64]
5517    movq            xm8,    [r0 + 28 * 64]
5518    punpcklqdq      xm7, xm7, xm8
5519    vinserti128     m6,  m6,  xm7, 1             ;[14 30 12 28]
5520
5521    movq            xm7,    [r0 + 7 * 64]
5522    movq            xm8,    [r0 + 15 * 64]
5523    punpcklqdq      xm7, xm7, xm8
5524    movq            xm8,    [r0 + 23 * 64]
5525    movq            xm9,    [r0 + 31 * 64]
5526    punpcklqdq      xm8, xm8, xm9
5527    vinserti128     m7,  m7,  xm8, 1             ;[7 15 23 31]
5528
5529    punpckhwd       m8, m0, m2                  ;[18 22 16 20]
5530    punpcklwd       m0, m2                      ;[2 6 0 4]
5531
5532    punpckhwd       m2, m1, m3                  ;[9 11 25 27]
5533    punpcklwd       m1, m3                      ;[1 3 17 19]
5534
5535    punpckhwd       m3, m4, m6                  ;[26 30 24 28]
5536    punpcklwd       m4, m6                      ;[10 14 8 12]
5537
5538    punpckhwd       m6, m5, m7                  ;[13 15 29 31]
5539    punpcklwd       m5, m7                      ;[5 7 21 23]
5540
5541    punpckhdq       m7, m0, m4                  ;[22 62 102 142 23 63 103 143 02 42 82 122 03 43 83 123]
5542    punpckldq       m0, m4                      ;[20 60 100 140 21 61 101 141 00 40 80 120 01 41 81 121]
5543
5544    punpckhdq       m4, m8, m3                  ;[182 222 262 302 183 223 263 303 162 202 242 282 163 203 243 283]
5545    punpckldq       m8, m3                      ;[180 220 260 300 181 221 261 301 160 200 240 280 161 201 241 281]
5546
5547    punpckhdq       m3, m1, m5                  ;[12 32 52 72 13 33 53 73 172 192 212 232 173 193 213 233]
5548    punpckldq       m1, m5                      ;[10 30 50 70 11 31 51 71 170 190 210 230 171 191 211 231]
5549
5550    punpckhdq       m5, m2, m6                  ;[92 112 132 152 93 113 133 153 252 272 292 312 253 273 293 313]
5551    punpckldq       m2, m6                      ;[90 110 130 150 91 111 131 151 250 270 290 310 251 271 291 311]
5552
5553    punpckhqdq      m6, m0, m8                  ;[21 61 101 141 181 221 261 301 01 41 81 121 161 201 241 281]
5554    punpcklqdq      m0, m8                      ;[20 60 100 140 180 220 260 300 00 40 80 120 160 200 240 280]
5555
5556    punpckhqdq      m8, m7, m4                  ;[23 63 103 143 183 223 263 303 03 43 83 123 163 203 243 283]
5557    punpcklqdq      m7, m4                      ;[22 62 102 142 182 222 262 302 02 42 82 122 162 202 242 282]
5558
5559    punpckhqdq      m4, m1, m2                  ;[11 31 51 71 91 111 131 151 171 191 211 231 251 271 291 311]
5560    punpcklqdq      m1, m2                      ;[10 30 50 70 90 110 130 150 170 190 210 230 250 270 290 310]
5561
5562    punpckhqdq      m2, m3, m5                  ;[13 33 53 73 93 113 133 153 173 193 213 233 253 273 293 313]
5563    punpcklqdq      m3, m5                      ;[12 32 52 72 92 112 132 152 172 192 212 232 252 272 292 312]
5564
5565    vperm2i128      m5, m0, m6, 0x20            ;[20 60 100 140 180 220 260 300 21 61 101 141 181 221 261 301]
5566    vperm2i128      m0, m0, m6, 0x31            ;[00 40 80 120 160 200 240 280 01 41 81 121 161 201 241 281]
5567
5568    vperm2i128      m6, m7, m8, 0x20            ;[22 62 102 142 182 222 262 302 23 63 103 143 183 223 263 303]
5569    vperm2i128      m7, m7, m8, 0x31            ;[02 42 82 122 162 202 242 282 03 43 83 123 163 203 243 283]
5570
5571    vperm2i128      m8, m1, m4, 0x31            ;[170 190 210 230 250 270 290 310 171 191 211 231 251 271 291 311]
5572    vperm2i128      m4, m1, m4, 0x20            ;[10 30 50 70 90 110 130 150 11 31 51 71 91 111 131 151]
5573
5574    vperm2i128      m1, m3, m2, 0x31            ;[172 192 212 232 252 272 292 312 173 193 213 233 253 273 293 313]
5575    vperm2i128      m2, m3, m2, 0x20            ;[12 32 52 72 92 112 132 152 13 33 53 73 93 113 133 153]
5576
5577    IDCT32_PASS1 0
5578    IDCT32_PASS1 1
5579    IDCT32_PASS1 2
5580    IDCT32_PASS1 3
5581    IDCT32_PASS1 4
5582    IDCT32_PASS1 5
5583    IDCT32_PASS1 6
5584    IDCT32_PASS1 7
5585
5586    add             r0, 8
5587    add             r3, 4
5588    add             r4, 4
5589    dec             r5d
5590    jnz             .pass1
5591
5592%if BIT_DEPTH == 12
5593    %define         IDCT_SHIFT2        8
5594    vpbroadcastd    m15,                [pd_128]
5595%elif BIT_DEPTH == 10
5596    %define         IDCT_SHIFT2        10
5597    vpbroadcastd    m15,                [pd_512]
5598%elif BIT_DEPTH == 8
5599    %define         IDCT_SHIFT2        12
5600    vpbroadcastd    m15,                [pd_2048]
5601%else
5602    %error Unsupported BIT_DEPTH!
5603%endif
5604
5605    mov             r3, rsp
5606    add             r2d, r2d
5607    mov             r4d, 32
5608
5609    mova            m7,  [tab_idct32_4]
5610    mova            m8,  [tab_idct32_4 + 32]
5611    mova            m9,  [tab_idct32_4 + 64]
5612    mova            m10, [tab_idct32_4 + 96]
5613    mova            m11, [tab_idct32_4 + 128]
5614    mova            m12, [tab_idct32_4 + 160]
5615    mova            m13, [tab_idct32_4 + 192]
5616    mova            m14, [tab_idct32_4 + 224]
5617.pass2:
5618    movu            m0, [r3]
5619    movu            m1, [r3 + 32]
5620
5621    pmaddwd         m2, m0, m7
5622    pmaddwd         m3, m0, m8
5623    phaddd          m2, m3
5624
5625    pmaddwd         m3, m0, m9
5626    pmaddwd         m4, m0, m10
5627    phaddd          m3, m4
5628
5629    phaddd          m2, m3
5630
5631    pmaddwd         m3, m0, m11
5632    pmaddwd         m4, m0, m12
5633    phaddd          m3, m4
5634
5635    pmaddwd         m4, m0, m13
5636    pmaddwd         m5, m0, m14
5637    phaddd          m4, m5
5638
5639    phaddd          m3, m4
5640
5641    vperm2i128      m4, m2, m3, 0x31
5642    vperm2i128      m2, m2, m3, 0x20
5643    paddd           m2, m4
5644
5645    pmaddwd         m3, m0, [tab_idct32_4 + 256]
5646    pmaddwd         m4, m0, [tab_idct32_4 + 288]
5647    phaddd          m3, m4
5648
5649    pmaddwd         m4, m0, [tab_idct32_4 + 320]
5650    pmaddwd         m5, m0, [tab_idct32_4 + 352]
5651    phaddd          m4, m5
5652
5653    phaddd          m3, m4
5654
5655    pmaddwd         m4, m0, [tab_idct32_4 + 384]
5656    pmaddwd         m5, m0, [tab_idct32_4 + 416]
5657    phaddd          m4, m5
5658
5659    pmaddwd         m5, m0, [tab_idct32_4 + 448]
5660    pmaddwd         m0,     [tab_idct32_4 + 480]
5661    phaddd          m5, m0
5662
5663    phaddd          m4, m5
5664
5665    vperm2i128      m0, m3, m4, 0x31
5666    vperm2i128      m3, m3, m4, 0x20
5667    paddd           m3, m0
5668
5669    pmaddwd         m4, m1, [tab_idct32_1]
5670    pmaddwd         m0, m1, [tab_idct32_1 + 32]
5671    phaddd          m4, m0
5672
5673    pmaddwd         m5, m1, [tab_idct32_1 + 64]
5674    pmaddwd         m0, m1, [tab_idct32_1 + 96]
5675    phaddd          m5, m0
5676
5677    phaddd          m4, m5
5678
5679    pmaddwd         m5, m1, [tab_idct32_1 + 128]
5680    pmaddwd         m0, m1, [tab_idct32_1 + 160]
5681    phaddd          m5, m0
5682
5683    pmaddwd         m6, m1, [tab_idct32_1 + 192]
5684    pmaddwd         m0, m1, [tab_idct32_1 + 224]
5685    phaddd          m6, m0
5686
5687    phaddd          m5, m6
5688
5689    vperm2i128      m0, m4, m5, 0x31
5690    vperm2i128      m4, m4, m5, 0x20
5691    paddd           m4, m0
5692
5693    pmaddwd         m5, m1, [tab_idct32_1 + 256]
5694    pmaddwd         m0, m1, [tab_idct32_1 + 288]
5695    phaddd          m5, m0
5696
5697    pmaddwd         m6, m1, [tab_idct32_1 + 320]
5698    pmaddwd         m0, m1, [tab_idct32_1 + 352]
5699    phaddd          m6, m0
5700
5701    phaddd          m5, m6
5702
5703    pmaddwd         m6, m1, [tab_idct32_1 + 384]
5704    pmaddwd         m0, m1, [tab_idct32_1 + 416]
5705    phaddd          m6, m0
5706
5707    pmaddwd         m0, m1, [tab_idct32_1 + 448]
5708    pmaddwd         m1,     [tab_idct32_1 + 480]
5709    phaddd          m0, m1
5710
5711    phaddd          m6, m0
5712
5713    vperm2i128      m0, m5, m6, 0x31
5714    vperm2i128      m5, m5, m6, 0x20
5715    paddd           m5, m0
5716
5717    paddd           m6, m2, m4
5718    paddd           m6, m15
5719    psrad           m6, IDCT_SHIFT2
5720
5721    psubd           m2, m4
5722    paddd           m2, m15
5723    psrad           m2, IDCT_SHIFT2
5724
5725    paddd           m4, m3, m5
5726    paddd           m4, m15
5727    psrad           m4, IDCT_SHIFT2
5728
5729    psubd           m3, m5
5730    paddd           m3, m15
5731    psrad           m3, IDCT_SHIFT2
5732
5733    packssdw        m6, m4
5734    packssdw        m2, m3
5735
5736    vpermq          m6, m6, 0xD8
5737    vpermq          m2, m2, 0x8D
5738    pshufb          m2, [dct16_shuf1]
5739
5740    mova            [r1], m6
5741    mova            [r1 + 32], m2
5742
5743    add             r1, r2
5744    add             r3, 64
5745    dec             r4d
5746    jnz             .pass2
5747    RET
5748
5749
5750%macro IDCT32_AVX512_PASS1 5
5751    pmaddwd         m9,  m8, m%4
5752    pmaddwd         m10, m7, m%5
5753
5754    paddd            m9,  m10
5755    vpsrldq          m0,   m9, 8
5756    paddd            m9,   m0
5757    vpsrldq          m0,   m9, 4
5758    paddd            m9,   m0
5759
5760    pmaddwd         m10, m4, m%4
5761    pmaddwd         m11, m1, m%5
5762
5763    paddd           m10,   m11
5764    vpsrldq          m0,   m10, 8
5765    paddd           m10,   m0
5766    vpslldq          m0,   m10, 4
5767    paddd           m10,    m0
5768
5769    vmovdqu32       m9 {k3}, m10
5770
5771    mova            m6,  [tab_idct32_AVX512_5 + %1 * 64]
5772    mova            m5,  [tab_idct32_AVX512_5 + %1 * 64 + 64]
5773
5774    pmaddwd         m10, m8, m6
5775    pmaddwd         m11, m7, m5
5776
5777    paddd           m10,  m11
5778    vpslldq         m0,   m10, 8
5779    paddd           m10,   m0
5780    vpsrldq          m0,  m10, 4
5781    paddd           m10,   m0
5782
5783    pmaddwd         m11, m4, m6
5784    pmaddwd         m12, m1, m5
5785
5786    paddd           m11,   m12
5787    vpslldq          m0,   m11, 8
5788    paddd           m11,    m0
5789    vpslldq          m0,   m11, 4
5790    paddd           m11,    m0
5791
5792    vmovdqu32        m10  {k4},  m11
5793    vmovdqu32        m9  {k2}, m10
5794
5795    pmaddwd         m10, m3, m%2
5796    pmaddwd         m11, m14, m%2
5797
5798    vpsrldq          m0,   m10, 4
5799    paddd           m10,    m0
5800    vpslldq          m5,   m11, 4
5801    paddd           m11,    m5
5802    vmovdqu32       m10   {k1}, m11
5803
5804    vpsrldq         m0,    m10, 8
5805    paddd           m10,    m0
5806
5807    pmaddwd         m11, m2, m%3
5808    pmaddwd         m12, m13, m%3
5809
5810    vpsrldq          m0,   m11, 4
5811    paddd           m11,    m0
5812    vpslldq          m5,   m12, 4
5813    paddd           m12,    m5
5814    vmovdqu32       m11   {k1}, m12
5815
5816    vpsrldq          m0,   m11, 8
5817    paddd           m11,    m0
5818
5819    paddd           m12, m10, m11
5820    psubd           m10, m11
5821
5822    punpcklqdq      m12, m10
5823    paddd           m10, m9, m12
5824    paddd           m10, m15
5825    psrad           m10, IDCT_SHIFT1
5826
5827    psubd           m12, m9
5828    paddd           m12, m15
5829    psrad           m12, IDCT_SHIFT1
5830
5831    packssdw        m10, m12
5832    vextracti128    xm12, m10, 1
5833    vextracti64x4   ym5,  m10, 1
5834    vextracti128    xm0, ym5, 1
5835
5836    movd            [r3 + %1 * 64], xm10
5837    movd            [r3 + 32 + %1 * 64], xm12
5838    pextrd          [r4 - %1 * 64], xm10, 1
5839    pextrd          [r4+ 32 - %1 * 64], xm12, 1
5840    pextrd          [r3 + 16 * 64 + %1 *64], xm10, 3
5841    pextrd          [r3 + 16 * 64 + 32 + %1 * 64], xm12, 3
5842    pextrd          [r4 + 16 * 64 - %1 * 64], xm10, 2
5843    pextrd          [r4 + 16 * 64 + 32 - %1 * 64], xm12, 2
5844
5845    movd            [r3 + (%1 + 1) * 64], xm5
5846    movd            [r3 + 32 + (%1 + 1) * 64], xm0
5847    pextrd          [r4 - (%1 + 1) * 64], xm5, 1
5848    pextrd          [r4+ 32 - (%1 + 1) * 64], xm0, 1
5849    pextrd          [r3 + 16 * 64 + (%1 + 1) * 64], xm5, 3
5850    pextrd          [r3 + 16 * 64 + 32 + (%1 + 1) * 64], xm0, 3
5851    pextrd          [r4 + 16 * 64 - (%1 + 1) * 64], xm5, 2
5852    pextrd          [r4 + 16 * 64 + 32 - (%1 + 1) * 64], xm0, 2
5853%endmacro
5854
5855%macro IDCT32_AVX512_PASS2 0
5856    pmaddwd         m2, m0, m7
5857    pmaddwd         m3, m0, m8
5858
5859    vpsrldq         m24,   m2, 4
5860    paddd            m2,  m24
5861    vpslldq         m25,   m3, 4
5862    paddd            m3,  m25
5863    vmovdqu32        m2   {k1}, m3
5864
5865    pmaddwd         m3, m0, m9
5866    pmaddwd         m4, m0, m10
5867
5868    vpsrldq         m24,   m3, 4
5869    paddd            m3,  m24
5870    vpslldq         m25,   m4, 4
5871    paddd            m4,  m25
5872    vmovdqu32        m3   {k1}, m4
5873
5874    vpsrldq         m24,   m2, 8
5875    paddd            m2,  m24
5876    vpslldq         m25,   m3, 8
5877    paddd            m3,  m25
5878    vmovdqu32        m2   {k2}, m3
5879
5880    pmaddwd         m3, m0, m11
5881    pmaddwd         m4, m0, m12
5882
5883    vpsrldq         m24,   m3, 4
5884    paddd            m3,  m24
5885    vpslldq         m25,   m4, 4
5886    paddd            m4,  m25
5887    vmovdqu32        m3   {k1}, m4
5888
5889    pmaddwd         m4, m0, m13
5890    pmaddwd         m5, m0, m14
5891
5892    vpsrldq         m24,   m4, 4
5893    paddd            m4,  m24
5894    vpslldq         m25,   m5, 4
5895    paddd            m5,  m25
5896    vmovdqu32        m4   {k1}, m5
5897
5898    vpsrldq         m24,   m3, 8
5899    paddd            m3,  m24
5900    vpslldq         m25,   m4, 8
5901    paddd            m4,  m25
5902    vmovdqu32        m3   {k2}, m4
5903
5904    mova           m24,        [idct16_AVX512_shuff3]
5905    mova           m25,        [idct16_AVX512_shuff2]
5906    vpermi2q       m24,        m2,       m3
5907    vpermi2q       m25,        m2,       m3
5908    paddd           m2, m25, m24
5909
5910    pmaddwd         m3, m0, m16
5911    pmaddwd         m4, m0, m17
5912
5913    vpsrldq         m24,   m3, 4
5914    paddd            m3,  m24
5915    vpslldq         m25,   m4, 4
5916    paddd            m4,  m25
5917    vmovdqu32        m3   {k1}, m4
5918
5919    pmaddwd         m4, m0, m18
5920    pmaddwd         m5, m0, m19
5921
5922    vpsrldq         m24,   m4, 4
5923    paddd            m4,  m24
5924    vpslldq         m25,   m5, 4
5925    paddd            m5,  m25
5926    vmovdqu32        m4   {k1}, m5
5927
5928    vpsrldq         m24,   m3, 8
5929    paddd            m3,  m24
5930    vpslldq         m25,   m4, 8
5931    paddd            m4,  m25
5932    vmovdqu32        m3   {k2}, m4
5933
5934    pmaddwd         m4, m0, m20
5935    pmaddwd         m5, m0, m21
5936
5937    vpsrldq         m24,   m4, 4
5938    paddd            m4,  m24
5939    vpslldq         m25,   m5, 4
5940    paddd            m5,  m25
5941    vmovdqu32        m4   {k1}, m5
5942
5943    pmaddwd         m5, m0, m22
5944    pmaddwd         m0,     m23
5945
5946    vpsrldq         m24,   m5, 4
5947    paddd            m5,  m24
5948    vpslldq         m25,   m0, 4
5949    paddd            m0,  m25
5950    vmovdqu32        m5   {k1}, m0
5951
5952    vpsrldq         m24,   m4, 8
5953    paddd            m4,  m24
5954    vpslldq         m25,   m5, 8
5955    paddd            m5,  m25
5956    vmovdqu32        m4   {k2}, m5
5957
5958    mova           m24,        [idct16_AVX512_shuff3]
5959    mova           m25,        [idct16_AVX512_shuff2]
5960    vpermi2q       m24,        m3,       m4
5961    vpermi2q       m25,        m3,       m4
5962    paddd           m3, m25, m24
5963
5964    pmaddwd         m4, m1, m26
5965    pmaddwd         m0, m1, m27
5966
5967    vpsrldq         m24,   m4, 4
5968    paddd            m4,  m24
5969    vpslldq         m25,   m0, 4
5970    paddd            m0,  m25
5971    vmovdqu32        m4   {k1}, m0
5972
5973    pmaddwd         m5, m1, m28
5974    pmaddwd         m0, m1, m29
5975
5976    vpsrldq         m24,   m5, 4
5977    paddd            m5,  m24
5978    vpslldq         m25,   m0, 4
5979    paddd            m0,  m25
5980    vmovdqu32        m5   {k1}, m0
5981
5982
5983    vpsrldq         m24,   m4, 8
5984    paddd            m4,  m24
5985    vpslldq         m25,   m5, 8
5986    paddd            m5,  m25
5987    vmovdqu32        m4   {k2}, m5
5988
5989    pmaddwd         m5, m1, m30
5990    pmaddwd         m0, m1, m31
5991
5992    vpsrldq         m24,   m5, 4
5993    paddd            m5,  m24
5994    vpslldq         m25,   m0, 4
5995    paddd            m0,  m25
5996    vmovdqu32        m5   {k1}, m0
5997
5998    pmaddwd         m6, m1, [tab_idct32_AVX512_4 + 6 * mmsize]
5999    pmaddwd         m0, m1, [tab_idct32_AVX512_4 + 7 * mmsize]
6000
6001    vpsrldq         m24,   m6, 4
6002    paddd            m6,  m24
6003    vpslldq         m25,   m0, 4
6004    paddd            m0,  m25
6005    vmovdqu32        m6   {k1}, m0
6006
6007    vpsrldq         m24,   m5, 8
6008    paddd            m5,  m24
6009    vpslldq         m25,   m6, 8
6010    paddd            m6,  m25
6011    vmovdqu32        m5   {k2}, m6
6012
6013    mova           m24,        [idct16_AVX512_shuff3]
6014    mova           m25,        [idct16_AVX512_shuff2]
6015    vpermi2q       m24,        m4,       m5
6016    vpermi2q       m25,        m4,       m5
6017    paddd           m4, m25, m24
6018
6019    pmaddwd         m5, m1, [tab_idct32_AVX512_4 + 8 * mmsize]
6020    pmaddwd         m0, m1, [tab_idct32_AVX512_4 + 9 * mmsize]
6021
6022    vpsrldq         m24,   m5, 4
6023    paddd            m5,  m24
6024    vpslldq         m25,   m0, 4
6025    paddd            m0,  m25
6026    vmovdqu32        m5   {k1}, m0
6027
6028    pmaddwd         m6, m1, [tab_idct32_AVX512_4 + 10 * mmsize]
6029    pmaddwd         m0, m1, [tab_idct32_AVX512_4 + 11 * mmsize]
6030
6031    vpsrldq         m24,   m6, 4
6032    paddd            m6,  m24
6033    vpslldq         m25,   m0, 4
6034    paddd            m0,  m25
6035    vmovdqu32        m6   {k1}, m0
6036
6037    vpsrldq         m24,   m5, 8
6038    paddd            m5,  m24
6039    vpslldq         m25,   m6, 8
6040    paddd            m6,  m25
6041    vmovdqu32        m5   {k2}, m6
6042
6043    pmaddwd         m6, m1, [tab_idct32_AVX512_4 + 12 * mmsize]
6044    pmaddwd         m0, m1, [tab_idct32_AVX512_4 + 13 * mmsize]
6045
6046    vpsrldq         m24,   m6, 4
6047    paddd            m6,  m24
6048    vpslldq         m25,   m0, 4
6049    paddd            m0,  m25
6050    vmovdqu32        m6   {k1}, m0
6051
6052    pmaddwd         m0, m1, [tab_idct32_AVX512_4 + 14 * mmsize]
6053    pmaddwd         m1,     [tab_idct32_AVX512_4 + 15 * mmsize]
6054
6055    vpsrldq         m24,   m0, 4
6056    paddd            m0,  m24
6057    vpslldq         m25,   m1, 4
6058    paddd            m1,  m25
6059    vmovdqu32        m0   {k1}, m1
6060
6061    vpsrldq         m24,   m6, 8
6062    paddd            m6,  m24
6063    vpslldq         m25,   m0, 8
6064    paddd            m0,  m25
6065    vmovdqu32        m6   {k2}, m0
6066
6067    mova           m24,        [idct16_AVX512_shuff3]
6068    mova           m25,        [idct16_AVX512_shuff2]
6069    vpermi2q       m24,        m5,       m6
6070    vpermi2q       m25,        m5,       m6
6071    paddd           m5, m25, m24
6072
6073    paddd           m6, m2, m4
6074    paddd           m6, m15
6075    psrad           m6, IDCT_SHIFT2
6076
6077    psubd           m2, m4
6078    paddd           m2, m15
6079    psrad           m2, IDCT_SHIFT2
6080
6081    paddd           m4, m3, m5
6082    paddd           m4, m15
6083    psrad           m4, IDCT_SHIFT2
6084
6085    psubd           m3, m5
6086    paddd           m3, m15
6087    psrad           m3, IDCT_SHIFT2
6088
6089    packssdw        m6, m4
6090    packssdw        m2, m3
6091
6092    vpermq          m6, m6, 0xD8
6093    vpermq          m2, m2, 0x8D
6094    pshufb          m2, [idct16_AVX512_shuff6]
6095%endmacro
6096
6097;-------------------------------------------------------------------
6098; void idct32(const int16_t* src, int16_t* dst, intptr_t dstStride)
6099;-------------------------------------------------------------------
6100
6101INIT_ZMM avx512
6102cglobal idct32, 3, 8, 32, 0-32*64
6103
6104%define             IDCT_SHIFT1         7
6105
6106    vbroadcasti128  m15, [pd_64]
6107
6108    mov             r3,  rsp
6109    lea             r4,  [r3 + 15 * 64]
6110    mov             r5d, 8
6111    mov             r7d, 0xAAAA
6112    kmovd            k1, r7d
6113    mov             r7d, 0xCCCC
6114    kmovd            k2, r7d
6115    mov             r7d, 0x2222
6116    kmovd            k3, r7d
6117    mov             r7d, 0x8888
6118    kmovd            k4, r7d
6119
6120
6121    mova            m16, [tab_idct32_AVX512_2 + 0 * 64]
6122    mova            m17, [tab_idct32_AVX512_2 + 1 * 64]
6123    mova            m18, [tab_idct32_AVX512_2 + 2 * 64]
6124    mova            m19, [tab_idct32_AVX512_2 + 3 * 64]
6125
6126    mova            m20, [tab_idct32_AVX512_3 + 0 * 64]
6127    mova            m21, [tab_idct32_AVX512_3 + 1 * 64]
6128    mova            m22, [tab_idct32_AVX512_3 + 2 * 64]
6129    mova            m23, [tab_idct32_AVX512_3 + 3 * 64]
6130
6131    mova            m24, [tab_idct32_AVX512_1 + 0 * 64]
6132    mova            m25, [tab_idct32_AVX512_1 + 1 * 64]
6133    mova            m26, [tab_idct32_AVX512_1 + 2 * 64]
6134    mova            m27, [tab_idct32_AVX512_1 + 3 * 64]
6135    mova            m28, [tab_idct32_AVX512_1 + 4 * 64]
6136    mova            m29, [tab_idct32_AVX512_1 + 5 * 64]
6137    mova            m30, [tab_idct32_AVX512_1 + 6 * 64]
6138    mova            m31, [tab_idct32_AVX512_1 + 7 * 64]
6139
6140.pass1:
6141    movq            xm0,    [r0 +  2 * 64]
6142    movq            xm1,    [r0 + 18 * 64]
6143    punpcklqdq      xm0,    xm0,  xm1
6144    movq            xm1,    [r0 +  0 * 64]
6145    movq            xm2,    [r0 + 16 * 64]
6146    punpcklqdq      xm1,    xm1,  xm2
6147    vinserti128     ym0,    ym0,  xm1, 1             ;[2 18 0 16]
6148
6149    movq            xm1,    [r0 + 1 * 64]
6150    movq            xm2,    [r0 + 9 * 64]
6151    punpcklqdq      xm1,    xm1,  xm2
6152    movq            xm2,    [r0 + 17 * 64]
6153    movq            xm3,    [r0 + 25 * 64]
6154    punpcklqdq      xm2,    xm2,  xm3
6155    vinserti128     ym1,    ym1,  xm2, 1             ;[1 9 17 25]
6156
6157    movq            xm2,    [r0 + 6 * 64]
6158    movq            xm3,    [r0 + 22 * 64]
6159    punpcklqdq      xm2,    xm2,  xm3
6160    movq            xm3,    [r0 + 4 * 64]
6161    movq            xm4,    [r0 + 20 * 64]
6162    punpcklqdq      xm3,    xm3,  xm4
6163    vinserti128     ym2,    ym2,  xm3, 1             ;[6 22 4 20]
6164
6165    movq            xm3,    [r0 + 3 * 64]
6166    movq            xm4,    [r0 + 11 * 64]
6167    punpcklqdq      xm3,    xm3,  xm4
6168    movq            xm4,    [r0 + 19 * 64]
6169    movq            xm5,    [r0 + 27 * 64]
6170    punpcklqdq      xm4,    xm4,  xm5
6171    vinserti128     ym3,    ym3,  xm4, 1             ;[3 11 17 25]
6172
6173    movq            xm4,    [r0 + 10 * 64]
6174    movq            xm5,    [r0 + 26 * 64]
6175    punpcklqdq      xm4,    xm4,  xm5
6176    movq            xm5,    [r0 + 8 * 64]
6177    movq            xm6,    [r0 + 24 * 64]
6178    punpcklqdq      xm5,    xm5,  xm6
6179    vinserti128     ym4,    ym4,  xm5, 1             ;[10 26 8 24]
6180
6181    movq            xm5,    [r0 + 5 * 64]
6182    movq            xm6,    [r0 + 13 * 64]
6183    punpcklqdq      xm5,    xm5,  xm6
6184    movq            xm6,    [r0 + 21 * 64]
6185    movq            xm7,    [r0 + 29 * 64]
6186    punpcklqdq      xm6,    xm6,  xm7
6187    vinserti128     ym5,    ym5,  xm6, 1             ;[5 13 21 9]
6188
6189    movq            xm6,    [r0 + 14 * 64]
6190    movq            xm7,    [r0 + 30 * 64]
6191    punpcklqdq      xm6,    xm6,  xm7
6192    movq            xm7,    [r0 + 12 * 64]
6193    movq            xm8,    [r0 + 28 * 64]
6194    punpcklqdq      xm7,    xm7,  xm8
6195    vinserti128     ym6,    ym6,  xm7, 1             ;[14 30 12 28]
6196
6197    movq            xm7,    [r0 + 7 * 64]
6198    movq            xm8,    [r0 + 15 * 64]
6199    punpcklqdq      xm7,    xm7,  xm8
6200    movq            xm8,    [r0 + 23 * 64]
6201    movq            xm9,    [r0 + 31 * 64]
6202    punpcklqdq      xm8,    xm8,  xm9
6203    vinserti128     ym7,    ym7,  xm8, 1             ;[7 15 23 31]
6204
6205    punpckhwd       ym8, ym0, ym2                  ;[18 22 16 20]
6206    punpcklwd       ym0, ym2                       ;[2 6 0 4]
6207
6208    punpckhwd       ym2, ym1, ym3                  ;[9 11 25 27]
6209    punpcklwd       ym1, ym3                       ;[1 3 17 19]
6210
6211    punpckhwd       ym3, ym4, ym6                  ;[26 30 24 28]
6212    punpcklwd       ym4, ym6                       ;[10 14 8 12]
6213
6214    punpckhwd       ym6, ym5, ym7                  ;[13 15 29 31]
6215    punpcklwd       ym5, ym7                       ;[5 7 21 23]
6216
6217    punpckhdq       ym7, ym0, ym4                  ;[22 62 102 142 23 63 103 143 02 42 82 122 03 43 83 123]
6218    punpckldq       ym0, ym4                       ;[20 60 100 140 21 61 101 141 00 40 80 120 01 41 81 121]
6219
6220    punpckhdq       ym4, ym8, ym3                  ;[182 222 262 302 183 223 263 303 162 202 242 282 163 203 243 283]
6221    punpckldq       ym8, ym3                       ;[180 220 260 300 181 221 261 301 160 200 240 280 161 201 241 281]
6222
6223    punpckhdq       ym3, ym1, ym5                  ;[12 32 52 72 13 33 53 73 172 192 212 232 173 193 213 233]
6224    punpckldq       ym1, ym5                       ;[10 30 50 70 11 31 51 71 170 190 210 230 171 191 211 231]
6225
6226    punpckhdq       ym5, ym2, ym6                  ;[92 112 132 152 93 113 133 153 252 272 292 312 253 273 293 313]
6227    punpckldq       ym2, ym6                       ;[90 110 130 150 91 111 131 151 250 270 290 310 251 271 291 311]
6228
6229    punpckhqdq      ym6, ym0, ym8                  ;[21 61 101 141 181 221 261 301 01 41 81 121 161 201 241 281]
6230    punpcklqdq      ym0, ym8                       ;[20 60 100 140 180 220 260 300 00 40 80 120 160 200 240 280]
6231
6232    punpckhqdq      ym8, ym7, ym4                  ;[23 63 103 143 183 223 263 303 03 43 83 123 163 203 243 283]
6233    punpcklqdq      ym7, ym4                       ;[22 62 102 142 182 222 262 302 02 42 82 122 162 202 242 282]
6234
6235    punpckhqdq      ym4, ym1, ym2                  ;[11 31 51 71 91 111 131 151 171 191 211 231 251 271 291 311]
6236    punpcklqdq      ym1, ym2                       ;[10 30 50 70 90 110 130 150 170 190 210 230 250 270 290 310]
6237
6238    punpckhqdq      ym2, ym3, ym5                  ;[13 33 53 73 93 113 133 153 173 193 213 233 253 273 293 313]
6239    punpcklqdq      ym3, ym5                       ;[12 32 52 72 92 112 132 152 172 192 212 232 252 272 292 312]
6240
6241    vinserti64x4    m7,        m7,      ym7, 1
6242    vinserti64x4    m8,        m8,      ym8, 1
6243    movu           m13,        [idct16_AVX512_shuff2]
6244    movu           m14,        [idct16_AVX512_shuff3]
6245    vpermi2q       m13,        m7,       m8
6246    vpermi2q       m14,        m7,       m8
6247
6248    vinserti64x4    m1,        m1,      ym1, 1
6249    vinserti64x4    m4,        m4,      ym4, 1
6250    movu            m7,        [idct16_AVX512_shuff3]
6251    movu            m8,        [idct16_AVX512_shuff2]
6252    vpermi2q        m7,        m1,       m4
6253    vpermi2q        m8,        m1,       m4
6254
6255    vinserti64x4    m3,        m3,      ym3, 1
6256    vinserti64x4    m2,        m2,      ym2, 1
6257    movu            m1,        [idct16_AVX512_shuff3]
6258    movu            m4,        [idct16_AVX512_shuff2]
6259    vpermi2q        m1,        m3,       m2
6260    vpermi2q        m4,        m3,       m2
6261
6262    vinserti64x4    m0,        m0,      ym0, 1
6263    vinserti64x4    m6,        m6,      ym6, 1
6264    movu            m2,        [idct16_AVX512_shuff2]
6265    movu            m3,        [idct16_AVX512_shuff3]
6266    vpermi2q        m2,        m0,       m6
6267    vpermi2q        m3,        m0,       m6
6268
6269
6270    IDCT32_AVX512_PASS1 0, 16, 20, 24, 25
6271    IDCT32_AVX512_PASS1 2, 17, 21, 26, 27
6272    IDCT32_AVX512_PASS1 4, 18, 22, 28, 29
6273    IDCT32_AVX512_PASS1 6, 19, 23, 30, 31
6274
6275    add             r0, 8
6276    add             r3, 4
6277    add             r4, 4
6278    dec             r5d
6279    jnz             .pass1
6280
6281%if BIT_DEPTH == 12
6282    %define         IDCT_SHIFT2        8
6283    vpbroadcastd    m15,                [pd_128]
6284%elif BIT_DEPTH == 10
6285    %define         IDCT_SHIFT2        10
6286    vpbroadcastd    m15,                [pd_512]
6287%elif BIT_DEPTH == 8
6288    %define         IDCT_SHIFT2        12
6289    vpbroadcastd    m15,                [pd_2048]
6290%else
6291    %error Unsupported BIT_DEPTH!
6292%endif
6293
6294    mov             r3,  rsp
6295    add             r2d, r2d
6296    mov             r4d, 16
6297    mov             r6d, 0xFFFF0000
6298    kmovd            k3, r6d
6299
6300    mova            m7,  [tab_idct32_AVX512_6]
6301    mova            m8,  [tab_idct32_AVX512_6 + 1 * mmsize]
6302    mova            m9,  [tab_idct32_AVX512_6 + 2 * mmsize]
6303    mova            m10, [tab_idct32_AVX512_6 + 3 * mmsize]
6304    mova            m11, [tab_idct32_AVX512_6 + 4 * mmsize]
6305    mova            m12, [tab_idct32_AVX512_6 + 5 * mmsize]
6306    mova            m13, [tab_idct32_AVX512_6 + 6 * mmsize]
6307    mova            m14, [tab_idct32_AVX512_6 + 7 * mmsize]
6308    mova            m16, [tab_idct32_AVX512_6 + 8 * mmsize]
6309    mova            m17, [tab_idct32_AVX512_6 + 9 * mmsize]
6310    mova            m18, [tab_idct32_AVX512_6 + 10 * mmsize]
6311    mova            m19, [tab_idct32_AVX512_6 + 11 * mmsize]
6312    mova            m20, [tab_idct32_AVX512_6 + 12 * mmsize]
6313    mova            m21, [tab_idct32_AVX512_6 + 13 * mmsize]
6314    mova            m22, [tab_idct32_AVX512_6 + 14 * mmsize]
6315    mova            m23, [tab_idct32_AVX512_6 + 15 * mmsize]
6316    mova            m26, [tab_idct32_AVX512_4]
6317    mova            m27, [tab_idct32_AVX512_4 + 1 * mmsize]
6318    mova            m28, [tab_idct32_AVX512_4 + 2 * mmsize]
6319    mova            m29, [tab_idct32_AVX512_4 + 3 * mmsize]
6320    mova            m30, [tab_idct32_AVX512_4 + 4 * mmsize]
6321    mova            m31, [tab_idct32_AVX512_4 + 5 * mmsize]
6322
6323.pass2:
6324    movu            ym0, [r3]
6325    movu            ym1, [r3 + 32]
6326    vmovdqu16        m0  {k3}, [r3 + 32]
6327    vmovdqu16        m1  {k3}, [r3 + 64]
6328
6329    IDCT32_AVX512_PASS2
6330    movu            [r1],      ym6
6331    movu            [r1 + 32], ym2
6332    vextracti64x4   ym24,       m6, 1
6333    vextracti64x4   ym25,       m2, 1
6334    add             r1,         r2
6335    movu            [r1 ],     ym24
6336    movu            [r1 + 32], ym25
6337
6338    add             r1, r2
6339    add             r3, 128
6340    dec             r4d
6341    jnz             .pass2
6342    RET
6343
6344;-------------------------------------------------------
6345; void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
6346;-------------------------------------------------------
6347INIT_YMM avx2
6348cglobal idct4, 3, 4, 6
6349
6350%define             IDCT_SHIFT1         7
6351%if BIT_DEPTH == 12
6352    %define         IDCT_SHIFT2        8
6353    vpbroadcastd    m5,                [pd_128]
6354%elif BIT_DEPTH == 10
6355    %define         IDCT_SHIFT2        10
6356    vpbroadcastd    m5,                [pd_512]
6357%elif BIT_DEPTH == 8
6358    %define         IDCT_SHIFT2        12
6359    vpbroadcastd    m5,                [pd_2048]
6360%else
6361    %error Unsupported BIT_DEPTH!
6362%endif
6363    vbroadcasti128  m4, [pd_64]
6364
6365    add             r2d, r2d
6366    lea             r3, [r2 * 3]
6367
6368    movu            m0, [r0]                      ;[00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33]
6369
6370    pshufb          m0, [idct4_shuf1]             ;[00 02 01 03 10 12 11 13 20 22 21 23 30 32 31 33]
6371    vextracti128    xm1, m0, 1                    ;[20 22 21 23 30 32 31 33]
6372    punpcklwd       xm2, xm0, xm1                 ;[00 20 02 22 01 21 03 23]
6373    punpckhwd       xm0, xm1                      ;[10 30 12 32 11 31 13 33]
6374    vinserti128     m2, m2, xm2, 1                ;[00 20 02 22 01 21 03 23 00 20 02 22 01 21 03 23]
6375    vinserti128     m0, m0, xm0, 1                ;[10 30 12 32 11 31 13 33 10 30 12 32 11 31 13 33]
6376
6377    mova            m1, [avx2_idct4_1]
6378    mova            m3, [avx2_idct4_1 + 32]
6379    pmaddwd         m1, m2
6380    pmaddwd         m3, m0
6381
6382    paddd           m0, m1, m3
6383    paddd           m0, m4
6384    psrad           m0, IDCT_SHIFT1               ;[00 20 10 30 01 21 11 31]
6385
6386    psubd           m1, m3
6387    paddd           m1, m4
6388    psrad           m1, IDCT_SHIFT1               ;[03 23 13 33 02 22 12 32]
6389
6390    packssdw        m0, m1                        ;[00 20 10 30 03 23 13 33 01 21 11 31 02 22 12 32]
6391    vmovshdup       m1, m0                        ;[10 30 10 30 13 33 13 33 11 31 11 31 12 32 12 32]
6392    vmovsldup       m0, m0                        ;[00 20 00 20 03 23 03 23 01 21 01 21 02 22 02 22]
6393
6394    vpbroadcastq    m2, [avx2_idct4_2]
6395    vpbroadcastq    m3, [avx2_idct4_2 + 8]
6396    pmaddwd         m0, m2
6397    pmaddwd         m1, m3
6398
6399    paddd           m2, m0, m1
6400    paddd           m2, m5
6401    psrad           m2, IDCT_SHIFT2               ;[00 01 10 11 30 31 20 21]
6402
6403    psubd           m0, m1
6404    paddd           m0, m5
6405    psrad           m0, IDCT_SHIFT2               ;[03 02 13 12 33 32 23 22]
6406
6407    pshufb          m0, [idct4_shuf2]             ;[02 03 12 13 32 33 22 23]
6408    punpcklqdq      m1, m2, m0                    ;[00 01 02 03 10 11 12 13]
6409    punpckhqdq      m2, m0                        ;[30 31 32 33 20 21 22 23]
6410    packssdw        m1, m2                        ;[00 01 02 03 30 31 32 33 10 11 12 13 20 21 22 23]
6411    vextracti128    xm0, m1, 1
6412
6413    movq            [r1], xm1
6414    movq            [r1 + r2], xm0
6415    movhps          [r1 + 2 * r2], xm0
6416    movhps          [r1 + r3], xm1
6417    RET
6418
6419;static void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos)
6420;{
6421;    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
6422;    const int scaleBits = SCALE_BITS - 2 * transformShift;
6423;    const uint32_t trSize = 1 << log2TrSize;
6424
6425;    for (int y = 0; y < MLS_CG_SIZE; y++)
6426;    {
6427;        for (int x = 0; x < MLS_CG_SIZE; x++)
6428;        {
6429;             int signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
6430;             costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
6431;             *totalUncodedCost += costUncoded[blkPos + x];
6432;             *totalRdCost += costUncoded[blkPos + x];
6433;        }
6434;        blkPos += trSize;
6435;    }
6436;}
6437
6438;---------------------------------------------------------------------------------------------------------------------------------------------------------
6439; void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos)
6440;---------------------------------------------------------------------------------------------------------------------------------------------------------
6441INIT_ZMM avx512
6442cglobal nonPsyRdoQuant4, 5, 5, 8
6443    mov            r4d,        r4m
6444    lea             r0,        [r0 + 2 * r4]
6445    lea             r4,        [4 * r4]
6446    lea             r1,        [r1 + 2 * r4]
6447%if BIT_DEPTH == 12
6448    mov             r4,        [tab_nonpsyRdo12]
6449%elif BIT_DEPTH == 10
6450    mov             r4,        [tab_nonpsyRdo10]
6451%elif BIT_DEPTH == 8
6452    mov             r4,        [tab_nonpsyRdo8]
6453%else
6454    %error Unsupported BIT_DEPTH!
6455 %endif
6456    movq           xm3,        r4
6457    movq           xm6,        [r2]
6458    movq           xm7,        [r3]
6459    vpxor           m4,        m4
6460    vpxor           m5,        m5
6461;Row 1, 2
6462    movu           xm0,        [r0]
6463    vpmovsxwq      m1,         xm0
6464    vcvtqq2pd      m2,         m1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
6465    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
6466    vcvtpd2qq      m1,         m2
6467    vpsllq         m1,         xm3                              ; costUncoded
6468    paddq          m4,         m1
6469    movu           [r1],       m1
6470    ;Row 3, 4
6471    movu           xm0,        [r0 + 16]
6472    vpmovsxwq      m1,         xm0
6473    vcvtqq2pd      m2,         m1
6474    vfmadd213pd    m2,         m2,             m5
6475    vcvtpd2qq      m1,         m2
6476    vpsllq         m1,         xm3                              ; costUncoded
6477    paddq          m4,         m1
6478    movu           [r1 + 64],  m1
6479    vextracti32x8  ym2,        m4,             1
6480    paddq          ym4,        ym2
6481    vextracti32x4  xm2,        m4,             1
6482    paddq          xm4,        xm2
6483    punpckhqdq     xm2,        xm4,            xm5
6484    paddq          xm4,        xm2
6485
6486    paddq          xm6,        xm4
6487    paddq          xm7,        xm4
6488
6489    movq           [r2],       xm6
6490    movq           [r3],       xm7
6491    RET
6492INIT_ZMM avx512
6493cglobal nonPsyRdoQuant8, 5, 5, 8
6494    mov            r4d,        r4m
6495    lea             r0,        [r0 + 2 * r4]
6496    lea             r4,        [4 * r4]
6497    lea             r1,        [r1 + 2 * r4]
6498%if BIT_DEPTH == 12
6499    mov             r4,        [tab_nonpsyRdo12 + 8]
6500%elif BIT_DEPTH == 10
6501    mov             r4,        [tab_nonpsyRdo10 + 8]
6502%elif BIT_DEPTH == 8
6503    mov             r4,        [tab_nonpsyRdo8 + 8]
6504%else
6505    %error Unsupported BIT_DEPTH!
6506 %endif
6507    movq           xm3,        r4
6508    movq           xm6,        [r2]
6509    movq           xm7,        [r3]
6510    vpxor           m4,        m4
6511    vpxor           m5,        m5
6512
6513;Row 1, 2
6514    movq           xm0,        [r0]
6515    pinsrq         xm0,        [r0 + mmsize/4], 1
6516    vpmovsxwq      m1,         xm0
6517    vcvtqq2pd      m2,         m1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
6518    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
6519    vcvtpd2qq      m1,         m2
6520    vpsllq         m1,         xm3                              ; costUncoded
6521    paddq          m4,         m1
6522    movu           [r1],       ym1
6523    vextracti32x8  [r1 + mmsize],  m1 ,        1
6524
6525    ;Row 3, 4
6526    movq           xm0,        [r0 + mmsize/2]
6527    pinsrq         xm0,        [r0 + 3 * mmsize/4],      1
6528    vpmovsxwq      m1,         xm0
6529    vcvtqq2pd      m2,         m1
6530    vfmadd213pd    m2,         m2,             m5
6531    vcvtpd2qq      m1,         m2
6532    vpsllq         m1,         xm3                              ; costUncoded
6533    paddq          m4,         m1
6534    movu           [r1 + 2 * mmsize], ym1
6535    vextracti32x8  [r1 + 3 * mmsize], m1 ,     1
6536
6537    vextracti32x8  ym2,        m4,             1
6538    paddq          ym4,        ym2
6539    vextracti32x4  xm2,        m4,             1
6540    paddq          xm4,        xm2
6541    punpckhqdq     xm2,        xm4,            xm5
6542    paddq          xm4,        xm2
6543
6544    paddq          xm6,        xm4
6545    paddq          xm7,        xm4
6546
6547    movq           [r2],       xm6
6548    movq           [r3],       xm7
6549    RET
6550INIT_ZMM avx512
6551cglobal nonPsyRdoQuant16, 5, 5, 8
6552    mov            r4d,        r4m
6553    lea             r0,        [r0 + 2 * r4]
6554    lea             r4,        [4 * r4]
6555    lea             r1,        [r1 + 2 * r4]
6556%if BIT_DEPTH == 12
6557    mov             r4,        [tab_nonpsyRdo12 + 16]
6558%elif BIT_DEPTH == 10
6559    mov             r4,        [tab_nonpsyRdo10 + 16]
6560%elif BIT_DEPTH == 8
6561    mov             r4,        [tab_nonpsyRdo8 + 16]
6562%else
6563    %error Unsupported BIT_DEPTH!
6564 %endif
6565    movq           xm3,        r4
6566    movq           xm6,        [r2]
6567    movq           xm7,        [r3]
6568    vpxor           m4,        m4
6569    vpxor           m5,        m5
6570
6571;Row 1, 2
6572    movq           xm0,        [r0]
6573    pinsrq         xm0,        [r0 + mmsize/2],       1
6574    vpmovsxwq      m1,         xm0
6575    vcvtqq2pd      m2,         m1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
6576    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
6577    vcvtpd2qq      m1,         m2
6578    vpsllq         m1,         xm3                              ; costUncoded
6579    paddq          m4,         m1
6580    movu           [r1],       ym1
6581    vextracti32x8  [r1 + 2 * mmsize],  m1,     1
6582
6583    ;Row 3, 4
6584    movq           xm0,        [r0 + mmsize]
6585    pinsrq         xm0,        [r0 + 3 * mmsize/2],      1
6586    vpmovsxwq      m1,         xm0
6587    vcvtqq2pd      m2,         m1
6588    vfmadd213pd    m2,         m2,             m5
6589    vcvtpd2qq      m1,         m2
6590    vpsllq         m1,         xm3                              ; costUncoded
6591    paddq          m4,         m1
6592    movu           [r1 + 4 * mmsize],         ym1
6593    vextracti32x8  [r1 + 6 * mmsize],          m1 ,            1
6594
6595    vextracti32x8  ym2,        m4,             1
6596    paddq          ym4,        ym2
6597    vextracti32x4  xm2,        m4,             1
6598    paddq          xm4,        xm2
6599    punpckhqdq     xm2,        xm4,            xm5
6600    paddq          xm4,        xm2
6601
6602    paddq          xm6,        xm4
6603    paddq          xm7,        xm4
6604
6605    movq           [r2],       xm6
6606    movq           [r3],       xm7
6607    RET
6608INIT_ZMM avx512
6609cglobal nonPsyRdoQuant32, 5, 5, 8
6610    mov            r4d,        r4m
6611    lea             r0,        [r0 + 2 * r4]
6612    lea             r4,        [4 * r4]
6613    lea             r1,        [r1 + 2 * r4]
6614%if BIT_DEPTH == 12
6615    mov             r4,        [tab_nonpsyRdo12 + 24]
6616%elif BIT_DEPTH == 10
6617    mov             r4,        [tab_nonpsyRdo10 + 24]
6618%elif BIT_DEPTH == 8
6619    mov             r4,        [tab_nonpsyRdo8 + 24]
6620%else
6621    %error Unsupported BIT_DEPTH!
6622 %endif
6623    movq           xm3,        r4
6624    movq           xm6,        [r2]
6625    movq           xm7,        [r3]
6626    vpxor           m4,        m4
6627    vpxor           m5,        m5
6628
6629;Row 1, 2
6630    movq           xm0,        [r0]
6631    pinsrq         xm0,        [r0 + mmsize],  1
6632    vpmovsxwq      m1,         xm0
6633    vcvtqq2pd      m2,         m1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
6634    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
6635    vcvtpd2qq      m1,         m2
6636    vpsllq         m1,         xm3                              ; costUncoded
6637    paddq          m4,         m1
6638    movu           [r1],       ym1
6639    vextracti32x8  [r1 + 4 * mmsize],  m1,     1
6640
6641    ;Row 3, 4
6642    movq           xm0,        [r0 + 2 * mmsize]
6643    pinsrq         xm0,        [r0 + 3 * mmsize],      1
6644    vpmovsxwq      m1,         xm0
6645    vcvtqq2pd      m2,         m1
6646    vfmadd213pd    m2,         m2,             m5
6647    vcvtpd2qq      m1,         m2
6648    vpsllq         m1,         xm3                              ; costUncoded
6649    paddq          m4,         m1
6650    movu           [r1 + 8 * mmsize],         ym1
6651    vextracti32x8  [r1 + 12 * mmsize],         m1 ,            1
6652
6653    vextracti32x8  ym2,        m4,             1
6654    paddq          ym4,        ym2
6655    vextracti32x4  xm2,        m4,             1
6656    paddq          xm4,        xm2
6657    punpckhqdq     xm2,        xm4,            xm5
6658    paddq          xm4,        xm2
6659
6660    paddq          xm6,        xm4
6661    paddq          xm7,        xm4
6662
6663    movq           [r2],       xm6
6664    movq           [r3],       xm7
6665    RET
6666;static void psyRdoQuant_c(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t psyScale, uint32_t blkPos)
6667;{
6668;    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
6669;    const int scaleBits = SCALE_BITS - 2 * transformShift;
6670;    const uint32_t trSize = 1 << log2TrSize;
6671;    int max = X265_MAX(0, (2 * transformShift + 1));
6672;
6673;    for (int y = 0; y < MLS_CG_SIZE; y++)
6674;    {
6675;        for (int x = 0; x < MLS_CG_SIZE; x++)
6676;        {
6677;            int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
6678;            int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
6679;
6680;            costUncoded[blkPos + x] = static_cast<int64_t>((double)(signCoef * signCoef) << scaleBits);
6681;
6682;            /* when no residual coefficient is coded, predicted coef == recon coef */
6683;            costUncoded[blkPos + x] -= static_cast<int64_t>((psyScale * (predictedCoef)) >> max);
6684;
6685;            *totalUncodedCost += costUncoded[blkPos + x];
6686;            *totalRdCost += costUncoded[blkPos + x];
6687;        }
6688;        blkPos += trSize;
6689;    }
6690;}
6691
6692;---------------------------------------------------------------------------------------------------------------------------------------------------------
6693; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
6694;---------------------------------------------------------------------------------------------------------------------------------------------------------
6695INIT_ZMM avx512
6696cglobal psyRdoQuant4, 5, 9, 13
6697%if WIN64
6698    mov             r5,        r5m
6699%endif
6700    mov            r6d,        r6m
6701    vpbroadcastq   m12,        [r5]                              ; psyScale
6702    lea             r0,        [r0 + 2 * r6]
6703    lea             r1,        [r1 + 2 * r6]
6704    lea             r6,        [4 * r6]
6705    lea             r2,        [r2 + 2 * r6]
6706    movq           xm0,        [r3]
6707    movq           xm1,        [r4]
6708
6709%if BIT_DEPTH == 12
6710    mov            r5,         [tab_nonpsyRdo12]                 ; scaleBits
6711%elif BIT_DEPTH == 10
6712    mov            r5,         [tab_nonpsyRdo10]
6713%elif BIT_DEPTH == 8
6714    mov            r5,         [tab_nonpsyRdo8]
6715%else
6716    %error Unsupported BIT_DEPTH!
6717%endif
6718
6719    movq           xm2,        r5
6720    vpxor           m4,        m4
6721    vpxor           m3,        m3
6722
6723;Row 1, 2
6724    vpmovsxwq       m6,        [r0]
6725    vpmovsxwq       m7,        [r1]
6726    psubq           m7,        m6                              ; predictedCoef
6727
6728    vcvtqq2pd       m9,        m6
6729    vfmadd213pd     m9,        m9,             m3
6730    vcvtpd2qq       m8,        m9
6731    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits
6732
6733    vcvtqq2pd      m10,        m7
6734    vcvtqq2pd      m11,        m12
6735    vfmadd213pd    m10,        m11,            m3
6736    vcvtpd2qq       m9,        m10
6737    vpsraq          m9,        RDO_MAX_4                       ;(psyScale * predictedCoef) >> max
6738
6739    psubq           m8,        m9
6740    paddq           m4,        m8
6741    movu           [r2],       m8
6742
6743    ;Row 3, 4
6744    vpmovsxwq       m6,        [r0 + 16]
6745    vpmovsxwq       m7,        [r1 + 16]
6746    psubq           m7,        m6                              ; predictedCoef
6747
6748    vcvtqq2pd       m9,        m6
6749    vfmadd213pd     m9,        m9,             m3
6750    vcvtpd2qq       m8,        m9
6751    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits
6752
6753    vcvtqq2pd      m10,        m7
6754    vcvtqq2pd      m11,        m12
6755    vfmadd213pd    m10,        m11,             m3
6756    vcvtpd2qq       m9,        m10
6757    vpsraq          m9,        RDO_MAX_4                      ;(psyScale * predictedCoef) >> max
6758
6759    psubq           m8,         m9
6760    paddq           m4,         m8
6761    movu           [r2 + 64],   m8
6762
6763    vextracti32x8  ym2,         m4,            1
6764    paddq          ym4,        ym2
6765    vextracti32x4  xm2,         m4,            1
6766    paddq          xm4,        xm2
6767    punpckhqdq     xm2,        xm4,            xm3
6768    paddq          xm4,        xm2
6769
6770    paddq          xm0,        xm4
6771    paddq          xm1,        xm4
6772
6773    movq           [r3],       xm0
6774    movq           [r4],       xm1
6775    RET
6776
6777;---------------------------------------------------------------------------------------------------------------------------------------------------------
6778; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
6779;---------------------------------------------------------------------------------------------------------------------------------------------------------
6780INIT_ZMM avx512
6781cglobal psyRdoQuant8, 5, 9, 15
6782%if WIN64
6783    mov             r5,        r5m
6784%endif
6785    mov            r6d,        r6m
6786    vpbroadcastq   m12,        [r5]                              ; psyScale
6787    lea             r0,        [r0 + 2 * r6]
6788    lea             r1,        [r1 + 2 * r6]
6789    lea             r6,        [4 * r6]
6790    lea             r2,        [r2 + 2 * r6]
6791    movq           xm0,        [r3]
6792    movq           xm1,        [r4]
6793
6794%if BIT_DEPTH == 12
6795    mov            r5,         [tab_nonpsyRdo12 + 8]                 ; scaleBits
6796%elif BIT_DEPTH == 10
6797    mov            r5,         [tab_nonpsyRdo10 + 8]
6798%elif BIT_DEPTH == 8
6799    mov            r5,         [tab_nonpsyRdo8 + 8]
6800%else
6801    %error Unsupported BIT_DEPTH!
6802%endif
6803
6804    movq           xm2,        r5
6805    vpxor           m4,        m4
6806    vpxor           m3,        m3
6807
6808;Row 1, 2
6809    movq           xm13,       [r0]
6810    movq           xm14,       [r1]
6811    pinsrq         xm13,       [r0 + mmsize/4], 1
6812    pinsrq         xm14,       [r1 + mmsize/4], 1
6813    vpmovsxwq       m6,        xm13
6814    vpmovsxwq       m7,        xm14
6815    psubq           m7,        m6                              ; predictedCoef
6816
6817    vcvtqq2pd       m9,        m6
6818    vfmadd213pd     m9,        m9,             m3
6819    vcvtpd2qq       m8,        m9
6820    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits
6821
6822    vcvtqq2pd      m10,        m7
6823    vcvtqq2pd      m11,        m12
6824    vfmadd213pd    m10,        m11,            m3
6825    vcvtpd2qq       m9,        m10
6826    vpsraq          m9,        RDO_MAX_8                       ;(psyScale * predictedCoef) >> max
6827
6828    psubq           m8,        m9
6829    paddq           m4,        m8
6830    movu           [r2],       ym8
6831    vextracti32x8  [r2 + mmsize],  m8 ,        1
6832
6833    ;Row 3, 4
6834    movq           xm13,       [r0 + mmsize/2]
6835    movq           xm14,       [r1 + mmsize/2]
6836    pinsrq         xm13,       [r0 + 3 * mmsize/4],      1
6837    pinsrq         xm14,       [r1 + 3 * mmsize/4],      1
6838    vpmovsxwq       m6,        xm13
6839    vpmovsxwq       m7,        xm14
6840    psubq           m7,        m6                              ; predictedCoef
6841
6842    vcvtqq2pd       m9,        m6
6843    vfmadd213pd     m9,        m9,             m3
6844    vcvtpd2qq       m8,        m9
6845    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits
6846
6847    vcvtqq2pd      m10,        m7
6848    vcvtqq2pd      m11,        m12
6849    vfmadd213pd    m10,        m11,             m3
6850    vcvtpd2qq       m9,        m10
6851    vpsraq          m9,        RDO_MAX_8                      ;(psyScale * predictedCoef) >> max
6852
6853    psubq           m8,         m9
6854    paddq           m4,         m8
6855    movu           [r2 + 2 * mmsize],       ym8
6856    vextracti32x8  [r2 + 3 * mmsize],  m8 ,    1
6857
6858    vextracti32x8  ym2,         m4,            1
6859    paddq          ym4,        ym2
6860    vextracti32x4  xm2,         m4,            1
6861    paddq          xm4,        xm2
6862    punpckhqdq     xm2,        xm4,            xm3
6863    paddq          xm4,        xm2
6864
6865    paddq          xm0,        xm4
6866    paddq          xm1,        xm4
6867
6868    movq           [r3],       xm0
6869    movq           [r4],       xm1
6870    RET
6871
6872;---------------------------------------------------------------------------------------------------------------------------------------------------------
6873; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
6874;---------------------------------------------------------------------------------------------------------------------------------------------------------
6875INIT_ZMM avx512
6876cglobal psyRdoQuant16, 5, 9, 15
6877%if WIN64
6878    mov             r5,        r5m
6879%endif
6880    mov            r6d,        r6m
6881    vpbroadcastq   m12,        [r5]                              ; psyScale
6882    lea             r0,        [r0 + 2 * r6]
6883    lea             r1,        [r1 + 2 * r6]
6884    lea             r6,        [4 * r6]
6885    lea             r2,        [r2 + 2 * r6]
6886    movq           xm0,        [r3]
6887    movq           xm1,        [r4]
6888
6889%if BIT_DEPTH == 12
6890    mov            r5,         [tab_nonpsyRdo12 + 16]                 ; scaleBits
6891%elif BIT_DEPTH == 10
6892    mov            r5,         [tab_nonpsyRdo10 + 16]
6893%elif BIT_DEPTH == 8
6894    mov            r5,         [tab_nonpsyRdo8 + 16]
6895%else
6896    %error Unsupported BIT_DEPTH!
6897%endif
6898
6899    movq           xm2,        r5
6900    vpxor           m4,        m4
6901    vpxor           m3,        m3
6902
6903;Row 1, 2
6904    movq           xm13,       [r0]
6905    movq           xm14,       [r1]
6906    pinsrq         xm13,       [r0 + mmsize/2], 1
6907    pinsrq         xm14,       [r1 + mmsize/2], 1
6908    vpmovsxwq       m6,        xm13
6909    vpmovsxwq       m7,        xm14
6910    psubq           m7,        m6                              ; predictedCoef
6911
6912    vcvtqq2pd       m9,        m6
6913    vfmadd213pd     m9,        m9,             m3
6914    vcvtpd2qq       m8,        m9
6915    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits
6916
6917    vcvtqq2pd      m10,        m7
6918    vcvtqq2pd      m11,        m12
6919    vfmadd213pd    m10,        m11,            m3
6920    vcvtpd2qq       m9,        m10
6921    vpsraq          m9,        RDO_MAX_16                      ;(psyScale * predictedCoef) >> max
6922
6923    psubq           m8,        m9
6924    paddq           m4,        m8
6925    movu           [r2],       ym8
6926    vextracti32x8  [r2 + 2 * mmsize],  m8 ,        1
6927
6928    ;Row 3, 4
6929    movq           xm13,       [r0 + mmsize]
6930    movq           xm14,       [r1 + mmsize]
6931    pinsrq         xm13,       [r0 + 3 * mmsize/2],      1
6932    pinsrq         xm14,       [r1 + 3 * mmsize/2],      1
6933    vpmovsxwq       m6,        xm13
6934    vpmovsxwq       m7,        xm14
6935    psubq           m7,        m6                              ; predictedCoef
6936
6937    vcvtqq2pd       m9,        m6
6938    vfmadd213pd     m9,        m9,             m3
6939    vcvtpd2qq       m8,        m9
6940    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits
6941
6942    vcvtqq2pd      m10,        m7
6943    vcvtqq2pd      m11,        m12
6944    vfmadd213pd    m10,        m11,             m3
6945    vcvtpd2qq       m9,        m10
6946    vpsraq          m9,        RDO_MAX_16                      ;(psyScale * predictedCoef) >> max
6947
6948    psubq           m8,         m9
6949    paddq           m4,         m8
6950    movu           [r2 + 4 * mmsize],       ym8
6951    vextracti32x8  [r2 + 6 * mmsize],  m8 ,    1
6952
6953    vextracti32x8  ym2,         m4,            1
6954    paddq          ym4,        ym2
6955    vextracti32x4  xm2,         m4,            1
6956    paddq          xm4,        xm2
6957    punpckhqdq     xm2,        xm4,            xm3
6958    paddq          xm4,        xm2
6959
6960    paddq          xm0,        xm4
6961    paddq          xm1,        xm4
6962
6963    movq           [r3],       xm0
6964    movq           [r4],       xm1
6965    RET
6966
6967;---------------------------------------------------------------------------------------------------------------------------------------------------------
6968; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
6969;---------------------------------------------------------------------------------------------------------------------------------------------------------
6970INIT_ZMM avx512
6971cglobal psyRdoQuant32, 5, 9, 15
6972%if WIN64
6973    mov             r5,        r5m
6974%endif
6975    mov            r6d,        r6m
6976    vpbroadcastq   m12,        [r5]                              ; psyScale
6977    lea             r0,        [r0 + 2 * r6]
6978    lea             r1,        [r1 + 2 * r6]
6979    lea             r6,        [4 * r6]
6980    lea             r2,        [r2 + 2 * r6]
6981    movq           xm0,        [r3]
6982    movq           xm1,        [r4]
6983
6984%if BIT_DEPTH == 12
6985    mov            r5,         [tab_nonpsyRdo12 + 24]                 ; scaleBits
6986%elif BIT_DEPTH == 10
6987    mov            r5,         [tab_nonpsyRdo10 + 24]
6988%elif BIT_DEPTH == 8
6989    mov            r5,         [tab_nonpsyRdo8 + 24]
6990%else
6991    %error Unsupported BIT_DEPTH!
6992%endif
6993
6994    movq           xm2,        r5
6995    vpxor           m4,        m4
6996    vpxor           m3,        m3
6997
6998;Row 1, 2
6999    movq           xm13,       [r0]
7000    movq           xm14,       [r1]
7001    pinsrq         xm13,       [r0 + mmsize], 1
7002    pinsrq         xm14,       [r1 + mmsize], 1
7003    vpmovsxwq       m6,        xm13
7004    vpmovsxwq       m7,        xm14
7005    psubq           m7,        m6                              ; predictedCoef
7006
7007    vcvtqq2pd       m9,        m6
7008    vfmadd213pd     m9,        m9,             m3
7009    vcvtpd2qq       m8,        m9
7010    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits
7011
7012    vcvtqq2pd      m10,        m7
7013    vcvtqq2pd      m11,        m12
7014    vfmadd213pd    m10,        m11,            m3
7015    vcvtpd2qq       m9,        m10
7016    vpsraq          m9,        RDO_MAX_32                      ;(psyScale * predictedCoef) >> max
7017
7018    psubq           m8,        m9
7019    paddq           m4,        m8
7020    movu           [r2],       ym8
7021    vextracti32x8  [r2 + 4 * mmsize],  m8 ,        1
7022
7023    ;Row 3, 4
7024    movq           xm13,       [r0 + 2 * mmsize]
7025    movq           xm14,       [r1 + 2 * mmsize]
7026    pinsrq         xm13,       [r0 + 3 * mmsize],      1
7027    pinsrq         xm14,       [r1 + 3 * mmsize],      1
7028    vpmovsxwq       m6,        xm13
7029    vpmovsxwq       m7,        xm14
7030    psubq           m7,        m6                              ; predictedCoef
7031
7032    vcvtqq2pd       m9,        m6
7033    vfmadd213pd     m9,        m9,             m3
7034    vcvtpd2qq       m8,        m9
7035    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits
7036
7037    vcvtqq2pd      m10,        m7
7038    vcvtqq2pd      m11,        m12
7039    vfmadd213pd    m10,        m11,             m3
7040    vcvtpd2qq       m9,        m10
7041    vpsraq          m9,        RDO_MAX_32                      ;(psyScale * predictedCoef) >> max
7042
7043    psubq           m8,         m9
7044    paddq           m4,         m8
7045    movu           [r2 + 8 * mmsize],       ym8
7046    vextracti32x8  [r2 + 12 * mmsize], m8 ,    1
7047
7048    vextracti32x8  ym2,         m4,            1
7049    paddq          ym4,        ym2
7050    vextracti32x4  xm2,         m4,            1
7051    paddq          xm4,        xm2
7052    punpckhqdq     xm2,        xm4,            xm3
7053    paddq          xm4,        xm2
7054
7055    paddq          xm0,        xm4
7056    paddq          xm1,        xm4
7057
7058    movq           [r3],       xm0
7059    movq           [r4],       xm1
7060    RET
7061
7062INIT_YMM avx2
7063cglobal nonPsyRdoQuant4, 5, 9, 16
7064    mov            r4d,        r4m
7065    lea             r0,        [r0 + 2 * r4]
7066    lea             r4,        [4 * r4]
7067    lea             r1,        [r1 + 2 * r4]
7068    movq           xm0,        [r2]
7069    movq           xm1,        [r3]
7070
7071%if BIT_DEPTH == 12
7072    mov            r5,         [tab_nonpsyRdo12]                 ; scaleBits
7073%elif BIT_DEPTH == 10
7074    mov            r5,         [tab_nonpsyRdo10]
7075%elif BIT_DEPTH == 8
7076    mov            r5,         [tab_nonpsyRdo8]
7077%else
7078    %error Unsupported BIT_DEPTH!
7079%endif
7080    movq           xm2,        r5
7081    vpxor           m4,        m4
7082    vpxor           m3,        m3
7083    vpxor           m13,       m13
7084
7085    vpmovsxwd                  m6,        [r0]
7086    vcvtdq2pd                  m9,        xm6
7087    vfmadd213pd                m9,        m9,             m3
7088    vcvtpd2dq                  xm8,       m9
7089    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
7090    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits
7091    paddq                      m4,        m13
7092    movu                       [r1],       m13
7093
7094    vpmovsxwd                 m6,        [r0 + 8]
7095    vcvtdq2pd                 m9,        xm6
7096    vfmadd213pd               m9,        m9,             m3
7097    vcvtpd2dq                 xm8,       m9
7098    vpmovsxdq                 m13,       xm8                              ; 32 bit int to 64 bit int
7099    vpsllq                    m13,       xm2                             ;(signCoef * signCoef) << scaleBits
7100    paddq                     m4,        m13
7101    movu                      [r1 + 32], m13
7102
7103    vpmovsxwd                 m6,        [r0 + 16]
7104    vcvtdq2pd                 m9,        xm6
7105    vfmadd213pd               m9,        m9,             m3
7106    vcvtpd2dq                 xm8,       m9
7107    vpmovsxdq                 m13,       xm8                              ; 32 bit int to 64 bit int
7108    vpsllq                    m13,       xm2                             ;(signCoef * signCoef) << scaleBits
7109    paddq                     m4,        m13
7110    movu                      [r1 + 64], m13
7111
7112    vpmovsxwd                 m6,        [r0 +24]
7113    vcvtdq2pd                 m9,        xm6
7114    vfmadd213pd               m9,        m9,             m3
7115    vcvtpd2dq                 xm8,       m9
7116    vpmovsxdq                 m13,       xm8                              ; 32 bit int to 64 bit int
7117    vpsllq                    m13,       xm2                             ;(signCoef * signCoef) << scaleBits
7118    paddq                     m4,        m13
7119    movu                      [r1 + 96], m13
7120
7121
7122    vextracti128              xm2,       m4,            1
7123    paddq                     xm4,       xm2
7124    punpckhqdq                xm2,       xm4,            xm3
7125    paddq                     xm4,       xm2
7126
7127    paddq                     xm0,       xm4
7128    paddq                     xm1,       xm4
7129
7130    movq                      [r2],      xm0
7131    movq                      [r3],      xm1
7132    RET
7133
7134
7135
7136INIT_YMM avx2
7137cglobal nonPsyRdoQuant8, 5, 5, 8
7138    mov            r4d,        r4m
7139    lea             r0,        [r0 + 2 * r4]
7140    lea             r4,        [4 * r4]
7141    lea             r1,        [r1 + 2 * r4]
7142%if BIT_DEPTH == 12
7143    mov             r4,        [tab_nonpsyRdo12 + 8]
7144%elif BIT_DEPTH == 10
7145    mov             r4,        [tab_nonpsyRdo10 + 8]
7146%elif BIT_DEPTH == 8
7147    mov             r4,        [tab_nonpsyRdo8 + 8]
7148%else
7149    %error Unsupported BIT_DEPTH!
7150 %endif
7151    movq           xm3,        r4
7152    movq           xm6,        [r2]
7153    movq           xm7,        [r3]
7154    vpxor           m4,        m4
7155    vpxor           m5,        m5
7156    movq           xm0,        [r0]
7157    vpmovsxwd       m1,         xm0
7158    vcvtdq2pd       m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
7159    vfmadd213pd     m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
7160    vcvtpd2dq       xm1,        m2
7161    vpmovsxdq       m0 ,        xm1
7162    vpsllq          m0,         xm3                              ; costUncoded
7163    paddq           m4,         m0
7164    movu            [r1],       ym0
7165    vpxor           m0,         m0
7166    movq            xm0,        [r0 +mmsize/2]
7167    vpmovsxwd       m1,         xm0
7168    vcvtdq2pd       m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
7169    vfmadd213pd     m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
7170    vcvtpd2dq       xm1,        m2
7171    vpmovsxdq       m0 ,       xm1
7172    vpsllq          m0,         xm3                              ; costUncoded
7173    paddq           m4,         m0
7174    movu            [r1 +2*mmsize],       m0
7175    vpxor           m0,         m0
7176    movq            xm0,        [r0 +mmsize]
7177    vpmovsxwd       m1,         xm0
7178    vcvtdq2pd       m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
7179    vfmadd213pd     m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
7180    vcvtpd2dq       xm1,        m2
7181    vpmovsxdq       m0 ,       xm1
7182    vpsllq          m0,         xm3                              ; costUncoded
7183    paddq           m4,         m0
7184    movu            [r1 +4*mmsize],       m0
7185    vpxor           m0,         m0
7186    movq            xm0,        [r0 +3*mmsize/2]
7187    vpmovsxwd       m1,         xm0
7188    vcvtdq2pd       m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
7189    vfmadd213pd     m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
7190    vcvtpd2dq       xm1,        m2
7191    vpmovsxdq       m0 ,       xm1
7192    vpsllq          m0,         xm3                              ; costUncoded
7193    paddq           m4,         m0
7194    movu            [r1 +6*mmsize],       m0
7195
7196    vextracti128    xm2,        m4,             1
7197    paddq           xm4,        xm2
7198    punpckhqdq      xm2,        xm4,            xm5
7199    paddq           xm4,        xm2
7200
7201    paddq          xm6,        xm4
7202    paddq          xm7,        xm4
7203
7204    movq           [r2],       xm6
7205    movq           [r3],       xm7
7206    RET
7207INIT_YMM avx2
7208cglobal nonPsyRdoQuant16, 5, 5, 8
7209    mov            r4d,        r4m
7210    lea             r0,        [r0 + 2 * r4]
7211    lea             r4,        [4 * r4]
7212    lea             r1,        [r1 + 2 * r4]
7213%if BIT_DEPTH == 12
7214    mov             r4,        [tab_nonpsyRdo12 + 16]
7215%elif BIT_DEPTH == 10
7216    mov             r4,        [tab_nonpsyRdo10 + 16]
7217%elif BIT_DEPTH == 8
7218    mov             r4,        [tab_nonpsyRdo8 + 16]
7219%else
7220    %error Unsupported BIT_DEPTH!
7221 %endif
7222    movq           xm3,        r4
7223    movq           xm6,        [r2]
7224    movq           xm7,        [r3]
7225    vpxor           m4,        m4
7226    vpxor           m5,        m5
7227
7228;Row 1, 2
7229    movq           xm0,        [r0]
7230    vpmovsxwd      m1,         xm0
7231    vcvtdq2pd      m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
7232    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
7233    vcvtpd2dq      xm1,         m2
7234    vpmovsxdq      m0 ,       xm1
7235    vpsllq         m0,         xm3                              ; costUncoded
7236    paddq          m4,         m0
7237    movu           [r1],       ym0
7238
7239    movq           xm0,        [r0 +mmsize]
7240    vpmovsxwd      m1,         xm0
7241    vcvtdq2pd      m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
7242    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
7243    vcvtpd2dq      xm1,         m2
7244    vpmovsxdq      m0 ,       xm1
7245    vpsllq         m0,         xm3                              ; costUncoded
7246    paddq          m4,         m0
7247    movu           [r1+4*mmsize],       ym0
7248
7249    movq           xm0,        [r0 + 2*mmsize]
7250    vpmovsxwd      m1,         xm0
7251    vcvtdq2pd      m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
7252    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
7253    vcvtpd2dq      xm1,         m2
7254    vpmovsxdq      m0 ,       xm1
7255    vpsllq         m0,         xm3                              ; costUncoded
7256    paddq          m4,         m0
7257    movu           [r1+8*mmsize],       ym0
7258
7259    movq           xm0,        [r0 + 3*mmsize]
7260    vpmovsxwd      m1,         xm0
7261    vcvtdq2pd      m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
7262    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
7263    vcvtpd2dq      xm1,         m2
7264    vpmovsxdq      m0 ,       xm1
7265    vpsllq         m0,         xm3                              ; costUncoded
7266    paddq          m4,         m0
7267    movu           [r1+12*mmsize],       ym0
7268
7269
7270    vextracti128  xm2,        m4,             1
7271    paddq          xm4,        xm2
7272    punpckhqdq     xm2,        xm4,            xm5
7273    paddq          xm4,        xm2
7274
7275    paddq          xm6,        xm4
7276    paddq          xm7,        xm4
7277
7278    movq           [r2],       xm6
7279    movq           [r3],       xm7
7280    RET
7281INIT_YMM avx2
7282cglobal nonPsyRdoQuant32, 5, 5, 8
7283    mov            r4d,        r4m
7284    lea             r0,        [r0 + 2 * r4]
7285    lea             r4,        [4 * r4]
7286    lea             r1,        [r1 + 2 * r4]
7287%if BIT_DEPTH == 12
7288    mov             r4,        [tab_nonpsyRdo12 + 24]
7289%elif BIT_DEPTH == 10
7290    mov             r4,        [tab_nonpsyRdo10 + 24]
7291%elif BIT_DEPTH == 8
7292    mov             r4,        [tab_nonpsyRdo8 + 24]
7293%else
7294    %error Unsupported BIT_DEPTH!
7295 %endif
7296    movq           xm3,        r4
7297    movq           xm6,        [r2]
7298    movq           xm7,        [r3]
7299    vpxor           m4,        m4
7300    vpxor           m5,        m5
7301
7302    movq           xm0,        [r0]
7303    vpmovsxwd      m1,         xm0
7304    vcvtdq2pd      m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
7305    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
7306    vcvtpd2dq      xm1,         m2
7307    vpmovsxdq      m0 ,       xm1
7308    vpsllq         m0,         xm3                              ; costUncoded
7309    paddq          m4,         m0
7310    movu           [r1],       m0
7311    vpxor           m0,        m0
7312
7313    movq           xm0,        [r0 +2*mmsize]
7314    vpmovsxwd      m1,         xm0
7315    vcvtdq2pd      m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
7316    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
7317    vcvtpd2dq      xm1,         m2
7318    vpmovsxdq      m0 ,       xm1
7319    vpsllq         m0,         xm3                              ; costUncoded
7320    paddq          m4,         m0
7321    movu           [r1 + 8*mmsize],       m0
7322    vpxor           m0,        m0
7323
7324    movq           xm0,        [r0 +4*mmsize]
7325    vpmovsxwd      m1,         xm0
7326    vcvtdq2pd      m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
7327    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
7328    vcvtpd2dq      xm1,         m2
7329    vpmovsxdq      m0 ,       xm1
7330    vpsllq         m0,         xm3                              ; costUncoded
7331    paddq          m4,         m0
7332    movu           [r1 +16*mmsize],       m0
7333    vpxor           m0,        m0
7334
7335    movq           xm0,        [r0 +6*mmsize]
7336    vpmovsxwd      m1,         xm0
7337    vcvtdq2pd      m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
7338    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
7339    vcvtpd2dq      xm1,         m2
7340    vpmovsxdq      m0 ,       xm1
7341    vpsllq         m0,         xm3                              ; costUncoded
7342    paddq          m4,         m0
7343    movu           [r1 +24*mmsize],       m0
7344
7345    vextracti128   xm2,        m4,             1
7346    paddq          xm4,        xm2
7347    punpckhqdq     xm2,        xm4,            xm5
7348    paddq          xm4,        xm2
7349
7350    paddq          xm6,        xm4
7351    paddq          xm7,        xm4
7352
7353    movq           [r2],       xm6
7354    movq           [r3],       xm7
7355    RET
7356
7357INIT_YMM avx2
7358cglobal psyRdoQuant_1p4, 5, 9, 16
7359    mov            r4d,        r4m
7360    lea             r0,        [r0 + 2 * r4]
7361    lea             r4,        [4 * r4]
7362    lea             r1,        [r1 + 2 * r4]
7363    movq           xm0,        [r2]
7364    movq           xm1,        [r3]
7365
7366%if BIT_DEPTH == 12
7367    mov            r5,         [tab_nonpsyRdo12]                 ; scaleBits
7368%elif BIT_DEPTH == 10
7369    mov            r5,         [tab_nonpsyRdo10]
7370%elif BIT_DEPTH == 8
7371    mov            r5,         [tab_nonpsyRdo8]
7372%else
7373    %error Unsupported BIT_DEPTH!
7374%endif
7375    movq           xm2,        r5
7376    vpxor           m4,        m4
7377    vpxor           m3,        m3
7378    vpxor           m13,       m13
7379
7380    vpmovsxwd                  m6,        [r0]
7381    vcvtdq2pd                  m9,        xm6
7382    vfmadd213pd                m9,        m9,             m3
7383    vcvtpd2dq                  xm8,       m9
7384    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
7385    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits
7386    paddq                      m4,        m13
7387    movu                       [r1],       m13
7388
7389    vpmovsxwd                 m6,        [r0 + 8]
7390    vcvtdq2pd                 m9,        xm6
7391    vfmadd213pd               m9,        m9,             m3
7392    vcvtpd2dq                 xm8,       m9
7393    vpmovsxdq                 m13,       xm8                              ; 32 bit int to 64 bit int
7394    vpsllq                    m13,       xm2                             ;(signCoef * signCoef) << scaleBits
7395    paddq                     m4,        m13
7396    movu                      [r1 + 32], m13
7397
7398    vpmovsxwd                 m6,        [r0 + 16]
7399    vcvtdq2pd                 m9,        xm6
7400    vfmadd213pd               m9,        m9,             m3
7401    vcvtpd2dq                 xm8,       m9
7402    vpmovsxdq                 m13,       xm8                              ; 32 bit int to 64 bit int
7403    vpsllq                    m13,       xm2                             ;(signCoef * signCoef) << scaleBits
7404    paddq                     m4,        m13
7405    movu                      [r1 + 64], m13
7406
7407    vpmovsxwd                 m6,        [r0 +24]
7408    vcvtdq2pd                 m9,        xm6
7409    vfmadd213pd               m9,        m9,             m3
7410    vcvtpd2dq                 xm8,       m9
7411    vpmovsxdq                 m13,       xm8                              ; 32 bit int to 64 bit int
7412    vpsllq                    m13,       xm2                             ;(signCoef * signCoef) << scaleBits
7413    paddq                     m4,        m13
7414    movu                      [r1 + 96], m13
7415
7416
7417    vextracti128              xm2,       m4,            1
7418    paddq                     xm4,       xm2
7419    punpckhqdq                xm2,       xm4,            xm3
7420    paddq                     xm4,       xm2
7421
7422    paddq                     xm0,       xm4
7423    paddq                     xm1,       xm4
7424
7425    movq                      [r2],      xm0
7426    movq                      [r3],      xm1
7427    RET
7428INIT_YMM avx2
7429cglobal psyRdoQuant_1p8, 7, 9, 16
7430    mov            r4d,        r4m
7431    lea             r0,        [r0 + 2 * r4]
7432    lea             r4,        [4 * r4]
7433    lea             r1,        [r1 + 2 * r4]
7434    movq           xm0,        [r2]
7435    movq           xm1,        [r3]
7436%if BIT_DEPTH == 12
7437    mov            r5,         [tab_nonpsyRdo12 +8]                 ; scaleBits
7438%elif BIT_DEPTH == 10
7439    mov            r5,         [tab_nonpsyRdo10 +8]
7440%elif BIT_DEPTH == 8
7441    mov            r5,         [tab_nonpsyRdo8 + 8 ]
7442%else
7443    %error Unsupported BIT_DEPTH!
7444%endif
7445    movq           xm2,        r5
7446    vpxor           m4,        m4
7447    vpxor           m3,        m3
7448    vpxor           m13,       m13
7449
7450
7451    vpmovsxwd                  m6,        [r0]
7452    vcvtdq2pd                  m9,        xm6
7453    vfmadd213pd                m9,        m9,             m3
7454    vcvtpd2dq                  xm8,       m9
7455    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
7456    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits
7457    paddq                      m4,        m13
7458    movu                       [r1],       m13
7459
7460    vpmovsxwd                  m6,        [r0 + 16]
7461    vcvtdq2pd                  m9,        xm6
7462    vfmadd213pd                m9,        m9,             m3
7463    vcvtpd2dq                  xm8,       m9
7464    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
7465    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits
7466    paddq                      m4,        m13
7467    movu                       [r1 + 64],       m13
7468
7469    vpmovsxwd                  m6,        [r0 +32]
7470    vcvtdq2pd                  m9,        xm6
7471    vfmadd213pd                m9,        m9,             m3
7472    vcvtpd2dq                  xm8,       m9
7473    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
7474    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits
7475    paddq                      m4,        m13
7476    movu                       [r1 +128],       m13
7477
7478    vpmovsxwd                  m6,        [r0 + 48]
7479    vcvtdq2pd                  m9,        xm6
7480    vfmadd213pd                m9,        m9,             m3
7481    vcvtpd2dq                  xm8,       m9
7482    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
7483    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits
7484    paddq                      m4,        m13
7485    movu                       [r1 + 192],       m13
7486
7487    vextracti128              xm2,       m4,            1
7488    paddq                     xm4,       xm2
7489    punpckhqdq                xm2,       xm4,            xm3
7490    paddq                     xm4,       xm2
7491
7492    paddq                     xm0,       xm4
7493    paddq                     xm1,       xm4
7494
7495    movq                      [r2],      xm0
7496    movq                      [r3],      xm1
7497    RET
7498
7499INIT_YMM avx2
7500cglobal psyRdoQuant_1p16, 7, 9, 16
7501    mov            r4d,        r4m
7502    lea             r0,        [r0 + 2 * r4]
7503    lea             r4,        [4 * r4]
7504    lea             r1,        [r1 + 2 * r4]
7505    movq           xm0,        [r2]
7506    movq           xm1,        [r3]
7507%if BIT_DEPTH == 12
7508    mov            r5,         [tab_nonpsyRdo12 + 16]                 ; scaleBits
7509%elif BIT_DEPTH == 10
7510    mov            r5,         [tab_nonpsyRdo10 + 16]
7511%elif BIT_DEPTH == 8
7512    mov            r5,         [tab_nonpsyRdo8 + 16 ]
7513%else
7514    %error Unsupported BIT_DEPTH!
7515%endif
7516    movq           xm2,        r5
7517    vpxor           m4,        m4
7518    vpxor           m3,        m3
7519    vpxor           m13,       m13
7520
7521    vpmovsxwd                  m6,        [r0]
7522    vcvtdq2pd                  m9,        xm6
7523    vfmadd213pd                m9,        m9,             m3
7524    vcvtpd2dq                  xm8,       m9
7525    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
7526    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits
7527    paddq                      m4,        m13
7528    movu                       [r1],       m13
7529
7530    vpmovsxwd                  m6,        [r0 + mmsize]
7531
7532    vcvtdq2pd                  m9,        xm6
7533    vfmadd213pd                m9,        m9,             m3
7534    vcvtpd2dq                  xm8,       m9
7535    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
7536    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits
7537    paddq                      m4,        m13
7538    movu                       [r1 + 4*mmsize],       m13
7539
7540    vpmovsxwd                  m6,        [r0 + 2 * mmsize]
7541    vcvtdq2pd                  m9,        xm6
7542    vfmadd213pd                m9,        m9,             m3
7543    vcvtpd2dq                  xm8,       m9
7544    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
7545    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits
7546    paddq                      m4,        m13
7547    movu                       [r1 + 8*mmsize],       m13
7548
7549    vpmovsxwd                  m6,        [r0 + 3 * mmsize]
7550    vcvtdq2pd                  m9,        xm6
7551    vfmadd213pd                m9,        m9,             m3
7552    vcvtpd2dq                  xm8,       m9
7553    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
7554    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits
7555    paddq                      m4,        m13
7556    movu                       [r1 + 12*mmsize],       m13
7557
7558    vextracti128              xm2,       m4,            1
7559    paddq                     xm4,       xm2
7560    punpckhqdq                xm2,       xm4,            xm3
7561    paddq                     xm4,       xm2
7562
7563    paddq                     xm0,       xm4
7564    paddq                     xm1,       xm4
7565
7566    movq                      [r2],      xm0
7567    movq                      [r3],      xm1
7568    RET
7569
7570INIT_YMM avx2
7571cglobal psyRdoQuant_1p32, 7, 9, 16
7572   mov            r4d,        r4m
7573    lea             r0,        [r0 + 2 * r4]
7574    lea             r4,        [4 * r4]
7575    lea             r1,        [r1 + 2 * r4]
7576    movq           xm0,        [r2]
7577    movq           xm1,        [r3]
7578%if BIT_DEPTH == 12
7579    mov            r5,         [tab_nonpsyRdo12 + 24]                 ; scaleBits
7580%elif BIT_DEPTH == 10
7581    mov            r5,         [tab_nonpsyRdo10 + 24]
7582%elif BIT_DEPTH == 8
7583    mov            r5,         [tab_nonpsyRdo8 + 24]
7584%else
7585    %error Unsupported BIT_DEPTH!
7586%endif
7587    movq           xm2,        r5
7588    vpxor           m4,        m4
7589    vpxor           m3,        m3
7590    vpxor           m13,       m13
7591
7592
7593    vpmovsxwd                  m6,        [r0]
7594    vcvtdq2pd                  m9,        xm6
7595    vfmadd213pd                m9,        m9,             m3
7596    vcvtpd2dq                  xm8,       m9
7597    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
7598    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits
7599    paddq                      m4,        m13
7600    movu                       [r1],       m13
7601
7602    vpmovsxwd                  m6,        [r0 + 2 * mmsize]
7603    vcvtdq2pd                  m9,        xm6
7604    vfmadd213pd                m9,        m9,             m3
7605    vcvtpd2dq                  xm8,       m9
7606    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
7607    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits
7608    paddq                      m4,        m13
7609    movu                       [r1 + 8 * mmsize],       m13
7610
7611    vpmovsxwd                  m6,        [r0 + 4 * mmsize]
7612    vcvtdq2pd                  m9,        xm6
7613    vfmadd213pd                m9,        m9,             m3
7614    vcvtpd2dq                  xm8,       m9
7615    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
7616    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits
7617    paddq                      m4,        m13
7618    movu                       [r1 + 16 * mmsize],       m13
7619
7620    vpmovsxwd                  m6,        [r0 + 6 * mmsize]
7621    vcvtdq2pd                  m9,        xm6
7622    vfmadd213pd                m9,        m9,             m3
7623    vcvtpd2dq                  xm8,       m9
7624    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
7625    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits
7626    paddq                      m4,        m13
7627    movu                       [r1  + 24 *mmsize],       m13
7628
7629    vextracti128              xm2,       m4,            1
7630    paddq                     xm4,       xm2
7631    punpckhqdq                xm2,       xm4,            xm3
7632    paddq                     xm4,       xm2
7633
7634    paddq                     xm0,       xm4
7635    paddq                     xm1,       xm4
7636
7637    movq                      [r2],      xm0
7638    movq                      [r3],      xm1
7639    RET
7640
7641%endif
7642